In [15]:
from nltk.corpus import semcor, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.wordnet import Synset
from nltk import Tree
import nltk
import re
import random
from typing import Set, List, AnyStr, Union

In [16]:
with open('data/stop_words_FULL.txt') as f:
    stop_words = {line for line in f.read().splitlines()}
    
sentences = semcor.sents()
tagged_sentences = semcor.tagged_sents(tag='sem')

In [17]:
def get_random_index(evaluated_indices: Set[int], max_index: int) -> int:
    while True:
        index = random.randrange(max_index)
        if index not in evaluated_indices:
            return index


def get_random_word(tagged_sentence: List[Union[AnyStr, Tree]], pos: AnyStr=None) -> Union[AnyStr, None]:
    words = []
    for word in tagged_sentence:
        if type(word) is Tree and type(word.label()) != str:
            if pos: # if pos is filtered
                if word.label().synset().pos() == pos:
                    words.append(' '.join(word.leaves()))
            else:
                words.append(' '.join(word.leaves()))
    
    while words:
        word = random.choice(words)
        if word not in stop_words and len(wordnet.synsets(word, pos=pos)) > 0:
            return word.lower()
        words.remove(word)
        
    return None


### Pre-Processing

In [18]:
def bag_of_words(sentence: AnyStr) -> Set[AnyStr]:
    return set(remove_stopwords(tokenize_sentence(remove_punctuation(sentence))))


def remove_stopwords(words: List[AnyStr]) -> List[AnyStr]:
    return [value for value in words if value not in stop_words]


def tokenize_sentence(sentence: AnyStr) -> List[AnyStr]:
    words = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        words.append(lmtzr.lemmatize(tag[0]).lower())
    return words

# remove punctuation and multiple spaces
def remove_punctuation(sentence: AnyStr) -> AnyStr:
    return re.sub('\s\s+', ' ', re.sub(r'[^\w\s]', '', sentence))

### Lesk Algorithm

In [19]:
def get_signature(sense: Synset) -> Set[AnyStr]:
    signature = set()
    sentence_list = [sense.definition()] + sense.examples()

    for sentence in sentence_list:
        signature.update(bag_of_words(sentence))
    return signature


def get_word_sense(noun: AnyStr, tagged_sentence: List[Union[AnyStr, Tree]]) -> Union[Synset, None]:
    for word in tagged_sentence:
        if type(word) is Tree and word[0] == noun:
            return word.label().synset()
    return None


def lesk_algorithm(word: AnyStr, sentence: AnyStr, pos: AnyStr=None) -> Synset:
    max_overlap = 0
    context = bag_of_words(sentence)
    synsets = wordnet.synsets(word, pos=pos)
    best_sense = synsets[0]
        
    for sense in synsets:
        signature = get_signature(sense)
        overlap = len(signature & context)
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
            
    return best_sense

### Disambiguation (1)

In [20]:
max_k = 50
max_index = 3000
evaluated_indices = set()
predictions = list()
reference = list()

for i in range(max_k):
    while True:
        index = get_random_index(evaluated_indices, max_index)
        word = get_random_word(tagged_sentences[index], wordnet.NOUN)
        evaluated_indices.add(index)
        if word:
            break

    sentence = sentences[index]
    best_sense = lesk_algorithm(word, ' '.join(word for word in sentence), pos=wordnet.NOUN)
    target_sense = get_word_sense(word, tagged_sentences[index])
    
    predictions.append(best_sense)
    reference.append(target_sense)
    
    print(f'Sentence: "{" ".join(sentence)}"')
    print(f'Word: {word.upper()}')
    print(f'Best sense: "{str(best_sense)} - {best_sense.definition()}"\n')
    

Sentence: "He flounders and lets music sprawl ."
Word: MUSIC
Best sense: "Synset('music.n.02') - any agreeable (pleasing and harmonious) sounds"

Sentence: "Some time ago , however , Mr. Khrushchev decided that when bigger bombs were made , the Soviet Union would make them ."
Word: BOMBS
Best sense: "Synset('bomb.n.01') - an explosive device fused to explode under specific conditions"

Sentence: "It would authorize the Texas Education Agency to establish county-wide day schools for the deaf in counties of 300000 or more population , require deaf children between 6 and 13 years of age to attend the day schools , permitting older ones to attend the residential Texas School for the Deaf here ."
Word: CHILDREN
Best sense: "Synset('child.n.02') - a human offspring (son or daughter) of any age"

Sentence: "A tape of cellulose acetate is pulled between the blocks and the tape pulls the fluid or paste with it between the parallel faces of the blocks ."
Word: FACES
Best sense: "Synset('face.n.0

### Evaluation

In [21]:
correctly_evaluated = [target for (prediction, target) in zip(predictions, reference) if prediction is target]
accuracy = len(correctly_evaluated) / len(reference)
print(f'Accuracy: {accuracy}')

Accuracy: 0.48


### Disambiguation (2)

In [23]:
max_runs = 10
max_k = 50
max_index = 7000
run_predictions = list()
run_reference = list()

for run in range(max_runs):
    evaluated_indices = set()
    predictions = list()
    reference = list()

    for i in range(max_k):
        while True:
            index = get_random_index(evaluated_indices, max_index)
            word = get_random_word(tagged_sentences[index], wordnet.NOUN)
            evaluated_indices.add(index)
            if word:
                break

        sentence = sentences[index]
        best_sense = lesk_algorithm(word, ' '.join(word for word in sentence), pos=wordnet.NOUN)
        target_sense = get_word_sense(word, tagged_sentences[index])
        
        predictions.append(best_sense)
        reference.append(target_sense)
    
    run_predictions.append(predictions)
    run_reference.append(reference)

### Evaluation

In [24]:
accuracy_list = list()

for (prediction, reference) in zip(run_predictions, run_reference):
    correctly_evaluated = [target for (prediction, target) in zip(prediction, reference) if prediction is target]
    accuracy_list.append(len(correctly_evaluated) / len(reference))

print(f'Total executions: {max_runs}')
print(f'Accuracy list: {accuracy_list}')
print(f'Average accuracy: {sum(accuracy_list) / len(accuracy_list)}')


Total executions: 10
Accuracy list: [0.46, 0.48, 0.48, 0.58, 0.54, 0.36, 0.6, 0.48, 0.52, 0.54]
Average accuracy: 0.504
