In [144]:
from nltk.corpus import semcor, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.wordnet import Synset
from nltk import Tree
import nltk
import re
import random
from typing import Set, List, AnyStr, Union

In [145]:
with open('data/stop_words_FULL.txt') as f:
    stop_words = {line for line in f.read().splitlines()}
    
sentences = semcor.sents()
tagged_sentences = semcor.tagged_sents(tag='sem')

In [146]:
def get_random_index(evaluated_indices: Set[int], max_index: int) -> int:
    while True:
        index = random.randrange(max_index)
        if index not in evaluated_indices:
            return index


def get_random_word(tagged_sentence: List[Union[AnyStr, Tree]], pos: AnyStr=None) -> Union[AnyStr, None]:
    words = []
    for word in tagged_sentence:
        if type(word) is Tree and type(word.label()) != str:
            if pos: # if pos is filtered
                if word.label().synset().pos() == pos:
                    words.append(' '.join(word.leaves()))
            else:
                words.append(' '.join(word.leaves()))
    
    while words:
        word = random.choice(words)
        if word not in stop_words and len(wordnet.synsets(word, pos=pos)) > 0:
            return word.lower()
        words.remove(word)
        
    return None


### Pre-Processing

In [147]:
def pre_processing(sentence: AnyStr) -> Set[AnyStr]:
    return set(remove_stopwords(tokenize_sentence(remove_punctuation(sentence))))


def remove_stopwords(words: List[AnyStr]) -> List[AnyStr]:
    return [value for value in words if value not in stop_words]


def tokenize_sentence(sentence: AnyStr) -> List[AnyStr]:
    words = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        words.append(lmtzr.lemmatize(tag[0]).lower())
    return words

# remove punctuation and multiple spaces
def remove_punctuation(sentence: AnyStr) -> AnyStr:
    return re.sub('\s\s+', ' ', re.sub(r'[^\w\s]', '', sentence))

### Lesk Algorithm

In [148]:
def get_signature(sense: Synset) -> Set[AnyStr]:
    signature = set()
    sentence_list = [sense.definition()] + sense.examples()

    for sentence in sentence_list:
        for word in pre_processing(sentence):
            signature.add(word)
    return signature


def get_word_sense(noun: AnyStr, tagged_sentence: List[Union[AnyStr, Tree]]) -> Union[Synset, None]:
    for word in tagged_sentence:
        if type(word) is Tree and word[0] == noun:
            return word.label().synset()
    return None


def lesk_algorithm(word: AnyStr, sentence: AnyStr, pos: AnyStr=None) -> Synset:
    max_overlap = 0
    context = pre_processing(sentence)
    synsets = wordnet.synsets(word, pos=pos)
    best_sense = synsets[0]
        
    for sense in synsets:
        signature = get_signature(sense)
        overlap = len(signature & context)
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
            
    return best_sense

### Disambiguation (1)

In [167]:
max_k = 50
max_index = 3000
evaluated_indices = set()
predictions = list()
reference = list()

for i in range(max_k):
    while True:
        index = get_random_index(evaluated_indices, max_index)
        word = get_random_word(tagged_sentences[index], wordnet.NOUN)
        evaluated_indices.add(index)
        if word:
            break

    sentence = sentences[index]
    best_sense = lesk_algorithm(word, ' '.join(word for word in sentence), pos=wordnet.NOUN)
    target_sense = get_word_sense(word, tagged_sentences[index])
    
    predictions.append(best_sense)
    reference.append(target_sense)
    
    print(f'Sentence: "{" ".join(sentence)}"')
    print(f'Word: {word.upper()}')
    print(f'Best sense: "{str(best_sense)} - {best_sense.definition()}"\n')
    

Sentence: "The Birds got five hits and all three of their runs off Kunkel before Hartman took over in the top of the fourth ."
Word: BIRDS
Best sense: "Synset('bird.n.01') - warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings"

Sentence: "While emphasizing that technical details were not fully worked out , Pelham said his resolution would seek to set aside the privilege resolution which the House voted through 87 - 31 ."
Word: PRIVILEGE
Best sense: "Synset('privilege.n.01') - a special advantage or immunity or benefit not enjoyed by all"

Sentence: "Practical management problems and their suggested solutions are dealt with in a series of SBA publications ."
Word: SOLUTIONS
Best sense: "Synset('solution.n.02') - a statement that solves a problem or explains how to solve the problem"

Sentence: "The real question that follows is - how are those four years used and what is their value as training ?"
Word: TRAINING
Best sense: "Synset('training.n.0

### Evaluation

In [168]:
correctly_evaluated = [target for (prediction, target) in zip(predictions, reference) if prediction is target]
accuracy = len(correctly_evaluated) / len(reference)
print(f'Accuracy: {accuracy}')

Accuracy: 0.46


### Disambiguation (2)

In [169]:
max_runs = 10
max_k = 50
max_index = 7000
run_predictions = list()
run_reference = list()

for run in range(max_runs):
    evaluated_indices = set()
    predictions = list()
    reference = list()

    for i in range(max_k):
        while True:
            index = get_random_index(evaluated_indices, max_index)
            word = get_random_word(tagged_sentences[index])
            evaluated_indices.add(index)
            if word:
                break

        sentence = sentences[index]
        best_sense = lesk_algorithm(word, ' '.join(word for word in sentence))
        target_sense = get_word_sense(word, tagged_sentences[index])
        
        predictions.append(best_sense)
        reference.append(target_sense)
    
    run_predictions.append(predictions)
    run_reference.append(reference)

### Evaluation

In [158]:
accuracy_list = list()

for (prediction, reference) in zip(run_predictions, run_reference):
    correctly_evaluated = [target for (prediction, target) in zip(prediction, reference) if prediction is target]
    accuracy_list.append(len(correctly_evaluated) / len(reference))

print(f'Total executions: {max_runs}')
print(f'Accuracy list: {accuracy_list}')
print(f'Average accuracy: {sum(accuracy_list) / len(accuracy_list)}')


Total executions: 10
Accuracy list: [0.4, 0.42, 0.44, 0.42, 0.6, 0.3, 0.46, 0.44, 0.4, 0.44]
Average accuracy: 0.43199999999999994
