In [None]:
from nltk.corpus import semcor, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import Tree
import nltk
import re
import random

In [None]:
with open('data/stop_words_FULL.txt') as f:
    stop_words = {line for line in f.read().splitlines()}
    
sentences = semcor.sents()
tagged_sentences = semcor.tagged_sents(tag='sem')

In [None]:
def get_random_index(evaluated_indices, max_index):
    while True:
        index = random.randrange(max_index)
        if index not in evaluated_indices:
            return index


def get_random_word(tagged_sentence, pos):
    words = []
    for word in tagged_sentence:
        if type(word) is Tree and type(word.label()) != str and word.label().synset().pos() == pos:
            words.append(' '.join(word.leaves()))
    
    while words:
        word = random.choice(words)
        if word not in stop_words and len(wordnet.synsets(word)) > 0:
            return word.lower()
        words.remove(word)
        
    return None


### Pre-Processing

In [None]:
def pre_processing(sentence):
    return set(remove_stopwords(tokenize_sentence(remove_punctuation(sentence))))


def remove_stopwords(words_list):
    return [value for value in words_list if value not in stop_words]


def tokenize_sentence(sentence):
    words = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        word = None
        if (tag[1][:2] == "NN"):
            word = lmtzr.lemmatize(tag[0], pos=wordnet.NOUN)
        elif (tag[1][:2] == "VB"):
            word = lmtzr.lemmatize(tag[0], pos=wordnet.VERB)
        elif (tag[1][:2] == "RB"):
            word = lmtzr.lemmatize(tag[0], pos=wordnet.ADV)
        elif (tag[1][:2] == "JJ"):
            word = lmtzr.lemmatize(tag[0], pos=wordnet.ADJ)
        
        if word:
            words.append(word.lower())
    return words

# remove punctuation and multiple spaces
def remove_punctuation(sentence):
    return re.sub('\s\s+', ' ', re.sub(r'[^\w\s]', '', sentence))

### Lesk Algorithm

In [None]:
def get_signature(sense):
    signature = set()
    sentence_list = [sense.definition()] + sense.examples()

    for sentence in sentence_list:
        for word in pre_processing(sentence):
            signature.add(word)
    return signature


def get_word_sense(noun, tagged_sentence):
    for word in tagged_sentence:
        if type(word) is Tree and word[0] == noun:
            return word.label().synset()
    return None


def lesk_algorithm(word, sentence, pos=None):
    best_sense = wordnet.synsets(word)[0]
    max_overlap = 0
    context = pre_processing(sentence)
    synsets = wordnet.synsets(word, pos=pos)
        
    for sense in synsets:
        signature = get_signature(sense)
        overlap = len(list(signature & context))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
            
    return best_sense

### Disambiguation (1)

In [None]:
max_k = 50
max_index = 3000
evaluated_indices = set()
predictions = list()
reference = list()

for i in range(max_k):
    while True:
        index = get_random_index(evaluated_indices, max_index)
        word = get_random_word(tagged_sentences[index], wordnet.NOUN)
        evaluated_indices.add(index)
        if word:
            break

    sentence = sentences[index]
    best_sense = lesk_algorithm(word, ' '.join(word for word in sentence), pos=wordnet.NOUN)
    target_sense = get_word_sense(word, tagged_sentences[index])
    
    predictions.append(best_sense)
    reference.append(target_sense)
    
    print(f'Sentence: "{" ".join(sentence)}"')
    print(f'Word: {word.upper()}')
    print(f'Best sense: "{str(best_sense)} - {best_sense.definition()}"\n')
    

### Evaluation

In [None]:
correctly_evaluated = [target for (prediction, target) in zip(predictions, reference) if prediction is target]
accuracy = len(correctly_evaluated) / len(reference)
print(f'Accuracy: {accuracy}')

### Disambiguation (2)

In [None]:
max_runs = 10
max_k = 50
max_index = 7000
run_predictions = list()
run_reference = list()

for run in range(max_runs):
    evaluated_indices = set()
    predictions = list()
    reference = list()

    for i in range(max_k):
        while True:
            index = get_random_index(evaluated_indices, max_index)
            word = get_random_word(tagged_sentences[index], wordnet.NOUN)
            evaluated_indices.add(index)
            if word:
                break

        sentence = sentences[index]
        best_sense = lesk_algorithm(word, ' '.join(word for word in sentence), pos=wordnet.NOUN)
        target_sense = get_word_sense(word, tagged_sentences[index])
        
        predictions.append(best_sense)
        reference.append(target_sense)
    
    run_predictions.append(predictions)
    run_reference.append(reference)

### Evaluation

In [None]:
accuracy_list = list()

for (prediction, reference) in zip(run_predictions, run_reference):
    correctly_evaluated = [target for (prediction, target) in zip(prediction, reference) if prediction is target]
    accuracy_list.append(len(correctly_evaluated) / len(reference))

print(f'Total executions: {max_runs}')
print(f'Accuracy list: {accuracy_list}')
print(f'Average accuracy: {sum(accuracy_list) / len(accuracy_list)}')
