In [None]:
from nltk.corpus import semcor, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import Tree
import nltk
import re
import random

In [None]:
with open('data/stop_words_FULL.txt') as f:
    stop_words = {line for line in f.read().splitlines()}
    
sentences = semcor.sents()
tagged_sentences = semcor.tagged_sents(tag='sem')
MAX_INDEX = 3000

In [None]:
def get_random_index(evaluated_indices):
    while True:
        index = random.randrange(MAX_INDEX)
        if index not in evaluated_indices:
            return index


def get_random_word(tagged_sentence, pos):
    words = []
    for word in tagged_sentence:
        if type(word) is Tree and type(word.label()) != str and word.label().synset().pos() == pos:
            words.append(' '.join(word.leaves()))
    
    while words:
        word = random.choice(words)
        if word not in stop_words and wordnet.synsets(word):
            return word.lower()
        words.remove(word)
        
    return None


### Pre-Processing

In [None]:
def pre_processing(sentence):
    return set(remove_stopwords(tokenize_sentence(remove_punctuation(sentence))))


def remove_stopwords(words_list):
    return [value.lower() for value in words_list if value.lower() not in stop_words]


def tokenize_sentence(sentence):
    words_list = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        if (tag[1][:2] == "NN"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wordnet.NOUN))
        elif (tag[1][:2] == "VB"):
             words_list.append(lmtzr.lemmatize(tag[0], pos=wordnet.VERB))
        elif (tag[1][:2] == "RB"):
             words_list.append(lmtzr.lemmatize(tag[0], pos=wordnet.ADV))
        elif (tag[1][:2] == "JJ"):
             words_list.append(lmtzr.lemmatize(tag[0], pos=wordnet.ADJ))
    return words_list

# remove punctuation and multiple spaces
def remove_punctuation(sentence):
    return re.sub('\s\s+', ' ', re.sub(r'[^\w\s]', '', sentence))

### Lesk Algorithm

In [None]:
def get_signature(sense):
    signature = set()
    for word in pre_processing(sense.definition()):
        signature.add(word)
    for example in sense.examples():
        for word in pre_processing(example):
            signature.add(word)
    return signature


def lesk_algorithm(word, sentence, pos=None):
    best_sense = wordnet.synsets(word)[0]
    max_overlap = 0
    context = pre_processing(sentence)
    synsets = wordnet.synsets(word, pos=pos)
        
    for sense in synsets:
        signature = get_signature(sense)
        overlap = len(list(signature & context))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
    return best_sense

### Disambiguation

In [None]:
evaluated_indices = set()
max_runs = 10
max_k = 50

for i in range(max_k):

    while True:
        index = get_random_index(evaluated_indices)
        word = get_random_word(tagged_sentences[index], wordnet.NOUN)
        evaluated_indices.add(index)
        if word:
            break

    sentence = sentences[index]
    best_sense = lesk_algorithm(word, ' '.join(word for word in sentence), pos=wordnet.NOUN)
    
    print(f'Sentence: "{" ".join(sentence)}"')
    print(f'Word: {word.upper()}')
    print(f'Best sense: "{str(best_sense)} - {best_sense.definition()}"\n')

### Evaluation