In [13]:
import nltk
nltk.download('punkt')
nltk.download('semcor')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.corpus import wordnet as wn
from nltk.corpus import semcor
from nltk.corpus.reader.semcor import SemcorCorpusReader

from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
from nltk.corpus import stopwords

import string

import networkx as nx

import random

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package semcor to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
NUM_OF_FILES = 1

In [15]:
def wsd():
    avg_acc = 0
    # files = random.sample(semcor.fileids(), NUM_OF_FILES)
    files = [semcor.fileids()[7]]
    print(files)
    for file in files:
        clean_words, original_to_lemma = preprocess(file)
        word_senses = dict([(w, None) for w in clean_words])
        
        G = create_graph(clean_words, word_senses)
        
        pr = nx.pagerank(G)
        
        avg_acc += evaluate(pr, word_senses, original_to_lemma, file)
    
    avg_acc /= NUM_OF_FILES
    print('Average wsd accuracy: {} achieved on {} files'.format(avg_acc, NUM_OF_FILES))

In [16]:
def preprocess(file):
    reader = SemcorCorpusReader(semcor.root, semcor.fileids(), wn)
    
    words = reader.words(file)
    
    # pos tagging
    tagged_words = nltk.pos_tag(words)
    # convert to wordnet pos tag
    tagged_words = list(map(lambda pair: (pair[0], to_wordnet_pos(pair[1][0])), tagged_words))
    # filter out those without wordnet tag
    tagged_words = list(filter(lambda pair: pair[1] != None, tagged_words))
    
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(w, tag) for (w, tag) in tagged_words]
    
    # map from original word to lemmatized
    original_to_lemma = {(w.lower(), i) : lemmatizer.lemmatize(w, tag).lower() for (i, (w, tag)) in enumerate(tagged_words)}

    
    
    # stopwords
    english_stopwords = stopwords.words('english')
    punctuation = string.punctuation
    normalized_words = map(lambda x: x.lower(), lemmatized_words)
    clean_words = list(filter(lambda x: x not in english_stopwords and x not in punctuation, normalized_words))
    
    original_to_lemma = {k : v for k, v in original_to_lemma.items() if v not in english_stopwords and v not in punctuation}

    print('uspelo: {}'.format(clean_words == list(original_to_lemma.values())))
    print(clean_words)
    print(list(original_to_lemma.values()))
    print(len(clean_words))
    print(len(original_to_lemma.values()))

    # collocations
    clean_words, original_to_lemma = wn_collocations(clean_words, original_to_lemma)

    print('posle kolokejsna - original_to_lemma: {}'.format(original_to_lemma))
    print('posle kolokejsna - words: {}'.format(clean_words))
    
    #remove duplicates - order doesn't matter to pagerank
    clean_words = list(set(clean_words))
    
    return clean_words, original_to_lemma

In [17]:
def to_wordnet_pos(nltk_pos):
    if nltk_pos == "J":
        return wn.ADJ
    elif nltk_pos == "N":
        return wn.NOUN
    elif nltk_pos == "V":
        return wn.VERB
    elif nltk_pos == "R":
        return wn.ADV
    elif nltk_pos == "S":
        return wn.ADJ_SAT
    else:
        return None

In [18]:
def wn_collocations(words, original_to_lemma):
    max_col_size = 5
    for col_size in range(max_col_size, 1, -1):
        # i = 0
        to_delete = []
        to_delete_o2l = []
        cols_to_add = []
        cols_to_add_ind = []
        for (k, ix), v in original_to_lemma.items():
            if ix > len(words) - col_size:
                break
            if ix in to_delete:
                continue
            clean_col = "_".join([words[j] for j in range(ix, ix + col_size)])
            orig_col = "_".join([k for (k, index) in original_to_lemma.keys() if index in range(ix, ix + col_size)])
            print("kolokacija i synset {} : {}".format(orig_col, ix))
            if len(wn.synsets(clean_col)) != 0:
                cols_to_add.append((orig_col, clean_col))
                cols_to_add_ind.append(ix)
                print('kolokacija: {}'.format(original_to_lemma[(k, ix)]))
                to_delete_o2l.append(ix)
                words[ix] = clean_col

                for j in range(ix + col_size - 1, ix, -1):
                    to_delete.append(j)
                    to_delete_o2l.append(j)
        new_indexes = list(range(len(words) - len(to_delete_o2l)))
        # original_to_lemma = {(k, ix) : v for ((k, ix), v) in original_to_lemma.items() if ix not in to_delete}
        new_or_to_lemma = {}
        new_ix = 0
        help_i = 0
        for (k, ix), v in original_to_lemma.items():
            if ix not in to_delete_o2l:
                new_or_to_lemma[(k, new_ix)] = v
                new_ix += 1
            elif ix in cols_to_add_ind:
                new_or_to_lemma[(cols_to_add[help_i][0], new_ix)] = cols_to_add[help_i][1]
                help_i += 1
                new_ix += 1

        original_to_lemma = new_or_to_lemma
        words = [w for (i, w) in enumerate(words) if i not in to_delete]

    original_to_lemma = {k : v for ((k, ix), v) in original_to_lemma.items()}
    print('u kolokejsnima - original_to_lemma: {}'.format(original_to_lemma))
    
    print('u kolokejsnima - words: {}'.format(words))

    return words, original_to_lemma
    

        # while i <= len(words) - col_size:
        #     col = "_".join([words[j] for j in range(i, i + col_size)])
        #     if len(wn.synsets(col)) != 0:               
        #         # ???
        #         # original_to_lemma[words[i]] = col
        #         original_to_lemma[col] = col
        #         # original_to_lemma[words[i]] = 'collocation: ' + col 
        #         keys_to_delete = []
        #         for k, v in original_to_lemma.items():
        #             if v == words[i]:
        #                 print("1 {}".format(k))
        #                 keys_to_delete.append(k)
                
        #         words[i] = col
        #         for j in range(i + col_size - 1, i, -1):
        #             # original_to_lemma[words[j]] = 'collocation: ' + col
        #             for k, v in original_to_lemma.items():
        #                 if v == words[j]:
        #                     print("2 {}: {}".format(k, original_to_lemma[k]))
        #                     keys_to_delete.append(k)

        #             del words[j]

        #         for k in keys_to_delete:
        #             print("To be deleted {}".format(original_to_lemma[k]))
        #             del original_to_lemma[k]
                
#                 print(col, len(wn.synsets(col)))
            # i+=1

In [19]:
def create_graph(clean_words, word_senses):
    G = nx.Graph()
    # create nodes
    for w in clean_words:
        # TODO: try adding pos when getting synsets
        w_synsets = wn.synsets(w)
        if len(w_synsets) > 1:
            for synset in w_synsets:
                G.add_node((w, synset))
        elif len(w_synsets) == 1:
            word_senses[w] = w_synsets[0]
    #     if there are no synsets - leave None
    #                 G.add_node(synset.name())
    
    # create edges
    for node_i in G.nodes:
        for node_j in G.nodes:
            if node_i[0] != node_j[0] and are_connected(node_i[1], node_j[1]):
                G.add_edge(node_i, node_j)
    
    return G

In [20]:
def are_connected(u, v):
    if u in v.hypernyms() or u in v.hyponyms() or u in v.part_holonyms() or u in v.part_meronyms() or u in v.substance_holonyms() or u in v.substance_meronyms():
        return True
#     coordinate relation - e.g. wolf and dog
    u_hyper = set(u.hypernyms())
    v_hyper = set(v.hypernyms())
    if len(u_hyper & v_hyper) > 0:
        return True

In [21]:
def evaluate(pr, word_senses, original_to_lemma, file):
    max_values = dict([(w, 0.0) for w in word_senses.keys()])
    
    # for each word assign sense as the max value from pagerank
    for k, v in pr.items():
        if v > max_values[k[0]]:
            word_senses[k[0]] = k[1]
            max_values[k[0]] = v
    
    # get actual meaning of words from original semcor file
    actual_meaning = get_actual_meaning(file)
    
    # calculate accuracy
    accuracy = calculate_accuracy(original_to_lemma, word_senses, actual_meaning)
    print('Accuracy: {}'.format(accuracy))
    
    return accuracy

In [22]:
def get_actual_meaning(file):
    actual_meaning = {}
    no_lemma_sense = 0
    trees = list(semcor.tagged_chunks(fileids=file, tag='sem'))
    no_trees = 0
    for tree in trees:
        if isinstance(tree, nltk.tree.Tree):
            actual_meaning['_'.join(tree.leaves()).lower()] = tree.label()
            if not isinstance(tree.label(), nltk.corpus.reader.wordnet.Lemma):
                #TODO: handle these - create Lemma (or Synset) from string if possible...
                # There are mistakes in semcor file: e.g. toe.a.00 sense doesn't exist in wordnet
                no_lemma_sense += 1
        else:
          no_trees += 1
    print('Number of words that do not have lemma sense in semcor file: {}'.format(no_lemma_sense))
    print('Number of words that do not have tree in semcor file: {}'.format(no_trees))
    
    return actual_meaning

In [23]:
def calculate_accuracy(original_to_lemma, word_senses, actual_meaning):
    correct = 0
    incorrect = 0
    no_word_sense = 0
    num_collocations = 0
    no_actual_meaning = 0
    print(actual_meaning)
    # for k, v in original_to_lemma.items():
    #     k_l = k.lower()
    #     v_l = v.lower()
    #     if v_l in word_senses.keys() and k_l in actual_meaning.keys():
    #         # print('original: {}, sense: {}'.format(k, word_senses[v]))
    #         if isinstance(actual_meaning[k_l], nltk.corpus.reader.wordnet.Lemma) and word_senses[v_l] is not None and actual_meaning[k_l] in word_senses[v_l].lemmas():
    #             correct += 1
    # #             print('correct')
    #         else:
    #             incorrect +=1
    # #             print('incorrect')
    #     elif v_l not in word_senses.keys():
    #         print('no word sense for {} : {}'.format(k_l, v_l))
    #         if 'collocation: ' in v_l:
    #             num_collocations += 1
    #         else:
    #             no_word_sense += 1
    #     else:
    #         print('{} not in actual meaning keys'.format(k_l))
    #         no_actual_meaning += 1
    
    for k, v in actual_meaning.items():
        if not isinstance(v, nltk.corpus.reader.wordnet.Lemma):
            print("Oni nisu umeli lepo da kazu znacenje: {}".format(k))
        elif k not in original_to_lemma.keys():
            print("Oni imaju kolokaciju koju mi nemamo: {}".format(k))
        elif original_to_lemma[k] not in word_senses.keys():
            print("Nemam pojma sta ovo znaci: {}".format(k))
        elif word_senses[original_to_lemma[k]] is None:
            print("Oni imaju znacenje, a mi nemamo: {}".format(k))
        elif v in word_senses[original_to_lemma[k]].lemmas():
            correct += 1
        else:
            incorrect += 1


    # print('Correct: {}, incorrect: {}, num_collocations: {}, no_word_sense: {}, no_actual_meaning: {}'.format(correct, incorrect, num_collocations, no_word_sense, no_actual_meaning))
    print("Correct = {}, incorrect = {}".format(correct, incorrect))
    
    return correct / (correct + incorrect)
#     ili 
#     return correct / (correct + incorrect + no_word_sense)

In [24]:
wn.synsets('jawaharlal_nehru')

[Synset('nehru.n.01')]

In [25]:
wsd()

['brown1/tagfiles/br-b13.xml']
uspelo: True
['sizzling', 'temperature', 'hot', 'summer', 'pavement', 'anything', 'kind', 'foot', 'important', 'invest', 'comfortable', 'airy', 'type', 'shoe', 'many', 'soft', 'light', 'shoe', 'leather', 'available', 'many', 'style', 'perforation', 'almost', 'weightlessness', 'achieve', 'unlined', 'leather', 'softness', 'find', 'crushed', 'texture', 'styles', 'run', 'gamut', 'slender', 'taper', 'elongated', 'toe', 'newer', 'square', 'toe', 'shape', 'heels', 'place', 'emphasis', 'long', 'legged', 'silhouette', 'wine', 'glass', 'heel', 'find', 'high', 'semi-heights', 'stacked', 'heel', 'also', 'popular', 'dressy', 'tailor', 'shoe', 'bare', 'suggestion', 'heel', 'find', 'teenage', 'pump', 'white', 'cool', 'summer', 'shade', 'lot', 'pastel', 'hue', 'tintable', 'fabric', 'blend', 'wardrobe', 'color', 'tintable', 'group', 'high', 'little', 'heel', 'square', 'oval', 'throat', 'shantung', 'texture', "n't", 'overlook', 'straw', 'year', 'come', 'crisp', 'basket', '

In [27]:
# collocations that contain stopwords
wn.synsets('responsible_for')
wn.synsets('out_of_bounds')

[Synset('sideline.n.01')]

In [None]:
# some testing...

In [None]:
nltk.pos_tag(['refuse'])

[('refuse', 'NN')]

In [None]:
nltk.pos_tag(['they', 'refuse'])

[('they', 'PRP'), ('refuse', 'VBP')]

In [None]:
#pos tagging depends on context

In [None]:
wn.synsets('are') # there are stopwords in wordnet

[Synset('are.n.01'),
 Synset('be.v.01'),
 Synset('be.v.02'),
 Synset('be.v.03'),
 Synset('exist.v.01'),
 Synset('be.v.05'),
 Synset('equal.v.01'),
 Synset('constitute.v.01'),
 Synset('be.v.08'),
 Synset('embody.v.02'),
 Synset('be.v.10'),
 Synset('be.v.11'),
 Synset('be.v.12'),
 Synset('cost.v.01')]

In [None]:
wn.synsets('bertrand_russell')

[Synset('russell.n.07')]

In [None]:
wn.synsets('bertrand_russell')[0].lemmas()

[Lemma('russell.n.07.Russell'),
 Lemma('russell.n.07.Bertrand_Russell'),
 Lemma('russell.n.07.Bertrand_Arthur_William_Russell'),
 Lemma('russell.n.07.Earl_Russell')]

In [None]:
test_words = ['I', 'am', 'Bertrand', 'Arthur', 'William', 'Russell']

In [None]:
pam = {}

In [None]:
wn_collocations(test_words, pam)

In [None]:
test_words

['I', 'am', 'Bertrand_Arthur_William_Russell']

In [None]:
original_to_lemma['easy']

'easy_money'

In [None]:
original_to_lemma['money']

'collocation: easy_money'

In [None]:
wn.synsets("n't") # => errors in semcor file ???

[]