In [71]:
import nltk
nltk.download('punkt')

from nltk.corpus import wordnet as wn
from nltk.corpus import semcor
from nltk.corpus.reader.semcor import SemcorCorpusReader

from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
from nltk.corpus import stopwords

import string

import networkx as nx

import random

[nltk_data] Downloading package punkt to /home/stefan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/stefan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [72]:
NUM_OF_FILES = 5

In [73]:
def wsd():
    avg_acc = 0
    files = random.sample(semcor.fileids(), NUM_OF_FILES)
    for file in files:
        clean_words, original_to_lemma = preprocess(file)
        word_senses = dict([(w, None) for w in clean_words])
        
        G = create_graph(clean_words, word_senses)
        
        pr = nx.pagerank(G)
        
        avg_acc += evaluate(pr, word_senses, original_to_lemma, file)
    
    avg_acc /= NUM_OF_FILES
    print('Average wsd accuracy: {} achieved on {} files'.format(avg_acc, NUM_OF_FILES))

In [74]:
def preprocess(file):
    reader = SemcorCorpusReader(semcor.root, semcor.fileids(), wn)
    
    words = reader.words(file)
    
    # pos tagging
    tagged_words = nltk.pos_tag(words)
    # convert to wordnet pos tag
    tagged_words = list(map(lambda pair: (pair[0], to_wordnet_pos(pair[1][0])), tagged_words))
    # filter out those without wordnet tag
    tagged_words = list(filter(lambda pair: pair[1] != None, tagged_words))
    
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(w, tag) for (w, tag) in tagged_words]
    
    # map from original word to lemmatized
    original_to_lemma = dict([(w, lemmatizer.lemmatize(w, tag)) for (w, tag) in tagged_words])
    
    # stopwords
    english_stopwords = stopwords.words('english')
    punctuation = string.punctuation
    normalized_words = map(lambda x: x.lower(), lemmatized_words)
    clean_words = list(filter(lambda x: x not in english_stopwords and x not in punctuation, normalized_words))
    
    # collocations
    wn_collocations(clean_words, original_to_lemma)
    
    #remove duplicates - order doesn't matter to pagerank
    clean_words = list(set(clean_words))
    
    return clean_words, original_to_lemma

In [75]:
def to_wordnet_pos(nltk_pos):
    if nltk_pos == "J":
        return wn.ADJ
    elif nltk_pos == "N":
        return wn.NOUN
    elif nltk_pos == "V":
        return wn.VERB
    elif nltk_pos == "R":
        return wn.ADV
    elif nltk_pos == "S":
        return wn.ADJ_SAT
    else:
        return None

In [76]:
def wn_collocations(words, original_to_lemma):
    max_col_size = 5
    for col_size in range(max_col_size, 1, -1):
        i = 0
        while i <= len(words) - col_size:
            col = "_".join([words[j] for j in range(i, i + col_size)])
            if len(wn.synsets(col)) != 0:               
                # ???
                original_to_lemma[words[i]] = col
                
                words[i] = col
                for j in range(i + col_size - 1, i, -1):
                    original_to_lemma[words[j]] = 'collocation: ' + col
                    del words[j]
                
#                 print(col, len(wn.synsets(col)))
            i+=1

In [77]:
def create_graph(clean_words, word_senses):
    G = nx.Graph()
    # create nodes
    for w in clean_words:
        # TODO: try adding pos when getting synsets
        w_synsets = wn.synsets(w)
        if len(w_synsets) > 1:
            for synset in w_synsets:
                G.add_node((w, synset))
        elif len(w_synsets) == 1:
            word_senses[w] = w_synsets[0]
    #     if there are no synsets - leave None
    #                 G.add_node(synset.name())
    
    # create edges
    for node_i in G.nodes:
        for node_j in G.nodes:
            if node_i[0] != node_j[0] and are_connected(node_i[1], node_j[1]):
                G.add_edge(node_i, node_j)
    
    return G

In [78]:
def are_connected(u, v):
    if u in v.hypernyms() or u in v.hyponyms() or u in v.part_holonyms() or u in v.part_meronyms() or u in v.substance_holonyms() or u in v.substance_meronyms():
        return True
#     coordinate relation - e.g. wolf and dog
    u_hyper = set(u.hypernyms())
    v_hyper = set(v.hypernyms())
    if len(u_hyper & v_hyper) > 0:
        return True

In [79]:
def evaluate(pr, word_senses, original_to_lemma, file):
    max_values = dict([(w, 0.0) for w in word_senses.keys()])
    
    # for each word assign sense as the max value from pagerank
    for k, v in pr.items():
        if v > max_values[k[0]]:
            word_senses[k[0]] = k[1]
            max_values[k[0]] = v
    
    # get actual meaning of words from original semcor file
    actual_meaning = get_actual_meaning(file)
    
    # calculate accuracy
    accuracy = calculate_accuracy(original_to_lemma, word_senses, actual_meaning)
    print('Accuracy: {}'.format(accuracy))
    
    return accuracy

In [80]:
def get_actual_meaning(file):
    actual_meaning = {}
    no_lemma_sense = 0
    trees = list(semcor.tagged_chunks(fileids=file, tag='sem'))
    for tree in trees:
        if isinstance(tree, nltk.tree.Tree):
            actual_meaning['_'.join(tree.leaves())] = tree.label()
            if not isinstance(tree.label(), nltk.corpus.reader.wordnet.Lemma):
                #TODO: handle these - create Lemma (or Synset) from string if possible...
                # There are mistakes in semcor file: e.g. toe.a.00 sense doesn't exist in wordnet
                no_lemma_sense += 1
    print('Number of words that do not have lemma sense in semcor file: {}'.format(no_lemma_sense))
    
    return actual_meaning

In [81]:
def calculate_accuracy(original_to_lemma, word_senses, actual_meaning):
    correct = 0
    incorrect = 0
    no_word_sense = 0
    num_collocations = 0
    no_actual_meaning = 0
    for k, v in original_to_lemma.items():
        v_l = v.lower()
        if v_l in word_senses.keys() and k in actual_meaning.keys():
    #         print('original: {}, sense: {}'.format(k, word_senses[v]))
            if isinstance(actual_meaning[k], nltk.corpus.reader.wordnet.Lemma) and word_senses[v_l] is not None and actual_meaning[k] in word_senses[v_l].lemmas():
                correct += 1
    #             print('correct')
            else:
                incorrect +=1
    #             print('incorrect')
        elif v_l not in word_senses.keys():
    #         print('no word sense for {} : {}'.format(k, v))
            if 'collocation: ' in v_l:
                num_collocations += 1
            else:
                no_word_sense += 1
        else:
#             print('{} not in actual meaning keys'.format(k))
            no_actual_meaning += 1

    print('Correct: {}, incorrect: {}, num_collocations: {}, no_word_sense: {}, no_actual_meaning: {}'.format(correct, incorrect, num_collocations, no_word_sense, no_actual_meaning))
    
    return correct / (correct + incorrect)
#     ili 
#     return correct / (correct + incorrect + no_word_sense)

In [82]:
wsd()

Number of words that do not have lemma sense in semcor file: 65
Correct: 573, incorrect: 82, num_collocations: 15, no_word_sense: 51, no_actual_meaning: 60
Accuracy: 0.8748091603053435
Number of words that do not have lemma sense in semcor file: 49
Correct: 390, incorrect: 59, num_collocations: 22, no_word_sense: 42, no_actual_meaning: 51
Accuracy: 0.8685968819599109
Number of words that do not have lemma sense in semcor file: 77
Correct: 382, incorrect: 57, num_collocations: 20, no_word_sense: 34, no_actual_meaning: 48
Accuracy: 0.8701594533029613
Number of words that do not have lemma sense in semcor file: 43
Correct: 304, incorrect: 54, num_collocations: 20, no_word_sense: 37, no_actual_meaning: 45
Accuracy: 0.8491620111731844
Number of words that do not have lemma sense in semcor file: 3
Correct: 147, incorrect: 9, num_collocations: 22, no_word_sense: 51, no_actual_meaning: 512
Accuracy: 0.9423076923076923
Average wsd accuracy: 0.8810070398098185 achieved on 5 files


In [None]:
# some testing...

In [122]:
nltk.pos_tag(['refuse'])

[('refuse', 'NN')]

In [123]:
nltk.pos_tag(['they', 'refuse'])

[('they', 'PRP'), ('refuse', 'VBP')]

In [124]:
#pos tagging depends on context

In [138]:
wn.synsets('are') # there are stopwords in wordnet

[Synset('are.n.01'),
 Synset('be.v.01'),
 Synset('be.v.02'),
 Synset('be.v.03'),
 Synset('exist.v.01'),
 Synset('be.v.05'),
 Synset('equal.v.01'),
 Synset('constitute.v.01'),
 Synset('be.v.08'),
 Synset('embody.v.02'),
 Synset('be.v.10'),
 Synset('be.v.11'),
 Synset('be.v.12'),
 Synset('cost.v.01')]

In [147]:
wn.synsets('bertrand_russell')

[Synset('russell.n.07')]

In [148]:
wn.synsets('bertrand_russell')[0].lemmas()

[Lemma('russell.n.07.Russell'),
 Lemma('russell.n.07.Bertrand_Russell'),
 Lemma('russell.n.07.Bertrand_Arthur_William_Russell'),
 Lemma('russell.n.07.Earl_Russell')]

In [83]:
test_words = ['I', 'am', 'Bertrand', 'Arthur', 'William', 'Russell']

In [86]:
pam = {}

In [87]:
wn_collocations(test_words, pam)

In [152]:
test_words

['I', 'am', 'Bertrand_Arthur_William_Russell']

In [154]:
original_to_lemma['easy']

'easy_money'

In [155]:
original_to_lemma['money']

'collocation: easy_money'

In [199]:
wn.synsets("n't") # => errors in semcor file ???

[]