In [3]:
# Import of the main utilities
from nltk.wsd import lesk 
from nltk.corpus import stopwords, wordnet, semcor
from nltk import tree


In [16]:
def lesk2(context_sentence, ambiguous_word, pos=None):
    # Attempt to make lesk more accurate removing stop words
    stop_words = set(stopwords.words('english'))
    context = set(context_sentence)
    synsets = wordnet.synsets(ambiguous_word)

    if pos:
        synsets = [ss for ss in synsets if str(ss.pos()) == pos]
    if not synsets:
        return None
    _, sense = max(
        (len(
            context.intersection(set(ss.definition().split())-stop_words)), ss
        ) for ss in synsets
    )
    return sense

In [5]:
def retrieve_sent(ambiguos_word, all_tagged_sents):
    # This method retrieve all the sentences containing the specified ambiguos word
    # and the corresponding synset from a tagged corpora structured as list of sentences. 
    # Each sentence is represented as a list of tagged chunks (in tree form).
    saved_tagged_sent = []
    saved_synset = []
    
    for tagged_sent in all_tagged_sents:
        sent_to_save = False
        sent_words = []
        for tagged_chunk in tagged_sent:
            synset = ""
            if isinstance(tagged_chunk, tree.Tree):
                current_word = tagged_chunk.leaves()
                sent_words = sent_words + current_word
                synset = tagged_chunk.label()
            else:
                current_word = tagged_chunk
                sent_words = sent_words + current_word
                if len(tagged_chunk) == 2:
                    synset = tagged_chunk[1]
            if ambiguos_word in current_word:
                sent_to_save = True
                saved_synset.append(synset)
        if sent_to_save:
            saved_tagged_sent.append(sent_words)
    return (saved_tagged_sent, saved_synset)

In [6]:
# Test retrieve_sentence method
all_tagged_sents = semcor.tagged_sents(tag='sem')
retrievedBreak = retrieve_sent("break", all_tagged_sents)
retrievedMake = retrieve_sent("make", all_tagged_sents)
retrievedRun = retrieve_sent("run", all_tagged_sents)



In [7]:
# Check differences obtained by using regular lesk and lesk2
sent3 = "the break was short".split()

sense1 = lesk(sent3, 'break', 'n')
sense2 = lesk2(sent3, 'break', 'n')

print(sense1)
print(sense1.definition())

print(sense2)
print(sense2.definition())
# Which one is better???

Synset('fault.n.04')
(geology) a crack in the earth's crust resulting from the displacement of one side with respect to the other
Synset('rupture.n.02')
a personal or social separation (as between opposing factions)


In [17]:
print(len(retrievedMake[0]))
print(len(retrievedBreak[0]))
print(len(retrievedRun[0]))

lesk1Syns=[]
lesk2Syns=[]

i = 0
correct_synsLesk1 = 0
correct_synsLesk2 = 0

retrievedMakeSent = retrievedMake[0]
for sent in retrievedMakeSent:
    lesk1 = lesk(sent, "make")
    lesk1Syns.append(lesk1)
    leskTwo = lesk2(sent, "make")
    lesk2Syns.append(lesk2)
    if retrievedMake[1][i] == lesk1:
        correct_synsLesk1 = correct_synsLesk1 + 1
    if retrievedMake[1][i] == leskTwo:
        correct_synsLesk2 = correct_synsLesk2 + 1
    

539
67
162
['When', 'the', 'crowd', 'was', 'asked', 'whether', 'it', 'wanted', 'to', 'wait', 'one', 'more', 'term', 'to', 'make', 'the', 'race', ',', 'it', 'voted', 'no', '-', 'and', 'there', 'were', 'no', 'dissents', '.']
['When', 'the', 'crowd', 'was', 'asked', 'whether', 'it', 'wanted', 'to', 'wait', 'one', 'more', 'term', 'to', 'make', 'the', 'race', ',', 'it', 'voted', 'no', '-', 'and', 'there', 'were', 'no', 'dissents', '.']
['Gov.', 'Vandiver', 'is', 'expected', 'to', 'make', 'the', 'traditional', 'visit', 'to', 'both', 'chambers', 'as', 'they', 'work', 'toward', 'adjournment', '.']
['Gov.', 'Vandiver', 'is', 'expected', 'to', 'make', 'the', 'traditional', 'visit', 'to', 'both', 'chambers', 'as', 'they', 'work', 'toward', 'adjournment', '.']
['The', 'department', 'apparently', 'intends', 'to', 'make', 'the', 'Rural', 'Roads', 'Authority', 'a', 'revolving', 'fund', 'under', 'which', 'new', 'bonds', 'would', 'be', 'issued', 'every', 'time', 'a', 'portion', 'of', 'the', 'old', 'one

['I', 'said', ',', '``', 'Mr.', 'McKenzie', ',', 'it', 'is', 'as', 'authentic', 'as', 'careful', 'research', 'can', 'make', 'it', "''", '.']
['I', 'said', ',', '``', 'Mr.', 'McKenzie', ',', 'it', 'is', 'as', 'authentic', 'as', 'careful', 'research', 'can', 'make', 'it', "''", '.']
['I', 'kept', 'saying', ',', '``', 'If', 'I', 'could', 'just', 'build', 'up', 'a', 'reputation', 'for', 'myself', ',', 'make', 'some', 'real', 'money', ',', 'get', 'to', 'be', 'well', 'known', 'as', 'an', 'illustrator', '-', 'like', 'Peter', 'Askington', ',', 'for', 'instance', '-', 'then', 'I', 'could', 'take', 'some', 'time', 'off', 'and', 'paint', "''", '.']
['I', 'kept', 'saying', ',', '``', 'If', 'I', 'could', 'just', 'build', 'up', 'a', 'reputation', 'for', 'myself', ',', 'make', 'some', 'real', 'money', ',', 'get', 'to', 'be', 'well', 'known', 'as', 'an', 'illustrator', '-', 'like', 'Peter', 'Askington', ',', 'for', 'instance', '-', 'then', 'I', 'could', 'take', 'some', 'time', 'off', 'and', 'paint', "


['Taxation', 'of', 'tangible', 'movable', 'property', 'in', 'Rhode', 'Island', 'has', 'been', 'generally', 'of', 'a', '``', 'hands', 'off', "''", 'nature', 'due', 'possibly', 'to', 'several', 'reasons', ':', '(', '1', ')', 'local', 'assessors', ',', 'in', 'the', 'main', ',', 'are', 'not', 'well', 'paid', 'and', 'have', 'inadequate', 'office', 'staffs', ',', '(', '2', ')', 'the', 'numerous', 'categories', 'of', 'this', 'component', 'of', 'personal', 'property', 'make', 'locating', 'extremely', 'difficult', ',', 'and', '(', '3', ')', 'the', 'inexperience', 'of', 'the', 'majority', 'of', 'assessors', 'in', 'evaluating', 'this', 'type', 'of', 'property', '.']
['To', 'this', 'end', ',', 'the', 'community', 'assistance', 'program', 'of', 'the', 'planning', 'division', 'will', 'continue', 'to', 'be', 'operated', 'as', 'a', 'staff', 'function', 'to', 'make', 'available', ',', 'on', 'a', 'shared', 'cost', 'basis', ',', 'technical', 'planning', 'assistance', 'to', 'those', 'communities', 'in', 

In [21]:
print(lesk1Syns)
print(lesk2Syns)

[Synset('make.v.38'), Synset('make.v.42'), Synset('name.v.03'), Synset('make.v.38'), Synset('make.v.08'), Synset('construct.v.01'), Synset('make.v.42'), Synset('make.v.42'), Synset('make.v.08'), Synset('shuffle.n.01'), Synset('make.v.03'), Synset('shuffle.n.01'), Synset('make.v.08'), Synset('make.v.42'), Synset('make.v.42'), Synset('make.v.03'), Synset('form.v.02'), Synset('name.v.03'), Synset('make.v.38'), Synset('make.v.08'), Synset('shuffle.n.01'), Synset('construct.v.01'), Synset('make.v.42'), Synset('make.v.08'), Synset('make.v.38'), Synset('make.v.42'), Synset('shuffle.n.01'), Synset('make.v.08'), Synset('make.v.38'), Synset('make.v.42'), Synset('make.v.46'), Synset('make.v.03'), Synset('make.v.08'), Synset('make.v.42'), Synset('form.v.02'), Synset('shuffle.n.01'), Synset('make.v.08'), Synset('form.v.02'), Synset('make.v.38'), Synset('make.v.08'), Synset('make.v.38'), Synset('make.v.08'), Synset('make.v.08'), Synset('make.v.08'), Synset('make.v.08'), Synset('make.v.42'), Synset('