In [22]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import *
from nltk.corpus import senseval

In [253]:
class Disambiguator:
    
    def __init__(self, window_size=3):
        self.window_size = window_size
        self.window_words = []
        
        self.stemmer = PorterStemmer()
        self.porter_to_wn = {
            "NOUN": wn.NOUN,
            "VERB": wn.VERB,
            "ADJ" : wn.ADJ
        }
        # RELPAIRS
            # subst: gloss, hiponim, meronim
            # adj: gloss, antonim, similarity
            # verb: gloss, entailment
        self.rel_pairs = {
            "NOUN": [
                ("gloss", "gloss"), ("hypo", "hypo"), ("mero", "mero"),
                ("gloss", "hypo"), ("gloss", "mero"), ("hypo", "mero"),
                ("hypo", "gloss"), ("mero", "gloss"), ("mero", "hypo")
            ],
            "ADJ" : [
                ("gloss", "gloss"), ("anto", "anto"), ("sim", "sim"),
                ("gloss", "anto"), ("gloss", "sim"), ("anto", "sim"),
                ("anto", "gloss"), ("sim", "gloss"), ("sim", "anto")
            ], 
            "VERB": [
                ("gloss", "gloss"), ("entl", "entl"),
                ("gloss", "entl"), ("entl", "gloss")
            ],
            "default": [("gloss", "gloss")]
        }
        
        
    # a = ["a", "b", "c", "d"]
    # b = ["b", "c", "x", "c", "d"]
    # disambiguator = Disambiguator()
    # length, new_a, new_b = disambiguator.get_longest_common_substring(a, b)
    # print(length) # 2
    # print(new_a)  # ["a", "*", "*", "d"]
    # print(new_b)  # ["*", "*", "x", "c", "d"]
    def get_longest_common_substring(self, tokens_a, tokens_b):
        lcs = [[0] * (1 + len(tokens_b)) for i in range(1 + len(tokens_a))]
        best, best_position_a, best_position_b = 0, 0, 0
        for i in range(1, 1 + len(tokens_a)):
            for j in range(1, 1 + len(tokens_b)):
                if (tokens_a[i - 1] == tokens_b[j - 1]) & (tokens_a[i-1] != "*"):
                    lcs[i][j] = lcs[i - 1][j - 1] + 1
                    if lcs[i][j] > best:
                        best = lcs[i][j]
                        best_position_a = i
                        best_position_b = j
                else:
                    lcs[i][j] = 0
                    
        tokens_a[best_position_a - best:best_position_a] = "*" * best
        tokens_b[best_position_b - best:best_position_b] = "*" * best
        
        return best, tokens_a, tokens_b
    
    # a = "a b c d"
    # b = "b c x c d"
    # disambiguator = Disambiguator()
    # overlap_score = disambiguator.get_overlap_score(a, b)
    # print(overlap_score) # 5 = 4 + 1 = 2^2 + 1^2
    def get_overlap_score(self, text_a, text_b):
        overlap_score = 0
        tokens_a = nltk.word_tokenize(text_a)
        tokens_b = nltk.word_tokenize(text_b)
        sequence_length, tokens_a, tokens_b = self.get_longest_common_substring(tokens_a, tokens_b)
        while sequence_length > 0:
            overlap_score = overlap_score + sequence_length * sequence_length # we square the length 
            sequence_length, tokens_a, tokens_b = self.get_longest_common_substring(tokens_a, tokens_b)
        
        return overlap_score
    
    def get_texts(self, target_tuple, window_tuple):
        target = {}
        window = {}
        
        target["gloss"] = self.get_gloss_for_sense(target_tuple[0])
        window["gloss"] = self.get_gloss_for_sense(window_tuple[0])
        target["hypo"] = self.get_hyponyms_for_sense(target_tuple[0])
        window["hypo"] = self.get_hyponyms_for_sense(window_tuple[0])
        target["mero"] = self.get_meronyms_for_sense(target_tuple[0])
        window["mero"] = self.get_meronyms_for_sense(window_tuple[0])
        target["anto"] = self.get_antonyms_for_sense(target_tuple[0])
        window["anto"] = self.get_antonyms_for_sense(window_tuple[0])
        target["sim"] = self.get_similarity_for_sense(target_tuple[0])
        window["sim"] = self.get_similarity_for_sense(window_tuple[0])
        target["entl"] = self.get_entailments_for_sense(target_tuple[0])
        window["entl"] = self.get_entailments_for_sense(window_tuple[0])
        
        return target, window
    
    def get_enhanced_relatedness(self, target_pos, target_texts, window_texts):
        relatedness = 0
        for rel_pair in self.rel_pairs[target_pos]:
            relatedness = relatedness + self.get_overlap_score(target_texts[rel_pair[0]], window_texts[rel_pair[1]])
        return relatedness
    
    
    # mass =  wn.synsets("mass", wn.NOUN)[0]
    # print(mass)
    # kilogram = wn.synsets("kilogram", wn.NOUN)[0]
    # print(kilogram)
    # target_tuple, window_tuple =(mass, 'NOUN'), (kilogram, 'NOUN')
    # disambiguator = Disambiguator()
    # relatedness_score = disambiguator.get_relatedness(target_tuple, window_tuple)
    # print(relatedness_score)
    def get_relatedness(self, target_tuple, window_tuple):
        target_texts, window_texts = self.get_texts(target_tuple, window_tuple)
        if target_tuple[1] in ["NOUN", "ADJ", "VERB"]:
            relatedness = self.get_enhanced_relatedness(target_tuple[1], target_texts, window_texts)
        else:
            relatedness = self.get_overlap_score("default", target_texts, window_texts)
            
        return relatedness
    
    def get_gloss_for_sense(self, sense):
        gloss_for_sense = ""
        gloss_for_sense = gloss_for_sense + sense.definition()
        for example in sense.examples():
            gloss_for_sense =gloss_for_sense + ". " + example
        return gloss_for_sense
    
    def merge_glosses(self, synsets):
        aggregator = ""
        for synset in synsets:
            aggregator = aggregator + ". " + self.get_gloss_for_sense(synset)
        return aggregator
    
    def get_hyponyms_for_sense(self, sense):
        hyponyms_for_sense = self.merge_glosses(sense.hyponyms())
        return hyponyms_for_sense
    
    def get_meronyms_for_sense(self, sense):
        meronyms_for_sense = self.merge_glosses(sense.member_meronyms())
        meronyms_for_sense = meronyms_for_sense + ". " + self.merge_glosses(sense.substance_meronyms())
        meronyms_for_sense = meronyms_for_sense + ". " + self.merge_glosses(sense.part_meronyms())
        return meronyms_for_sense
    
    def get_antonyms_for_sense(self, sense):
        antonyms_for_sense = ""
        for lemma in sense.lemmas():
            antonyms_for_sense = antonyms_for_sense + ". " + self.merge_glosses([ant.synset() for ant in lemma.antonyms()])
        return antonyms_for_sense
    
    def get_similarity_for_sense(self, sense):
        similarity_for_sense = self.merge_glosses(sense.similar_tos())
        return similarity_for_sense
    
    def get_entailments_for_sense(self, sense):
        entailments_for_sense = self.merge_glosses(sense.entailments())
        return entailments_for_sense
    
    def get_target_sense_score(self, target_sense_tuple):
        target_sense_score = 0
        
        for window_word in self.window_words:
            #print(window_word)
            window_word_senses = self.get_tuple_senses(window_word)
            for window_word_sense in window_word_senses:
                relatedness = self.get_relatedness(target_sense_tuple, (window_word_sense, window_word[1]))
                target_sense_score = target_sense_score + relatedness
        
        return target_sense_score
    
    def remove_stop_words(self, processed_text):
        filtered_text = []
        for word_tuple in processed_text: 
            if word_tuple[0] not in stopwords.words('english'): 
                if word_tuple[1] in ["NOUN", "ADJ", "VERB"]:
                    filtered_text.append(word_tuple)
        return filtered_text
    
    def remove_punctuation(self, processed_text):
        filtered_text = []
        for word_tuple in processed_text:  
            if (len(word_tuple) == 2):
                if (word_tuple[1] != ".") : # word_tuple[0] not in [".", "`"]: 
                    filtered_text.append(word_tuple)
        return filtered_text
    
    def get_target_position(self, processed_text, stemmed_target):
        for i in range(0,len(processed_text)):
            if stemmed_target == processed_text[i][0]:
                return processed_text[i], i
        return None, -1
    
    # word_tuple = ('mass', 'NOUN')
    # disambiguator = Disambiguator()
    # tuple_senses = disambiguator.get_tuple_senses(word_tuple)
    # print(tuple_senses)
    # # [Synset('mass.n.01'), Synset('batch.n.02'), Synset('mass.n.03'), Synset('mass.n.04'), Synset('mass.n.05'), Synset('multitude.n.03'), Synset('bulk.n.02'), Synset('mass.n.08'), Synset('mass.n.09')]
    def get_tuple_senses(self, word_tuple):
        return wn.synsets(word_tuple[0], self.porter_to_wn[word_tuple[1]])
    
    def num_words_on_the_right(self, target_position, processed_text):
        return (len(processed_text) - target_position - 1)
    
    # processed_text = [('the', 'DET'), ('mass', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('object', 'NOUN'), ('is', 'VERB'), ('ten', 'ADJ'), ('kilogram', 'NOUN')]
    # stemmed_target = "mass"
    # disambiguator = Disambiguator()
    # target_tuple, window_words = disambiguator.get_window_words(processed_text, stemmed_target)
    # print(target_tuple)
    # print(window_words)
    # # ('mass', 'NOUN')
    # # [('object', 'NOUN'), ('ten', 'ADJ'), ('kilogram', 'NOUN')]
    def get_window_words(self, processed_text, stemmed_target):
        processed_text = self.remove_punctuation(processed_text)
        processed_text = self.remove_stop_words(processed_text)
        target_tuple, target_position = self.get_target_position(processed_text, stemmed_target)
        if target_position == -1:
            print("Target word is not in the text!")
            return None, -1
        if (self.window_size * 2 + 1) > len(processed_text):
            window_words = processed_text[0:target_position] + processed_text[target_position+1:]
        elif 0 < (self.window_size - target_position): 
            left_words = processed_text[0:target_position]
            right_words = processed_text[target_position+1:2 * self.window_size + 1]
            # target_position + self.window_size + 1 + (self.window_size - target_position) =  2 * self.window_size + 1
            # print("Extract everything from the left and (self.window_size - target_position) more from the right")
            window_words=left_words+right_words
        elif 0 < (self.window_size - self.num_words_on_the_right(target_position, processed_text) ):
            right_words = processed_text[target_position+1:]
            left_words = processed_text[len(processed_text) - 2*self.window_size - 1:target_position]
            # target_position-self.window_size-(self.window_size - (len(processed_text) - target_position - 1)) =
            # = -2*self.window_size + target_position - target_position + len(processed_text) - 1 = 
            # = len(processed_text) - 2*self.window_size - 1
            # print("Extract everything from the right and (self.window_size - self.num_words_on_the_right) more from the left")
            window_words=left_words+right_words
        else:
            left_words = processed_text[target_position-self.window_size:target_position]
            right_words = processed_text[target_position+1:target_position+1+self.window_size]
            # print("Extract self.window_size word tuples from each side")
            window_words=left_words+right_words
        return target_tuple, window_words
    
    def get_stemmed_pairs(self, word_pos_pairs):
        stemmed_pairs = []
        for word_pos_pair in word_pos_pairs:
            stemmed_pair = (self.stemmer.stem(word_pos_pair[0]), word_pos_pair[1])
            stemmed_pairs.append(stemmed_pair)
            
        return stemmed_pairs
    
    # disambiguator = Disambiguator()
    # disambiguator.get_processed("the mass of the object is ten kilograms")
    def get_processed(self, text_string):
        tokenized_words = nltk.word_tokenize(text_string)
#         print("Tokens with punctuation")
#         print(tokenized_words_with_punctuation)
#         tokenized_words = self.remove_punctuation(tokenized_words_with_punctuation)
#         print("Tokens")
#         print(tokenized_words)
        word_pos_pairs = nltk.pos_tag(tokenized_words, tagset='universal')
#         print("POS")
#         print(word_pos_pairs)
        stemmed_pairs = self.get_stemmed_pairs(word_pos_pairs)
#         print("Stemmed")
#         print(stemmed_pairs)
        
        return stemmed_pairs
        
    # eg:
    # disambiguator = Disambiguator()
    # disambiguator.disambiguate("the mass of the object is ten kilograms", "mass")
    # # The word "mass" has the meaning from synset *****( gloss of synset ***** - property of physical body)
    # disambiguator.disambiguate("the angry mass of people went after him", "mass")
    # # The word "mass" has the meaning from synset *****( gloss of synset ***** - crowd)       
    def disambiguate(self, text_string, target_word):
        disambiguated_sense = "Not Implemented Yet"
        processed_text = self.get_processed(text_string)
#         print(processed_text)
        stemmed_target = self.stemmer.stem(target_word)
        
#         print(target_word)
#         print(stemmed_target)
        target_tuple, self.window_words = self.get_window_words(processed_text, stemmed_target)
        if target_tuple == None:
            return
#         print(target_tuple)
        target_senses = self.get_tuple_senses((target_word, target_tuple[1]))
        best_score = (None, -1)
#         print(target_senses)
        for target_sense in target_senses:
            target_sense_score = self.get_target_sense_score((target_sense, target_tuple[1]))
            if target_sense_score > best_score[1] :
                best_score = (target_sense, target_sense_score)
        
        predicted_synset = best_score[0]
        disambiguated_sense = self.get_gloss_for_sense(best_score[0])
        return predicted_synset, disambiguated_sense

In [254]:
class DisambiguatorTester:
    def __init__(self, window_size=3):
        self.disambiguator = Disambiguator(window_size)
        self.sense_map = {
            "HARD1": ["difficult.a.01"],    # not easy, requiring great physical or mental
            "HARD2": ["hard.a.02",          # dispassionate
                      "difficult.a.01"],
            "HARD3": ["hard.a.03"],         # resisting weight or pressure
            "interest_1": ["interest.n.01"], # readiness to give attention
            "interest_2": ["interest.n.03"], # quality of causing attention to be given to
            "interest_3": ["pastime.n.01"],  # activity, etc. that one gives attention to
            "interest_4": ["sake.n.01"],     # advantage, advancement or favor
            "interest_5": ["interest.n.05"], # a share in a company or business
            "interest_6": ["interest.n.04"], # money paid for the use of money
            "cord": ["line.n.18"],          # something (as a cord or rope) that is long and thin and flexible
            "formation": ["line.n.01","line.n.03"], # a formation of people or things one beside another
            "text": ["line.n.05"],                 # text consisting of a row of words written across a page or computer screen
            "phone": ["telephone_line.n.02"],   # a telephone connection
            "product": ["line.n.22"],       # a particular kind of product or merchandise
            "division": ["line.n.29"],      # a conceptual separation or distinction
            "SERVE12": ["serve.v.02"],       # do duty or hold offices; serve in a specific function
            "SERVE10": ["serve.v.06"], # provide (usually but not necessarily food)
            "SERVE2": ["serve.v.01"],       # serve a purpose, role, or function
            "SERVE6": ["service.v.01"]      # be used by; as of a utility
        }
        
    def get_test_case(self, instance):
        pos = instance.position
        target_word = ' '.join(w for (w,t) in instance.context[pos:pos+1])
        
#         print(instance.context[0:pos])
        left = ' '.join( "" if word_tuple=='FRASL' else word_tuple[0] for word_tuple in instance.context[0:pos]) # .items()
        right = ' '.join(w for (w,t) in instance.context[pos+1:])
        phrase = left + " " + target_word + " " + right
        
        target_synsets_names = self.sense_map[instance.senses[0]]
        target_synsets = [wn.synset(name) for name in target_synsets_names]
        
        return phrase, target_word, target_synsets
        
    def test(self):
        correct_predictions, total_tests = 0, 0
        corpuses = senseval.fileids()
        for corpus in corpuses[2:]:
            print("=" * 100)
#             no_instances = len(senseval.instances(corpus))
#             print("Testing " + corpus + " with " + str(no_instances) + " instances." )
            print("Testing " + corpus)
            print("=" * 100)
            for instance in senseval.instances(corpus)[0:2]:
                phrase, target_word, target_synsets = self.get_test_case(instance)
                print("-" * 100)
                print("Phrase: " + phrase)
                print("_" * 5)
                print("Target Word: " + target_word)
                print("_" * 5)
                print("Target Synsets: " + str(target_synsets) + " meaning " + target_synsets[0].definition())
                print("_" * 5)
                predicted_synset, description = self.disambiguator.disambiguate(phrase, target_word)
                print("Predicted Synset: " + str(predicted_synset) + " meaning: ")
                print(description)
                if predicted_synset in target_synsets:
                    correct_predictions += 1
                total_tests +=1
        print("=" * 100)
        print("=" * 100)
        print("Run " + str(total_tests) + " tests")
        print("Predicted " + str(correct_predictions) + " predictions")
        print("Percentage " + str(correct_predictions / total_tests * 100) + "%")
        print("=" * 100)
        print("=" * 100)

In [None]:
tester = DisambiguatorTester(5)

In [None]:
tester.test()

Testing line.pos
----------------------------------------------------------------------------------------------------
Phrase: the company argued that its foreman needn 't have told the worker not to move the plank to which his lifeline was tied because " that comes with common sense . " the commission noted , however , that dellovade hadn 't instructed its employees on how to secure their lifelines and didn 't heed a federal inspector 's earlier suggestion that the company install special safety lines inside the a-frame structure it was building .
_____
Target Word: lines
_____
Target Synsets: [Synset('line.n.18')] meaning something (as a cord or rope) that is long and thin and flexible
_____
Predicted Synset: Synset('line.n.18') meaning: 
something (as a cord or rope) that is long and thin and flexible. a washing line
----------------------------------------------------------------------------------------------------
Phrase: the set , designed by mr . hall 's longtime associate eugene

In [27]:
senseval.fileids()

['hard.pos', 'interest.pos', 'line.pos', 'serve.pos']

In [124]:
x = 0
x

0

In [126]:
x +=1
x

2

In [147]:
phrase = "`` he may lose all popular support , but someone has to kill him to defeat him and that 's hard to do . ''"
target_word = "hard"
diz = Disambiguator()

In [148]:
diz.disambiguate(phrase, target_word)

hard
hard
Extract everything from the right and (self.window_size - self.num_words_on_the_right) more from the left


KeyError: '.'

In [137]:
diz.disambiguate("the mass of the object is ten kilograms", "object")

(Synset('object.n.01'),
 'a tangible and visible entity; an entity that can cast a shadow. it was full of rackets, balls and other objects')

In [149]:
word_tuple = ('mass', 'NOUN')
disambiguator = Disambiguator()
tuple_senses = disambiguator.get_tuple_senses(word_tuple)
print(tuple_senses)

[Synset('mass.n.01'), Synset('batch.n.02'), Synset('mass.n.03'), Synset('mass.n.04'), Synset('mass.n.05'), Synset('multitude.n.03'), Synset('bulk.n.02'), Synset('mass.n.08'), Synset('mass.n.09')]


In [150]:
disambiguator = Disambiguator()
disambiguator.disambiguate("the mass of the object is ten kilograms", "mass")

mass
mass


(Synset('mass.n.01'),
 'the property of a body that causes it to have weight in a gravitational field')

In [151]:
disambiguator.disambiguate("he may lose all popular support , but someone has to kill him to defeat him and that 's hard to do", "hard")

hard
hard
Extract everything from the right and (self.window_size - self.num_words_on_the_right) more from the left


KeyError: '.'

In [178]:
tup = ("x","y","z")

In [179]:
len(tup)

3

In [193]:
"." != '.'

False

In [205]:
nltk.word_tokenize("`` he may lose all popular support , but someone has to kill him to defeat him and that 's hard to do . ''")

['``',
 'he',
 'may',
 'lose',
 'all',
 'popular',
 'support',
 ',',
 'but',
 'someone',
 'has',
 'to',
 'kill',
 'him',
 'to',
 'defeat',
 'him',
 'and',
 'that',
 "'s",
 'hard',
 'to',
 'do',
 '.',
 "''"]

In [217]:
wn.synsets("inside", wn.ADP)

AttributeError: 'WordNetCorpusReader' object has no attribute 'ADP'

In [226]:
senseval.fileids()[3]

'serve.pos'

In [239]:
wn.synsets("serve")

[Synset('serve.n.01'),
 Synset('serve.v.01'),
 Synset('serve.v.02'),
 Synset('serve.v.03'),
 Synset('service.v.01'),
 Synset('serve.v.05'),
 Synset('serve.v.06'),
 Synset('serve.v.07'),
 Synset('serve.v.08'),
 Synset('serve.v.09'),
 Synset('serve.v.10'),
 Synset('serve.v.11'),
 Synset('suffice.v.01'),
 Synset('serve.v.13'),
 Synset('serve.v.14'),
 Synset('serve.v.15')]