In [56]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
# from nltk.stem import *

In [12]:
text = nltk.word_tokenize("the mass of the object is ten kilograms")

In [10]:
text = "the mass of the object is ten kilograms"

In [17]:
pos = nltk.pos_tag(text, tagset='universal')

In [131]:
syns = wn.synsets('dying')

In [132]:
syns[0].hyponyms()

[Synset('grave.n.01')]

In [135]:
syns[0].lemmas()

[Lemma('death.n.04.death'),
 Lemma('death.n.04.dying'),
 Lemma('death.n.04.demise')]

In [134]:
syns[0].lemmas()[0].antonyms()

[Lemma('birth.n.01.birth')]

In [24]:
stemmer = PorterStemmer()

In [23]:
pos[5][0]

'is'

In [39]:
pos[7][0]

'kilograms'

In [43]:
stemmed_pairs = []

In [44]:
stemmed = (stemmer.stem(pos[7][0]), pos[7][1])
stemmed_pairs.append(stemmed)

In [45]:
stemmed_pairs

[('kilogram', 'NOUN')]

In [37]:
stemmer.stem("dying")

'die'

In [128]:
class Disambiguator:
    
    def __init__(self, window_size):
        self.window_size = window_size
        self.window_words = []
        
        self.stemmer = PorterStemmer()
        self.porter_to_wn = {
            "NOUN": wn.NOUN,
            "VERB": wn.VERB,
            "ADJ" : wn.ADJ
        }
        # RELPAIRS
            # subst: gloss, hiponim, meronim
            # adj: gloss, antonim, similarity
            # verb: gloss, entailment
        
    def get_longest_common_subsequence(text_a, text_b):
        # !IMPORTANT, after finding the sequence, mark the words from both text_a and text_b
        # We do this to avoid using a word in multiple sequences
        # eg:
        # a b c d
        # b c x c d 
        # the c in the first string will be part of the bc sequence but not of cd
        return sequence_length, text_a, text_b
        
    def get_overlap_score(text_a, text_b):
        overlap_score = 0
        # while length > 0
            # sequence_length, text_a, text_b = get_longest_common_subsequence(text_a, text_b)
            # overlap_score = overlap_score + sequence_length * sequence_length # we square the length 
        
        return overlap_score
    
    def get_relatedness(sense_a, sense_b):
        relatedness = 0
        relatedness= relatedness + self.get_overlap_score(self.get_text_for_sense(sense_a), self.get_text_for_sense(sense_b))
        return relatedness
    
    def get_text_for_sense(self, sense):
        text_for_sense = ""
        text_for_sense = text_for_sense + sense.definition()
        for example in sense.examples():
            text_for_sense = text_for_sense + ". " + example
        return text_for_sense
        
    def get_target_sense_score(self, target_sense):
        target_sense_score = 0
        
        for window_word in self.window_words:
            window_word_senses = self.get_tuple_senses(window_word)
            for window_word_sense in window_word_senses:
                relatedness = self.get_relatedness(target_sense, window_word_sense)
                target_sense_score = target_sense_score + relatedness
        # for each window_word in self.window_words
            # get all window_word_senses for the window_word
            # for each window_word_sense in window_word_senses
                # relatedness = get_relatedness(target_sense, window_word_sense)
                # target_sense_score = target_sense_score + relatedness
        
        return target_sense_score
    
#     def get_window_words(self, text_string):
#         print("Set self.window_words here")
        # do POS-tagging
        # remove words without meaning( the)
        # get self.window_size words from the left and from the right
            # if not enough words on one side, 
                # add the rest from the opposite side
            # if not enough words on either sides(i.e. len(words(text_string)) < 2 * self.window_size + 1 ) 
                # just use all words in text_string except the target_word
    
    def remove_stop_words(self, processed_text):
        filtered_text = []
        for word_tuple in processed_text: 
          if word_tuple[0] not in stopwords.words('english'): 
            filtered_text.append(word_tuple)
        return filtered_text
    
    def get_target_position(self, processed_text, stemmed_target):
        for i in range(0,len(processed_text)):
            if stemmed_target == processed_text[i][0]:
                return processed_text[i], i
        return None, -1
    
    def get_tuple_senses(self, word_tuple):
        return wn.synsets(word_tuple[0], self.porter_to_wn[word_tuple[1]])
    
    def num_words_on_the_right(self, target_position, processed_text):
        return (len(processed_text) - target_position - 1)
    
    
    def get_window_words(self, processed_text, stemmed_target):
        processed_text = self.remove_stop_words(processed_text)
        target_tuple, target_position = self.get_target_position(processed_text, stemmed_target)
        print("Target position is " + str(target_position))
        if target_position == -1:
            print("Target word is not in the text!")
            return None, -1
        if (self.window_size * 2 + 1) > len(processed_text):
            window_words = processed_text[0:target_position] + processed_text[target_position+1:]
        elif 0 < (self.window_size - target_position): 
            left_words = processed_text[0:target_position]
            right_words = processed_text[target_position+1:2 * self.window_size + 1]
            # target_position + self.window_size + 1 + (self.window_size - target_position) =  2 * self.window_size + 1
            print("Extract everything from the left and (self.window_size - target_position) more from the right")
            window_words=left_words+right_words
        elif 0 < (self.window_size - self.num_words_on_the_right(target_position, processed_text) ):
            right_words = processed_text[target_position+1:]
            left_words = processed_text[len(processed_text) - 2*self.window_size - 1:target]
            # target_position-self.window_size-(self.window_size - (len(processed_text) - target_position - 1)) =
            # = -2*self.window_size + target_position - target_position + len(processed_text) - 1 = 
            # = len(processed_text) - 2*self.window_size - 1
            print("Extract everything from the right and (self.window_size - self.num_words_on_the_right) more from the left")
            window_words=left_words+right_words
        else:
            left_words = processed_text[target_position-self.window_size:target_position]
            right_words = processed_text[target_position+1:target_position+1+self.window_size]
            print("Extract self.window_size word tuples from each side")
            window_words=left_words+right_words
        return target_tuple, window_words
    
    def get_stemmed_pairs(self, word_pos_pairs):
        stemmed_pairs = []
        for word_pos_pair in word_pos_pairs:
            stemmed_pair = (self.stemmer.stem(word_pos_pair[0]), word_pos_pair[1])
            stemmed_pairs.append(stemmed_pair)
            
        return stemmed_pairs
    
    def get_processed(self, text_string):
        tokenized_words = nltk.word_tokenize(text_string)
        word_pos_pairs = nltk.pos_tag(text, tagset='universal')
        stemmed_pairs = self.get_stemmed_pairs(word_pos_pairs)
        
        return stemmed_pairs
        
    
    # eg:
    # > disambiguator.disambiguate("the mass of the object is ten kilograms", "mass")
    # The word "mass" has the meaning from synset *****( gloss of synset ***** - property of physical body)
    # > disambiguator.disambiguate("the angry mass of people went after him", "mass")
    # The word "mass" has the meaning from synset *****( gloss of synset ***** - crowd)       
    def disambiguate(self, text_string, target_word):
        disambiguated_sense = "Not Implemented Yet"
        processed_text = self.get_processed(text_string)
        print(processed_text)
        stemmed_target = self.stemmer.stem(target_word)
        target_tuple, self.window_words = self.get_window_words(processed_text, stemmed_target)
        if target_tuple == None:
            return
        print(self.window_words)
        target_senses = self.get_tuple_senses(target_tuple)
        print(target_senses)
        best_sense = (None, 0)
        for target_sense in target_senses:
            print(self.get_text_for_sense(target_sense))
        # for each target_sense in target_senses
            # calculate the target_sense_score
            target_sense_score = self.get_target_sense_score(target_sense)
            if target_sense_score > best_score[1] :
                best_score = (target_sense, target_sense_score)
        # take the target_sense with the 
        disambiguated_sense = self.get_text_for_sense(best_score[0])
        return disambiguated_sense

In [129]:
disambiguator = Disambiguator(3)

In [130]:
disambiguator.disambiguate("the mass of the object is ten kilograms", "object")

[('the', 'DET'), ('mass', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('object', 'NOUN'), ('is', 'VERB'), ('ten', 'ADJ'), ('kilogram', 'NOUN')]
Target position is 1
[('mass', 'NOUN'), ('ten', 'ADJ'), ('kilogram', 'NOUN')]
[Synset('object.n.01'), Synset('aim.n.02'), Synset('object.n.03'), Synset('object.n.04'), Synset('object.n.05')]
a tangible and visible entity; an entity that can cast a shadow. it was full of rackets, balls and other objects
the goal intended to be attained (and which is believed to be attainable). the sole object of her trip was to see her children
(grammar) a constituent that is acted upon. the object of the verb
the focus of cognitions or feelings. objects of thought. the object of my affection
(computing) a discrete item that provides a description of virtually anything known to a computer. in object-oriented programming, objects include data and define its status, its methods of operation and how it interacts with other objects


'Not Implemented Yet'

AttributeError: 'WordNetCorpusReader' object has no attribute 'Synset'