In [None]:
## Dynamic Programming ##
# bottom up - efficient
def fib1(n):
    a = []
    for i in range(n):
        if i>=2:
            a.append(a[i-1]+a[i-2])
        else:
            a.append(1)
    return a[n-1]

# recursive calls - lazy
def fib2(n):
    if n==1 or n==2:
        return 1
    else:
        return fib2(n-1)+fib2(n-2)

print(fib1(35))
fib2(35)

In [1]:
import nltk
import numpy as np
import pandas as pd
import os, sys, glob

nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from __future__ import print_function
from BeautifulSoup import BeautifulSoup as bs

[nltk_data] Downloading package wordnet to /Users/TaMmY/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
xml = bs(open('dictionary.xml').read())
train= open('train.data').read().split('\n')
validate= open('validate.data').read().split('\n')
test= open('test.data').read().split('\n')

# Corpus Lex Model - Augmenting dictionary using Training Data

In [6]:
def augmentXML(train, xml):
    for i in train:
        try:
            feature = i.split(' | ')
            train_word = feature[0]
            train_sense = feature[1]
            train_context = ''.join(feature[2].split('%% ')) # removing %% around target word

            # looking up the target word in train data inside the xml dict
            for j in xml.findAll('lexelt',{'item':train_word}):

                # looking for sense inside the target word xml tag and adding context to it
                for k in j.findAll('sense',{'id':train_sense}):
                    k['examples'] = k['examples']+' | '+train_context
                    k['examples'] = os.path.join(k['examples'])

        except Exception as e:
                print(e)

    ## Writing the newly augmented dictionary
    f = open('augmentedDictionary.xml', "w")
    f.write(augDict.prettify())
    f.close()  
    return xml

# augXml = augmentXML(train, xml)
augXml = bs(open('augmentedDictionary.xml').read())  

# Lesk Sense Using XML dict


In [8]:
def leskSense_xml(context_sentence, ambiguous_word, xml_item, corpus='gloss' , stem=True):
    max_overlaps = 0; lesk_sense = None
    context_sentence = context_sentence.split()

    for item in xml_item.findAll('sense'):
        
        item_dict = dict(item.attrs)
        lesk_dictionary = []

        # Includes gloss=definition or examples.
        lesk_dictionary+= item_dict[corpus].split()
        
        if stem == True: # Matching exact words causes sparsity, so lets match stems.
            lesk_dictionary = [ps.stem(i) for i in lesk_dictionary]
            context_sentence = [ps.stem(i) for i in context_sentence] 
        
        # gives all the possible overlaps of single, consecutive, or more overlaps.
        overlaps = set(lesk_dictionary).intersection(context_sentence)

        if len(overlaps) > max_overlaps:
            lesk_sense = item
            max_overlaps = len(overlaps)
    
    if lesk_sense is None:
        return 0, max_overlaps
    else:
        return str(lesk_sense.attrs[0][1]), max_overlaps

# Lesk Sense Using Wordnet

In [9]:
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from itertools import chain

ps = PorterStemmer()

def leskSense_wn(context_sentence, ambiguous_word, stem=True, hyperhypo=True):
    max_overlaps = 0; lesk_sense = None
    context_sentence = context_sentence.split()
    for ss in wn.synsets(ambiguous_word):
        
        lesk_dictionary = []

        # Includes definition.
        lesk_dictionary+= ss.definition().split()
        # Includes lemma_names.
        lesk_dictionary+= ss.lemma_names()

        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            lesk_dictionary+= list(chain(*[i.lemma_names() for i in ss.hypernyms()+ss.hyponyms()]))       

        if stem == True: # Matching exact words causes sparsity, so lets match stems.
            lesk_dictionary = [ps.stem(i) for i in lesk_dictionary]
            context_sentence = [ps.stem(i) for i in context_sentence] 

        overlaps = set(lesk_dictionary).intersection(context_sentence)

        if len(overlaps) > max_overlaps:
            lesk_sense = ss
            max_overlaps = len(overlaps)

    return lesk_sense


In [10]:
def sensePrediction(data=train, dictionary=None, corpus='gloss'):
    import time
    time_init = time.time()
    count = 0
    for i in data:
        
        lookup_dict = i.split(' | ')
        target_word = lookup_dict[0]
        target_sense = lookup_dict[1]
        context = lookup_dict[2]

        ## prediction based on Wordnet
        if dictionary is None:
            target_word = target_word.split('.')
            a = leskSense_wn(context, target_word[0], pos=target_word[1])
            if int(a.unicode_repr()[-4:-2]) == int(target_sense):
                count+=1
#                 print(target_word,"wn_sense: ", (a.unicode_repr()[-4:-2]), "train_sense: ",target_sense)

        ## prediction based on given dictionary
        else:
            for j in dictionary.findAll('lexelt',{'item':target_word}):
                a,b = leskSense_xml(context, target_word, j, corpus=corpus)
                if str(a) == str(target_sense):
                    count+=1
#                     print(target_word,"sense: ", a, "train_sense: ",target_sense,"max_overlaps: ", b)

    print('Time Taken: ', time.time()/60 - time_init/60, " minutes")
    return 100*count/float(len(data))


In [11]:
def sensePredictionTest(data, dictionary=None, corpus='gloss'):

    sense_list=[]
    for i in data:
        lookup_dict = i.split(' | ')
        target_word = lookup_dict[0]
        target_sense = lookup_dict[1]
        context = lookup_dict[2]
        
        ## prediction using Wordnet
        if dictionary is None:
            target_word = target_word.split('.')
            a = leskSense_wn(context, target_word[0], pos=target_word[1])
            sense_list.append(str(a.unicode_repr()[-4:-2]))
            
        ## prediction based on given dictionary
        else:
            for j in dictionary.findAll('lexelt',{'item':target_word}):
                a,b = leskSense_xml(context, target_word, j, corpus=corpus)
                sense_list.append(str(a))
                
    return sense_list

def writeResults(sense_list, filename):
    with open(filename, mode="w") as outfile:  
        for sense in sense_list:
            outfile.write("%s\n" % sense)

In [None]:
if __name__ == '__main__':
    
    print("=== Train Accuracy ===")
    print("Simple Lesk using definitions - Accuracy: ",sensePrediction(train[:3000],dictionary=xml)) #this will be same for augDict as using definitions
    print("Simple Lesk using examples - Accuracy: ",sensePrediction(train[:3000],dictionary=xml, corpus='examples'))
    print("Corpus Lesk Accuracy: ",sensePrediction(train[:3000],dictionary=augXml, corpus='examples'))
    print("Wordnet Accuracy: ",sensePrediction(train[:3000],dictionary=None))
    
    print("=== Vaidate Accuracy ===")
    print("Simple Lesk using definitions - Accuracy: ",sensePrediction(validate,dictionary=xml)) #this will be same for augDict as using definitions
    print("Simple Lesk using examples - Accuracy: ",sensePrediction(validate,dictionary=xml, corpus='examples'))
    print("Corpus Lesk Accuracy: ",sensePrediction(validate,dictionary=augXml, corpus='examples'))
    print("Wordnet Accuracy: ",sensePrediction(validate,dictionary=None))
    
    print("=== Test Prediction ===")
    print("==Writing output for Simple/Corpus Lesk with definition")
    writeResults(sensePredictionTest(data = test, dictionary=xml, corpus='gloss'), filename='testPredictionGloss.txt')
#     print("==Writing output for Simple Lesk with examples")
#     writeResults(sensePredictionTest(data = test, dictionary=xml, corpus='examples'), filename='testPredictionSimpleLeskExamples.txt')
#     print("==Writing output for Corpus Lesk with examples")
#     writeResults(sensePredictionTest(data = test, dictionary=augXml, corpus='examples'), filename='testPredictionCorpusLeskExamples.txt')


=== Train Accuracy ===
Time Taken:  1.80645370111  minutes
Simple Lesk using definitions - Accuracy:  20.0666666667
Time Taken:  2.31064834818  minutes
Simple Lesk using examples - Accuracy:  49.2333333333


In [13]:
import pandas as pd

prob_s = [i.split(' | ') for i in train]
df = pd.DataFrame(prob_s)
df.columns = ['word.pos','sense','example']
df['sense_prior'] = 1
df1 = df.groupby(['word.pos','sense']).agg({'sense_prior': 'sum'})
sense_priors = df1.groupby(level=0).apply(lambda x: x / float(x.sum()))
sense_priors.reset_index()



Unnamed: 0,word.pos,sense,sense_prior
0,affect.v,1,1.000000
1,allow.v,1,0.907407
2,allow.v,2,0.092593
3,announce.v,1,0.988636
4,announce.v,2,0.011364
5,approve.v,1,0.943396
6,approve.v,2,0.056604
7,area.n,1,0.736196
8,area.n,2,0.233129
9,area.n,3,0.030675


# Naive Bayes WSD

In [None]:
def get_V(train):    
    un_train = []
    t = ""
    a = []
    for i in train[:1000]:
            feature  = i.split(' | ')
            t  = feature[0]
            un_train.append(t)
            [a.append(x) for x in un_train  if x not in a]
    return a

In [None]:
def preparingTrainingData(train,vocab):
    docs = {}
    words = {}
    sense_count = []
    index = 0 
    idx = 0
    temp = 0
    for i in train:
        feature = i.split(' | ')
        train_word = feature[0]
        if train_word == vocab:
            train_sense = feature[1]
            sense_count += train_sense
            train_context = ''.join(feature[2].split('%% ')) # removing %% around target word
            if train_word <> temp:
                ct = {}
                sense = []
                temp = train_word
                if train_sense not in sense:
                    sense += train_sense
                    words[train_word] = sense
                    ct[train_sense] = ct.get(train_sense,'')+''+train_context
                else:
                    ct[train_sense] = ct.get(train_sense,'')+''+train_context
            else:

                if train_sense not in sense:
                    sense += train_sense 
                    words[train_word] = sense
                    ct[train_sense] = ct.get(train_sense,'')+''+train_context
                else:
                    ct[train_sense] = ct.get(train_sense,'')+''+train_context
            docs[train_word] = ct
    return docs,sense_count

In [None]:
def getUniquesensecount(sense_count):
    sc = {}
    usense = []
    [usense.append(x) for x in sense_count if x not in usense]
    usense
    for s in sense_count:
        sc[s]  = sense_count.count(s)
    return sc

In [None]:
def revise_documents(docs):
    stemmer = PorterStemmer()
    senses = {}     # {reference:sense}
    count = 0 
    ulist = []
    b ={}
    for word, l in docs.items():
        for s, text in l.items():
            words = re.findall(r"[\w']+", text)
            word_list = []
            for w in words:
                word_list.append(stemmer.stem(w.lower(),))
            count += len(word_list)
            #print(word_list)# get word count for the whole doc
            b[s] = word_list
            [ulist.append(x) for x in word_list if x not in ulist]
    docs[word] = b
    #print(l.get('1'))
    count = len(ulist)
    return docs, count

In [None]:
def revise_documents_test(docs_test):
    stemmer = PorterStemmer()
    senses = {}     # {reference:sense}
    count = 0 
    ulist = []
    l ={}
    for word, text in docs.items():
        
        words = re.findall(r"[\w']+", str(text))
        word_list = []
        for w in words:
            word_list.append(stemmer.stem(w.lower(),))
        count += len(word_list)
        #print(word_list)# get word count for the whole doc
        l[s] = word_list
        [ulist.append(x) for x in word_list if x not in ulist]
    docs[word] = l
    #print(l.get('1'))
    count = len(ulist)
    return docs, count

In [None]:
def get_count_of_word_in_sense(word, context):
    count = 0       # Returns count of given word in given sense
    for w in context:      
        if w == word:
            count += 1
    #print("count each word",count)
    return count

In [None]:
def get_count_of_word_in_sense_test(word,sense_train,train_docs):
    count = 0 
    for word_train,sense in train_docs.items():
        for s, test_text in sense.items():
            for w in test_text:
                if s == sense_train:
                    if w == word:
                        count += 1
    return count

In [None]:
def naive_bayes(docs,count):
    output = []
    for word, senseList in docs.items():
        V = count # Total count of words in train set
        print("Variables ",V)
        probabilities = {}
        
        for sense, context in senseList.items():
            prob = float(sc.get(sense))/len(sense_count)  # P(class) prior
            for word in context:       # P(word|sense) = count(word,sense) + 1 / count(sense) + V
                # selected word count in given sense + 1 / all word count in given sense
                p = (get_count_of_word_in_sense(word, context) + 1) / (float(len(context))+ V)
                prob *=  p
            probabilities[sense] = prob
        best = max(probabilities, key=probabilities.get)
        print(probabilities)
        print("best" +" " + str(best))
        
        #output.append(str(ref) + " " + str(best))
    #print_txt(output, path)

In [None]:
def naive_bayes_test(sense_train,test_docs,count,docs):
    output = []
    for word, context in test_docs.items():
        V = count       
        #print("Total count::::",count, sense_train)
       
        #print(sc.get(sense_train)," ",len(sense_count))
        prob_test = float(sc.get(sense_train))/len(sense_count)
        #print("sense probabilities:::",prob_test)
        for word in context:       # P(word|sense) = count(word,sense) + 1 / count(sense) + V
            # selected word count in given sense + 1 / all word count in given sense
            #print("word:::",word)
            p_test = (get_count_of_word_in_sense_test(word,sense_train,docs) + 1) / (float(len(context))+ V)
            #print("print test",p_test)
            prob_test *=  p_test
            #print("prob test",prob_test)
        probabilities_test= float(prob_test)
        #best_test = max(probabilities_test, key=probabilities_test.get)
        #output += best_test 
        
    return probabilities_test

In [None]:
def print_txt(item, path):
    f = open(path, "w")
    for i in item:
        f.write(i + "\n")
    f.close()

In [None]:
if __name__ == '__main__':
    vocab  =  get_V(train)
    print(vocab)
    output = []
    for v in vocab:
        print(v)
        docs,sense_count = preparingTrainingData(train,v)
        sc = getUniquesensecount(sense_count)
        print("sense ",sc )
        docs,count = revise_documents(docs) #get total variables
        #print(docs)
        for i in test:
            feature = i.split(' | ')
            test_word = feature[0]
            if feature[0] == v:
                test_context = ''.join(feature[2].split('%% '))
                docs_test[test_word] = test_context
                docs_test,count       = revise_documents_test(docs_test)
                prob = {}
               
                for s,n in sc.items():
                    #for word, senseList in docs.items():
                        #for sense, context in senseList.items():
                            #print("how many sense:::",sense)
                    prob[s]=naive_bayes_test(s,docs_test,count,docs)
                
                best_test = max(prob, key=prob.get)
                print(prob)
                output.append(str(prob)+" "+str(test_word)+ " best sense "+ str(max(best_test)))
    print_txt(output, "ouput_naive")