## Run the following command in terminal to download GoogleNews dataset in the models folder we have made: <br/>
## wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [1]:
from gensim.models import KeyedVectors
model_w2v = KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin.gz',binary=True)

In [13]:
from nltk.corpus import semcor
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import time
import re
import warnings
from sklearn import metrics
warnings.filterwarnings("ignore")

# Generating word vectors for Semcor data

In [3]:
vectors = {}
semcor_sents = semcor.sents()
for sentence in semcor_sents:
    for word in sentence:
        if word in model_w2v:
            vectors[word] = model_w2v.get_vector(word)


## Function to get a single vector for entire sentence.

In [4]:
def sent_vec(sentence):
    li = []
    for word in sentence:
        if word in vectors:
            li.append(vectors[word])
    if len(li) == 0:
        return np.zeros(300)
    return np.average(li,axis=0)

tf_idf_model = TfidfVectorizer()
tf_idf_model.fit([" ".join(list(x)) for x in (semcor_sents)])
idf_values = dict(zip(tf_idf_model.get_feature_names(),list(tf_idf_model.idf_)))


def tf_idf_sent_vec(sentence):
    sentence_vector = np.zeros(300)
    total_weight = 0
    for word in sentence:
        if word in vectors:
            if word in idf_values.keys():
                tf_idf = idf_values[word.lower()]*(sentence.count(word)/len(sentence))
            else:
                tf_idf = 0
            sentence_vector += (vectors[word] * tf_idf)
            total_weight += tf_idf
    if total_weight != 0:
        return sentence_vector/total_weight
    else:
        return sentence_vector


### Predict_synset without TF_IDF with Extended Lesk approach

In [5]:
stop_words = set(stopwords.words('english'))

def predict_synset_extended(word, sentence, label):
    sentence = [w for w in sentence if (w not in stop_words) and (w.isalnum())]

    if word in sentence:
        sentence.remove(word)
        
    context_bag = sent_vec(sentence)
    
    
    sense_bag = {}
    senses = wn.synsets(word, pos = label)

    if len(senses)>0:
        for sense in senses:
            sense_bag[sense] = []
            sense_bag[sense].append(sent_vec([w for w in sense.definition().split() if (w not in stop_words) and (w.isalnum())]))

            for hypo in sense.hyponyms():
                sense_bag[sense].append(sent_vec([w for w in hypo.definition().split() if (w not in stop_words) and (w.isalnum())]))
        
        if len(sense_bag.keys()) > 0:
            synset = ""

            cos_sims = []
            for key,val in sense_bag.items():
                cos_sims.append((key,model_w2v.cosine_similarities(context_bag,val).mean()))#,model_w2v.cosine_similarities(context_bag,val)))
            cos_sims.sort(key = lambda x: x[1],reverse=True)
            return str(cos_sims[0][0])[8:-2]
    else:
        return "NA"

sentence = ['The', 'in', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
print(predict_synset_extended("charge",sentence,'v'))



charge.v.17


### Predict_synset without TF_IDF with Lesk approach

In [6]:
stop_words = set(stopwords.words('english'))

def predict_synset(word, sentence, label):
    sentence = [w for w in sentence if (w not in stop_words) and (w.isalnum())]
    
    if word in sentence:
        sentence.remove(word)
        
    context_bag = sent_vec(sentence)
        
    sense_bag = []
    senses = wn.synsets(word, pos = label)
    
    if len(senses)>0:
        max_cosine_similarity = 0
        synset = ""
        i = 0 
        for sense in senses:
            sense_bag.append(sent_vec([w for w in sense.definition().split() if (w not in stop_words) and (w.isalnum())]))   
            i += 1

        cosine_similarities = model_w2v.cosine_similarities(context_bag,sense_bag)
        max_cosine_similarity = 0
        synset = ""
        for i in range(len(cosine_similarities)):
            if(max_cosine_similarity < cosine_similarities[i]):
                max_cosine_similarity = cosine_similarities[i]
                synset = senses[i]
        return str(synset)[8:-2]
    else:
        return "NA"

sentence = ['The', 'in', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
print(predict_synset("charge",sentence,'v'))


charge.v.02


### Predict_synset with TF_IDF Lesk approach

In [7]:
def predict_synset_with_tf_idf(word, sentence, label):
    sentence = [w for w in sentence if (w not in stop_words) and (w.isalnum())]
    
    if word in sentence:
        sentence.remove(word)
    context_bag = tf_idf_sent_vec(sentence)
    sense_bag = []
    senses = wn.synsets(word, pos = label)
    
    if len(senses)>0:
        max_cosine_similarity = 0
        synset = ""
        i = 0 
        for sense in senses:
            sense_bag.append(tf_idf_sent_vec([w for w in sense.definition().split() if (w not in stop_words) and (w.isalnum())]))   
            i += 1

        cosine_similarities = model_w2v.cosine_similarities(context_bag,sense_bag)
        max_cosine_similarity = 0
        synset = ""
        for i in range(len(cosine_similarities)):
            if(max_cosine_similarity < cosine_similarities[i]):
                max_cosine_similarity = cosine_similarities[i]
                synset = senses[i]
        return str(synset)[8:-2]
    else:
        return "NA"

sentence = ['The', 'in', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
print(predict_synset_with_tf_idf("charge",sentence,"v"))


charge.v.02


### Predict_synset with TF_IDF with Extended Lesk approach

In [9]:
stop_words = set(stopwords.words('english'))

def predict_synset_with_tf_idf_extended(word, sentence, label):
    sentence = [w for w in sentence if (w not in stop_words) and (w.isalnum())]

    if word in sentence:
        sentence.remove(word)
        
    context_bag = tf_idf_sent_vec(sentence)

    sense_bag = {}
    senses = wn.synsets(word, pos = label)

    if len(senses)>0:
        for sense in senses:
            sense_bag[sense] = []
            sense_bag[sense].append(tf_idf_sent_vec([w for w in sense.definition().split() if (w not in stop_words) and (w.isalnum())]))

            for hypo in sense.hyponyms():
                sense_bag[sense].append(tf_idf_sent_vec([w for w in hypo.definition().split() if (w not in stop_words) and (w.isalnum())]))

        if len(sense_bag.keys()) > 0:
            synset = ""

            cos_sims = []
            for key,val in sense_bag.items():
                cos_sims.append((key,model_w2v.cosine_similarities(context_bag,val).mean()))#,model_w2v.cosine_similarities(context_bag,val)))
            cos_sims.sort(key = lambda x: x[1],reverse=True)
            return str(cos_sims[0][0])[8:-2]
    else:
        return "NA"

sentence = ['The', 'in', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
print(predict_synset_extended("charge",sentence,'v'))

charge.v.17


# Getting sysnets for entire data without TF_IDF

In [10]:
start = time.time()

max=0
words = []
actual_synsets = []
predicted_synsets = []
predicted_synsets_extended = []

i=0
j=0
semcor_tagged_sents = semcor.tagged_sents(tag='sem')
for sent in semcor_tagged_sents[:] :
    sentence = semcor_sents[i]
    i += 1
    for word in sent:
        if type(word) == list:
            continue
        if (word.height() == 3):
            max = word.height() 
            if str(word[0])[1:3] == "NE":
                continue

        elif ((word.height() == 2) and str(word.label()) != "NE"):
            words.extend(word.leaves())
            if not isinstance(word.label(), str) :
                actual_synsets.extend([word.label().synset().name() for x in word.leaves()])
            else:
                actual_synsets.extend([word.label() for x in word.leaves()])

            for x in word.leaves():
                    predicted_synsets.append(predict_synset(x,sentence, re.findall('.*\.([asrnv])\..*',str(word.label()))[0]))
                    predicted_synsets_extended.append(predict_synset_extended(x,sentence, re.findall('.*\.([asrnv])\..*',str(word.label()))[0]))
                    j += 1

print(time.time() - start)


425.95264196395874


# Extended Lesk without TF_IDF Results

In [14]:
true_pos = 0
total = 0
for i in range(len(actual_synsets)):
    total += 1
    if actual_synsets[i]==predicted_synsets_extended[i]:
        true_pos += 1
print("TP={}\tTOTAL={}\tAccuracy={}".format(true_pos,total,true_pos/total))
print("Precision, Recall, F1 Score",metrics.precision_recall_fscore_support(actual_synsets, predicted_synsets_extended,average="macro"))
print("Precision, Recall, F1 Score",metrics.precision_recall_fscore_support(actual_synsets, predicted_synsets_extended,average="micro"))
print("Precision, Recall, F1 Score",metrics.precision_recall_fscore_support(actual_synsets, predicted_synsets_extended,average="weighted"))


TP=89445	TOTAL=240415	Accuracy=0.37204417361645487
Precision, Recall, F1 Score (0.3646944860867053, 0.3588727112118985, 0.33204431721211286, None)
Precision, Recall, F1 Score (0.37204417361645487, 0.37204417361645487, 0.37204417361645487, None)
Precision, Recall, F1 Score (0.5298931771302642, 0.37204417361645487, 0.3710714289336667, None)


# Lesk without TF_IDF Results

In [15]:
true_pos = 0
total = 0
for i in range(len(actual_synsets)):
    total += 1
    if actual_synsets[i]==predicted_synsets[i]:
        true_pos += 1
print("TP={}\tTOTAL={}\tAccuracy={}".format(true_pos,total,true_pos/total))
print("Precision, Recall, F1 Score",metrics.precision_recall_fscore_support(actual_synsets, predicted_synsets,average="macro"))
print("Precision, Recall, F1 Score",metrics.precision_recall_fscore_support(actual_synsets, predicted_synsets,average="micro"))
print("Precision, Recall, F1 Score",metrics.precision_recall_fscore_support(actual_synsets, predicted_synsets,average="weighted"))


TP=82599	TOTAL=240415	Accuracy=0.34356841295260276
Precision, Recall, F1 Score (0.35864556702979183, 0.3537667531101894, 0.32804743922137636, None)
Precision, Recall, F1 Score (0.34356841295260276, 0.34356841295260276, 0.34356841295260276, None)
Precision, Recall, F1 Score (0.5133966728897991, 0.34356841295260276, 0.3496021300937267, None)


# Getting sysnets for entire data with TF_IDF

In [16]:
start = time.time()

max=0
words_with_tf_idf = []
actual_synsets_with_tf_idf = []
predicted_synsets_with_tf_idf = []
predicted_synsets_with_tf_idf_extended = []

i=0
j=0
for sent in semcor_tagged_sents[:int(len(semcor_tagged_sents)*0.1)]:
    sentence = semcor.sents()[i]
    i += 1
    for word in sent:
        if type(word) == list:
            continue
        if (word.height() == 3):
            max = word.height() 
            if str(word[0])[1:3] == "NE":
                continue

        elif ((word.height() == 2) and str(word.label()) != "NE"):
                words_with_tf_idf.extend(word.leaves())
                if not isinstance(word.label(), str) :
                    actual_synsets_with_tf_idf.extend([word.label().synset().name() for x in word.leaves()])
                else:
                    actual_synsets_with_tf_idf.extend([word.label() for x in word.leaves()])

                for x in word.leaves():
                        predicted_synsets_with_tf_idf.append(predict_synset_with_tf_idf(x,sentence, re.findall('.*\.([asrnv])\..*',str(word.label()))[0]))
                        predicted_synsets_with_tf_idf_extended.append(predict_synset_with_tf_idf_extended(x,sentence, re.findall('.*\.([asrnv])\..*',str(word.label()))[0]))
                        j += 1

print(time.time() - start)

2309.4560351371765


# Lesk with Tf-Idf values results:

In [17]:
true_pos = 0
total = 0
for i in range(len(actual_synsets_with_tf_idf)):
    total += 1
    if actual_synsets_with_tf_idf[i]==predicted_synsets_with_tf_idf[i]:
        true_pos += 1
print("TP={}\tTOTAL={}\tAccuracy={}".format(true_pos,total,true_pos/total))
print("Precision, Recall, F1 Score",metrics.precision_recall_fscore_support(actual_synsets_with_tf_idf, predicted_synsets_with_tf_idf,average="macro"))
print("Precision, Recall, F1 Score",metrics.precision_recall_fscore_support(actual_synsets_with_tf_idf, predicted_synsets_with_tf_idf,average="micro"))
print("Precision, Recall, F1 Score",metrics.precision_recall_fscore_support(actual_synsets_with_tf_idf, predicted_synsets_with_tf_idf,average="weighted"))

TP=16947	TOTAL=43529	Accuracy=0.38932665579268994
Precision, Recall, F1 Score (0.31789091914840145, 0.30383029489680646, 0.2921225720181405, None)
Precision, Recall, F1 Score (0.38932665579268994, 0.38932665579268994, 0.38932665579268994, None)
Precision, Recall, F1 Score (0.538804069693409, 0.38932665579268994, 0.4064881152938287, None)


# Extended Lesk with Tf-Idf values results:

In [18]:
true_pos = 0
total = 0
for i in range(len(actual_synsets_with_tf_idf)):
    total += 1
    if actual_synsets_with_tf_idf[i]==predicted_synsets_with_tf_idf_extended[i]:
        true_pos += 1
print("TP={}\tTOTAL={}\tAccuracy={}".format(true_pos,total,true_pos/total))
print("Precision, Recall, F1 Score",metrics.precision_recall_fscore_support(actual_synsets_with_tf_idf, predicted_synsets_with_tf_idf_extended,average="macro"))
print("Precision, Recall, F1 Score",metrics.precision_recall_fscore_support(actual_synsets_with_tf_idf, predicted_synsets_with_tf_idf_extended,average="micro"))
print("Precision, Recall, F1 Score",metrics.precision_recall_fscore_support(actual_synsets_with_tf_idf, predicted_synsets_with_tf_idf_extended,average="weighted"))


TP=18065	TOTAL=43529	Accuracy=0.41501068253348344
Precision, Recall, F1 Score (0.3212415951340641, 0.3065779458714964, 0.29455392045190104, None)
Precision, Recall, F1 Score (0.41501068253348344, 0.41501068253348344, 0.41501068253348344, None)
Precision, Recall, F1 Score (0.5490601696657545, 0.41501068253348344, 0.4251691901585739, None)


## Function to check for individual test cases:

In [19]:
def predict_ad_hoc(word,sentence,pos_tag):
    print("Sentence = ",sentence)
    print("Lesk Result = ",predict_synset(word,sentence.split(" "),pos_tag))
    print("Extended Lesk Result = ",predict_synset_extended(word,sentence.split(" "),pos_tag))
    print("Lesk with tf-idf sentence vectorizer Result = ",predict_synset_with_tf_idf(word,sentence.split(" "),pos_tag))
    print("Extended Lesk tf-idf sentence vectorizer Result = ",predict_synset_with_tf_idf_extended(word,sentence.split(" "),pos_tag),"\n\n")


def print_all_gloss(word,pos_tag):
    for gloss in wn.synsets(word,pos_tag):
        print(str(gloss)," = ",gloss.definition())

In [20]:
predict_ad_hoc("bank", "I went to the bank to withdraw some money.", "n")
predict_ad_hoc("bank", "I went to the bank to deposit some money.", "n")
predict_ad_hoc("bank", "I went to the bank institution to withdraw some money.", "n")

predict_ad_hoc("bank", "I went to the bank to have a bath in the river.", "n")
predict_ad_hoc("bank", "I went to the bank to have a bath in the river body.", "n")

predict_ad_hoc("school", "The school is seeing return of students.", "n")

predict_ad_hoc("school", "The school of fish is swimming past the island.", "n")


Sentence =  I went to the bank to withdraw some money.
Lesk Result =  bank.n.07
Extended Lesk Result =  bank.n.07
Lesk with tf-idf sentence vectorizer Result =  bank.n.07
Extended Lesk tf-idf sentence vectorizer Result =  bank.n.07 


Sentence =  I went to the bank to deposit some money.
Lesk Result =  depository_financial_institution.n.01
Extended Lesk Result =  savings_bank.n.02
Lesk with tf-idf sentence vectorizer Result =  depository_financial_institution.n.01
Extended Lesk tf-idf sentence vectorizer Result =  depository_financial_institution.n.01 


Sentence =  I went to the bank institution to withdraw some money.
Lesk Result =  depository_financial_institution.n.01
Extended Lesk Result =  depository_financial_institution.n.01
Lesk with tf-idf sentence vectorizer Result =  depository_financial_institution.n.01
Extended Lesk tf-idf sentence vectorizer Result =  depository_financial_institution.n.01 


Sentence =  I went to the bank to have a bath in the river.
Lesk Result =  savin

In [21]:
print_all_gloss("bank","n")

Synset('bank.n.01')  =  sloping land (especially the slope beside a body of water)
Synset('depository_financial_institution.n.01')  =  a financial institution that accepts deposits and channels the money into lending activities
Synset('bank.n.03')  =  a long ridge or pile
Synset('bank.n.04')  =  an arrangement of similar objects in a row or in tiers
Synset('bank.n.05')  =  a supply or stock held in reserve for future use (especially in emergencies)
Synset('bank.n.06')  =  the funds held by a gambling house or the dealer in some gambling games
Synset('bank.n.07')  =  a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
Synset('savings_bank.n.02')  =  a container (usually with a slot in the top) for keeping money at home
Synset('bank.n.09')  =  a building in which the business of banking transacted
Synset('bank.n.10')  =  a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turni

In [22]:
print_all_gloss("school","n")

Synset('school.n.01')  =  an educational institution
Synset('school.n.02')  =  a building where young people receive education
Synset('school.n.03')  =  the process of being formally educated at a school
Synset('school.n.04')  =  a body of creative artists or writers or thinkers linked by a similar style or by similar teachers
Synset('school.n.05')  =  the period of instruction in a school; the time period when school is in session
Synset('school.n.06')  =  an educational institution's faculty and students
Synset('school.n.07')  =  a large group of fish


In [23]:
predict_ad_hoc("bank", "The water overflowed the bank rooms filled with money.", "n")


Sentence =  The water overflowed the bank rooms filled with money.
Lesk Result =  savings_bank.n.02
Extended Lesk Result =  bank.n.01
Lesk with tf-idf sentence vectorizer Result =  bank.n.01
Extended Lesk tf-idf sentence vectorizer Result =  bank.n.01 




In [24]:
predict_ad_hoc("bank", "I went to a bank which is situated on the river bank.", "n")


Sentence =  I went to a bank which is situated on the river bank.
Lesk Result =  bank.n.01
Extended Lesk Result =  bank.n.01
Lesk with tf-idf sentence vectorizer Result =  bank.n.01
Extended Lesk tf-idf sentence vectorizer Result =  bank.n.01 


