In [13]:
import nltk
import random, math, copy, os, string
from scipy.stats import pearsonr
from nltk.metrics import jaccard_distance
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn  
from nltk.corpus import wordnet_ic
from nltk.wsd import lesk


from google.colab import drive
drive.mount('/content/drive')

nltk.download('wordnet')
nltk.download('universal_tagset') 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('wordnet_ic')
nltk.download('averaged_perceptron_tagger')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet_ic to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet_ic.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]      

True

In [6]:
import pandas as pd
dt = pd.read_csv('/content/drive/My Drive/content/drive/test-gold/STS.input.SMTeuroparl.txt',sep='\t',header=None)
dt['gs'] = pd.read_csv('/content/drive/My Drive/content/drive/test-gold/STS.gs.SMTeuroparl.txt',sep='\t',header=None)
dt.head()
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [14]:
#Document structure
sw = nltk.corpus.stopwords.words('english')
Jaccard_Dist=[] #Here we will store the Jaccard Distances between the pairs
tok=[] #Here we will keep a tokenized version of the pairs of phrases
for i in range(len(dt)):
  sent=[set(nltk.word_tokenize(dt[0][i])),set(nltk.word_tokenize(dt[1][i]))] 
  for n in [0,1]:
    sent[n]=set([w.lower() for w in sent[n] if w.lower() not in sw and w not in string.punctuation and w.isalpha()]) 
  tok.append(sent)
  Jaccard_Dist.append(jaccard_distance(sent[0],sent[1]))
  
dt['Jaccard']=Jaccard_Dist 
dt.head()

In [15]:
#morphology
def lemmatize(p):
    if p[1][0] in {'N','V'}:
        return wnl.lemmatize(p[0], pos=p[1][0].lower()) 
    return p[0]

wnl = nltk.stem.WordNetLemmatizer() #We will use this lemmatizer
PoS=[] #Here we will store the PoS tags

for i in range(len(dt)):
   PoS.append([set(nltk.pos_tag(tok[i][0])),set(nltk.pos_tag(tok[i][1]))])

Lemmas=[] 
lem=[] #auxiliary vector
for i in range(len(dt)): 
  for n in [0,1]:
    lem.append(set([lemmatize(w) for w in PoS[i][n]]))
  Lemmas.append(lem.copy())
  lem.clear()
del lem

Lem_Dist=[]
for i in range(len(dt)):
  Lem_Dist.append(jaccard_distance(Lemmas[i][0],Lemmas[i][1]))
dt['Lemma']=Lem_Dist
dt.head()

In [20]:
#Lesk algorithm

# Create the list to store Part of Speech Tags by nltk standard that wordnet deals with
Admissible_PoS=['NOUN', 'VERB', 'ADJ', 'ADV']

#Now we are going to change the Part of Speech tag to the format that Wordnet uses
pos_in_wn = {'DET':None, 'NOUN':'n', 'VERB':'v', 'ADJ':'a', 'ADV': 'r' , 'ADP':None, 'CONJ':None, 'NUM':None, 'PRT':None, 'PRON':None, '.':None, 'X':None }

#Tokenize the sentences
tokenized_sents = []
for i in range(len(dt)):
  sent = [nltk.word_tokenize(dt[0][i]),nltk.word_tokenize(dt[1][i])] 
  tokenized_sents.append(sent)

#First we apply the lesk algorithm, and keep the synsets
lesk = copy.deepcopy(tok)
for i, sent in enumerate(lesk): 
 for n in range(2):
   sent[n]=nltk.pos_tag(sent[n],tagset='universal')   
   sent[n] = set([nltk.wsd.lesk(tokenized_sents[i][n], w[0],pos_in_wn[w[1]]) for w in sent[n] if pos_in_wn[w[1]] is not None])


#Calculating the Jaccard distance for the resulting synsets 
Jaccard_Lesk = []
for sent in lesk:
  Jaccard_Lesk.append(jaccard_distance(sent[0],sent[1]))
dt['Lesk']=Jaccard_Lesk 

dt.head()


Unnamed: 0,0,1,gs,Jaccard,Lemma,Lesk
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5,0.692308,0.692308,0.555556
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0,0.285714,0.285714,0.5
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25,0.727273,0.727273,0.625
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5,0.25,0.25,0.25
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0,0.0,0.0,0.0


In [21]:
print([pearsonr(dt['gs'], dt['Jaccard'])[0],pearsonr(dt['gs'], dt['Lemma'])[0], pearsonr(dt['gs'], dt['Lesk'])[0]])

[-0.45279693414784167, -0.47130316108106723, -0.3540038658213318]


4)From the above pearson correlation we can conclude that lesk's algorithm in comparison to document structure and morphology is very inefficient and the results are mostly empty.

5)Compare the results with gold standard by giving the pearson correlation between them.

In [22]:
print([pearsonr(dt['Lesk'], dt['Jaccard'])[0]])

[0.8349367273950777]
