# Exercise 1

## Necessary imports and dataset loading

In [1]:
from nltk.corpus import wordnet as wn
import nltk
nltk.download('wordnet')
import pandas as pd
import math

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lores\AppData\Roaming\nltk_data...


In [2]:
# Reading the file
df = pd.read_csv('dataset\combined.csv')
display(df)

Unnamed: 0,Word 1,Word 2,Human (mean)
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.00
3,book,paper,7.46
4,computer,keyboard,7.62
...,...,...,...
348,shower,flood,6.03
349,weather,forecast,8.34
350,disaster,area,6.25
351,governor,office,6.34


## Support functions

In [3]:
# Defining the function to get the max depth of the synset

def max_depth_synset(synset):
    return max(len(path) for path in synset.hypernym_paths())

In [4]:
# Defining the function to get the least common subsumer of two synsets

def LCS(syn1,syn2):
    path1 = syn1.hypernym_paths()
    path2 = syn2.hypernym_paths()
    result = []

    # We loop through the paths of the first synset
    for pat1 in path1:
        x = pat1
        x.reverse()
        found = False
        for syn in x:
            if(not found):
                # We loop through the paths of the second synset
                for pat2 in path2:
                    if(syn in pat2):
                        result.append(syn)
                        found = True
                        break
            else:
                break
    if not result:
            return None
    else:
         max_depth = max(max_depth_synset(s) for s in result)
         max_hypernym = max(s for s in result if max_depth_synset(s) == max_depth)
         return max_hypernym

In [5]:
def path_len(syn1,syn2):
    path1 = syn1.hypernym_paths()
    path2 = syn2.hypernym_paths()
    result_syn1 = []
    result_syn2 = []

    common_ancestor = LCS(syn1,syn2)

    if(common_ancestor is None):
        return None
    else:
        for pat1 in path1:
            if(common_ancestor in pat1):
                result_syn1.append(pat1.index(syn1)-pat1.index(common_ancestor))
        for pat2 in path2:
            if(common_ancestor in pat2):
                result_syn2.append(pat2.index(syn2)-pat2.index(common_ancestor))
        return min(result_syn1) + min(result_syn2)
  

In [6]:
depth_max = max(max(len(hyp_path) for hyp_path in ss.hypernym_paths()) for ss in wn.all_synsets())

## Wu Palmer

$$ LCS(s1, s2) = {2 * depth(LCS(s1,s2)) \over depth(s1) + depth(s2)} $$

In [100]:
def wu_palmer(word1,word2):
    # We get all the synsets of the two words
    synw1 = wn.synsets(word1)
    synw2 = wn.synsets(word2)
    result = []
    print(synw1)
    print(synw2)
    for syn1 in synw1:
        depht1 = max_depth_synset(syn1)
        for syn2 in synw2:
            depht2 = max_depth_synset(syn2)
            # Finding the least common subsumer of the two synsets
            common_ancestor = LCS(syn1,syn2)
            if(common_ancestor is None):
                break
            result.append( (2 * max_depth_synset(common_ancestor)) / ( depht1 + depht2 ) )
            
    return max(result)
    

In [126]:
# Test
# print(wn.synsets('stock'))
# print(wn.synsets('live'))
# print(wn.synset('live.v.01').wup_similarity(wn.synset('stock.v.01')))
print(wn.synset('live.v.01').hypernym_paths())
print(wn.synset('stock.v.02').hypernym_paths())
print(wn.synset("canine.n.02").hypernyms())

#print("WU Palmer between dog and cat:", wu_palmer('stock','live'))

[[Synset('be.v.03'), Synset('populate.v.01')]]
[[Synset('transfer.v.05'), Synset('give.v.03'), Synset('supply.v.01'), Synset('equip.v.01'), Synset('stock.v.02')]]
[Synset('carnivore.n.01')]


## Shortest Path

$$ LCS(s1, s2) = 2 * depthMax - len(s1,s2) $$

In [97]:
def shortest_path(word1,word2):
    synw1 = wn.synsets(word1)
    synw2 = wn.synsets(word2)
    result = []

    for syn1 in synw1:
        for syn2 in synw2:
            pat = path_len(syn1,syn2)
            if pat is None:
                break
            elif (pat == 0):
                result.append(2 * depth_max)
            elif (pat == depth_max):
                result.append(0)
            else:
                result.append(2 * depth_max - pat)
    return max(result)

In [57]:
# Test

print("Shortest Path between dog and cat:", shortest_path('Jerusalem','Palestinian'))

Shortest Path between dog and cat: 24


## Leakcock & Chodorow

$$ sim_LC(s1, s2) = -log(\frac{len(s1,s2)}{2*depthMax}) $$

In [45]:
def leakcock_chodorow (word1,word2):
    synw1 = wn.synsets(word1)
    synw2 = wn.synsets(word2)
    result = []

    for syn1 in synw1:
        for syn2 in synw2:
            pat = path_len(syn1,syn2)
            if pat is None:
                break
            elif (pat == 0):
                result.append( -math.log( (1)/(2 * depth_max + 1) ))
            else:
                result.append(-math.log( (pat)/(2 * depth_max) ))
    return max(result)

In [46]:
# Test

print("Shortest Path between dog and cat:", leakcock_chodorow('dog','cat'))
print("lch_similarity", wn.synset('dog.n.01').lch_similarity(wn.synset('cat.n.01')))

Shortest Path between dog and cat: 2.3025850929940455
lch_similarity 2.0281482472922856


## Loop over all pair of words in the dataset and i print the three similarity measures

In [127]:
# Loop through the dataset and calculate the similarity for each pair of words
for index, row in df.iterrows():
    print("--------------------------------------")
    print("Word 1:", row['Word 1'])
    print("Word 2:", row['Word 2'])
    print("Human Similarity:", row['Human (mean)'])
    print("WU Palmer:", wu_palmer(row['Word 1'], row['Word 2']))
    print("Shortest Path:", shortest_path(row['Word 1'], row['Word 2']))
    print("Leacock Chodorow:", leakcock_chodorow(row['Word 1'], row['Word 2']))

--------------------------------------
Word 1: love
Word 2: sex
Human Similarity: 6.77
[Synset('love.n.01'), Synset('love.n.02'), Synset('beloved.n.01'), Synset('love.n.04'), Synset('love.n.05'), Synset('sexual_love.n.02'), Synset('love.v.01'), Synset('love.v.02'), Synset('love.v.03'), Synset('sleep_together.v.01')]
[Synset('sexual_activity.n.01'), Synset('sex.n.02'), Synset('sex.n.03'), Synset('sex.n.04'), Synset('arouse.v.07'), Synset('sex.v.02')]
WU Palmer: 0.9230769230769231
Shortest Path: 39
Leacock Chodorow: 3.6888794541139363
--------------------------------------
Word 1: tiger
Word 2: cat
Human Similarity: 7.35
[Synset('tiger.n.01'), Synset('tiger.n.02')]
[Synset('cat.n.01'), Synset('guy.n.01'), Synset('cat.n.03'), Synset('kat.n.01'), Synset('cat-o'-nine-tails.n.01'), Synset('caterpillar.n.02'), Synset('big_cat.n.01'), Synset('computerized_tomography.n.01'), Synset('cat.v.01'), Synset('vomit.v.01')]
WU Palmer: 0.9655172413793104
Shortest Path: 39
Leacock Chodorow: 3.68887945411

ValueError: max() arg is an empty sequence