# Exercise 1

## Necessary imports and dataset loading

In [4]:
from nltk.corpus import wordnet as wn
import nltk
from nltk.corpus.reader.wordnet import Synset
nltk.download('wordnet')
import pandas as pd
import math

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lores\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Reading the file
df = pd.read_csv('dataset\combined.csv')
display(df)

Unnamed: 0,Word 1,Word 2,Human (mean)
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.00
3,book,paper,7.46
4,computer,keyboard,7.62
...,...,...,...
348,shower,flood,6.03
349,weather,forecast,8.34
350,disaster,area,6.25
351,governor,office,6.34


## Support functions

In [6]:
# Defining the function to get the max depth of the synset

def max_depth_synset(synset: Synset) -> int:
    return max(len(path) for path in synset.hypernym_paths())

In [7]:
# Defining the function to get the least common subsumer of two synsets

def LCS(syn1: Synset, syn2: Synset) -> Synset | None:
    path1 = syn1.hypernym_paths()
    path2 = syn2.hypernym_paths()
    result = []

    # We loop through the paths of the first synset
    for pat1 in path1:
        x = pat1
        x.reverse()
        found = False
        for syn in x:
            if(not found):
                # We loop through the paths of the second synset
                for pat2 in path2:
                    if(syn in pat2):
                        result.append(syn)
                        found = True
                        break
            else:
                break
    if not result:
            return None
    else:
         max_depth = max(max_depth_synset(s) for s in result)
         max_hypernym = max(s for s in result if max_depth_synset(s) == max_depth)
         return max_hypernym

In [8]:
def path_len(syn1: Synset, syn2: Synset) -> int:
    path1 = syn1.hypernym_paths()
    path2 = syn2.hypernym_paths()
    result_syn1 = []
    result_syn2 = []

    common_ancestor = LCS(syn1,syn2)

    if(common_ancestor is None):
        return None
    else:
        for pat1 in path1:
            if(common_ancestor in pat1):
                result_syn1.append(pat1.index(syn1)-pat1.index(common_ancestor))
        for pat2 in path2:
            if(common_ancestor in pat2):
                result_syn2.append(pat2.index(syn2)-pat2.index(common_ancestor))
        return min(result_syn1) + min(result_syn2)
  

In [9]:
depth_max = max(max(len(hyp_path) for hyp_path in ss.hypernym_paths()) for ss in wn.all_synsets())

## Wu Palmer

$$ LCS(s1, s2) = {2 * depth(LCS(s1,s2)) \over depth(s1) + depth(s2)} $$

In [10]:
def wu_palmer(syn1: Synset, syn2: Synset) -> float:
    # We get all the synsets of the two words
    depht1 = max_depth_synset(syn1)
    depht2 = max_depth_synset(syn2)
    # Finding the least common subsumer of the two synsets
    common_ancestor = LCS(syn1,syn2)
    if common_ancestor is None:
        return 0
    return 2 * max_depth_synset(common_ancestor) / ( depht1 + depht2 )

## Shortest Path

$$ LCS(s1, s2) = 2 * depthMax - len(s1,s2) $$

In [11]:
def shortest_path(syn1: Synset, syn2: Synset) -> int:
    pat = path_len(syn1,syn2)
    if pat is None:
        return 0
    elif (pat == 0):
        return 2 * depth_max
    elif (pat == depth_max):
        return 0
    else:
        return 2 * depth_max - pat

## Leakcock & Chodorow

$$ sim_LC(s1, s2) = -log(\frac{len(s1,s2)}{2*depthMax}) $$

In [12]:
def leakcock_chodorow (syn1: Synset, syn2: Synset) -> float:
    pat = path_len(syn1,syn2)
    if pat is None:
        return 0
    elif (pat == 0):
        return -math.log((1)/(2 * depth_max + 1))
    else:
        return -math.log((pat)/(2 * depth_max))

## Cycle for every synset of the words

In [25]:
def synsets_cycle(word1: str, word2: str, metric: callable) -> float | str:
    syn1 = wn.synsets(word1)
    syn2 = wn.synsets(word2)
    result = []
    for s1 in syn1:
        for s2 in syn2:
            result.append(metric(s1,s2))
    if not result:
        return "No common ancestor founded. The two words are not related."
    else:
        return max(result)

In [26]:
# Test
print("WU Palmer between dog and cat:", synsets_cycle('dog','cat', wu_palmer))
print("WU Palmer between stock and live:", synsets_cycle('stock','live', wu_palmer))
print("Shortest Path between Jerusalem and Palestinian:", synsets_cycle('Jerusalem','Palestinian', shortest_path))
print("Leakcock Chodorow between dog and cat:", synsets_cycle('dog','cat', leakcock_chodorow))

WU Palmer between dog and cat: 0.8571428571428571
WU Palmer between stock and live: 0.2857142857142857
Shortest Path between Jerusalem and Palestinian: 24
Leakcock Chodorow between dog and cat: 2.3025850929940455


## Loop over all pair of words in the dataset and i print the three similarity measures

In [27]:
# Loop through the dataset and calculate the similarity for each pair of words
for index, row in df.iterrows():
    print("--------------------------------------")
    print("Word 1:", row['Word 1'])
    print("Word 2:", row['Word 2'])
    print("Human Similarity:", row['Human (mean)'])
    print("WU Palmer:", synsets_cycle(row['Word 1'], row['Word 2'], wu_palmer))
    print("Shortest Path:", synsets_cycle(row['Word 1'], row['Word 2'], shortest_path))
    print("Leacock Chodorow:", synsets_cycle(row['Word 1'], row['Word 2'], leakcock_chodorow))

--------------------------------------
Word 1: love
Word 2: sex
Human Similarity: 6.77
WU Palmer: 0.9230769230769231
Shortest Path: 39
Leacock Chodorow: 3.6888794541139363
--------------------------------------
Word 1: tiger
Word 2: cat
Human Similarity: 7.35
WU Palmer: 0.9655172413793104
Shortest Path: 39
Leacock Chodorow: 3.6888794541139363
--------------------------------------
Word 1: tiger
Word 2: tiger
Human Similarity: 10.0
WU Palmer: 1.0
Shortest Path: 40
Leacock Chodorow: 3.713572066704308
--------------------------------------
Word 1: book
Word 2: paper
Human Similarity: 7.46
WU Palmer: 0.875
Shortest Path: 38
Leacock Chodorow: 2.995732273553991
--------------------------------------
Word 1: computer
Word 2: keyboard
Human Similarity: 7.62
WU Palmer: 0.8235294117647058
Shortest Path: 37
Leacock Chodorow: 2.5902671654458267
--------------------------------------
Word 1: computer
Word 2: internet
Human Similarity: 7.58
WU Palmer: 0.631578947368421
Shortest Path: 33
Leacock Chod