In [86]:
import csv
import math
import nltk
from nltk.corpus import wordnet
from enum import Enum
from collections import deque

### Wordnet functions

In [87]:
# get all hypernyms paths of a sense looping recursively on hypernyms of his ancestors
def hypernym_paths(sense):
    paths = []
    hypernyms = sense.hypernyms()
    if not hypernyms:
        return [[sense]]

    for hypernym in hypernyms:
        for ancestor_list in hypernym_paths(hypernym):
            ancestor_list.append(sense)
            ancestor_list.reverse()
            paths.append(ancestor_list)

    return paths


# maximum length of hypernym paths for a sense
def depth(sense):
    if not sense:
        return 0

    return max([len(path) for path in sense.hypernym_paths()])


def lowest_common_sense(path1, path2):
    common_senses = list(set(path1) & set(path2))

    if common_senses:
        max_depth = max(depth(s) for s in common_senses)
        min_common_senses = [s for s in common_senses if depth(s) == max_depth]
        # if there are more than 1 min_common_senses we choose the sense with the max depth from root
        return min_common_senses[0], max_depth

    return None


# first common ancestor, that is the closest hypernym to the two senses
def lowest_common_subsumer(sense1, sense2):
    sense1_hypernym_paths = hypernym_paths(sense1)
    sense2_hypernym_paths = hypernym_paths(sense2)

    max_depth = None
    lcs = None

    for path1 in sense1_hypernym_paths:
        for path2 in sense2_hypernym_paths:
            if min_tuple := lowest_common_sense(path1, path2):
                min_common_sense, depth = min_tuple
                if not lcs or (lcs and depth > max_depth):
                    max_depth = depth
                    lcs = min_common_sense

    return lcs

# Maximum depth that is the longest path of the hyperonyms of all the senses present in wordnet (for this version of wn is equal to 40)
max_depth = max(max(len(hyp_path) for hyp_path in hypernym_paths(sense)) for sense in wordnet.all_synsets())


def compute_shortest_path(sense1, sense2):
    if sense1 == sense2:
        return 0
    
    sense1_hypernym_paths = hypernym_paths(sense1)
    sense2_hypernym_paths = hypernym_paths(sense2)

    min_distance = float("inf")
    for path1 in sense1_hypernym_paths:
        for path2 in sense2_hypernym_paths:
            common_set = list(set(path1) & set(path2))
            common_dict = {elem: path1.index(elem) + path2.index(elem) for elem in common_set}

            if common_dict:
                min_common_path = min(common_dict.values())
                min_distance = min(min_common_path, min_distance)
    
    return (2 * max_depth) if math.isinf(min_distance) else min_distance

### Similarity Metrics

In [88]:
# get max similarity among all the senses of each word
def similarity(word1, word2, metric):
    max_similarity = 0

    for sense1 in wordnet.synsets(word1):
        for sense2 in wordnet.synsets(word2):
            max_similarity = max(metric(sense1, sense2), max_similarity)

    return max_similarity

# interval: (0, 1]
def wu_palmer(sense1, sense2):
    return 2 * depth(lowest_common_subsumer(sense1,sense2)) / (depth(sense1) + depth(sense2))


# interval: (0, 2 * depthMax]
def shortest_path(sense1, sense2):
    return 2 * max_depth - compute_shortest_path(sense1, sense2)


# interval: (0, log(2 * depthMax + 1)]
def leakcock_chodorow(sense1, sense2):
    return -math.log((compute_shortest_path(sense1, sense2) + 1) / (2 * max_depth + 1)) # add 1 to avoid log(0)


class SimilarityMetrics(Enum):
    wu_palmer = wu_palmer
    shortest_path = shortest_path
    leakcock_chodorow = leakcock_chodorow

### Similarity Computation

In [89]:
metric = SimilarityMetrics.wu_palmer
print('-----------------------------------------')
print(f'SIMILARITY METRIC: {metric.__name__}')
print('-----------------------------------------\n')

with open('data/WordSim353.csv') as f:
    csv_reader = csv.reader(f, delimiter=',')

    for index, row in enumerate(csv_reader):
        if index > 0:
            sim = similarity(row[0], row[1], metric)
            print(f'Words: \t{row[0].upper()} - {row[1].upper()}'.expandtabs(20))
            print(f'Metric similarity: \t{sim}'.expandtabs(20))
            print(f'Human similarity: \t{row[2]}\n'.expandtabs(20))

-----------------------------------------
SIMILARITY METRIC: wu_palmer
-----------------------------------------

Words:              LOVE - SEX
Metric similarity:  0.9230769230769231
Human similarity:   6.77

Words:              TIGER - CAT
Metric similarity:  0.9655172413793104
Human similarity:   7.35

Words:              TIGER - TIGER
Metric similarity:  1.0
Human similarity:   10.00

Words:              BOOK - PAPER
Metric similarity:  0.875
Human similarity:   7.46

Words:              COMPUTER - KEYBOARD
Metric similarity:  0.8235294117647058
Human similarity:   7.62

Words:              COMPUTER - INTERNET
Metric similarity:  0.631578947368421
Human similarity:   7.58

Words:              PLANE - CAR
Metric similarity:  0.7272727272727273
Human similarity:   5.77

Words:              TRAIN - CAR
Metric similarity:  0.7368421052631579
Human similarity:   6.31

Words:              TELEPHONE - COMMUNICATION
Metric similarity:  0.16666666666666666
Human similarity:   7.50

Words:  