In [1]:
import nltk
from nltk.corpus import wordnet as wn
import fasttext
from scipy.spatial.distance import cosine 
import numpy as np
from scipy import stats
ft = fasttext.load_model('cc.en.300.bin')
from scipy.stats import spearmanr

In [2]:
path = 'SimLex-999.txt'
with open(path, 'r') as file:
    next(file)
    simlex = []
    sl_scores = np.zeros(999, dtype=float) # more efficient since size is fixed, sl = simlex
    for i, line in enumerate(file):
        cols = line.strip().split('\t')
        w1 = cols[0]
        w2 = cols[1]
        pos = cols[2].lower()
        sl_score = float(cols[3])/10 # scaling down to a range of 0-1, stats.kendalltau did not change
        simlex.append([w1, w2, pos, sl_score])
        sl_scores[i]= sl_score

In [3]:
wn_scores = np.zeros(999, dtype=float) # wn = wordnet
ft_scores = np.zeros(999, dtype=float) # ft = fasttext

print("{:<15} {:<15} {:<10} {:<10} {:<10}".format('word1', 'word2', 'simlex', 'wordnet', 'fasttext'))

for i, pair in enumerate(simlex):
    wn0_s = wn.synsets(f'{pair[0]}', pos={pair[2]}) # synsets of the first word
    wn1_s = wn.synsets(f'{pair[1]}', pos={pair[2]}) # synsets of the second word
    # the highest similarity among synsets combinations
    wn_score = 0
    for s0 in wn0_s:
        for s1 in wn1_s:
            wn_cur = wn.path_similarity(s0, s1)
            if wn_cur > wn_score:
                wn_score = wn_cur
    wn_scores[i] = wn_score
    # cosine similarity of vectors
    v0 = ft.get_sentence_vector(pair[0])
    v1 = ft.get_sentence_vector(pair[1])
    ft_score = 1 - cosine(v0, v1)
    ft_scores[i] = ft_score
    print("{:<15} {:<15} {:<10.3f} {:<10.3f} {:<10.3f}".format(pair[0], pair[1], pair[3], wn_score, ft_score))

word1           word2           simlex     wordnet    fasttext  
old             new             0.158      0.333      0.442     
smart           intelligent     0.920      0.333      0.705     
hard            difficult       0.877      1.000      0.631     
happy           cheerful        0.955      0.333      0.546     
hard            easy            0.095      0.333      0.486     
fast            rapid           0.875      0.333      0.526     
happy           glad            0.917      1.000      0.674     
short           long            0.123      0.333      0.662     
stupid          dumb            0.958      0.333      0.871     
weird           strange         0.893      0.333      0.851     
wide            narrow          0.103      0.333      0.535     
bad             awful           0.842      0.333      0.669     
easy            difficult       0.058      0.333      0.617     
bad             terrible        0.778      0.333      0.732     
hard            simple   

In [4]:
tau_wn, p_wn = stats.kendalltau(sl_scores, wn_scores)
tau_ft, p_ft = stats.kendalltau(sl_scores, ft_scores)

print(f'wordnet tau {tau_wn} p-value {p_wn}')
print(f'fasttext tau {tau_ft} p-value {p_ft}')

sp_wn, _ = spearmanr(sl_scores, wn_scores)
sp_ft, _ = spearmanr(sl_scores, ft_scores)

print(f'wordnet spearman: {sp_wn}')
print(f'fasttext spearman: {sp_ft}')

print(f'variance sl {np.var(sl_scores)}')
print(f'variance wn {np.var(wn_scores)}')
print(f'variance ft {np.var(ft_scores)}')

wordnet tau 0.35344887126870356 p-value 7.744308980342708e-55
fasttext tau 0.3301400933912036 p-value 7.744002627565699e-55
wordnet spearman: 0.4756537349907476
fasttext spearman: 0.4644247750821859
variance sl 0.06829621454086718
variance wn 0.04435063589902128
variance ft 0.027196266781775794
