# Evaluation of Embeddings on a Word-Similarity-Ranking Application

The wordsim353 data we are currently using can be downloaded from here: https://drive.google.com/file/d/1RXJ1fxzk82srGqQQWI8xFavWLBv5T8a6/view?usp=sharing

In [69]:
import os
import pandas as pd
import random
import numpy as np
from tabulate import tabulate
from gensim.models import KeyedVectors
from tqdm import tqdm
from scipy import stats

## Params
*wordsim_file*: path to tsv file that contains wordsim data  
*embedding_files*: dictionary of {(str) name : (str) file path of .kv embedding file}

In [168]:
wordsim_file = "./data/wordsim353/wordsim353crowd_results_v2_similarity.tsv"
# wordsim_file = "./data/similarity/ISI Examples for similarity algorithms - qnode_similarity_scores.csv"

embedding_files = {"H_3x6" : "./output/wikidata-20210215-dwd/H_walks_analysis/h_embeddings_3x6,min_count=8.kv",
                   "H_5x8" : "./output/wikidata-20210215-dwd/H_walks_analysis/h_embeddings_5x8,min_count=21.kv",
                   "A" : "./output/wikidata-20210215-dwd/A_walks_analysis/a_embeddings_10x10,min_count=0.kv",
                   "S" : "./output/wikidata-20210215-dwd/S_walks_analysis/s_embeddings_5x10,min_count=0.kv"
                  }

param setup work

In [169]:
# ensure paths are absolute
wordsim_file = os.path.abspath(wordsim_file)

for name, file_path in embedding_files.items():
    embedding_files[name] = os.path.abspath(file_path)

# Set up output directories if not created yet
output_dir = os.path.abspath("./output/embeddings_wordsim")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## Loading embeddings and wordsim data

In [138]:
embedding_models = {}
for name, file_path in embedding_files.items():
    print("now loading {} embeddings".format(name))
    embedding_models[name] = KeyedVectors.load(file_path)

now loading H_3x6 embeddings
now loading H_5x8 embeddings
now loading A embeddings
now loading S embeddings


In [170]:
df = pd.read_csv(wordsim_file, sep='\t')
# df = pd.read_csv(wordsim_file)
display(df.head())

Unnamed: 0,Word 1,word1_kg_id,word1_kg_label,word1_kg_description,Word 2,word2_kg_id,word2_kg_label,word2_kg_description,Human (Mean),complex,transe,text,class
0,admission,Q847337,confession,statement made by a person or a group of perso...,ticket,Q551800,ticket,paper or cardboard document showing payment fo...,5.536,0.361115,0.446632,0.485653,
1,alcohol,Q154,alcoholic beverage,"drink containing alcohols, typically ethanol",chemistry,Q2329,chemistry,branch of physical science concerned with the ...,4.125,0.302163,0.274388,0.479313,0.005851
2,aluminum,Q663,aluminium,metallic chemical element of silvery appearanc...,metal,Q11426,metal,"element, compound or alloy that is a good cond...",6.625,0.694462,0.429289,0.626784,0.393657
3,announcement,Q567303,announcement,"printed, spoken, or published statement or not...",effort,Q14536140,exertion,use of physical or perceived energy by a person,2.0625,0.324825,0.379077,0.618644,0.208812
4,announcement,Q567303,announcement,"printed, spoken, or published statement or not...",news,Q38926,news,communication of selected information on curre...,7.1875,0.361214,0.360383,0.863846,0.888621


## Preliminary analysis
Let's see how many of these entities are represented in our embeddings

In [139]:
headers = ["", "# rows with 2 words", "# words"] + ["in {}".format(name) for name in embedding_models]
rows=[]

word1_non_empty = np.array([word != "" for word in df.word1_kg_id])
word2_non_empty = np.array([word != "" for word in df.word2_kg_id])
row = ["Total",
       sum(word1_non_empty & word2_non_empty),
       sum(word1_non_empty) + sum(word2_non_empty)
      ]
rows.append(row)

for name, model in embedding_models.items():
    word1_has_embedding = np.array([word in model for word in df.word1_kg_id])
    word2_has_embedding = np.array([word in model for word in df.word2_kg_id])
    row = [name,
           sum(word1_has_embedding & word2_has_embedding),
           sum(word1_has_embedding) + sum(word2_has_embedding)
          ]
    rows.append(row)
    
print(tabulate(rows, headers=headers))

for name, model in embedding_models.items():
    print("\nEntities missed by {}:".format(name))
    for i in range(len(df)):
        if df.word1_kg_id[i] not in model:
            print("{} - {} ({})".format(df.word1_kg_id[i], df['Word 1'][i], df.word1_kg_label[i]))
        if df.word2_kg_id[i] not in model:
            print("{} - {} ({})".format(df.word2_kg_id[i], df['Word 2'][i], df.word2_kg_label[i]))

         # rows with 2 words    # words
-----  ---------------------  ---------
Total                    349        698
H_3x6                    344        693
H_5x8                    342        691
A                         19         87
S                        239        572

Entities missed by H_3x6:
Q181201 - dividend (dividend)
Q3149193 - impartiality (impartiality)
Q2357965 - quarrel (Quarrel)
Q10956426 - match (match)
Q11812495 - viewer (viewer)

Entities missed by H_5x8:
Q181201 - dividend (dividend)
Q32341 - fuck (fuck)
Q3149193 - impartiality (impartiality)
Q2357965 - quarrel (Quarrel)
Q1746570 - hardware (Hardware)
Q10956426 - match (match)
Q11812495 - viewer (viewer)

Entities missed by A:
Q847337 - admission (confession)
Q551800 - ticket (ticket)
Q154 - alcohol (alcoholic beverage)
Q2329 - chemistry (chemistry)
Q11426 - metal (metal)
Q567303 - announcement (announcement)
Q14536140 - effort (exertion)
Q567303 - announcement (announcement)
Q38926 - news (news)
Q567303 - an

In [147]:
headers = ["", "# rows with 2 words", "# words"] + ["in {}".format(name) for name in embedding_models]
rows=[]

word1_non_empty = np.array([word != "" for word in df.g_qnode])
word2_non_empty = np.array([word != "" for word in df.p_qnode])
row = ["Total",
       sum(word1_non_empty & word2_non_empty),
       sum(word1_non_empty) + sum(word2_non_empty)
      ]
rows.append(row)

for name, model in embedding_models.items():
    word1_has_embedding = np.array([word in model for word in df.g_qnode])
    word2_has_embedding = np.array([word in model for word in df.p_qnode])
    row = [name,
           sum(word1_has_embedding & word2_has_embedding),
           sum(word1_has_embedding) + sum(word2_has_embedding)
          ]
    rows.append(row)
    
print(tabulate(rows, headers=headers))

for name, model in embedding_models.items():
    print("\nEntities missed by {}:".format(name))
    for i in range(len(df)):
        if df.g_qnode[i] not in model:
            print("{} - {}".format(df.g_qnode[i], df.q1_label[i]))
        if df.p_qnode[i] not in model:
            print("{} - {}".format(df.p_qnode[i], df.q2_label[i]))

         # rows with 2 words    # words
-----  ---------------------  ---------
Total                    118        236
H_3x6                    118        236
H_5x8                    118        236
A                          0          0
S                         85        199

Entities missed by H_3x6:

Entities missed by H_5x8:

Entities missed by A:
Q17221 - spokesperson
Q189290 - military officer
Q702269 - professional
Q2285706 - head of government
Q2285706 - head of government
Q82955 - politician
Q107711 - firefighter
Q189290 - military officer
Q702269 - professional
Q48352 - head of state
Q48352 - head of state
Q82955 - politician
Q2285706 - head of government
Q83307 - minister
Q48352 - head of state
Q83307 - minister
Q1251441 - leader
Q83307 - minister
Q9352089 - spy
Q189290 - military officer
Q201948 - sniper
Q1930187 - journalist
Q1414937 - combatant
Q1930187 - journalist
Q82955 - politician
Q11499147 - political activist
Q245065 - intergovernmental organization
Q327333 - go

## add word similarities according to embeddings

In [171]:
for name, model in embedding_models.items():
    sims = []
    for i in range(len(df)):
        ent1 = df.word1_kg_id[i]
        ent2 = df.word2_kg_id[i]
        if ent1 not in model or ent2 not in model:
            sims.append(np.nan)
            continue
        sims.append(model.similarity(ent1,ent2))
    df[name] = sims

In [172]:
df.to_csv(path_or_buf = "{}/wordsim_with_embedding_scores_out.csv".format(output_dir), index = False)

In [173]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)

Unnamed: 0,Word 1,word1_kg_id,word1_kg_label,word1_kg_description,Word 2,word2_kg_id,word2_kg_label,word2_kg_description,Human (Mean),complex,transe,text,class,H_3x6,H_5x8,A,S
0,admission,Q847337,confession,statement made by a person or a group of perso...,ticket,Q551800,ticket,paper or cardboard document showing payment fo...,5.536,0.361115,0.446632,0.485653,,0.461174,0.183901,,0.589556
1,alcohol,Q154,alcoholic beverage,"drink containing alcohols, typically ethanol",chemistry,Q2329,chemistry,branch of physical science concerned with the ...,4.125,0.302163,0.274388,0.479313,0.005851,0.436731,0.212472,,-0.011093
2,aluminum,Q663,aluminium,metallic chemical element of silvery appearanc...,metal,Q11426,metal,"element, compound or alloy that is a good cond...",6.625,0.694462,0.429289,0.626784,0.393657,0.752512,0.727398,,0.154339
3,announcement,Q567303,announcement,"printed, spoken, or published statement or not...",effort,Q14536140,exertion,use of physical or perceived energy by a person,2.0625,0.324825,0.379077,0.618644,0.208812,0.406013,0.440686,,
4,announcement,Q567303,announcement,"printed, spoken, or published statement or not...",news,Q38926,news,communication of selected information on curre...,7.1875,0.361214,0.360383,0.863846,0.888621,0.667217,0.493082,,0.02433
5,announcement,Q567303,announcement,"printed, spoken, or published statement or not...",production,Q739302,production,act of creating goods or services,1.6875,0.459382,0.456597,0.652602,0.184294,0.682299,0.465328,,0.139569
6,announcement,Q567303,announcement,"printed, spoken, or published statement or not...",warning,Q1759104,warning,signal used to warn of danger,4.625,0.306571,0.553333,0.671417,0.000265,0.600576,0.540587,,
7,Arafat,Q34211,Yasser Arafat,former Palestinian President,peace,Q454,peace,state of harmony characterized by lack of viol...,2.125,0.307899,0.084068,0.35512,0.004245,0.25767,0.292418,,0.083322
8,Arafat,Q34211,Yasser Arafat,former Palestinian President,terror,Q13648784,terror,policy of political repression and violence,3.0625,0.298777,0.348326,0.48849,0.007114,0.396161,0.33391,,
9,architecture,Q12271,architecture,"both the process and product of planning, desi...",century,Q578,century,unit of time lasting 100 years,1.929,0.300268,0.193794,0.288866,0.051621,0.446766,0.3073,,0.159141


In [148]:
for name, model in embedding_models.items():
    if "H" not in name:
        continue
    sims = []
    for i in range(len(df)):
        ent1 = df.g_qnode[i]
        ent2 = df.p_qnode[i]
        if ent1 not in model or ent2 not in model:
            sims.append(np.nan)
            continue
        sims.append(model.similarity(ent1,ent2))
    df[name] = sims

In [149]:
df.to_csv(path_or_buf = "{}/sim_with_embedding_scores_out.csv".format(output_dir), index = False)

In [154]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)

Unnamed: 0,Index,g_qnode,p_qnode,q1_label,q2_label,complex,transe,text,class,node2vec-v0,H_3x6,H_5x8
0,11,Q17221,Q189290,spokesperson,military officer,0.25,0.11,0.62,0.65,0.27,0.682234,0.446492
1,3,Q702269,Q2285706,professional,head of government,0.29,0.39,0.59,0.57,0.13,0.46892,0.287753
2,5,Q2285706,Q82955,head of government,politician,0.36,0.26,0.84,0.55,0.05,0.607809,0.639436
3,11,Q107711,Q189290,firefighter,military officer,0.29,0.22,0.58,0.52,0.34,0.784308,0.492099
4,3,Q702269,Q48352,professional,head of state,0.3,0.27,0.61,0.52,0.04,0.492855,0.331725
5,5,Q48352,Q82955,head of state,politician,0.26,0.16,0.72,0.51,0.17,0.571318,0.535259
6,2,Q2285706,Q83307,head of government,minister,0.69,0.74,0.88,0.49,0.39,0.794205,0.726871
7,2,Q48352,Q83307,head of state,minister,0.59,0.54,0.79,0.47,0.3,0.773367,0.668117
8,16,Q1251441,Q83307,leader,minister,0.46,0.33,0.72,0.45,0.22,0.47203,0.407
9,11,Q9352089,Q189290,spy,military officer,0.47,0.49,0.66,0.45,0.4,0.825277,0.709352


In [167]:
list(df.groupby(by="Index")["H_5x8"].max())

[0.420357882976532,
 0.7268710136413574,
 0.420357882976532,
 0.4384082555770874,
 0.6394362449645996,
 0.35159027576446533,
 0.38020044565200806,
 0.4384082555770874,
 0.4243793189525604,
 0.3309544026851654,
 0.7093519568443298,
 0.39174821972846985,
 0.5734885931015015,
 0.26561102271080017,
 0.5734885931015015,
 0.6793391704559326,
 0.8601336479187012,
 0.31316307187080383,
 0.5979108810424805,
 0.5036538243293762,
 0.5296689867973328,
 0.5734885931015015,
 0.5734885931015015,
 0.2586284279823303]

## Compute Kendall's Tau and Spearman Coefficient

In [142]:
def print_agreement_coeffs(df, metric1_col, metric2_cols):
    no_nan_mask = np.full(len(df), True)
    for col in ([metric1_col] + metric2_cols):
        no_nan_mask = no_nan_mask & (~ np.array(np.isnan(df.loc[:,col])))
    df_no_nans = df.loc[no_nan_mask, :]

    metric1_order = np.array(df_no_nans.loc[:,metric1_col])

    headers = ["Metric 1", "Metric 2", "Kendall Tau", "Spearman"]
    rows = []
    for col in metric2_cols:
        metric2_order = np.array(df_no_nans.loc[:,col])
        tau, tau_p_val = stats.kendalltau(metric1_order, metric2_order)
        corr, corr_p_val = stats.spearmanr(metric1_order, metric2_order)
        row = [metric1_col, col, tau, corr]
        rows.append(row)

    print(tabulate(rows, headers=headers))
    
print("Comparing all methods to human baseline:\n")
print_agreement_coeffs(df, "Human (Mean)", ["complex", "transe", "text", "class"] + list(embedding_models.keys()))

print("\nComparing each of the embeddings against all  other methods:")
for name in embedding_models:
    print("\n")
    metric2_cols = ["Human (Mean)", "complex", "transe", "text", "class"] + [model_name for model_name in embedding_models if model_name != name]
    print_agreement_coeffs(df, name, metric2_cols)

Comparing all methods to human baseline:

Metric 1      Metric 2      Kendall Tau    Spearman
------------  ----------  -------------  ----------
Human (Mean)  complex         0.274725    0.389011
Human (Mean)  transe          0.296703    0.406593
Human (Mean)  text            0.340659    0.437363
Human (Mean)  class           0.164835    0.243956
Human (Mean)  H_3x6           0.56044     0.740659
Human (Mean)  H_5x8           0.450549    0.613187
Human (Mean)  A               0.0549451   0.0813187
Human (Mean)  S               0.0549451   0.0197802

Comparing each of the embeddings against all  other methods:


Metric 1    Metric 2        Kendall Tau    Spearman
----------  ------------  -------------  ----------
H_3x6       Human (Mean)       0.56044     0.740659
H_3x6       complex            0.318681    0.450549
H_3x6       transe             0.428571    0.556044
H_3x6       text               0.340659    0.476923
H_3x6       class              0.296703    0.402198
H_3x6       H_5x

In [176]:
def print_agreement_coeffs(df, metric1_col, metric2_cols):

    metric1_order = np.array(df.loc[:,metric1_col])

    headers = ["Metric 1", "Metric 2", "Kendall Tau", "Spearman"]
    rows = []
    for col in metric2_cols:
        metric2_order = np.array(df.loc[:,col])
        tau, tau_p_val = stats.kendalltau(metric1_order, metric2_order, nan_policy="omit")
        corr, corr_p_val = stats.spearmanr(metric1_order, metric2_order, nan_policy="omit")
        row = [metric1_col, col, tau, corr]
        rows.append(row)

    print(tabulate(rows, headers=headers))
    
print("Comparing all methods to human baseline:\n")
print_agreement_coeffs(df, "Human (Mean)", ["complex", "transe", "text", "class"] + list(embedding_models.keys()))

print("\nComparing each of the embeddings against all  other methods:")
for name in embedding_models:
    print("\n")
    metric2_cols = ["Human (Mean)", "complex", "transe", "text", "class"] + [model_name for model_name in embedding_models if model_name != name]
    print_agreement_coeffs(df, name, metric2_cols)

Comparing all methods to human baseline:

Metric 1      Metric 2      Kendall Tau    Spearman
------------  ----------  -------------  ----------
Human (Mean)  complex          0.313876    0.455019
Human (Mean)  transe           0.235773    0.344647
Human (Mean)  text             0.433488    0.608956
Human (Mean)  class            0.289971    0.415368
Human (Mean)  H_3x6            0.278908    0.405741
Human (Mean)  H_5x8            0.295199    0.435055
Human (Mean)  A                0.111765    0.19403
Human (Mean)  S                0.120836    0.177313

Comparing each of the embeddings against all  other methods:


Metric 1    Metric 2        Kendall Tau    Spearman
----------  ------------  -------------  ----------
H_3x6       Human (Mean)      0.278908     0.405741
H_3x6       complex           0.399253     0.554779
H_3x6       transe            0.426553     0.593709
H_3x6       text              0.306283     0.446432
H_3x6       class             0.292832     0.423766
H_3x6      