In [4]:
import pickle
import collections
import numpy as np
import ot
import scipy
import numpy.core.numeric as _nx
from scipy.spatial.distance import cdist
import sklearn
import csv, re, os
from scipy.stats import spearmanr, pearsonr
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from scipy.stats import entropy
import sklearn.metrics
from sklearn.metrics import  accuracy_score, brier_score_loss, hamming_loss, jaccard_similarity_score
import random

# Import embeddings and gold standard

In [2]:
emb_path = '/people/montariol/data/scalable_semantic_change/embeddings/'
data_path = '/people/montariol/data/scalable_semantic_change/'
code_path = '/people/montariol/code/dynamic_word_embeddings/Code/Modeling/BERT_embeddings/scalable_extraction/scalable_semantic_shift/'
gold_standard_dir = '/people/montariol/data/Evaluation/semeval2020_ulscd_posteval/test_data_truth/task2/'

In [20]:
langs = ['english', 'german', 'swedish', 'latin']

In [26]:
gold_dict=collections.defaultdict(dict)
for p in os.listdir(gold_standard_dir):
    with open(os.path.join(gold_standard_dir,p), 'r', encoding='utf8') as f:
        for line in f:
            word, score = line.split('\t')
            word = word.strip()
            score = float(score.strip())
            gold_dict[p[:-4]][word] = score


In [27]:
gold_dict.keys()

dict_keys(['english', 'german', 'latin', 'swedish'])

In [46]:
emb_dict = {}
for lang in langs:
    p = emb_path + lang + '_5_epochs_scalable.pickle'
    emb_scalable, count2sent_scalable = pickle.load(open(p, 'rb'))
    emb_dict[lang] = process_scalable_embeddings(emb_scalable)


In [45]:

def process_scalable_embeddings(emb_scalable):
    # use the counts to recreate the full scalable embeddings dictionary
    corpus_slices = ['1', '2']
    targets = list(emb_scalable.keys())
    emb_scalable_processed = collections.defaultdict(lambda:collections.defaultdict(list))
    emb_scalable_counts = collections.defaultdict(lambda:collections.defaultdict(list))
    for i, word in enumerate(targets):
        emb = emb_scalable[word]
        for cs in corpus_slices:
            cs_embeddings = []        
            for idx in range(len(emb[cs])):
                #get summed embedding and its count, divide embedding by count
                try:
                    e, count_emb = emb[cs][idx]
                    e = e/count_emb
                    emb_scalable_counts[word][cs].append(count_emb)
                except:
                    e = emb[cs][idx]
                emb_scalable_processed[word][cs].append(e)
    return emb_scalable_processed, emb_scalable_counts



# Experiment with averaging embeddings inside clusters

In [75]:
def compute_jsd(p, q):
    p = np.asarray(p)
    q = np.asarray(q)
    p /= p.sum()
    q /= q.sum()
    m = (p + q) / 2
    return (entropy(p, m) + entropy(q, m)) / 2

def cluster_drift(word, emb_dict_lang, emb_counts, n_clusters):
    emb1 = np.array(emb_dict_lang[word]['1'])
    emb2 = np.array(emb_dict_lang[word]['2'])
    counts200_1 = np.array(emb_counts[word]['1'])
    counts200_1_norm =  np.array(counts200_1) / np.array(counts200_1).sum()
    counts200_2 = np.array(emb_counts[word]['2'])
    counts200_2_norm =  np.array(counts200_2) / np.array(counts200_2).sum()
    
    # average pairwise distance:
    apd_cos = np.mean(cdist(emb1, emb2, metric='cosine'))
    apd_canb = np.mean(cdist(emb1, emb2, metric='canberra'))
    
    embeddings_concat = np.concatenate((emb1, emb2), axis=0)
    clustering = KMeans(n_clusters=n_clusters, n_init=100).fit(embeddings_concat)
    cluster_labels = clustering.labels_
    labels_list = list(set(cluster_labels))
    # construct count vectors from the cluster labels
    counts1 = Counter(list(cluster_labels[:emb1.shape[0]]))
    counts2 = Counter(list(cluster_labels[emb1.shape[0]:]))
    t1 = [counts1[i] for i in labels_list]
    t2 = [counts2[i] for i in labels_list]
    # compute JS divergence between count vectors by turning them into distributions
    t1_dist = np.array(t1) / np.array(t1).sum()
    t2_dist = np.array(t2) / np.array(t2).sum()
    jsd = compute_jsd(t1_dist, t2_dist)
    # compute distance between avg embeddings
    cos_avg = 1 - cosine_similarity(np.mean(emb2, 0).reshape(1, -1), np.mean(emb1, 0).reshape(1, -1))
    # do averaging by cluster and compute wasserstein distance
    emb1_means = np.array([np.mean(emb1[cluster_labels[:emb1.shape[0]] == clust], 0) for clust in labels_list])
    emb2_means = np.array([np.mean(emb2[cluster_labels[emb1.shape[0]:] == clust], 0) for clust in labels_list])
    M = np.nan_to_num(np.array([cdist(emb1_means, emb2_means, metric = 'cosine')])[0], nan = 1)
    #M /= M.max()
    #M is the metric cost matrix defining the cost to move mass from bin a_i to bin b_j
    # a and b are histograms on the simplex (positive, sum to 1) that represent the weights 
    #of each samples in the source an target distributions.
    # Thus M is the distance between each pair of elements in emb1_means and emb2_means
    # and a and b are the weights of each cluster in periods 1 and 2
    wass = ot.emd2(t1_dist, t2_dist, M)
    sinkhorn10 = ot.sinkhorn2(t1_dist, t2_dist, M, reg=10, method='sinkhorn')
    sinkhorn_1 = ot.sinkhorn2(t1_dist, t2_dist, M, reg=0.1, method='sinkhorn')

    # do 2 clusterings:
    labels1, centroids1, t1_dist = cluster_emb(emb1, n_clusters)
    labels2, centroids2, t2_dist = cluster_emb(emb2, n_clusters)
    emb1_means = np.array([np.mean(emb1[labels1 == clust], 0) for clust in list(set(labels1))])
    emb2_means = np.array([np.mean(emb2[labels2 == clust], 0) for clust in list(set(labels2))])
    M = np.nan_to_num(np.array([cdist(emb1_means, emb2_means, metric = 'cosine')])[0], nan = 1)
    # Some elements of the distance matrix are nan because one of the clusers is empty for a time slice.
    # replace them with 1
    # M /= M.max()
    indep_wass = ot.emd2(t1_dist, t2_dist, M)
    #indep_bregman = ot.bregman.empirical_sinkhorn_divergence(emb1_means, emb2_means, t1_dist, t2_dist)
    #indep_bregman2 = ot.bregman.empirical_sinkhorn2(emb1_means, emb2_means, t1_dist, t2_dist)
    indep_sinkhorn10 = ot.sinkhorn2(t1_dist, t2_dist, M, reg=10, method='sinkhorn')
    indep_sinkhorn_1 = ot.sinkhorn2(t1_dist, t2_dist, M, reg=0.1, method='sinkhorn')
    
    # with weighted average inside clusters:
    emb1_means = np.array([np.average(emb1[labels1 == clust], 0, counts200_1_norm[labels1 == clust]) for clust in list(set(labels1))])
    emb2_means = np.array([np.average(emb2[labels2 == clust], 0, counts200_2_norm[labels2 == clust]) for clust in list(set(labels2))])
    M = np.nan_to_num(np.array([cdist(emb1_means, emb2_means, metric = 'cosine')])[0], nan = 1)
    indep_wass_weighted = ot.emd2(t1_dist, t2_dist, M)
    indep_sinkhorn10_weighted = ot.sinkhorn2(t1_dist, t2_dist, M, reg=10, method='sinkhorn')
    indep_sinkhorn_1_weighted = ot.sinkhorn2(t1_dist, t2_dist, M, reg=0.1, method='sinkhorn')
    
    # with centroids it's the same as average for kmeans
    # M = np.nan_to_num(np.array([cdist(centroids1, centroids2, metric = 'cosine')])[0], nan = 1)
    # indep_wass_centroids = ot.emd2(t1_dist, t2_dist, M)
    # indep_sinkhorn_centroids = ot.sinkhorn2(t1_dist, t2_dist, M, reg= 1e-1, method='sinkhorn')
    
    # without clustering:
    M = np.nan_to_num(np.array([cdist(emb1, emb2, metric = 'cosine')])[0], nan = 1)
    #M /= M.max()
    full_wass = ot.emd2(counts200_1_norm, counts200_2_norm, M)
    full_sinkhorn10 = ot.sinkhorn2(counts200_1_norm, counts200_2_norm, M, reg=10, method='sinkhorn')
    full_sinkhorn_1 = ot.sinkhorn2(counts200_1_norm, counts200_2_norm, M, reg=0.1, method='sinkhorn')
    
    return apd_cos, apd_canb, jsd, cos_avg.flatten().item(), wass, sinkhorn10, sinkhorn_1, indep_wass, indep_sinkhorn10, indep_sinkhorn_1, indep_wass_weighted, indep_sinkhorn10_weighted, indep_sinkhorn_1_weighted, full_wass, full_sinkhorn10, full_sinkhorn_1

def cluster_emb(embeddings, n_clusters):
    clustering = KMeans(n_clusters=n_clusters, n_init=100).fit(embeddings)
    cluster_labels = clustering.labels_
    centroids = clustering.cluster_centers_
    labels_list = list(set(cluster_labels))
    # construct count vectors from the cluster labels
    counts = Counter(list(cluster_labels))
    t = [counts[i] for i in labels_list]
    t_dist = np.array(t) / np.array(t).sum()
    return cluster_labels, centroids, t_dist

In [78]:
n_clusters = 7
results_dict = collections.defaultdict(lambda: collections.defaultdict(list))


for lang in emb_dict.keys():
    emb_dict_lang, emb_counts = emb_dict[lang]
    targets = list(emb_dict_lang.keys())
    for i, word in enumerate(targets):
        print(lang, i, word)
        apd_cos, apd_canb, jsd, cos, wass, sinkhorn10, sinkhorn_1, indep_wass, indep_sinkhorn10, indep_sinkhorn_1, indep_wass_weighted, indep_sinkhorn10_weighted, indep_sinkhorn_1_weighted, full_wass, full_sinkhorn10, full_sinkhorn_1 = cluster_drift(word, emb_dict_lang, emb_counts, n_clusters)
        results_dict[lang]['apd_cos'].append(apd_cos)
        results_dict[lang]['apd_canb'].append(apd_canb)
        results_dict[lang]['jsd'].append(jsd)
        results_dict[lang]['cos'].append(cos)
        results_dict[lang]['wass'].append(wass)
        results_dict[lang]['sinkhorn10'].append(sinkhorn10)
        results_dict[lang]['sinkhorn_1'].append(sinkhorn_1)
        results_dict[lang]['indep_wass'].append(indep_wass)
        results_dict[lang]['indep_sinkhorn10'].append(indep_sinkhorn10)
        results_dict[lang]['indep_sinkhorn_1'].append(indep_sinkhorn_1)
        results_dict[lang]['indep_wass_weighted'].append(indep_wass_weighted)
        results_dict[lang]['indep_sinkhorn10_weighted'].append(indep_sinkhorn10_weighted)
        results_dict[lang]['indep_sinkhorn_1_weighted'].append(indep_sinkhorn_1_weighted)
        results_dict[lang]['full_wass'].append(full_wass)
        results_dict[lang]['full_sinkhorn10'].append(full_sinkhorn10)
        results_dict[lang]['full_sinkhorn_1'].append(full_sinkhorn_1)
        results_dict[lang]['gold'].append(gold_dict[lang][word])       


english 0 part_nn
english 1 face_nn
english 2 tip_vb
english 3 tree_nn
english 4 head_nn
english 5 multitude_nn
english 6 land_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 7 ball_nn
english 8 pin_vb
english 9 word_nn
english 10 record_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 11 savage_nn
english 12 lane_nn
english 13 stab_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 14 attack_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 15 risk_nn
english 16 player_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 17 contemplation_nn
english 18 prop_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 19 edge_nn
english 20 bit_nn
english 21 rag_nn
english 22 plane_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 23 gas_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 24 donkey_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 25 circle_vb
english 26 quilt_nn
english 27 stroke_vb


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


english 28 ounce_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 29 relationship_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


english 30 twist_nn
english 31 thump_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 32 chairman_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 33 lass_nn
english 34 bag_nn
english 35 graft_nn


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


english 36 fiction_nn
german 0 Tier


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 1 Fuß
german 2 Entscheidung
german 3 packen
german 4 Tragfähigkeit
german 5 beimischen


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


german 6 Titel


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 7 Mulatte


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 8 Truppenteil


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 9 Frechheit
german 10 vorliegen
german 11 abbauen


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 12 ausspannen
german 13 vergönnen
german 14 zersetzen


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 15 Sensation


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


german 16 Spielball


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 17 Ausnahmegesetz


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 18 vorweisen


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 19 Rezeption


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 20 Lyzeum
german 21 voranstellen
german 22 Einreichung
german 23 abdecken


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 24 Armenhaus


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 25 Kubikmeter


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


german 26 Manschette
german 27 Schmiere
german 28 Knotenpunkt


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


german 29 Naturschönheit
german 30 Mißklang


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 31 Gesichtsausdruck


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 32 Abgesang


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 33 Dynamik


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 34 Festspiel


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 35 verbauen
german 36 Seminar


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 37 Unentschlossenheit
german 38 weitgreifend


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 39 artikulieren


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 40 Ackergerät


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 41 Eintagsfliege


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 42 überspannen
german 43 abgebrüht


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


german 44 aufrechterhalten
german 45 Ohrwurm


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 46 Pachtzins


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


german 47 Engpaß


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 0 krita


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 1 färg
swedish 2 by
swedish 3 uträtta
swedish 4 uppfostran
swedish 5 konduktör


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 6 studie


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 7 bedömande
swedish 8 medium


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 9 bröllop


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 10 vegetation
swedish 11 annandag


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


swedish 12 antyda


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


swedish 13 ledning
swedish 14 granskare


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 15 beredning
swedish 16 blockera
swedish 17 bearbeta
swedish 18 bolagsstämma


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 19 central


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 20 kemisk


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 21 notis
swedish 22 vaktmästare
swedish 23 aktiv


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 24 förhandling


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 25 gagn
swedish 26 uppfattning


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 27 undertrycka
swedish 28 uppläggning


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 29 kokärt


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


swedish 30 motiv
latin 0 jus


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


latin 1 hostis
latin 2 oportet


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


latin 3 virtus
latin 4 consilium
latin 5 dux
latin 6 voluntas
latin 7 honor
latin 8 potestas
latin 9 senatus
latin 10 imperator
latin 11 consul


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


latin 12 dubius
latin 13 necessarius
latin 14 pontifex


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


latin 15 credo
latin 16 sensus
latin 17 adsumo


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


latin 18 civitas


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


latin 19 salus
latin 20 beatus


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


latin 21 licet
latin 22 templum
latin 23 sapientia
latin 24 cohors
latin 25 sanctus


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


latin 26 regnum
latin 27 poena
latin 28 ancilla


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


latin 29 fidelis
latin 30 nobilitas
latin 31 simplex
latin 32 humanitas
latin 33 itero
latin 34 acerbus
latin 35 titulus


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


latin 36 nepos
latin 37 dolus
latin 38 scriptura


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


latin 39 sacramentum


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  Kp = (1 / a).reshape(-1, 1) * K


In [80]:
res_all = collections.defaultdict(list)
for lang in emb_dict.keys():
    for measure in results_dict[lang].keys():
        corrs = spearmanr(results_dict[lang][measure], results_dict[lang]['gold'])
        # corrp = pearsonr(results_dict[method][clustering][measure], gold_standard_vec)
        print(lang.upper(), measure, corrs[0])
        res_all[measure].append(corrs[0])
for measure in res_all.keys():
    print(measure, np.mean(res_all[measure]))

ENGLISH apd_cos 0.5042385520488138
ENGLISH apd_canb 0.48289763049490214
ENGLISH jsd 0.24020392815680622
ENGLISH cos 0.31940245925687855
ENGLISH wass 0.3776157508289377
ENGLISH sinkhorn10 0.6153299048044543
ENGLISH sinkhorn_1 0.2060584536705475
ENGLISH indep_wass 0.3965854588768592
ENGLISH indep_sinkhorn10 0.4858616473773899
ENGLISH indep_sinkhorn_1 0.47376845849683996
ENGLISH indep_wass_weighted 0.4004979611617431
ENGLISH indep_sinkhorn10_weighted 0.4973620328814423
ENGLISH indep_sinkhorn_1_weighted 0.46013398083739626
ENGLISH full_wass 0.3857964374246039
ENGLISH full_sinkhorn10 0.4064259949267185
ENGLISH full_sinkhorn_1 0.3691979428826726
ENGLISH gold 1.0
GERMAN apd_cos 0.28669425286158623
GERMAN apd_canb 0.47949205782201537
GERMAN jsd 0.4751943640884166
GERMAN cos 0.5629978791014337
GERMAN wass 0.4989132814029615
GERMAN sinkhorn10 0.4203579680951569
GERMAN sinkhorn_1 0.41350341859599943
GERMAN indep_wass 0.4672517908592342
GERMAN indep_sinkhorn10 0.46077804966558544
GERMAN indep_sink

In [84]:
langs = list(emb_dict.keys())
print(f"  & & & semeval-avg & {langs[0]} & {langs[1]} & {langs[2]} & {langs[3]}  \\\\")
for measure in results_dict['english'].keys():
    temp = measure.replace('_1', '0.1').replace('_', '-')
    corrs = []
    corrs.append(np.mean(res_all[measure]))
    for lang in results_dict.keys():
        corr = spearmanr(results_dict[lang][measure], results_dict[lang]['gold'])[0]
        corrs.append(corr)
        # corrp = pearsonr(results_dict[method][clustering][measure], gold_standard_vec) 
    print(f" scalable & kmeans7 & {temp} & {corrs[0]:.3f} & {corrs[1]:.3f} & {corrs[2]:.3f} & {corrs[3]:.3f}  & {corrs[4]:.3f}\\\\")


  & & & semeval-avg & english & german & swedish & latin  \\
 scalable & kmeans7 & apd-cos & 0.333 & 0.504 & 0.287 & 0.219  & 0.324\\
 scalable & kmeans7 & apd-canb & 0.373 & 0.483 & 0.479 & 0.226  & 0.304\\
 scalable & kmeans7 & jsd & 0.290 & 0.240 & 0.475 & 0.073  & 0.370\\
 scalable & kmeans7 & cos & 0.413 & 0.319 & 0.563 & 0.275  & 0.495\\
 scalable & kmeans7 & wass & 0.398 & 0.378 & 0.499 & 0.266  & 0.451\\
 scalable & kmeans7 & sinkhorn10 & 0.440 & 0.615 & 0.420 & 0.250  & 0.476\\
 scalable & kmeans7 & sinkhorn0.1 & 0.287 & 0.206 & 0.414 & 0.071  & 0.458\\
 scalable & kmeans7 & indep-wass & 0.397 & 0.397 & 0.467 & 0.292  & 0.431\\
 scalable & kmeans7 & indep-sinkhorn10 & 0.419 & 0.486 & 0.461 & 0.257  & 0.472\\
 scalable & kmeans7 & indep-sinkhorn0.1 & 0.422 & 0.474 & 0.475 & 0.263  & 0.478\\
 scalable & kmeans7 & indep-wass-weighted & 0.401 & 0.400 & 0.473 & 0.325  & 0.407\\
 scalable & kmeans7 & indep-sinkhorn10-weighted & 0.424 & 0.497 & 0.464 & 0.266  & 0.468\\
 scalable & km