## Load the embeddings

In [1]:
import numpy as np
import scipy
from scipy import spatial
import os

def glove_to_dict(embedding: str, embedding_length: int = 300) -> dict:
    result = dict()
    for line in open(embedding, 'r'):
        line = line.split()
        result[line[0]] = np.array(line[1:]).astype(np.float)
    return result

def cos_dist(x: str, y: str, emb: dict) -> float:
    return spatial.distance.cosine(emb[x], emb[y])

In [2]:
full_histo = glove_to_dict('./HistoGlove.txt')
full_glove = glove_to_dict('./glove.6B.300d.txt')

## Equalize vocabularies
In order to rotate one matrix into the other the have to have the same dimensions.

In [3]:
import nltk

#make the vocabularies equal
shisto = set(full_histo.keys())
sglove = set(full_glove.keys())

#keep only words that appear in both embeddings
voc = shisto.intersection(sglove)

redux_histo = {k:full_histo[k] for k in voc}
redux_glove = {k:full_glove[k] for k in voc}

## Rotate embeddings
Finding the rotation matrix that rotates one embeddings closest to the other reduces to the [orthogonal Procrustes problem](https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem).

In [4]:
#Find the rotation matrix R
A = np.array([v for v in redux_histo.values()])
B = np.array([v for v in redux_glove.values()])
R = scipy.linalg.orthogonal_procrustes(A,B)[0]
R.shape

M = np.dot(A.T, B)
U, s, V = np.linalg.svd(M)
U.shape, V.shape, s.shape
#determinant == 1, it is a proper rotation matrix (only rotation, no reflection)
print(np.linalg.det(R))

0.9999999999998683


In [5]:
#Rotate Histo into GloVe
rot_histo = {k:v for k,v in zip(voc, A @ R)}

### Test how rotation affects the angles between words of different embeddings
The rotation should shrink the cosine distances between the same word in the two embeddings.  
Then, a set of words that contains verbs, which convey the realization of an event (as described in the event detection [guidelines](https://github.com/dhfbk/Histo/blob/master/Guidelines.pdf)), some common words, and words whose meaning is supposed to have evolved or changed during time. Common words are taken as a baseline to measure the distance between other words in the different embeddings

In [6]:
ws = [('man', 'woman'), ('he', 'she'), ('and',  'but')]
for w1, w2 in ws:
    print(w1, w2, 'Histo:', cos_dist(w1, w2, rot_histo), 'GloVe:', cos_dist(w1, w2, full_glove))
    print(w1, 'after rot:', spatial.distance.cosine(rot_histo[w1], full_glove[w1]),
          'before rot:', spatial.distance.cosine(full_histo[w1], full_glove[w1]))

man woman Histo: 0.3378917570693979 GloVe: 0.3001336620380982
man after rot: 0.2560016223450837 before rot: 0.9332433812838266
he she Histo: 0.2837719053470301 GloVe: 0.29290269811032577
he after rot: 0.18554595423735565 before rot: 1.0279926588039803
and but Histo: 0.40557248737786233 GloVe: 0.41867866286983835
and after rot: 0.1548723119445482 before rot: 1.039363584463667


In [7]:
analyzed_words = ['is', 'be', 'was', 'were', 'do', 'does', 'did', 'done', 'make', 'makes', 'made',
                  'get', 'gets', 'got', 'gotten', 'have', 'has', 'had', 'sex', 'keyboard', 'walk',
                  'computer', 'airplane', 'gun', 'hotel', 'and', 'but', 'fame', 'sport']

In [8]:
sorted([(w, spatial.distance.cosine(rot_histo[w], full_glove[w])) for w in analyzed_words],
       key=lambda x: x[1])

[('and', 0.1548723119445482),
 ('had', 0.16644375107820375),
 ('was', 0.1874767669807318),
 ('have', 0.19247986779105208),
 ('were', 0.1936423598387852),
 ('be', 0.1970632570984333),
 ('is', 0.20375246760367283),
 ('but', 0.23312515747075746),
 ('has', 0.24105492103206017),
 ('made', 0.2599861813067589),
 ('do', 0.2622409008738674),
 ('got', 0.2630604146022777),
 ('does', 0.2645355032178911),
 ('did', 0.26493237606452624),
 ('make', 0.2690426887029075),
 ('get', 0.27040756533221677),
 ('hotel', 0.2890237526744289),
 ('gets', 0.3094831914474201),
 ('done', 0.3147224944654403),
 ('makes', 0.3287810561318196),
 ('walk', 0.3318151115027669),
 ('gun', 0.42062724994285816),
 ('airplane', 0.45241414065377594),
 ('fame', 0.49467169745190076),
 ('sport', 0.5563455720425707),
 ('sex', 0.5665992716174366),
 ('gotten', 0.5848037183813043),
 ('keyboard', 0.7635135908862476),
 ('computer', 1.0758251455929173)]

## NN search
The previous analysis is compared against a different technique to measure words distance in two different embeddings to see weather the produce similar results. Given a word the set of K nearest neighbour of that word is taken. This is done for each embedding, then the Jaccard distance between the resulting sets is computed. The analysis is done both with equal and original vocabularies for the embeddings.

In [9]:
#normalize vector embeddings so that Cosine distance ~ Euclidean distance
#this is done because the data structure for efficient NN search (KDtree) allows only Euclidean distance
def normalize_embedding(emb: dict) -> dict:
    return {w:emb[w]/np.linalg.norm(emb[w]) for w in emb}

nhisto = normalize_embedding(redux_histo)
nglove = normalize_embedding(redux_glove)
nfull_histo = normalize_embedding(full_histo)
nfull_glove = normalize_embedding(full_glove)

In [10]:
#create KDTree representation of embeddings for fast NN search
tree_nhisto = spatial.KDTree(list(nhisto.values()))

tree_nglove = spatial.KDTree(list(nglove.values()))

tree_fhisto = spatial.KDTree(list(nfull_histo.values()))

tree_fglove = spatial.KDTree(list(nfull_glove.values()))

In [11]:
#don't remove words meaningful for the analysis
stopwords = set(nltk.corpus.stopwords.words('english')) - set(analyzed_words)

In [12]:
#get Jaccard distance of the selected word
def word_jaccard_distance(word: str, emb1: dict, emb1_tree: spatial.KDTree,
                          emb2: dict, emb2_tree: spatial.KDTree, k: int=100) -> tuple:
    dist_emb1, neigh_emb1 = emb1_tree.query(emb1[word], k=k)
    res_emb1 = [list(emb1.keys())[i] for i in neigh_emb1]
    
    dist_emb2, neigh_emb2 = emb2_tree.query(emb2[word], k=k)
    res_emb2 = [list(emb2.keys())[i] for i in neigh_emb2]
    
    s1, s2 = set(res_emb1), set(res_emb2)
    dist = 1 - len(s1.intersection(s2))/len(s1.union(s2))
    
    return dist , dict(zip(res_emb1, dist_emb1)), dict(zip(res_emb2, dist_emb2))

In [13]:
sorted([(w, word_jaccard_distance(w, nhisto, tree_nhisto, nglove, tree_nglove)[0]) for w in analyzed_words],
       key=lambda x: x[1])

[('did', 0.4126984126984127),
 ('does', 0.4375),
 ('do', 0.4496124031007752),
 ('be', 0.46153846153846156),
 ('is', 0.4732824427480916),
 ('make', 0.4732824427480916),
 ('made', 0.49624060150375937),
 ('have', 0.49624060150375937),
 ('had', 0.5294117647058824),
 ('but', 0.5294117647058824),
 ('and', 0.5507246376811594),
 ('done', 0.5714285714285714),
 ('get', 0.5714285714285714),
 ('were', 0.5815602836879432),
 ('got', 0.5815602836879432),
 ('was', 0.5915492957746479),
 ('has', 0.5915492957746479),
 ('makes', 0.6666666666666667),
 ('walk', 0.7012987012987013),
 ('gets', 0.7096774193548387),
 ('hotel', 0.7654320987654322),
 ('gun', 0.7730061349693251),
 ('airplane', 0.7878787878787878),
 ('sport', 0.8304093567251463),
 ('fame', 0.8372093023255813),
 ('gotten', 0.9189189189189189),
 ('sex', 0.9304812834224598),
 ('keyboard', 0.9847715736040609),
 ('computer', 0.9949748743718593)]

In [14]:
sorted([(w, word_jaccard_distance(w, nfull_histo, tree_fhisto, nfull_glove, tree_fglove)[0]) for w in analyzed_words],
       key=lambda x: x[1])

[('did', 0.4126984126984127),
 ('does', 0.4375),
 ('do', 0.4496124031007752),
 ('be', 0.46153846153846156),
 ('is', 0.4732824427480916),
 ('make', 0.4732824427480916),
 ('made', 0.49624060150375937),
 ('have', 0.49624060150375937),
 ('had', 0.5294117647058824),
 ('but', 0.5294117647058824),
 ('and', 0.5507246376811594),
 ('done', 0.5714285714285714),
 ('get', 0.5714285714285714),
 ('were', 0.5815602836879432),
 ('got', 0.5815602836879432),
 ('was', 0.5915492957746479),
 ('has', 0.5915492957746479),
 ('makes', 0.6666666666666667),
 ('walk', 0.7012987012987013),
 ('gets', 0.7096774193548387),
 ('hotel', 0.7804878048780488),
 ('gun', 0.7951807228915663),
 ('airplane', 0.8095238095238095),
 ('fame', 0.8372093023255813),
 ('sport', 0.8636363636363636),
 ('gotten', 0.9304812834224598),
 ('sex', 0.9304812834224598),
 ('keyboard', 0.9847715736040609),
 ('computer', 0.9949748743718593)]