## Load the embeddings

In [1]:
import embedding_comparison as comp

In [2]:
full_histo = comp.glove_to_dict('./HistoGlove.txt')
full_glove = comp.glove_to_dict('./glove.6B.300d.txt')

## Equalize vocabularies
In order to rotate one matrix into the other the have to have the same dimensions.

In [3]:
redux_histo, redux_glove = comp.equalize_voc(full_histo, full_glove)

## Rotate embeddings
Finding the rotation matrix that rotates one embeddings closest to the other reduces to the [orthogonal Procrustes problem](https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem).

In [9]:
#Rotate Histo into GloVe
rot_histo = comp.rotate_embeddings(redux_histo, redux_glove)

### Test how rotation affects the angles between words of different embeddings
The rotation should shrink the cosine distances between the same word in the two embeddings.  
Then, a set of words that contains verbs, which convey the realization of an event (as described in the event detection [guidelines](https://github.com/dhfbk/Histo/blob/master/Guidelines.pdf)), some common words, and words whose meaning is supposed to have evolved or changed during time. Common words are taken as a baseline to measure the distance between other words in the different embeddings

In [10]:
ws = [('man', 'woman'), ('he', 'she'), ('and',  'but')]
for w1, w2 in ws:
    print(w1, w2, 'Histo:', comp.cos_dist(rot_histo[w1], rot_histo[w2]), 'GloVe:', comp.cos_dist(full_glove[w1], full_glove[w2]))
    print(w1, 'after rot:', comp.cos_dist(rot_histo[w1], nglove[w1]),
          'before rot:', comp.cos_dist(full_histo[w1], full_glove[w1]))

man woman Histo: 0.3378917570693979 GloVe: 0.3001336620380982
man after rot: 0.2560016223450836 before rot: 0.9332433812838266
he she Histo: 0.28377190534703045 GloVe: 0.29290269811032577
he after rot: 0.18554595423735554 before rot: 1.0279926588039803
and but Histo: 0.40557248737786233 GloVe: 0.41867866286983835
and after rot: 0.15487231194454842 before rot: 1.039363584463667


In [11]:
analyzed_words = ['is', 'be', 'was', 'were', 'do', 'does', 'did', 'done', 'make', 'makes', 'made',
                  'get', 'gets', 'got', 'gotten', 'have', 'has', 'had', 'sex', 'keyboard', 'walk',
                  'computer', 'airplane', 'gun', 'hotel', 'and', 'but', 'fame', 'sport']

In [12]:
sorted([(w, comp.cos_dist(rot_histo[w], full_glove[w])) for w in analyzed_words],
       key=lambda x: x[1])

[('and', 0.1548723119445482),
 ('had', 0.16644375107820275),
 ('was', 0.18747676698073124),
 ('have', 0.19247986779105275),
 ('were', 0.1936423598387852),
 ('be', 0.19706325709843442),
 ('is', 0.20375246760367294),
 ('but', 0.23312515747075713),
 ('has', 0.24105492103205994),
 ('made', 0.259986181306759),
 ('do', 0.26224090087386687),
 ('got', 0.2630604146022776),
 ('does', 0.264535503217891),
 ('did', 0.2649323760645266),
 ('make', 0.2690426887029076),
 ('get', 0.2704075653322171),
 ('hotel', 0.2890237526744279),
 ('gets', 0.3094831914474211),
 ('done', 0.3147224944654414),
 ('makes', 0.3287810561318193),
 ('walk', 0.3318151115027683),
 ('gun', 0.4206272499428587),
 ('airplane', 0.45241414065377494),
 ('fame', 0.4946716974519002),
 ('sport', 0.5563455720425707),
 ('sex', 0.5665992716174372),
 ('gotten', 0.5848037183813057),
 ('keyboard', 0.7635135908862496),
 ('computer', 1.0758251455929164)]

## NN search
The previous analysis is compared against a different technique to measure words distance in two different embeddings to see weather the produce similar results. Given a word the set of K nearest neighbour of that word is taken. This is done for each embedding, then the Jaccard distance between the resulting sets is computed. The analysis is done both with equal and original vocabularies for the embeddings.

In [14]:
#normalize vector embeddings so that Cosine distance ~ Euclidean distance
#this is done because the data structure for efficient NN search (KDtree) allows only Euclidean distance

from scipy import spatial

nhisto = comp.normalize_embedding(redux_histo)
nglove = comp.normalize_embedding(redux_glove)

#create KDTree representation of embeddings for fast NN search
tree_nhisto = spatial.KDTree(list(nhisto.values()))

tree_nglove = spatial.KDTree(list(nglove.values()))

sorted([(w, comp.word_jaccard_distance(w, nhisto, tree_nhisto, nglove, tree_nglove)[0]) for w in analyzed_words],
       key=lambda x: x[1])

[('did', 0.4126984126984127),
 ('does', 0.4375),
 ('do', 0.4496124031007752),
 ('be', 0.46153846153846156),
 ('is', 0.4732824427480916),
 ('make', 0.4732824427480916),
 ('made', 0.49624060150375937),
 ('have', 0.49624060150375937),
 ('had', 0.5294117647058824),
 ('but', 0.5294117647058824),
 ('and', 0.5507246376811594),
 ('done', 0.5714285714285714),
 ('get', 0.5714285714285714),
 ('were', 0.5815602836879432),
 ('got', 0.5815602836879432),
 ('was', 0.5915492957746479),
 ('has', 0.5915492957746479),
 ('makes', 0.6666666666666667),
 ('walk', 0.7012987012987013),
 ('gets', 0.7096774193548387),
 ('hotel', 0.7654320987654322),
 ('gun', 0.7730061349693251),
 ('airplane', 0.7878787878787878),
 ('sport', 0.8304093567251463),
 ('fame', 0.8372093023255813),
 ('gotten', 0.9189189189189189),
 ('sex', 0.9304812834224598),
 ('keyboard', 0.9847715736040609),
 ('computer', 0.9949748743718593)]

In [16]:
nfull_histo = comp.normalize_embedding(full_histo)
nfull_glove = comp.normalize_embedding(full_glove)

tree_fhisto = spatial.KDTree(list(nfull_histo.values()))

tree_fglove = spatial.KDTree(list(nfull_glove.values()))

sorted([(w, comp.word_jaccard_distance(w, nfull_histo, tree_fhisto, nfull_glove, tree_fglove)[0]) for w in analyzed_words],
       key=lambda x: x[1])

[('did', 0.4126984126984127),
 ('does', 0.4375),
 ('do', 0.4496124031007752),
 ('be', 0.46153846153846156),
 ('is', 0.4732824427480916),
 ('make', 0.4732824427480916),
 ('made', 0.49624060150375937),
 ('have', 0.49624060150375937),
 ('had', 0.5294117647058824),
 ('but', 0.5294117647058824),
 ('and', 0.5507246376811594),
 ('done', 0.5714285714285714),
 ('get', 0.5714285714285714),
 ('were', 0.5815602836879432),
 ('got', 0.5815602836879432),
 ('was', 0.5915492957746479),
 ('has', 0.5915492957746479),
 ('makes', 0.6666666666666667),
 ('walk', 0.7012987012987013),
 ('gets', 0.7096774193548387),
 ('hotel', 0.7804878048780488),
 ('gun', 0.7951807228915663),
 ('airplane', 0.8095238095238095),
 ('fame', 0.8372093023255813),
 ('sport', 0.8636363636363636),
 ('gotten', 0.9304812834224598),
 ('sex', 0.9304812834224598),
 ('keyboard', 0.9847715736040609),
 ('computer', 0.9949748743718593)]