## Load the embeddings

In [1]:
import embedding_comparison as comp

In [2]:
full_glove = comp.glove_to_dict('./glove.6B.300d.txt')

#load only words in common with glove embedding for memory constraints
full_current = comp.glove_to_dict('./glove.42B.300d.txt', full_glove.keys())

## Equalize vocabularies
In order to rotate one matrix into the other the have to have the same dimensions.

In [3]:
redux_current, redux_glove = comp.equalize_voc(full_current, full_glove)

## Rotate embeddings
Finding the rotation matrix that rotates one embeddings closest to the other reduces to the [orthogonal Procrustes problem](https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem).

In [4]:
#Rotate current into GloVe
rot_current = comp.rotate_embeddings(redux_current, redux_glove)

### Test how rotation affects the angles between words of different embeddings
The rotation should shrink the cosine distances between the same word in the two embeddings.  
Then, a set of words that contains verbs, which convey the realization of an event (as described in the event detection [guidelines](https://github.com/dhfbk/current/blob/master/Guidelines.pdf)), some common words, and words whose meaning is supposed to have evolved or changed during time. Common words are taken as a baseline to measure the distance between other words in the different embeddings

In [8]:
ws = [('man', 'woman'), ('he', 'she'), ('and',  'but')]
for w1, w2 in ws:
    print(w1, w2, 'Current:', comp.cos_dist(rot_current[w1], rot_current[w2]), 'GloVe:', comp.cos_dist(full_glove[w1], full_glove[w2]))
    print(w1, 'after rot:', comp.cos_dist(rot_current[w1], full_glove[w1]),
          'before rot:', comp.cos_dist(redux_current[w1], full_glove[w1]))

man woman Current: 0.19520073596550114 GloVe: 0.3001336620380982
man after rot: 0.1778400167634343 before rot: 0.9667149934641015
he she Current: 0.13041001186124335 GloVe: 0.29290269811032577
he after rot: 0.18513043737372248 before rot: 1.036213448955623
and but Current: 0.24316111263086237 GloVe: 0.41867866286983835
and after rot: 0.19931965724107548 before rot: 0.881821337280183


In [7]:
analyzed_words = ['is', 'be', 'was', 'were', 'do', 'does', 'did', 'done', 'make', 'makes', 'made',
                  'get', 'gets', 'got', 'gotten', 'have', 'has', 'had', 'sex', 'keyboard', 'walk',
                  'computer', 'airplane', 'gun', 'hotel', 'and', 'but', 'fame', 'sport']

In [10]:
sorted([(w, comp.cos_dist(rot_current[w], full_glove[w])) for w in analyzed_words],
       key=lambda x: x[1])

[('do', 0.12519744733474436),
 ('make', 0.15295521940549617),
 ('does', 0.15764962647936975),
 ('be', 0.1602329575641891),
 ('done', 0.1652218381482874),
 ('have', 0.16911801843482643),
 ('had', 0.17258309920704373),
 ('computer', 0.174469882834433),
 ('did', 0.17654384206506335),
 ('get', 0.18270704460419185),
 ('made', 0.18309746638086644),
 ('is', 0.18410232929541437),
 ('but', 0.18767834844705222),
 ('has', 0.1919236824880206),
 ('were', 0.19908112092800978),
 ('and', 0.19931965724107548),
 ('got', 0.20079406143094547),
 ('sport', 0.21013923448358474),
 ('fame', 0.21854641266759078),
 ('was', 0.2190861711704336),
 ('airplane', 0.22354896589894446),
 ('makes', 0.22436744577893575),
 ('gun', 0.2261838567033444),
 ('walk', 0.2294932747049221),
 ('gotten', 0.23136626027539997),
 ('gets', 0.23601584524410513),
 ('keyboard', 0.2550318857457389),
 ('hotel', 0.30678723712006517),
 ('sex', 0.3097621064840107)]

## NN search
The previous analysis is compared against a different technique to measure words distance in two different embeddings to see weather the produce similar results. Given a word the set of K nearest neighbour of that word is taken. This is done for each embedding, then the Jaccard distance between the resulting sets is computed. The analysis is done both with equal and original vocabularies for the embeddings.

In [11]:
#normalize vector embeddings so that Cosine distance ~ Euclidean distance
#this is done because the data structure for efficient NN search (KDtree) allows only Euclidean distance

from scipy import spatial

ncurrent = comp.normalize_embedding(redux_current)
nglove = comp.normalize_embedding(redux_glove)

#create KDTree representation of embeddings for fast NN search
tree_ncurrent = spatial.KDTree(list(ncurrent.values()))

tree_nglove = spatial.KDTree(list(nglove.values()))

sorted([(w, comp.word_jaccard_distance(w, ncurrent, tree_ncurrent, nglove, tree_nglove)[0]) for w in analyzed_words],
       key=lambda x: x[1])

[('do', 0.31932773109243695),
 ('make', 0.360655737704918),
 ('does', 0.4126984126984127),
 ('have', 0.4126984126984127),
 ('but', 0.4126984126984127),
 ('is', 0.4251968503937008),
 ('be', 0.4251968503937008),
 ('get', 0.4375),
 ('and', 0.4496124031007752),
 ('did', 0.46153846153846156),
 ('got', 0.4732824427480916),
 ('done', 0.48484848484848486),
 ('makes', 0.5074626865671642),
 ('made', 0.5074626865671642),
 ('had', 0.5074626865671642),
 ('airplane', 0.5294117647058824),
 ('was', 0.5401459854014599),
 ('has', 0.5401459854014599),
 ('were', 0.5611510791366907),
 ('gets', 0.5611510791366907),
 ('gotten', 0.5815602836879432),
 ('computer', 0.6206896551724138),
 ('walk', 0.6301369863013699),
 ('fame', 0.6394557823129252),
 ('keyboard', 0.6842105263157895),
 ('gun', 0.6928104575163399),
 ('sport', 0.7012987012987013),
 ('hotel', 0.7261146496815287),
 ('sex', 0.8505747126436781)]