## Load the embeddings

In [1]:
import numpy as np
import scipy
from scipy import spatial
import os

def glove_to_dict(embedding: str, embedding_length: int = 300) -> dict:
    result = dict()
    for line in open(embedding, 'r'):
        line = line.split()
        result[line[0]] = np.array(line[1:]).astype(np.float)
    return result

def cos_dist(x: str, y: str, emb: dict) -> float:
    return spatial.distance.cosine(emb[x], emb[y])

In [2]:
full_histo = glove_to_dict('./HistoGlove.txt')
full_glove = glove_to_dict('./glove.6B.300d.txt')

## Equalize vocabularies

In [3]:
import nltk

#make the vocabularies equal
shisto = set(full_histo.keys())
sglove = set(full_glove.keys())

#keep only words that appear in both embeddings
voc = shisto.intersection(sglove)

redux_histo = {k:full_histo[k] for k in voc}
redux_glove = {k:full_glove[k] for k in voc}

## Rotate embeddings
Finding the rotation matrix that rotates one embeddings closest to the other reduces to the [orthogonal Procrustes problem](https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem)

In [4]:
#Find the rotation matrix R
A = np.array([v for v in redux_histo.values()])
B = np.array([v for v in redux_glove.values()])
R = scipy.linalg.orthogonal_procrustes(A,B)[0]
R.shape

M = np.dot(A.T, B)
U, s, V = np.linalg.svd(M)
U.shape, V.shape, s.shape
#determinant == 1, it is a proper rotation matrix (only rotation, no reflection)
print(np.linalg.det(R))

1.0000000000000469


In [5]:
#Rotate Histo into GloVe
rot_histo = {k:v for k,v in zip(voc, A @ R)}

In [22]:
ws = [('man', 'woman'), ('he', 'she'), ('king',  'queen')]
for w1, w2 in ws:
    print(w1, w2, 'Histo:', cos_dist(w1, w2, rot_histo), 'GloVe:', cos_dist(w1, w2, full_glove))
    print(w1, 'after rot:', spatial.distance.cosine(rot_histo[w1], full_glove[w1]),
          'before rot:', spatial.distance.cosine(full_histo[w1], full_glove[w1]))

man woman 0.337891757069398
man woman 0.3001336620380982
man 0.2560016223450823
woman 0.2736922747336825
he she 0.28377190534703023
he she 0.29290269811032577
he 0.1855459542373541
she 0.24730286487124697
king queen 0.3596912536175165
king queen 0.3663531298520035
king 0.2715579208495581
queen 0.2908447205370823
high higher 0.40981244227389446
high higher 0.37999040888827107
high 0.3063183898196824
higher 0.2797118381688637
smart smarter 0.7600848237140041
smart smarter 0.5538880489131943
smart 0.6448494608073473
smarter 0.6695391756504294
good better 0.4153298215744756
good better 0.2340416304234446
good 0.27218239337500183
better 0.2596502013181472
bad worse 0.45420851261634443
bad worse 0.37091165779275936
bad 0.35206396372315474
worse 0.29081367189400364
is bad 0.6698346208993016
is bad 0.6398420442055139
is 0.2037524676036765
bad 0.35206396372315474
not good 0.5258918695759116
not good 0.3979201915462419
not 0.21708159369699354
good 0.27218239337500183


In [None]:
analyzed_words = ['is', 'be', 'was', 'were', 'do', 'does', 'did', 'done', 'make', 'makes', 'made',
                  'get', 'gets', 'got', 'gotten', 'have', 'has', 'had', 'sex', 'keyboard', 'walk',
                  'computer', 'airplane', 'gun', 'hotel']

## NN search

In [59]:
#normalize vector embeddings so that cosine distance == Euclidean distance
def normalize_embedding(emb: dict) -> dict:
    return {w:emb[w]/np.linalg.norm(emb[w]) for w in emb}

nhisto = normalize_embedding(histo)
nglove = normalize_embedding(glove)

In [60]:
#create KDTree representation of embeddings for fast NN search
words_histo = list(nhisto.keys())
tree_histo = spatial.KDTree(list(nhisto.values()))

words_glove = list(nglove.keys())
tree_glove = spatial.KDTree(list(nglove.values()))

In [None]:
#don't remove words meaningful for the analysis
stopwords = set(nltk.corpus.stopwords.words('english')) - set(analyzed_words)

In [69]:
#get K nn of the selected word
K=50

for word in analyzed_words+['happened']:

    dist_histo, neigh_histo = tree_histo.query(nhisto[word], k=K)
    res_histo = [words_histo[i] for i in neigh_histo]
    #print(list(zip(res_histo, dist_histo)))

    dist_glove, neigh_glove = tree_glove.query(nglove[word], k=K)
    res_glove = [words_glove[i] for i in neigh_glove]
    #print(list(zip(res_glove, dist_glove)))

    sh = set(res_histo)
    sg = set(res_glove)

    print(word, len(sh.intersection(sg))/len(sh.union(sg)))

[('is', 0.0), ('.', 0.9301737290886262), ('seems', 0.9326651397914488), ('has', 0.9421574551134596), ('be', 0.9518956665799146), (',', 0.9531224040164639), ('fact', 0.9555405801024769), ('does', 0.9595135480783553), ('true', 0.9636766859333352), ('one', 0.965365923688058), ('also', 0.987558600424114), ('part', 0.9922341492991387), ('perhaps', 0.9953881054441825), ('indeed', 1.001720136316251), ('yet', 1.006330297678297), ('"', 1.0078439760078282), ('was', 1.0082793744604948), ('however', 1.0116866007574614), ('present', 1.01644950460444), ('world', 1.01807075919705), ('means', 1.0181496836632926), ('must', 1.021984924443583), ('becomes', 1.023146473693301), ('man', 1.0237206518225783), ('though', 1.0239231963629378), ('makes', 1.0247712332395433), ('thing', 1.0247797923972433), ('nothing', 1.0304187264410554), ('time', 1.0313678168144922), ('less', 1.0318129114684202), ('matter', 1.0322243556220172), ('known', 1.0334393227067136), ('may', 1.0364145085254268), ('even', 1.036648584704429

does 0.4084507042253521
[('did', 0.0), ('do', 0.6528210515101068), ("n't", 0.7560846922433037), ('know', 0.7595821598767325), ('think', 0.7871813784501498), ('say', 0.794919126779571), ('knew', 0.8233627523524626), ('would', 0.8309368500345157), ('tell', 0.8554439932981662), ('never', 0.8574980890701549), ('could', 0.8701496618292465), ('does', 0.8788519051393969), ('anything', 0.8897915322210351), ('thought', 0.8958092685673228), ('wish', 0.8984249184600581), ('mean', 0.9004891049946656), ('believe', 0.902686797839622), ('nothing', 0.9131195474083215), ('understand', 0.9169660874300616), ('wanted', 0.9192885602726805), ('want', 0.9207937845504356), ('see', 0.9264225570918827), ('even', 0.9288826938415744), ('ask', 0.9341673426049868), ('well', 0.9357642991960541), ('come', 0.9370243569066392), ('said', 0.9374617148468819), ('might', 0.9376937494927055), ('sure', 0.9406429383462297), ('certainly', 0.9414610044038435), ('really', 0.9424268369319777), ('suppose', 0.943663337738039), ('as

get 0.5384615384615384
[('gets', 0.0), ('takes', 0.8954573664100967), ('get', 0.8958311133103708), ('goes', 0.9064042951559641), ('got', 0.9099988871833966), ('getting', 0.9489002674057149), ('puts', 0.9513380490718121), ('comes', 0.9535046725749702), ('sees', 0.9794807202630252), ('thinks', 0.9795486796264622), ('finds', 0.9984891104059349), ('runs', 0.9993981676571179), ('gives', 1.0132348909222657), ('turns', 1.0165876930275906), ('looks', 1.0229656557172735), ('starts', 1.026612712981895), ('tries', 1.032120672568478), ("'ll", 1.0425964423974707), ('loses', 1.0432816255627575), ('wants', 1.0472133451281227), ('makes', 1.0489692060078768), ('sits', 1.0490734217807363), ('knows', 1.0530066443651978), ('rises', 1.0562359506920505), ('reaches', 1.0577784554995457), ('feels', 1.069321896769961), ('git', 1.0718558382224572), ('does', 1.0753129495427716), ('enters', 1.0769479193118332), ('brings', 1.077183764244511), ('pulls', 1.0791844303476523), ('holds', 1.0794369751990118), ('grows', 

had 0.4492753623188406
[('sex', 0.0), ('sexual', 0.9721428206808916), ('male', 0.9923340622035225), ('female', 0.9994294643832136), ('masculine', 1.051318200247057), ('feminine', 1.0597382303395622), ('sexes', 1.0868231323148534), ('instincts', 1.0877356223931618), ('instinct', 1.1026384185761122), ('women', 1.105693315378515), ('gentler', 1.107952756848456), ('weaker', 1.117799842481486), ('offspring', 1.1203038853687297), ('relationship', 1.1205362830031325), ('characteristics', 1.1339429954739764), ('impulses', 1.1369499307786526), ('personality', 1.1400400092974605), ('equality', 1.1405265204061275), ('vanity', 1.1412340900651843), ('sterner', 1.143002190246796), ('relation', 1.1436490963421846), ('human', 1.1441900703635641), ('secondary', 1.1455020506320648), ('race', 1.1463435268741018), ('organs', 1.1464213211556995), ('social', 1.148658618839509), ('education', 1.149214702672831), ('dominant', 1.1504413971141043), ('glands', 1.1557116078420198), ('nature', 1.1584664782933405),

In [None]:
'''
TODO
get some meaningfull words for event extraction
equal the vocabulary of both embeddings
calculate jaccard similarity of words
calculate mean of two words and jaccard similarity?
'''