## Test pre-trained word embeddings

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import numpy as np

### Load word embedding

In [7]:
with open("glove.6B.100d.txt", 'r', encoding='utf-8') as f:
    emb = f.readlines()

In [None]:
# sanity check
#emb[:20]

### Read glove format and produce word: vector map as dictionary

Note: If you load a different pre-trained embedding, you might need to change this accordingly.

In [26]:
emb_map = {word_vec.split(maxsplit=1)[0]: [float(x) for x in word_vec.split(maxsplit=1)[1].split()] for word_vec in emb}

### Produce test vector

In [153]:
test_vector = np.array(emb_map['king']) - np.array(emb_map['man']) + np.array(emb_map['woman'])
test_vector = np.array(emb_map['sushi']) - np.array(emb_map['japan']) + np.array(emb_map['spain'])
test_vector = np.array(emb_map['berlin']) - np.array(emb_map['germany']) + np.array(emb_map['france'])
test_vector = np.array(emb_map['bmw']) - np.array(emb_map['germany']) + np.array(emb_map['usa'])

test_vector = np.array(emb_map['man']) - np.array(emb_map['woman']) + np.array(emb_map['day'])
test_vector = np.array(emb_map['water']) + np.array(emb_map['wine'])

### Generate similariy matrix calculating the cos similarity between the test vector and all other word vectors

In [157]:
sim_matrix = []
for k, v in emb_map.items():
    v = np.array(v)
    sim_matrix.append((k, cosine_similarity(test_vector.reshape(1, -1), v.reshape(1, -1))))

In [158]:
# sort most similar word vectors

sim_matrix.sort(key=lambda x: x[1], reverse=True)

In [159]:
sim_matrix[:50]

[('day', array([[ 0.87684423]])),
 ('days', array([[ 0.76509387]])),
 ('time', array([[ 0.7613039]])),
 ('week', array([[ 0.73134851]])),
 ('last', array([[ 0.72871791]])),
 ('night', array([[ 0.72608259]])),
 ('here', array([[ 0.72429461]])),
 ('weekend', array([[ 0.71929217]])),
 ('next', array([[ 0.71897803]])),
 ('came', array([[ 0.71622974]])),
 ('end', array([[ 0.71520564]])),
 ('coming', array([[ 0.71163321]])),
 ('before', array([[ 0.70928444]])),
 ('this', array([[ 0.6989447]])),
 ('sunday', array([[ 0.69380595]])),
 ('start', array([[ 0.69304356]])),
 ('just', array([[ 0.69183953]])),
 ('one', array([[ 0.6916733]])),
 ('during', array([[ 0.69135084]])),
 ('on', array([[ 0.68946531]])),
 ('today', array([[ 0.68314928]])),
 ('saturday', array([[ 0.68131443]])),
 ('after', array([[ 0.68072571]])),
 ('hour', array([[ 0.68025739]])),
 ('year', array([[ 0.67850375]])),
 ('but', array([[ 0.67714291]])),
 ('morning', array([[ 0.67516307]])),
 ('hours', array([[ 0.67395039]])),
 ('whe

#### king - man + woman

In [130]:
sim_matrix_king[:50]

[('king', array([[ 0.85518372]])),
 ('queen', array([[ 0.78344143]])),
 ('monarch', array([[ 0.69338023]])),
 ('throne', array([[ 0.68331102]])),
 ('daughter', array([[ 0.68090825]])),
 ('prince', array([[ 0.67131421]])),
 ('princess', array([[ 0.66440829]])),
 ('mother', array([[ 0.65793253]])),
 ('elizabeth', array([[ 0.65633007]])),
 ('father', array([[ 0.63924191]])),
 ('wife', array([[ 0.63518652]])),
 ('son', array([[ 0.63402253]])),
 ('sister', array([[ 0.62044709]])),
 ('widow', array([[ 0.62006242]])),
 ('crown', array([[ 0.61989454]])),
 ('emperor', array([[ 0.61912293]])),
 ('cousin', array([[ 0.6139021]])),
 ('lady', array([[ 0.60536262]])),
 ('margaret', array([[ 0.60504502]])),
 ('married', array([[ 0.59957814]])),
 ('kingdom', array([[ 0.59918958]])),
 ('marriage', array([[ 0.59697579]])),
 ('brother', array([[ 0.59032062]])),
 ('mary', array([[ 0.58871532]])),
 ('marry', array([[ 0.58868639]])),
 ('birth', array([[ 0.5865192]])),
 ('eldest', array([[ 0.58310285]])),
 ('

#### sushi - japan + spain

In [137]:
sim_matrix_sushi[:5]

[('tapas', array([[ 0.61187109]])),
 ('provence', array([[ 0.52454662]])),
 ('paella', array([[ 0.51426447]])),
 ('wine', array([[ 0.49603065]])),
 ('bourbon', array([[ 0.49560689]]))]

#### berlin - germany + france

In [138]:
sim_matrix_berlin[:5]

[('paris', array([[ 0.88271442]])),
 ('france', array([[ 0.75580259]])),
 ('french', array([[ 0.70751649]])),
 ('prohertrib', array([[ 0.69431742]])),
 ('berlin', array([[ 0.66655615]]))]

#### bmw - germany + usa

In [140]:
sim_matrix_bmw[:50]

[('bmw', array([[ 0.64361948]])),
 ('chevrolet', array([[ 0.64332747]])),
 ('lexus', array([[ 0.59819163]])),
 ('buick', array([[ 0.58549216]])),
 ('x5', array([[ 0.5810536]])),
 ('chevy', array([[ 0.56564524]])),
 ('bmc', array([[ 0.56064426]])),
 ('infiniti', array([[ 0.55699883]])),
 ('usa', array([[ 0.55633616]])),
 ('mercedes', array([[ 0.55558025]])),
 ('cadillac', array([[ 0.55016506]])),
 ('irl', array([[ 0.54708922]])),
 ('reynard', array([[ 0.54537485]])),
 ('penske', array([[ 0.54285977]])),
 ('thunderbird', array([[ 0.54284841]])),
 ('honda', array([[ 0.53531258]])),
 ('dodge', array([[ 0.53368685]])),
 ('oldsmobile', array([[ 0.53189748]])),
 ('toyota', array([[ 0.53136199]])),
 ('nascar', array([[ 0.52928695]])),
 ('gt', array([[ 0.52010161]])),
 ('x6', array([[ 0.51657275]])),
 ('holden', array([[ 0.51657249]])),
 ('ford', array([[ 0.51360596]])),
 ('nissan', array([[ 0.51318477]])),
 ('cc', array([[ 0.5087336]])),
 ('4x4', array([[ 0.50333009]])),
 ('jaguar', array([[ 0