## Test pre-trained word embeddings

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import numpy as np

### Load word embedding

In [2]:
with open("glove.6B.100d.txt", 'r', encoding='utf-8') as f:
    emb = f.readlines()

In [3]:
# sanity check
#emb[:20]

### Read glove format and produce word: vector map as dictionary

Note: If you load a different pre-trained embedding, you might need to change this accordingly.

In [4]:
emb_map = {word_vec.split(maxsplit=1)[0]: [float(x) for x in word_vec.split(maxsplit=1)[1].split()] for word_vec in emb}

### Produce test vector

In [5]:
# test_vector = np.array(emb_map['man']) - np.array(emb_map['woman']) + np.array(emb_map['day'])
test_vector = np.array(emb_map['water']) + np.array(emb_map['wine'])

### Generate similariy matrix calculating the cos similarity between the test vector and all other word vectors

In [6]:
def generate_sim_matrix(test_vector):
    sim_matrix = []
    for k, v in emb_map.items():
        v = np.array(v)
        sim_matrix.append((k, cosine_similarity(test_vector.reshape(1, -1), v.reshape(1, -1))))
    sim_matrix.sort(key=lambda x: x[1], reverse=True)
    return sim_matrix

In [7]:
# sort most similar word vectors

sim_matrix = generate_sim_matrix(test_vector)

In [8]:
sim_matrix[:5]

[('water', array([[0.85977544]])),
 ('wine', array([[0.84178378]])),
 ('drinking', array([[0.71567708]])),
 ('food', array([[0.70500096]])),
 ('dry', array([[0.70095471]]))]

#### king - man + woman

In [9]:
test_vector = np.array(emb_map['king']) - np.array(emb_map['man']) + np.array(emb_map['woman'])
sim_matrix_king = generate_sim_matrix(test_vector)

In [10]:
sim_matrix_king[:5]

[('king', array([[0.85518372]])),
 ('queen', array([[0.78344143]])),
 ('monarch', array([[0.69338023]])),
 ('throne', array([[0.68331102]])),
 ('daughter', array([[0.68090825]]))]

#### sushi - japan + spain

In [11]:
test_vector = np.array(emb_map['sushi']) - np.array(emb_map['japan']) + np.array(emb_map['spain'])
sim_matrix_sushi = generate_sim_matrix(test_vector)

In [12]:
sim_matrix_sushi[:5]

[('tapas', array([[0.61187109]])),
 ('provence', array([[0.52454662]])),
 ('paella', array([[0.51426447]])),
 ('wine', array([[0.49603065]])),
 ('bourbon', array([[0.49560689]]))]

#### berlin - germany + france

In [13]:
test_vector = np.array(emb_map['berlin']) - np.array(emb_map['germany']) + np.array(emb_map['france'])
sim_matrix_berlin = generate_sim_matrix(test_vector)

In [14]:
sim_matrix_berlin[:5]

[('paris', array([[0.88271442]])),
 ('france', array([[0.75580259]])),
 ('french', array([[0.70751649]])),
 ('prohertrib', array([[0.69431742]])),
 ('berlin', array([[0.66655615]]))]

#### bmw - germany + usa

In [15]:
test_vector = np.array(emb_map['bmw']) - np.array(emb_map['germany']) + np.array(emb_map['usa'])
sim_matrix_bmw = generate_sim_matrix(test_vector)

In [16]:
sim_matrix_bmw[:5]

[('bmw', array([[0.64361948]])),
 ('chevrolet', array([[0.64332747]])),
 ('lexus', array([[0.59819163]])),
 ('buick', array([[0.58549216]])),
 ('x5', array([[0.5810536]]))]