<a href="https://colab.research.google.com/github/patbaa/demo_notebooks/blob/master/play_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Explore word2vec embeddings

- to be able to run this notebook you need to train your model first
- you can train your model locally, not on Google Colab

In [0]:
import numpy as np
from gensim.models.word2vec import Word2Vec

%matplotlib inline

In [0]:
%%time
en_w2v = Word2Vec.load('en_word2vec')

CPU times: user 9.47 s, sys: 2.72 s, total: 12.2 s
Wall time: 12.2 s


Get word embeddings for a few words

In [0]:
apple = en_w2v.wv.get_vector('apple')
pear  = en_w2v.wv.get_vector('pear')
dog   = en_w2v.wv.get_vector('dog')
len(apple), len(pear), len(dog)

(200, 200, 200)

In [0]:
apple

array([ 5.03434062e-01,  2.16583014e+00,  7.37607300e-01,  1.97248554e+00,
       -2.28733587e+00, -3.53058553e+00, -4.56870317e-01,  1.62385142e+00,
        3.02450180e-01,  1.27949536e+00,  5.46707439e+00,  1.14164639e+00,
       -2.08161160e-01,  7.08680272e-01, -3.67628407e+00, -8.61007929e-01,
       -1.64131534e+00,  3.68233585e+00, -3.64759147e-01, -1.76412380e+00,
       -3.27543354e+00, -2.21811190e-01,  1.83110189e+00,  3.46247411e+00,
        1.15471900e+00, -3.60734224e-01,  9.19606149e-01, -2.15470600e+00,
        6.45661473e-01,  2.06053424e+00,  1.47143602e-01,  1.57057762e+00,
        2.38143015e+00,  1.04366207e+00, -6.41605139e-01,  1.93041682e-01,
        9.43868279e-01, -3.98410946e-01,  1.04741946e-01, -8.30831289e-01,
       -3.98071140e-01,  4.53355837e+00,  4.16923666e+00,  4.93524492e-01,
       -1.70531178e+00, -7.68735826e-01, -3.99773693e+00,  1.52821586e-01,
        2.18201828e+00,  4.03025389e+00,  7.38585889e-01, -3.51597834e+00,
        6.88978004e+00, -

In [0]:
def cosine_similarity(v1, v2):
    sim = np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))
    return np.round(sim, 4)

In [0]:
cosine_similarity(apple, pear), cosine_similarity(apple, dog), cosine_similarity(pear, dog)

(0.4224, 0.1457, 0.1629)

In [0]:
print('apple-pear:', cosine_similarity(apple, pear))
print('apple-dog:', cosine_similarity(apple, dog))
print('pear-dog:', cosine_similarity(pear, dog))

apple-pear: 0.4224
apple-dog: 0.1457
pear-dog: 0.1629


In [0]:
cosine_similarity(en_w2v.wv.get_vector('king')-en_w2v.wv.get_vector('queen'), 
                  en_w2v.wv.get_vector('man')-en_w2v.wv.get_vector('woman'))

0.4969

Vector operations:

$$ carrots - carrot = cats - cat $$
$$ carrots = carrot + cats - cat $$

In [0]:
en_w2v.wv.most_similar(positive=['cats', 'carrot'], negative=['cat'], topn=5)

[('carrots', 0.5424485802650452),
 ('thistles', 0.5246830582618713),
 ('parsnip', 0.5204790234565735),
 ('celery', 0.5110417604446411),
 ('blueberries', 0.5081753730773926)]

In [0]:
en_w2v.wv.most_similar(positive=['cats', 'mouse'], negative=['cat'], topn=5)

[('mice', 0.6760387420654297),
 ('rabbits', 0.5993174910545349),
 ('rats', 0.5820876955986023),
 ('hamsters', 0.5710359215736389),
 ('gerbils', 0.5586819052696228)]

In [0]:
en_w2v.wv.most_similar('mozart', topn=5)

[('beethoven', 0.7484350800514221),
 ('mozarts', 0.7354700565338135),
 ('haydn', 0.7320547103881836),
 ('brahms', 0.728082001209259),
 ('liszt', 0.7276102304458618)]

## Plural-singular is easy... is there anything else?

$$ X - Mozart = footballer - Messi $$
$$ X = Mozart + footballer - Messi $$

In [0]:
en_w2v.wv.most_similar(positive=['footballer', 'mozart'], negative=['messi'], topn=5)

[('operatic', 0.45616382360458374),
 ('violinist', 0.4509333670139313),
 ('cellist', 0.44413888454437256),
 ('composer', 0.4420720040798187),
 ('oboist', 0.43317335844039917)]

In [0]:
en_w2v.wv.most_similar(positive=['ballmer', 'apple'], negative=['microsoft'], topn=5)

[('sculley', 0.6320608258247375),
 ('wozniak', 0.6043099164962769),
 ('hertzfeld', 0.585370659828186),
 ('fadell', 0.5655703544616699),
 ('elop', 0.49531126022338867)]

Not Steve Jobs!

In [0]:
en_w2v.wv.most_similar(positive=['sushi', 'italy'], negative=['japan'], topn=5)

[('ravioli', 0.6762843132019043),
 ('trattoria', 0.6757627725601196),
 ('risotto', 0.6740370988845825),
 ('bruschetta', 0.6610550880432129),
 ('ristorante', 0.6580170392990112)]

## Does not match

In [0]:
en_w2v.wv.doesnt_match('messi ronaldo einstein rivaldo buffon'.split())

'einstein'

In [0]:
en_w2v.wv.doesnt_match('einstein hawking darwin bohr curie tesla'.split())

'darwin'

In [0]:
en_w2v.wv.doesnt_match('physics mathematics chemistry biology history'.split())

'history'

In [0]:
en_w2v.wv.doesnt_match('germany hungary slovakia usa france italy'.split())

'usa'