Reference: [link](https://rare-technologies.com/word2vec-tutorial/)

In [1]:
import gensim

In [2]:
sentences = [['first', 'sentence'], ['second', 'sentence']]
# train word2vec on the two sentences
model = gensim.models.Word2Vec(sentences, min_count=1)

In [3]:
model['second']

array([  2.48366501e-03,   4.07958403e-03,   1.22610573e-03,
         4.47610300e-03,   4.06977208e-03,   2.06202199e-03,
        -4.95104818e-03,   3.43400752e-03,  -4.59521136e-04,
         2.14280561e-03,   3.32460948e-03,  -3.63749708e-03,
        -1.01702334e-03,  -1.34130334e-03,  -2.17719749e-03,
         4.88163531e-03,   3.61668854e-03,   2.73034256e-03,
         1.62617653e-03,   3.39382864e-03,  -3.52383149e-03,
        -4.45839576e-03,   7.20713404e-04,  -3.38851532e-04,
         6.66516251e-04,  -4.15217859e-04,   2.02905503e-03,
        -4.35809743e-05,  -2.54914258e-03,  -3.48882895e-04,
        -3.19840014e-03,  -1.03120692e-03,  -1.12814957e-03,
         1.53161737e-03,  -4.97337477e-03,   4.97778505e-03,
        -2.66994117e-03,   3.11474153e-03,   4.69619036e-03,
         4.50434675e-03,   3.49453301e-03,  -1.98399578e-03,
         4.88768285e-03,  -4.21569450e-03,   2.64422107e-03,
        -4.46875347e-03,   3.20817647e-03,   7.62257667e-04,
        -4.29247506e-03,

In [6]:
model.save('mymodel')
new_model = gensim.models.Word2Vec.load('mymodel')

In [4]:
from gensim.models.keyedvectors import KeyedVectors
gensim.scripts.glove2word2vec.glove2word2vec('glove.6B.50d.txt', 'word2vec_glove.txt')

(400000, 50)

In [5]:
glove_model = KeyedVectors.load_word2vec_format('word2vec_glove.txt', binary=False)

In [6]:
glove_model.wv.most_similar(positive=['woman', 'king'], negative=['man']) # king - man + woman

[('queen', 0.8523603677749634),
 ('throne', 0.7664334177970886),
 ('prince', 0.759214460849762),
 ('daughter', 0.7473883032798767),
 ('elizabeth', 0.7460220456123352),
 ('princess', 0.7424569725990295),
 ('kingdom', 0.7337411642074585),
 ('monarch', 0.7214490175247192),
 ('eldest', 0.7184861898422241),
 ('widow', 0.7099430561065674)]

In [7]:
glove_model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [8]:
glove_model.wv.similarity('woman', 'man')

0.88603376834235426

In [9]:
glove_model.wv.similarity('woman', 'girl')

0.90652807013092229

In [10]:
# lets try loading glove file itself
del glove_model

In [12]:
import numpy as np
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model

In [13]:
manual_load_model = loadGloveModel('glove.6B.50d.txt')

Loading Glove Model
Done. 400000  words loaded!


In [14]:
manual_load_model['man']

array([-0.094386,  0.43007 , -0.17224 , -0.45529 ,  1.6447  ,  0.40335 ,
       -0.37263 ,  0.25071 , -0.10588 ,  0.10778 , -0.10848 ,  0.15181 ,
       -0.65396 ,  0.55054 ,  0.59591 , -0.46278 ,  0.11847 ,  0.64448 ,
       -0.70948 ,  0.23947 , -0.82905 ,  1.272   ,  0.033021,  0.2935  ,
        0.3911  , -2.8094  , -0.70745 ,  0.4106  ,  0.3894  , -0.2913  ,
        2.6124  , -0.34576 , -0.16832 ,  0.25154 ,  0.31216 ,  0.31639 ,
        0.12539 , -0.012646,  0.22297 , -0.56585 , -0.086264,  0.62549 ,
       -0.0576  ,  0.29375 ,  0.66005 , -0.53115 , -0.48233 , -0.97925 ,
        0.53135 , -0.11725 ])

In [15]:
manual_load_model['woman']

array([ -1.81530000e-01,   6.48270000e-01,  -5.82100000e-01,
        -4.94510000e-01,   1.54150000e+00,   1.34500000e+00,
        -4.33050000e-01,   5.80590000e-01,   3.55560000e-01,
        -2.51840000e-01,   2.02540000e-01,  -7.16430000e-01,
         3.06100000e-01,   5.61270000e-01,   8.39280000e-01,
        -3.80850000e-01,  -9.08750000e-01,   4.33260000e-01,
        -1.44360000e-02,   2.37250000e-01,  -5.37990000e-01,
         1.77730000e+00,  -6.64330000e-02,   6.97950000e-01,
         6.92910000e-01,  -2.67390000e+00,  -7.68050000e-01,
         3.39290000e-01,   1.96950000e-01,  -3.52450000e-01,
         2.29200000e+00,  -2.74110000e-01,  -3.01690000e-01,
         8.52860000e-04,   1.69230000e-01,   9.14330000e-02,
        -2.36100000e-02,   3.62360000e-02,   3.44880000e-01,
        -8.39470000e-01,  -2.51740000e-01,   4.21230000e-01,
         4.86160000e-01,   2.23250000e-02,   5.57600000e-01,
        -8.52230000e-01,  -2.30730000e-01,  -1.31380000e+00,
         4.87640000e-01,

In [16]:
import math

def dot_product(v1, v2):
    return sum(map(lambda x: x[0] * x[1], zip(v1, v2)))

def cosine_measure(v1, v2):
    prod = dot_product(v1, v2)
    len1 = math.sqrt(dot_product(v1, v1))
    len2 = math.sqrt(dot_product(v2, v2))
    return prod / (len1 * len2)


In [17]:
cosine_measure(manual_load_model['man'], manual_load_model['woman'])

0.8860337718495821