In [11]:
import os
import numpy as np
import gensim
from gensim.parsing.preprocessing import STOPWORDS
from scipy.spatial.distance import euclidean
from sklearn.metrics import euclidean_distances
from gensim.corpora.dictionary import Dictionary
from time import time
import collections

In [2]:
from gensim.models import Word2Vec
start = time()
if not os.path.exists('GoogleNews-vectors-negative300.bin.gz'):
    raise ValueError("SKIP: You need to download the google news model")
    
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
print('Cell took %.2f seconds to run.' % (time() - start))

Cell took 387.18 seconds to run.


In [15]:
s1 = "Obama speaks to the media in Illinois".lower().split()
s2 = "The president greets the press in Chicago".lower().split()
s3 = 'Oranges are my favorite fruit'.lower().split()
s1 = [w for w in s1 if w not in STOPWORDS]
s2 = [w for w in s2 if w not in STOPWORDS]
s3 = [w for w in s3 if w not in STOPWORDS]
print s1, s2, s3

['obama', 'speaks', 'media', 'illinois'] ['president', 'greets', 'press', 'chicago'] ['oranges', 'favorite', 'fruit']


Defining the relaxed word mover distance

In [24]:
def wmDistance(d1,d2):
    d1 = [w for w in d1 if w not in STOPWORDS]
    d2 = [w for w in d2 if w not in STOPWORDS]
    # print d1, d2
    freq1 = collections.Counter(d1)
    freq2 = collections.Counter(d2)
    sum1 = 0
    sum2 = 0
    di = []
    dj = []
    for word in freq1:
        di.append(freq1[word])
        sum1 = sum1 + freq1[word]
    for word in freq2:
        dj.append(freq2[word])
        sum2 = sum2 + freq2[word]
    
    di = np.array(di)/float(sum1)
    dj = np.array(dj)/float(sum2)
    # print di, dj
    distance_matrix = []
    for word1 in d1:
        row = []
        for word2 in d2:
            a = euclidean_distances([model[word1]], [model[word2]])
            row.append(a[0][0])
        distance_matrix.append(row)
    distance_matrix = np.array(distance_matrix)
    flow_matrix = []
    for i in range(len(distance_matrix)):
        for j in range(len(distance_matrix)):
            # print distance_matrix[i][j]
            if np.argmin(distance_matrix[i]) == j:
                flow_matrix.append(di[i])
            else:
                flow_matrix.append(0)
    finalSum = 0
    x = 0
    for i in distance_matrix:
        for j in i:
            finalSum = finalSum + flow_matrix[x]*j
            x = x + 1
    return finalSum

Computing from existing wmdistance method, for performance benchmark

In [25]:
distance = model.wmdistance(s1, s2)
distance1 = model.wmdistance(s2, s3)
print distance, distance1

3.37412332147 4.18429791


Checking with wmDistance function.

In [26]:
distance0 = wmDistance(s1, s2)
distance01 = wmDistance(s2, s3)
print distance0, distance01

2.10030454397 3.12690079212


Normalizing word2vec vectors.

In [27]:
start = time()
model.init_sims(replace=True)
print 'Cell took %.2f seconds to run.' %(time() - start)

Cell took 61.41 seconds to run.


In [28]:
distance = model.wmdistance(s1, s2)
distance1 = model.wmdistance(s2, s3)
print distance, distance1

1.01746462593 1.33882517563


In [29]:
distance0 = wmDistance(s1, s2)
distance01 = wmDistance(s2, s3)
print distance0, distance01

0.353553384542 0.965925842524
