In [860]:
from collections import Counter, namedtuple
from gensim.models import KeyedVectors
from nltk.stem import WordNetLemmatizer
from pyemd import emd_with_flow
from sklearn.metrics import euclidean_distances

import time

In [861]:
from flow_wmd.modules import Document, DocPair

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [867]:
# Obama speaks to the media in Illinois 
sentence1_words = ['beautiful','obama', 'speaks', 'media', 'illinois', 'morning','Iraq']

# The President greets the press in Chicago. 
sentence2_words = ['beautiful','president', 'greets', 'press', 'chicago', 'beautiful','morning']

In [868]:
source = Document(sentence1_words)
source.instantiate(model)

sink = Document(sentence2_words)
sink.instantiate(model)

In [869]:
dp = DocPair(source, sink)
dp.getsignature()
dp.emd()
dp.getCost()

In [870]:
dp.source_cost

{'beautiful': 0.0,
 'obama': 0.19391,
 'speaks': 0.13973,
 'media': 0.12756,
 'illinois': 0.12296,
 'morning': 0.0,
 'Iraq': 0.18869}

In [871]:
dp.sink_cost

{'beautiful': 0.0,
 'president': 0.18869,
 'greets': 0.13973,
 'press': 0.12756,
 'chicago': 0.12296,
 'morning': 0.0}

In [874]:
w2v_emd, w2v_flow = dp.emd()

In [875]:
np.sum(np.array(w2v_flow)[np.ix_(dp.idx1,)], axis=1)

array([0.142857, 0.142857, 0.142857, 0.142857, 0.142857, 0.142857,
       0.142857])

In [876]:
dp.vocab

['beautiful',
 'obama',
 'speaks',
 'media',
 'illinois',
 'morning',
 'Iraq',
 'beautiful',
 'president',
 'greets',
 'press',
 'chicago',
 'beautiful',
 'morning']

In [878]:
import pandas as pd

flow_df = pd.DataFrame(w2v_flow, index=dp.vocab, columns=dp.vocab)
flow_df.iloc[dp.idx1,dp.idx2]

Unnamed: 0,beautiful,president,greets,press,chicago,beautiful.1,morning
beautiful,0.142857,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.142857,0.0,0.0,0.0,0.0,0.0,0.0
speaks,0.0,0.0,0.142857,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.142857,0.0,0.0,0.0
illinois,0.0,0.0,0.0,0.0,0.142857,0.0,0.0
morning,0.0,0.0,0.0,0.0,0.0,0.0,0.142857
Iraq,0.0,0.142857,0.0,0.0,0.0,0.0,0.0


In [879]:
dist_df = pd.DataFrame(dp.w2v_distances, index=dp.vocab, columns=dp.vocab)
cost_df = (flow_df * dist_df)
cost_df.iloc[dp.idx1,dp.idx2].round(3)

Unnamed: 0,beautiful,president,greets,press,chicago,beautiful.1,morning
beautiful,0.0,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.194,0.0,0.0,0.0,0.0,0.0,0.0
speaks,0.0,0.0,0.14,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.128,0.0,0.0,0.0
illinois,0.0,0.0,0.0,0.0,0.123,0.0,0.0
morning,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Iraq,0.0,0.189,0.0,0.0,0.0,0.0,0.0


In [542]:
def read_1w_corpus(name, sep="\t"):
    for line in open(name):
        yield line.split(sep)

In [6]:
print("Loading GoogleNews Vectors")
%time model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
vocabulary = set(model.vocab)
relevant_words = [word for (word, count) in read_1w_corpus('count_1w.txt') if word in vocabulary]
model_reduced = model[[w for w in relevant_words]]

Loading GoogleNews Vectors


In [7]:
%time model.init_sims(replace=True)

CPU times: user 3.25 s, sys: 4.25 s, total: 7.5 s
Wall time: 8.49 s


In [71]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mikaelbrunila/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [89]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

import numpy as np

sw = set(stopwords.words('english'))

doc1 = " ".join(sentence1_words)
doc2 = " ".join(sentence2_words)
nbow = CountVectorizer(stop_words = sw)
nbow.fit([doc for doc in [doc1, doc2]])

vocabulary = set(model.index2word)
names = nbow.get_feature_names()

In [93]:
v1, v2 = nbow.transform([doc1, doc2])   
index = np.union1d(v1.indices, v2.indices)

v1  = v1.toarray().ravel()
v2  = v2.toarray().ravel()

n = len(index) 
index_map = [(index[i], i) for i in range(n) if names[index[i]] in vocabulary]
source    = np.zeros(n)
sink      = np.zeros(n)
vecs      = np.zeros(shape = (n, 300))
    
for i, j in index_map:
    source[j] = v1[i]
    sink[j]   = v2[i]
    vecs[j]   = model[names[i]]

sum_source = sum(source)
sum_sink = sum(sink)
if sum_source == 0:
    sum_source = 1
if sum_sink == 0:
    sum_sink = 1


In [98]:
source / sum_source

array([0.  , 0.  , 0.25, 0.25, 0.25, 0.  , 0.  , 0.25])

In [100]:
vecs.shape

(8, 300)

In [87]:
index

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int32)