In [2]:
from collections import Counter, namedtuple
from gensim.models import KeyedVectors
from nltk.stem import WordNetLemmatizer
from pyemd import emd_with_flow
from sklearn.metrics import euclidean_distances

import numpy as np
import time

In [3]:
from flow_wmd.modules import Document, DocPair

%load_ext autoreload
%autoreload 2

## Load WV model

In [4]:
def read_1w_corpus(name, sep="\t"):
    for line in open(name):
        yield line.split(sep)

print("Loading GoogleNews Vectors")
%time model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

Loading GoogleNews Vectors
CPU times: user 43 s, sys: 4.17 s, total: 47.2 s
Wall time: 50 s


In [6]:
%time model.init_sims(replace=True)

CPU times: user 4.36 s, sys: 12.1 s, total: 16.5 s
Wall time: 28.1 s


## First Try

In [None]:
# Obama speaks to the media in Illinois 
sentence1_words = ['obama', 'speaks', 'media', 'illinois']

# The President greets the press in Chicago. 
sentence2_words = ['president', 'greets', 'press', 'chicago']

In [125]:
# Obama speaks to the media in Illinois 
sentence1_words = ['beautiful','obama', 'speaks', 'media', 'illinois', 'morning','iraq']

# The President greets the press in Chicago. 
sentence2_words = ['beautiful','president', 'greets', 'press', 'chicago', 'beautiful','morning']

In [126]:
%time
source = Document(sentence1_words)
source.instantiate(model)

sink = Document(sentence2_words)
sink.instantiate(model)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [127]:
%time dp = DocPair(source, sink)
%time dp.getsignature()
%time dp.emd()
%time dp.getCost()

CPU times: user 19 µs, sys: 0 ns, total: 19 µs
Wall time: 21.9 µs
CPU times: user 86 µs, sys: 26 µs, total: 112 µs
Wall time: 88.7 µs
CPU times: user 1.39 ms, sys: 1.35 ms, total: 2.74 ms
Wall time: 3.13 ms
CPU times: user 236 µs, sys: 20 µs, total: 256 µs
Wall time: 256 µs


In [83]:
dp.source_cost

{'beautiful': 0.0,
 'obama': 0.19391,
 'speaks': 0.13973,
 'media': 0.12756,
 'illinois': 0.12296,
 'morning': 0.0,
 'Iraq': 0.18869}

In [84]:
dp.sink_cost

{'beautiful': 0.0,
 'president': 0.18869,
 'greets': 0.13973,
 'press': 0.12756,
 'chicago': 0.12296,
 'morning': 0.0}

In [85]:
w2v_emd, w2v_flow = dp.emd()

In [86]:
np.sum(np.array(w2v_flow)[np.ix_(dp.idx1,)], axis=1)

array([0.142857, 0.142857, 0.142857, 0.142857, 0.142857, 0.142857,
       0.142857])

In [87]:
dp.vocab

['beautiful',
 'obama',
 'speaks',
 'media',
 'illinois',
 'morning',
 'Iraq',
 'beautiful',
 'president',
 'greets',
 'press',
 'chicago',
 'beautiful',
 'morning']

In [137]:
import pandas as pd

flow_df = pd.DataFrame(w2v_flow, index=dp.vocab, columns=dp.vocab)
flow_df.iloc[dp.idx1,dp.idx2]

Unnamed: 0,beautiful,president,greets,press,chicago,beautiful.1,morning
beautiful,0.142857,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.142857,0.0,0.0,0.0,0.0,0.0,0.0
speaks,0.0,0.0,0.142857,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.142857,0.0,0.0,0.0
illinois,0.0,0.0,0.0,0.0,0.142857,0.0,0.0
morning,0.0,0.0,0.0,0.0,0.0,0.0,0.142857
iraq,0.0,0.142857,0.0,0.0,0.0,0.0,0.0


In [370]:
dp.w2v_distances.shape

(14, 14)

In [138]:
dist_df = pd.DataFrame(dp.w2v_distances, index=dp.vocab, columns=dp.vocab)
cost_df = (flow_df * dist_df)
cost_df.iloc[dp.idx1,dp.idx2].round(3)

Unnamed: 0,beautiful,president,greets,press,chicago,beautiful.1,morning
beautiful,0.0,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.194,0.0,0.0,0.0,0.0,0.0,0.0
speaks,0.0,0.0,0.14,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.128,0.0,0.0,0.0
illinois,0.0,0.0,0.0,0.0,0.123,0.0,0.0
morning,0.0,0.0,0.0,0.0,0.0,0.0,0.0
iraq,0.0,0.202,0.0,0.0,0.0,0.0,0.0


## Using LC RMWD Paper

In [177]:
from sklearn.feature_extraction.text import TfidfVectorizer

X1 = [['beautiful','obama', 'speaks', 'media', 'illinois', 'morning','iraq'],
      ['beautiful','obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning'],
      ["which","team","will","lose"],
      ["is", "it", "gonna", "rain", "tomorrow"],
      ['obama', 'speaks', 'media', 'illinois']]
X2 = [['beautiful','president', 'greets', 'press', 'chicago', 'beautiful','morning'],
      ['president', 'greets', 'press', 'chicago', 'evening'],
      ["who", "is", "winning"],
      ["the", "forecast", "says", "snow"],
      ['president', 'greets', 'press', 'chicago'],
      ['hello','called','love','you']]

X1_sent = [" ".join(doc) for doc in X1]
X2_sent = [" ".join(doc) for doc in X2]

corpus = X1_sent + X2_sent

#vectorizer = TfidfVectorizer(use_idf=False, norm='l2')
vectorizer = TfidfVectorizer(use_idf=False, norm='l1')
vectorizer.fit(corpus)

TfidfVectorizer(norm='l1', use_idf=False)

In [178]:
features = vectorizer.get_feature_names()
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
idx2word = {idx: word for idx, word in enumerate(vectorizer.get_feature_names())}

In [179]:
word2idx['chicago']

2

In [180]:
#X1_corpus = 
X1_nbow = vectorizer.transform(X1_sent)
X2_nbow = vectorizer.transform(X2_sent)

In [102]:
E = np.vstack([model.word_vec(word) for word in vectorizer.get_feature_names()])

### 1. WMD

In [103]:
#T1 = np.vstack([E[word2idx[word]] for doc in X1 for word in doc])
#T2 = np.vstack([E[word2idx[word]] for doc in X2 for word in doc])

#print(T1.shape)
#print(T2.shape)

In [104]:
E[[1,2],].shape

(2, 300)

In [312]:
%time

def lc_rmwd(X1, X2, E):
    D1, D2 = [], []

    # Atasu et al LC-RWMD: One-to-many
    for idx2,doc2 in enumerate(X2):
        Z = euclidean_distances(E, [vec for vec in [E[word2idx[word]] for word in doc2]]).min(axis=1)
        LC_RWMD = np.dot(X1_nbow.toarray(), Z)
        D1.append(LC_RWMD)

    for idx1,doc1 in enumerate(X1):
        Z = euclidean_distances(E, [vec for vec in [E[word2idx[word]] for word in doc1]]).min(axis=1)
        LC_RWMD = np.dot(X2_nbow.toarray(), Z)
        D2.append(LC_RWMD)

    D = np.maximum(np.vstack(D1), np.vstack(np.transpose(D2)))
    return D

D = lc_rmwd(X1, X2, E)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.05 µs


In [106]:
distance_df = pd.DataFrame(D, index=X2_sent, columns=X1_sent)
distance_df

Unnamed: 0,beautiful obama speaks media illinois morning iraq,beautiful obama speaks media illinois beautiful morning,which team will lose,is it gonna rain tomorrow,obama speaks media illinois
beautiful president greets press chicago beautiful morning,0.689295,0.577514,1.3403,1.221573,1.144354
president greets press chicago evening,0.982602,1.010237,1.321951,1.227445,1.080542
who is winning,1.340106,1.324983,1.148093,1.00634,1.317062
the forecast says snow,1.289052,1.272779,1.259401,1.113256,1.29005
president greets press chicago,1.055,1.082634,1.33366,1.281563,1.01065
hello called love you,1.234553,1.193422,1.282169,1.145928,1.281186


In [None]:
from pyemd import emd

In [167]:
def WMD(X1, X2, X1_nbow, X2_nbow, E):
    X1_idxs = list(set([word2idx[word] for word in X1]))
    X2_idxs = list(set([word2idx[word] for word in X2]))
    T_emd = E[X1_idxs + X2_idxs,]
    C_emd = euclidean_distances(T_emd, T_emd)
    X1_sig = np.concatenate((X1_nbow.toarray()[0,X1_idxs], np.zeros(len(X2_idxs))))
    X2_sig = np.concatenate((np.zeros(len(X1_idxs)), X2_nbow.toarray()[0,X2_idxs]))
    print(X1_sig)
    print(X2_sig)
    # Doing WMD
    WMD = emd_with_flow(np.array(X1_sig, dtype=np.double), 
                                np.array(X2_sig, dtype=np.double), 
                                np.array(C_emd, dtype=np.double))
    #WMD_s.append(WMD + (X1_indeces, X2_indeces))
    m = np.array(WMD[1])[:len(X1_idxs),len(X1_idxs):]
    return (WMD[0],m,X1_idxs,X2_idxs)

In [155]:
sentence2_words

['beautiful',
 'president',
 'greets',
 'press',
 'chicago',
 'beautiful',
 'morning']

In [168]:
WMD(sentence1_words, 
    sentence2_words, 
    vectorizer.transform([" ".join(sentence1_words)]), 
    vectorizer.transform([" ".join(sentence2_words)]),
   E)

[0.14285714 0.14285714 0.14285714 0.14285714 0.14285714 0.14285714
 0.14285714 0.         0.         0.         0.         0.
 0.        ]
[0.         0.         0.         0.         0.         0.
 0.         0.28571429 0.14285714 0.14285714 0.14285714 0.14285714
 0.14285714]


(0.7779423936888455,
 array([[0.142857, 0.      , 0.      , 0.      , 0.      , 0.      ],
        [0.      , 0.142857, 0.      , 0.      , 0.      , 0.      ],
        [0.142857, 0.      , 0.      , 0.      , 0.      , 0.      ],
        [0.      , 0.      , 0.      , 0.      , 0.      , 0.142857],
        [0.      , 0.      , 0.      , 0.142857, 0.      , 0.      ],
        [0.      , 0.      , 0.      , 0.      , 0.142857, 0.      ],
        [0.      , 0.      , 0.142857, 0.      , 0.      , 0.      ]]),
 [0, 8, 9, 14, 15, 16, 22],
 [0, 2, 6, 15, 17, 18])

In [132]:
import bottleneck as bn

%time

def get_L(X1, X2, 
          X1_nbow, X2_nbow, 
          D, E, n):
    n = 2
    L_values = []
    for idx1, doc1 in enumerate(X1):
        X1_indeces = list(set([word2idx[word] for word in doc1]))
        values = bn.partition(D[idx1], D[idx1].size-n)[:-n]
        indeces = bn.argpartition(D[idx1], D[idx1].size-n)[:-n]
        WMD_s = []
        for idx2 in indeces:
            doc2 = X2[idx2]
            X2_indeces = list(set([word2idx[word] for word in doc2]))
            wmd = WMD(doc1, doc2, X1_nbow[idx1],X2_nbow[idx2],E)[0]
            WMD_s.append(wmd)
        L = max(WMD_s)
        L_values.append((idx1, L))
    return L_values

L_values = get_L(X1, X2, X1_nbow, X2_nbow,D, E, 2)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


In [133]:
cost_df = pd.DataFrame(rmwds[1][1], 
                       index=[idx2word[idx] for idx in rmwds[1][2]], 
                       columns=[idx2word[idx] for idx in rmwds[1][3]])
cost_df

Unnamed: 0,obama,president,chicago,greets
beautiful,0.214285,0.0,0.0,0.071429
illinois,0.0,0.0,0.142857,0.0
love,0.0,0.142857,0.0,0.0
media,0.0,0.107143,0.0,0.035714
morning,0.035714,0.0,0.107143,0.0
snow,0.0,0.0,0.0,0.142857


In [143]:
def R_WMD(L_values, X1, X2, X1_nbow, X2_nbow):
    wmd_s = []
    for L in L_values:
        for idx2, row in enumerate(D[L[0]]):
            if row < L[1]:
                wmd = WMD(X1[L[0]], X2[idx2], X1_nbow[L[0]], X2_nbow[idx2], E)
                wmd_s.append(wmd)
            else:
                pass
    return wmd_s

rmwds = R_WMD(L_values, X1, X2, X1_nbow, X2_nbow)

## Modularize LC-RMWD

In [229]:
from sklearn.feature_extraction.text import TfidfVectorizer

X1 = [['beautiful','obama', 'speaks', 'media', 'illinois', 'morning','iraq'],
      ['beautiful','obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning'],
      ["which","team","will","lose"],
      ["is", "it", "gonna", "rain", "tomorrow"],
      ['obama', 'speaks', 'media', 'illinois']]
X2 = [['beautiful','president', 'greets', 'press', 'chicago', 'beautiful','morning'],
      ['president', 'greets', 'press', 'chicago', 'evening'],
      ["who", "is", "winning"],
      ["the", "forecast", "says", "snow"],
      ['president', 'greets', 'press', 'chicago'],
      ['hello','called','love','you']]

X1_sent = [" ".join(doc) for doc in X1]
X2_sent = [" ".join(doc) for doc in X2]

corpus = X1_sent + X2_sent

#vectorizer = TfidfVectorizer(use_idf=False, norm='l2')
vectorizer = TfidfVectorizer(use_idf=False, norm='l1')
vectorizer.fit(corpus)

TfidfVectorizer(norm='l1', use_idf=False)

In [178]:
features = vectorizer.get_feature_names()
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
idx2word = {idx: word a idx, word in enumerate(vectorizer.get_feature_names())}

In [179]:
word2idx['chicago']

2

In [256]:
#X1_corpus = 
%time
X1_nbow = vectorizer.transform(X1_sent)
X2_nbow = vectorizer.transform(X2_sent)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


In [102]:
E = np.vstack([model.word_vec(word) for word in vectorizer.get_feature_names()])

In [642]:
from flow_wmd.modules_rwmd import Document, LC_RWMD, WMDCluster

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [208]:
X1_sent[0]

'beautiful obama speaks media illinois morning iraq'

In [239]:
X1_sent[1]

'beautiful obama speaks media illinois beautiful morning'

In [564]:
%time 

X1_docs, X2_docs = [], []

X1_nbow = vectorizer.transform(X1_sent)
X2_nbow = vectorizer.transform(X2_sent)

for idx, doc in enumerate(X1):
    X1_docs.append(Document(doc, X1_nbow[idx], word2idx, E))
    
for idx, doc in enumerate(X2):
    X2_docs.append(Document(doc, X2_nbow[idx], word2idx, E))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [565]:
%time

lc_rwmd = LC_RWMD(X1_docs, X2_docs,X1_nbow,X2_nbow,E)
lc_rwmd.get_D()
lc_rwmd.get_L(2)
lc_rwmd.get_rwmd()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


In [566]:
lc_rwmd.wmd_s

[0.7779423936888455,
 1.0577764527086124,
 0.5814076731150226,
 1.0601321450052137,
 1.1221033037203125,
 1.3456935750793113,
 1.33409258504228,
 1.1613731145914146,
 1.2665668245616257,
 1.3413949351295233,
 1.1287236809574008,
 1.1562737829503784,
 1.0927072101407647,
 1.0174646259300113]

In [569]:
X1_docs[0]

<flow_wmd.modules_rwmd.Document at 0x7f942b48d9a0>

In [649]:
X1_docs[0]

<flow_wmd.modules_rwmd.Document at 0x7f942b48d9a0>

In [655]:
%time

distances, wc_X1, wc_X2 = WMDCluster(X1_docs, X2_docs,E,idx2word).get_distances(return_flow = True)

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 2.86 µs


In [658]:
wc_X1

{'is': 1.27494,
 'iraq': 1.18948,
 'it': 1.4954100000000001,
 'beautiful': 2.7164,
 'which': 1.9112699999999998,
 'lose': 1.9683000000000002,
 'speaks': 3.5973,
 'obama': 4.23171,
 'will': 1.8686099999999999,
 'tomorrow': 1.46315,
 'media': 3.61458,
 'illinois': 3.6213599999999997,
 'rain': 1.4949000000000001,
 'gonna': 1.41928,
 'morning': 1.70016,
 'team': 1.9965}

In [578]:
%time

test = WMDCluster(X1_docs[0:2], X2_docs[0:2],E,idx2word).get_flow()[0]

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


In [632]:
test[2][:,0]

array([0.     , 0.     , 0.19653, 0.     , 0.     , 0.     , 0.     ])

In [580]:
pd.DataFrame(test[2], index=test[3],columns=test[4])

Unnamed: 0,beautiful,chicago,greets,morning,president,press
beautiful,0.0,0.0,0.0,0.0,0.0,0.0
illinois,0.0,0.12296,0.0,0.0,0.0,0.0
iraq,0.19653,0.0,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.0,0.0,0.12756
morning,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.0,0.0,0.0,0.0,0.19115,0.0
speaks,0.0,0.0,0.13973,0.0,0.0,0.0


In [521]:
cost_m = test[0]*test[1]
cost_m = cost_m[:len(test[2]),len(test[2]):].round(5)
pd.DataFrame(cost_m, index=w1,columns=w2)

Unnamed: 0,beautiful,chicago,greets,morning,president,press
beautiful,0.0,0.0,0.0,0.0,0.0,0.0
illinois,0.0,0.12296,0.0,0.0,0.0,0.0
iraq,0.19653,0.0,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.0,0.0,0.12756
morning,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.0,0.0,0.0,0.0,0.19115,0.0
speaks,0.0,0.0,0.13973,0.0,0.0,0.0


In [69]:
T2_full = []

# Atasu et al LC-RWMD: Many-to-many NOT WORKING
for idx2,doc2 in enumerate(X2):
    # Helper to quickly get word vectors for X2
    X2_indeces = list(set([word2idx[word] for word in doc2]))
    #Atasu et al. RMWD
    T2 = E[X2_indeces,]
    T2_full.append(T2)
    #ET2 = np.dot(E)
    #Z_mtm =

np.vstack(T2_full).min(axis=1)

array([-0.15780888, -0.17197762, -0.14504768, -0.12993725, -0.16405022,
       -0.1421869 , -0.22472994, -0.15382984, -0.14625083, -0.13629027,
       -0.13017109, -0.17396213, -0.12993725, -0.16405022, -0.15780888,
       -0.14504768, -0.161765  , -0.16040245, -0.16454686, -0.1797309 ],
      dtype=float32)

In [70]:
%time

RWMD_s = []

for idx1,doc1 in enumerate(X1):
    # Helper to quickly get word vectors for X1
    X1_indeces = list(set([word2idx[word] for word in doc1]))

    # Atasu et al. RMWD
    T1 = E[X1_indeces,]
    F1 = X1_nbow[idx1].toarray()[0,X1_indeces]
    
    # Helper
    similarity = []
    for idx2,doc2 in enumerate(X2):
        # Helper to quickly get word vectors for X2
        X2_indeces = list(set([word2idx[word] for word in doc2]))
        
        #Atasu et al. RMWD
        T2 = E[X2_indeces,]
        F2 = X2_nbow[idx2].toarray()[0,X2_indeces]
        
        C = euclidean_distances(T1, T2)
        RWMD = np.maximum(np.dot(np.min(C, axis=1), F1),
                          np.dot(np.min(C, axis=0), F2))
        
        # Helper
        similarity.append((doc1, idx2, doc2,RWMD))
        
        # Required by pyemd, but not part of the Atasu et al.
        T_emd = E[X1_indeces + X2_indeces,]
        C_emd = euclidean_distances(T_emd, T_emd)
        X1_sig = np.concatenate((X1_nbow[3].toarray()[0,X1_indeces], np.zeros(len(F2))))
        X2_sig = np.concatenate((np.zeros(len(F1)), X2_nbow[3].toarray()[0,X2_indeces]))
        
        # Doing WMD
        WMD = emd_with_flow(np.array(X1_sig, dtype=np.double), 
                            np.array(X2_sig, dtype=np.double), 
                            np.array(C_emd, dtype=np.double))
        
        #print(WMD[0])
        #print(RWMD)
    RWMD_s.append(similarity)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


In [71]:
k = 5
k_RWMD = [sorted(rwmd, key = lambda t: t[3], reverse=False)[:k] for rwmd in RWMD_s]

for k in k_RWMD:
    for scores in k:
        print(scores[0])
        print(scores[2])
        print(scores[3])
    print("\n")

['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['president', 'greets', 'press', 'chicago', 'evening']
1.0102365442684718
['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['president', 'greets', 'press', 'chicago']
1.0826340743473597
['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['hello', 'called', 'love', 'you']
1.1934219428471156
['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['the', 'forecast', 'says', 'snow']
1.2727791241237094
['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['who', 'is', 'winning']
1.3249833924429755


['which', 'team', 'will', 'lose']
['who', 'is', 'winning']
1.148093432188034
['which', 'team', 'will', 'lose']
['the', 'forecast', 'says', 'snow']
1.2594012022018433
['which', 'team', 'will', 'lose']
['hello', 'called', 'love', 'you']
1.2821693122386932
['which', 'team', 'will', 'lose']
['president', 'greets', 'press