In [1]:
from collections import Counter, namedtuple
from gensim.models import KeyedVectors
from nltk.stem import WordNetLemmatizer
from pyemd import emd_with_flow, emd
from sklearn.metrics import euclidean_distances

import numpy as np
import pandas as pd
import time

In [3]:
import os
import sys

root_dir = os.path.join(os.getcwd(), '..')
sys.path.append(root_dir)

In [4]:
from flow_wmd.modules import Document, DocPair

%load_ext autoreload
%autoreload 2

## Load WV model

In [6]:
def read_1w_corpus(name, sep="\t"):
    for line in open(name):
        yield line.split(sep)

print("Loading GoogleNews Vectors")
%time model = KeyedVectors.load_word2vec_format('../embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

Loading GoogleNews Vectors
CPU times: user 47 s, sys: 3.03 s, total: 50 s
Wall time: 51 s


In [7]:
%time model.init_sims(replace=True)



CPU times: user 3.46 s, sys: 8.17 s, total: 11.6 s
Wall time: 16.5 s


## First Try

In [8]:
# Obama speaks to the media in Illinois 
sentence1_words = ['obama', 'speaks', 'media', 'illinois']

# The President greets the press in Chicago. 
sentence2_words = ['president', 'greets', 'press', 'chicago']

In [9]:
# Obama speaks to the media in Illinois 
sentence1_words = ['beautiful','obama', 'speaks', 'media', 'illinois', 'morning','iraq']

# The President greets the press in Chicago. 
sentence2_words = ['beautiful','president', 'greets', 'press', 'chicago', 'beautiful','morning']

In [126]:
%time
source = Document(sentence1_words)
source.instantiate(model)

sink = Document(sentence2_words)
sink.instantiate(model)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [127]:
%time dp = DocPair(source, sink)
%time dp.getsignature()
%time dp.emd()
%time dp.getCost()

CPU times: user 19 µs, sys: 0 ns, total: 19 µs
Wall time: 21.9 µs
CPU times: user 86 µs, sys: 26 µs, total: 112 µs
Wall time: 88.7 µs
CPU times: user 1.39 ms, sys: 1.35 ms, total: 2.74 ms
Wall time: 3.13 ms
CPU times: user 236 µs, sys: 20 µs, total: 256 µs
Wall time: 256 µs


In [83]:
dp.source_cost

{'beautiful': 0.0,
 'obama': 0.19391,
 'speaks': 0.13973,
 'media': 0.12756,
 'illinois': 0.12296,
 'morning': 0.0,
 'Iraq': 0.18869}

In [84]:
dp.sink_cost

{'beautiful': 0.0,
 'president': 0.18869,
 'greets': 0.13973,
 'press': 0.12756,
 'chicago': 0.12296,
 'morning': 0.0}

In [85]:
w2v_emd, w2v_flow = dp.emd()

In [137]:
flow_df = pd.DataFrame(w2v_flow, index=dp.vocab, columns=dp.vocab)
flow_df.iloc[dp.idx1,dp.idx2]

Unnamed: 0,beautiful,president,greets,press,chicago,beautiful.1,morning
beautiful,0.142857,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.142857,0.0,0.0,0.0,0.0,0.0,0.0
speaks,0.0,0.0,0.142857,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.142857,0.0,0.0,0.0
illinois,0.0,0.0,0.0,0.0,0.142857,0.0,0.0
morning,0.0,0.0,0.0,0.0,0.0,0.0,0.142857
iraq,0.0,0.142857,0.0,0.0,0.0,0.0,0.0


In [138]:
dist_df = pd.DataFrame(dp.w2v_distances, index=dp.vocab, columns=dp.vocab)
cost_df = (flow_df * dist_df)
cost_df.iloc[dp.idx1,dp.idx2].round(3)

Unnamed: 0,beautiful,president,greets,press,chicago,beautiful.1,morning
beautiful,0.0,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.194,0.0,0.0,0.0,0.0,0.0,0.0
speaks,0.0,0.0,0.14,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.128,0.0,0.0,0.0
illinois,0.0,0.0,0.0,0.0,0.123,0.0,0.0
morning,0.0,0.0,0.0,0.0,0.0,0.0,0.0
iraq,0.0,0.202,0.0,0.0,0.0,0.0,0.0


## Using LC RMWD Paper

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

X1 = [['beautiful','obama', 'speaks', 'media', 'illinois', 'morning','troops','return','iraq'],
      ['beautiful','obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning'],
      ["which","team","will","lose"],
      ['obama', 'speaks', 'media', 'illinois'],
      ["is", "it", "gonna", "rain", "tomorrow"],
      ['one','more','sentence','test','stuf'],
      ['you','shall','not','pass'],
      ['kennedy','was','here','suckers'],
      ['love','thy','neighbor'],
      ['eye','revenge','eye']]
X2 = [['beautiful','president', 'greets', 'press', 'chicago', 'beautiful','morning'],
      ['president', 'greets', 'press', 'chicago', 'evening'],
      ["who", "is", "winning"],
      ["the", "forecast", "says", "snow"],
      ['president', 'greets', 'press', 'chicago'],
      ['hello','called','love','you'],
      ['first','gang','die'],
      ['best','times','worst','times'],
      ['therefore','am'],
      ['two','people','couch']]

X1_sent = [" ".join(doc) for doc in X1]
X2_sent = [" ".join(doc) for doc in X2]

corpus = X1_sent + X2_sent

vectorizer = TfidfVectorizer(use_idf=False, norm='l1')
vectorizer.fit(corpus)

TfidfVectorizer(norm='l1', use_idf=False)

In [11]:
features = vectorizer.get_feature_names()
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
idx2word = {idx: word for idx, word in enumerate(vectorizer.get_feature_names())}

In [12]:
X1_nbow = vectorizer.transform(X1_sent)
X2_nbow = vectorizer.transform(X2_sent)

In [13]:
E = np.vstack([model.word_vec(word) for word in vectorizer.get_feature_names()])

  E = np.vstack([model.word_vec(word) for word in vectorizer.get_feature_names()])


### 1. WMD

In [14]:
%time

def lc_rmwd(X1, X2, E):
    D1, D2 = [], []

    # Atasu et al LC-RWMD: One-to-many
    for idx2,doc2 in enumerate(X2):
        Z = euclidean_distances(E, [vec for vec in [E[word2idx[word]] for word in doc2]]).min(axis=1)
        LC_RWMD = np.dot(X1_nbow.toarray(), Z)
        D1.append(LC_RWMD)

    for idx1,doc1 in enumerate(X1):
        Z = euclidean_distances(E, [vec for vec in [E[word2idx[word]] for word in doc1]]).min(axis=1)
        LC_RWMD = np.dot(X2_nbow.toarray(), Z)
        D2.append(LC_RWMD)

    D = np.maximum(np.vstack(D1), np.vstack(np.transpose(D2)))
    return D

D = lc_rmwd(X1, X2, E)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.29 µs


In [15]:
D

array([[0.82725746, 0.57751435, 1.34030032, 1.14435359, 1.22157277,
        1.28083788, 1.28982919, 1.21699172, 1.22740861, 1.3565355 ],
       [1.05410856, 1.01023654, 1.32195076, 1.08054153, 1.22744477,
        1.29915538, 1.31048636, 1.23490313, 1.32801049, 1.36230195],
       [1.336287  , 1.32498339, 1.14809343, 1.31706247, 1.00633972,
        1.29815297, 1.22959071, 1.19013703, 1.32402221, 1.33076803],
       [1.28945665, 1.27277912, 1.2594012 , 1.29004994, 1.11325568,
        1.24861984, 1.26217186, 1.26131362, 1.30896604, 1.34001368],
       [1.11587715, 1.08263407, 1.33366004, 1.01065011, 1.28156326,
        1.30140336, 1.30379188, 1.23090096, 1.32541144, 1.36797354],
       [1.25172512, 1.19342194, 1.28216931, 1.28118557, 1.14592826,
        1.24128144, 0.92043224, 1.17788664, 0.90561929, 1.32605949],
       [1.32288837, 1.33760999, 1.23821968, 1.34087273, 1.27764707,
        1.2329242 , 1.29380719, 1.2502839 , 1.27708765, 1.30551573],
       [1.33702754, 1.31617936, 1.2611624

In [16]:
X1_pref = []

X1_pref = {str(idx): list(row.argsort()) for (idx, row) in enumerate(D)}
X2_pref = {str(idx): list(row.argsort()) for (idx, row) in enumerate(D.T)}

X1_pref = {kk: [str(v) for v in vv] for kk, vv in X1_pref.items()}
X2_pref = {kk: [str(v) for v in vv] for kk, vv in X2_pref.items()}

In [17]:
from flow_wmd.gale_shapeley import Matcher

matcher = Matcher(D)
engaged = matcher.matchmaker()
matcher.check()

True

In [18]:
engaged

{0: 1, 2: 4, 4: 3, 5: 8, 9: 5, 6: 2, 8: 6, 1: 0, 3: 7, 7: 9}

In [19]:
distance_df = pd.DataFrame(D, index=X2_sent, columns=X1_sent)
distance_df

Unnamed: 0,beautiful obama speaks media illinois morning troops return iraq,beautiful obama speaks media illinois beautiful morning,which team will lose,obama speaks media illinois,is it gonna rain tomorrow,one more sentence test stuf,you shall not pass,kennedy was here suckers,love thy neighbor,eye revenge eye
beautiful president greets press chicago beautiful morning,0.827257,0.577514,1.3403,1.144354,1.221573,1.280838,1.289829,1.216992,1.227409,1.356536
president greets press chicago evening,1.054109,1.010237,1.321951,1.080542,1.227445,1.299155,1.310486,1.234903,1.32801,1.362302
who is winning,1.336287,1.324983,1.148093,1.317062,1.00634,1.298153,1.229591,1.190137,1.324022,1.330768
the forecast says snow,1.289457,1.272779,1.259401,1.29005,1.113256,1.24862,1.262172,1.261314,1.308966,1.340014
president greets press chicago,1.115877,1.082634,1.33366,1.01065,1.281563,1.301403,1.303792,1.230901,1.325411,1.367974
hello called love you,1.251725,1.193422,1.282169,1.281186,1.145928,1.241281,0.920432,1.177887,0.905619,1.326059
first gang die,1.322888,1.33761,1.23822,1.340873,1.277647,1.232924,1.293807,1.250284,1.277088,1.305516
best times worst times,1.337028,1.316179,1.261162,1.376421,1.274439,1.267623,1.282566,1.316227,1.317455,1.351343
therefore am,1.296666,1.285259,1.218989,1.316114,1.197306,1.295485,1.180132,1.253469,1.302091,1.379033
two people couch,1.31208,1.317706,1.308014,1.337589,1.277699,1.214767,1.265152,1.265272,1.26977,1.359689


In [20]:
def WMD(X1, X2, X1_nbow, X2_nbow, E):
    X1_idxs = list(set([word2idx[word] for word in X1]))
    X2_idxs = list(set([word2idx[word] for word in X2]))
    T_emd = E[X1_idxs + X2_idxs,]
    C_emd = euclidean_distances(T_emd, T_emd)
    X1_sig = np.concatenate((X1_nbow.toarray()[0,X1_idxs], np.zeros(len(X2_idxs))))
    X2_sig = np.concatenate((np.zeros(len(X1_idxs)), X2_nbow.toarray()[0,X2_idxs]))
    # Doing WMD
    WMD = emd_with_flow(np.array(X1_sig, dtype=np.double), 
                                np.array(X2_sig, dtype=np.double), 
                                np.array(C_emd, dtype=np.double))
    #WMD_s.append(WMD + (X1_indeces, X2_indeces))
    m = np.array(WMD[1])[:len(X1_idxs),len(X1_idxs):]
    return (WMD[0],m,X1_idxs,X2_idxs)

In [22]:
import bottleneck as bn

%time

def fast_get_L(X1, X2, 
          X1_nbow, X2_nbow, 
          D, E, n):
    n = 2
    L_values = []
    for idx1, doc1 in enumerate(X1):
        values = bn.partition(D[idx1], D[idx1].size-n)[:-n]
        indeces = bn.argpartition(D[idx1], D[idx1].size-n)[:-n]
        WMD_s = []
        for idx2 in indeces:
            doc2 = X2[idx2]
            X2_indeces = list(set([word2idx[word] for word in doc2]))
            wmd = WMD(doc1, doc2, X1_nbow[idx1],X2_nbow[idx2],E)[0]
            WMD_s.append(wmd)
        L = max(WMD_s)
        L_values.append((idx1, L))
    return L_values

L_values = fast_get_L(X1, X2, X1_nbow, X2_nbow,D, E, 2)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.05 µs


In [23]:
list(range(len(X1)))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [32]:
import bottleneck as bn

%time

def get_L(X1, X2, 
          X1_nbow, X2_nbow, 
          D, E, n):
    L_values = []
    for idx1, doc1 in enumerate(X1):
        values = bn.partition(D[idx1], D[idx1].size-n)[:-n]
        indeces = bn.argpartition(D[idx1], D[idx1].size-n)[:-n]
        WMD_s = []
        for idx2 in indeces:
            doc2 = X2[idx2]
            X2_indeces = list(set([word2idx[word] for word in doc2]))
            wmd = WMD(doc1, doc2, X1_nbow[idx1],X2_nbow[idx2],E)[0]
            WMD_s.append(wmd)
        L = max(WMD_s)
        L_values.append((idx1, L))
    return L_values

L_values = get_L(X1, X2, X1_nbow, X2_nbow,D, E, 2)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [25]:
L_values

[(0, 1.3440824106096465),
 (1, 1.3435464752353057),
 (2, 1.3413949351295233),
 (3, 1.3834322830935122),
 (4, 1.30095842616584),
 (5, 1.3166037171496152),
 (6, 1.3219162567487956),
 (7, 1.3167128382524846),
 (8, 1.3334503746436128),
 (9, 1.3829850634988057)]

In [36]:
def R_WMD(L_values, X1, X2, X1_nbow, X2_nbow):
    wmd_s = []
    for L in L_values:
        for idx2, row in enumerate(D[L[0]]):
            #print(row)
            #print(L[1])
            #print("\n")
            if row < L[1]:
                wmd = WMD(X1[L[0]], X2[idx2], X1_nbow[L[0]], X2_nbow[idx2], E)
                wmd_s.append(wmd)
            else:
                pass
    return wmd_s

rmwds = R_WMD(L_values, X1, X2, X1_nbow, X2_nbow)

In [42]:
D[0]

array([0.82725746, 0.57751435, 1.34030032, 1.14435359, 1.22157277,
       1.28083788, 1.28982919, 1.21699172, 1.22740861, 1.3565355 ])

In [41]:
L_values

[(0, 1.3440824106096465),
 (1, 1.3435464752353057),
 (2, 1.3413949351295233),
 (3, 1.3834322830935122),
 (4, 1.30095842616584),
 (5, 1.3166037171496152),
 (6, 1.3219162567487956),
 (7, 1.3167128382524846),
 (8, 1.3334503746436128),
 (9, 1.3829850634988057)]

In [28]:
cost_df = pd.DataFrame(rmwds[1][1], 
                       index=[idx2word[idx] for idx in rmwds[1][2]], 
                       columns=[idx2word[idx] for idx in rmwds[1][3]])
cost_df

Unnamed: 0,president,press,chicago,evening,greets
beautiful,0.022222,0.0,0.0,0.0,0.088889
return,0.0,0.022222,0.0,0.088889,0.0
speaks,0.0,0.0,0.0,0.0,0.111111
illinois,0.0,0.0,0.111111,0.0,0.0
iraq,0.044444,0.066667,0.0,0.0,0.0
troops,0.111111,0.0,0.0,0.0,0.0
media,0.0,0.111111,0.0,0.0,0.0
morning,0.0,0.0,0.0,0.111111,0.0
obama,0.022222,0.0,0.088889,0.0,0.0


## RWMD revisited (August 2021)

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

doc1 = " ".join(sentence1_words)
doc2 = " ".join(sentence2_words)

stopwords_set = set(stopwords.words('english'))
documents = [doc1, doc2]
nbow = CountVectorizer(stop_words = stopwords_set)
nbow.fit([doc for doc in documents])

CountVectorizer(stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...})

In [63]:
vocabulary = set(model.index_to_key)
names = nbow.get_feature_names()

In [70]:
def flow_graph_rwmd(doc1, doc2):     
    v1, v2    = nbow.transform([doc1, doc2])   
    index     = np.union1d(v1.indices, v2.indices)
    
    v1  = v1.toarray().ravel()
    v2  = v2.toarray().ravel()

    n         = len(index)    
#     index_map = [index[i] for i in range(n) if names[index[i]] in vocabulary]
    index_map = [i for i in index if names[i] in vocabulary]
    source    = np.zeros(len(index_map))
    sink      = np.zeros(len(index_map))
    vecs      = np.zeros(shape = (len(index_map), 300))

    for j, i in enumerate(index_map):
        source[j] = v1[i]
        sink[j]   = v2[i]
        vecs[j]   = model[names[i]]
#         print(names[i])
    sum_source = sum(source)
    sum_sink = sum(sink)
    if sum_source == 0:
        sum_source = 1
    if sum_sink == 0:
        sum_sink = 1
    return (source / sum_source, sink / sum_sink, vecs)


In [120]:
def rwmd_(doc1, doc2):
    source, sink, vecs = flow_graph_rwmd(doc1, doc2)
    weights = euclidean_distances(vecs)
    potential_dj = list(j for j, dj in enumerate(sink) if dj > 0)
    new_weights_dj = list(min(weights[i, potential_dj]) for i in range(len(source)))
    potential_di = list(i for i, di in enumerate(source) if di > 0)
    new_weights_di = list(min(weights[j, potential_di]) for j in range(len(sink)))
    rwmd = max(np.dot(new_weights_dj, source), np.dot(new_weights_di, sink))
    return rwmd

In [121]:
doc1

'beautiful obama speaks media illinois morning iraq'

In [122]:
doc2

'beautiful president greets press chicago beautiful morning'

In [123]:
rwmd_(doc1, doc2)

0.6892952702116069

In [118]:
source, sink, vecs = flow_graph_rwmd(doc1, doc2)

In [100]:
print(source)
print(sink)
print(len(source))
print(len(sink))
print(len(vecs))

[0.14285714 0.         0.         0.14285714 0.14285714 0.14285714
 0.14285714 0.14285714 0.         0.         0.14285714]
[0.28571429 0.14285714 0.14285714 0.         0.         0.
 0.14285714 0.         0.14285714 0.14285714 0.        ]
11
11
11


In [102]:
weights = euclidean_distances(vecs)
print(len(weights))
print(weights)

11
[[0.         1.34256729 1.30131367 1.36322785 1.37574318 1.42906725
  1.35944544 1.35738904 1.42581627 1.39553837 1.31566228]
 [1.34256729 0.         1.37007087 0.86072855 1.10787424 1.38123171
  1.33345207 0.98541434 1.41867127 1.37537805 1.3499811 ]
 [1.30131367 1.37007087 0.         1.37085179 1.43199013 1.33151178
  1.26411535 1.37600482 1.33362547 1.28527545 0.9780999 ]
 [1.36322785 0.86072855 1.37085179 0.         1.10740923 1.40152814
  1.33655016 1.03601759 1.4259402  1.38934576 1.36625094]
 [1.37574318 1.10787424 1.43199013 1.10740923 0.         1.38972075
  1.41328485 0.94665669 1.41136532 1.37599085 1.37361004]
 [1.42906725 1.38123171 1.33151178 1.40152814 1.38972075 0.
  1.34366823 1.3406543  1.34258846 0.89294986 1.29293702]
 [1.35944544 1.33345207 1.26411535 1.33655016 1.41328485 1.34366823
  0.         1.35696713 1.41013729 1.2586186  1.36000256]
 [1.35738904 0.98541434 1.37600482 1.03601759 0.94665669 1.3406543
  1.35696713 0.         1.33808043 1.34161041 1.38691697

In [103]:
new_weights_dj = []
potential_dj = list(j for j, dj in enumerate(sink) if dj > 0)
print(potential_dj)

[0, 1, 2, 6, 8, 9]


In [105]:
new_weights_dj = list(min(weights[i, potential_dj]) for i in range(len(source)))
print(new_weights_dj)

[0.0, 0.0, 0.0, 0.8607285451074355, 1.1078742397408554, 0.89294986372621, 0.0, 0.9854143431632054, 0.0, 0.0, 0.9780998997435427]


In [106]:
potential_di = list(i for i, di in enumerate(source) if di > 0)
new_weights_di = list(min(weights[j, potential_di]) for j in range(len(sink)))

In [110]:
new_weights_dj

[0.0,
 0.0,
 0.0,
 0.8607285451074355,
 1.1078742397408554,
 0.89294986372621,
 0.0,
 0.9854143431632054,
 0.0,
 0.0,
 0.9780998997435427]

In [114]:
np.multiply(new_weights_dj, source)

array([0.        , 0.        , 0.        , 0.12296122, 0.15826775,
       0.12756427, 0.        , 0.14077348, 0.        , 0.        ,
       0.13972856])

In [109]:
np.dot(new_weights_dj, source)

0.6892952702116069

In [108]:
rwmd = max(np.dot(new_weights_dj, source), np.dot(new_weights_di, sink))
rwmd

0.6892952702116069

## Modularize LC-RMWD

In [229]:
from sklearn.feature_extraction.text import TfidfVectorizer

X1 = [['beautiful','obama', 'speaks', 'media', 'illinois', 'morning','iraq'],
      ['beautiful','obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning'],
      ["which","team","will","lose"],
      ["is", "it", "gonna", "rain", "tomorrow"],
      ['obama', 'speaks', 'media', 'illinois']]
X2 = [['beautiful','president', 'greets', 'press', 'chicago', 'beautiful','morning'],
      ['president', 'greets', 'press', 'chicago', 'evening'],
      ["who", "is", "winning"],
      ["the", "forecast", "says", "snow"],
      ['president', 'greets', 'press', 'chicago'],
      ['hello','called','love','you']]

X1_sent = [" ".join(doc) for doc in X1]
X2_sent = [" ".join(doc) for doc in X2]

corpus = X1_sent + X2_sent

#vectorizer = TfidfVectorizer(use_idf=False, norm='l2')
vectorizer = TfidfVectorizer(use_idf=False, norm='l1')
vectorizer.fit(corpus)

TfidfVectorizer(norm='l1', use_idf=False)

In [178]:
features = vectorizer.get_feature_names()
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
idx2word = {idx: word a idx, word in enumerate(vectorizer.get_feature_names())}

In [179]:
word2idx['chicago']

2

In [256]:
#X1_corpus = 
%time
X1_nbow = vectorizer.transform(X1_sent)
X2_nbow = vectorizer.transform(X2_sent)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


In [102]:
E = np.vstack([model.word_vec(word) for word in vectorizer.get_feature_names()])

In [642]:
from flow_wmd.modules_rwmd import Document, LC_RWMD, WMDCluster

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [208]:
X1_sent[0]

'beautiful obama speaks media illinois morning iraq'

In [239]:
X1_sent[1]

'beautiful obama speaks media illinois beautiful morning'

In [564]:
%time 

X1_docs, X2_docs = [], []

X1_nbow = vectorizer.transform(X1_sent)
X2_nbow = vectorizer.transform(X2_sent)

for idx, doc in enumerate(X1):
    X1_docs.append(Document(doc, X1_nbow[idx], word2idx, E))
    
for idx, doc in enumerate(X2):
    X2_docs.append(Document(doc, X2_nbow[idx], word2idx, E))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [565]:
%time

lc_rwmd = LC_RWMD(X1_docs, X2_docs,X1_nbow,X2_nbow,E)
lc_rwmd.get_D()
lc_rwmd.get_L(2)
lc_rwmd.get_rwmd()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


In [566]:
lc_rwmd.wmd_s

[0.7779423936888455,
 1.0577764527086124,
 0.5814076731150226,
 1.0601321450052137,
 1.1221033037203125,
 1.3456935750793113,
 1.33409258504228,
 1.1613731145914146,
 1.2665668245616257,
 1.3413949351295233,
 1.1287236809574008,
 1.1562737829503784,
 1.0927072101407647,
 1.0174646259300113]

In [569]:
X1_docs[0]

<flow_wmd.modules_rwmd.Document at 0x7f942b48d9a0>

In [649]:
X1_docs[0]

<flow_wmd.modules_rwmd.Document at 0x7f942b48d9a0>

In [655]:
%time

distances, wc_X1, wc_X2 = WMDCluster(X1_docs, X2_docs,E,idx2word).get_distances(return_flow = True)

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 2.86 µs


In [658]:
wc_X1

{'is': 1.27494,
 'iraq': 1.18948,
 'it': 1.4954100000000001,
 'beautiful': 2.7164,
 'which': 1.9112699999999998,
 'lose': 1.9683000000000002,
 'speaks': 3.5973,
 'obama': 4.23171,
 'will': 1.8686099999999999,
 'tomorrow': 1.46315,
 'media': 3.61458,
 'illinois': 3.6213599999999997,
 'rain': 1.4949000000000001,
 'gonna': 1.41928,
 'morning': 1.70016,
 'team': 1.9965}

In [578]:
%time

test = WMDCluster(X1_docs[0:2], X2_docs[0:2],E,idx2word).get_flow()[0]

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


In [632]:
test[2][:,0]

array([0.     , 0.     , 0.19653, 0.     , 0.     , 0.     , 0.     ])

In [580]:
pd.DataFrame(test[2], index=test[3],columns=test[4])

Unnamed: 0,beautiful,chicago,greets,morning,president,press
beautiful,0.0,0.0,0.0,0.0,0.0,0.0
illinois,0.0,0.12296,0.0,0.0,0.0,0.0
iraq,0.19653,0.0,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.0,0.0,0.12756
morning,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.0,0.0,0.0,0.0,0.19115,0.0
speaks,0.0,0.0,0.13973,0.0,0.0,0.0


In [521]:
cost_m = test[0]*test[1]
cost_m = cost_m[:len(test[2]),len(test[2]):].round(5)
pd.DataFrame(cost_m, index=w1,columns=w2)

Unnamed: 0,beautiful,chicago,greets,morning,president,press
beautiful,0.0,0.0,0.0,0.0,0.0,0.0
illinois,0.0,0.12296,0.0,0.0,0.0,0.0
iraq,0.19653,0.0,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.0,0.0,0.12756
morning,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.0,0.0,0.0,0.0,0.19115,0.0
speaks,0.0,0.0,0.13973,0.0,0.0,0.0


In [69]:
T2_full = []

# Atasu et al LC-RWMD: Many-to-many NOT WORKING
for idx2,doc2 in enumerate(X2):
    # Helper to quickly get word vectors for X2
    X2_indeces = list(set([word2idx[word] for word in doc2]))
    #Atasu et al. RMWD
    T2 = E[X2_indeces,]
    T2_full.append(T2)
    #ET2 = np.dot(E)
    #Z_mtm =

np.vstack(T2_full).min(axis=1)

array([-0.15780888, -0.17197762, -0.14504768, -0.12993725, -0.16405022,
       -0.1421869 , -0.22472994, -0.15382984, -0.14625083, -0.13629027,
       -0.13017109, -0.17396213, -0.12993725, -0.16405022, -0.15780888,
       -0.14504768, -0.161765  , -0.16040245, -0.16454686, -0.1797309 ],
      dtype=float32)

In [70]:
%time

RWMD_s = []

for idx1,doc1 in enumerate(X1):
    # Helper to quickly get word vectors for X1
    X1_indeces = list(set([word2idx[word] for word in doc1]))

    # Atasu et al. RMWD
    T1 = E[X1_indeces,]
    F1 = X1_nbow[idx1].toarray()[0,X1_indeces]
    
    # Helper
    similarity = []
    for idx2,doc2 in enumerate(X2):
        # Helper to quickly get word vectors for X2
        X2_indeces = list(set([word2idx[word] for word in doc2]))
        
        #Atasu et al. RMWD
        T2 = E[X2_indeces,]
        F2 = X2_nbow[idx2].toarray()[0,X2_indeces]
        
        C = euclidean_distances(T1, T2)
        RWMD = np.maximum(np.dot(np.min(C, axis=1), F1),
                          np.dot(np.min(C, axis=0), F2))
        
        # Helper
        similarity.append((doc1, idx2, doc2,RWMD))
        
        # Required by pyemd, but not part of the Atasu et al.
        T_emd = E[X1_indeces + X2_indeces,]
        C_emd = euclidean_distances(T_emd, T_emd)
        X1_sig = np.concatenate((X1_nbow[3].toarray()[0,X1_indeces], np.zeros(len(F2))))
        X2_sig = np.concatenate((np.zeros(len(F1)), X2_nbow[3].toarray()[0,X2_indeces]))
        
        # Doing WMD
        WMD = emd_with_flow(np.array(X1_sig, dtype=np.double), 
                            np.array(X2_sig, dtype=np.double), 
                            np.array(C_emd, dtype=np.double))
        
        #print(WMD[0])
        #print(RWMD)
    RWMD_s.append(similarity)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


In [71]:
k = 5
k_RWMD = [sorted(rwmd, key = lambda t: t[3], reverse=False)[:k] for rwmd in RWMD_s]

for k in k_RWMD:
    for scores in k:
        print(scores[0])
        print(scores[2])
        print(scores[3])
    print("\n")

['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['president', 'greets', 'press', 'chicago', 'evening']
1.0102365442684718
['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['president', 'greets', 'press', 'chicago']
1.0826340743473597
['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['hello', 'called', 'love', 'you']
1.1934219428471156
['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['the', 'forecast', 'says', 'snow']
1.2727791241237094
['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['who', 'is', 'winning']
1.3249833924429755


['which', 'team', 'will', 'lose']
['who', 'is', 'winning']
1.148093432188034
['which', 'team', 'will', 'lose']
['the', 'forecast', 'says', 'snow']
1.2594012022018433
['which', 'team', 'will', 'lose']
['hello', 'called', 'love', 'you']
1.2821693122386932
['which', 'team', 'will', 'lose']
['president', 'greets', 'press