In [15]:
from collections import Counter, namedtuple
from gensim.models import KeyedVectors
from nltk.stem import WordNetLemmatizer
from pyemd import emd_with_flow, emd
from sklearn.metrics import euclidean_distances

import numpy as np
import pandas as pd
import time

In [3]:
from flow_wmd.modules import Document, DocPair

%load_ext autoreload
%autoreload 2

## Load WV model

In [4]:
def read_1w_corpus(name, sep="\t"):
    for line in open(name):
        yield line.split(sep)

print("Loading GoogleNews Vectors")
%time model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

Loading GoogleNews Vectors
CPU times: user 42 s, sys: 3.89 s, total: 45.9 s
Wall time: 47.9 s


In [6]:
%time model.init_sims(replace=True)

CPU times: user 4.36 s, sys: 12.1 s, total: 16.5 s
Wall time: 28.1 s


## First Try

In [None]:
# Obama speaks to the media in Illinois 
sentence1_words = ['obama', 'speaks', 'media', 'illinois']

# The President greets the press in Chicago. 
sentence2_words = ['president', 'greets', 'press', 'chicago']

In [125]:
# Obama speaks to the media in Illinois 
sentence1_words = ['beautiful','obama', 'speaks', 'media', 'illinois', 'morning','iraq']

# The President greets the press in Chicago. 
sentence2_words = ['beautiful','president', 'greets', 'press', 'chicago', 'beautiful','morning']

In [126]:
%time
source = Document(sentence1_words)
source.instantiate(model)

sink = Document(sentence2_words)
sink.instantiate(model)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [127]:
%time dp = DocPair(source, sink)
%time dp.getsignature()
%time dp.emd()
%time dp.getCost()

CPU times: user 19 µs, sys: 0 ns, total: 19 µs
Wall time: 21.9 µs
CPU times: user 86 µs, sys: 26 µs, total: 112 µs
Wall time: 88.7 µs
CPU times: user 1.39 ms, sys: 1.35 ms, total: 2.74 ms
Wall time: 3.13 ms
CPU times: user 236 µs, sys: 20 µs, total: 256 µs
Wall time: 256 µs


In [83]:
dp.source_cost

{'beautiful': 0.0,
 'obama': 0.19391,
 'speaks': 0.13973,
 'media': 0.12756,
 'illinois': 0.12296,
 'morning': 0.0,
 'Iraq': 0.18869}

In [84]:
dp.sink_cost

{'beautiful': 0.0,
 'president': 0.18869,
 'greets': 0.13973,
 'press': 0.12756,
 'chicago': 0.12296,
 'morning': 0.0}

In [85]:
w2v_emd, w2v_flow = dp.emd()

In [137]:
flow_df = pd.DataFrame(w2v_flow, index=dp.vocab, columns=dp.vocab)
flow_df.iloc[dp.idx1,dp.idx2]

Unnamed: 0,beautiful,president,greets,press,chicago,beautiful.1,morning
beautiful,0.142857,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.142857,0.0,0.0,0.0,0.0,0.0,0.0
speaks,0.0,0.0,0.142857,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.142857,0.0,0.0,0.0
illinois,0.0,0.0,0.0,0.0,0.142857,0.0,0.0
morning,0.0,0.0,0.0,0.0,0.0,0.0,0.142857
iraq,0.0,0.142857,0.0,0.0,0.0,0.0,0.0


In [138]:
dist_df = pd.DataFrame(dp.w2v_distances, index=dp.vocab, columns=dp.vocab)
cost_df = (flow_df * dist_df)
cost_df.iloc[dp.idx1,dp.idx2].round(3)

Unnamed: 0,beautiful,president,greets,press,chicago,beautiful.1,morning
beautiful,0.0,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.194,0.0,0.0,0.0,0.0,0.0,0.0
speaks,0.0,0.0,0.14,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.128,0.0,0.0,0.0
illinois,0.0,0.0,0.0,0.0,0.123,0.0,0.0
morning,0.0,0.0,0.0,0.0,0.0,0.0,0.0
iraq,0.0,0.202,0.0,0.0,0.0,0.0,0.0


## Using LC RMWD Paper

In [163]:
from sklearn.feature_extraction.text import TfidfVectorizer

X1 = [['beautiful','obama', 'speaks', 'media', 'illinois', 'morning','troops','return','iraq'],
      ['beautiful','obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning'],
      ["which","team","will","lose"],
      ['obama', 'speaks', 'media', 'illinois'],
      ["is", "it", "gonna", "rain", "tomorrow"],
      ['one','more','sentence','test','stuf'],
      ['you','shall','not','pass'],
      ['kennedy','was','here','suckers'],
      ['love','thy','neighbor'],
      ['eye','revenge','eye']]
X2 = [['beautiful','president', 'greets', 'press', 'chicago', 'beautiful','morning'],
      ['president', 'greets', 'press', 'chicago', 'evening'],
      ["who", "is", "winning"],
      ["the", "forecast", "says", "snow"],
      ['president', 'greets', 'press', 'chicago'],
      ['hello','called','love','you'],
      ['first','gang','die'],
      ['best','times','worst','times'],
      ['therefore','am'],
      ['two','people','couch']]

X1_sent = [" ".join(doc) for doc in X1]
X2_sent = [" ".join(doc) for doc in X2]

corpus = X1_sent + X2_sent

vectorizer = TfidfVectorizer(use_idf=False, norm='l1')
vectorizer.fit(corpus)

TfidfVectorizer(norm='l1', use_idf=False)

In [164]:
features = vectorizer.get_feature_names()
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
idx2word = {idx: word for idx, word in enumerate(vectorizer.get_feature_names())}

In [165]:
X1_nbow = vectorizer.transform(X1_sent)
X2_nbow = vectorizer.transform(X2_sent)

In [166]:
E = np.vstack([model.word_vec(word) for word in vectorizer.get_feature_names()])

### 1. WMD

In [167]:
%time

def lc_rmwd(X1, X2, E):
    D1, D2 = [], []

    # Atasu et al LC-RWMD: One-to-many
    for idx2,doc2 in enumerate(X2):
        Z = euclidean_distances(E, [vec for vec in [E[word2idx[word]] for word in doc2]]).min(axis=1)
        LC_RWMD = np.dot(X1_nbow.toarray(), Z)
        D1.append(LC_RWMD)

    for idx1,doc1 in enumerate(X1):
        Z = euclidean_distances(E, [vec for vec in [E[word2idx[word]] for word in doc1]]).min(axis=1)
        LC_RWMD = np.dot(X2_nbow.toarray(), Z)
        D2.append(LC_RWMD)

    D = np.maximum(np.vstack(D1), np.vstack(np.transpose(D2)))
    return D

D = lc_rmwd(X1, X2, E)

CPU times: user 5 µs, sys: 13 µs, total: 18 µs
Wall time: 31 µs


In [168]:
D

array([[2.67603522, 1.81671817, 3.15135673, 3.21397148, 3.06027203,
        3.2460289 , 3.22378486, 3.1274727 , 3.51476356, 3.99722511],
       [3.19047783, 2.9649595 , 3.20790558, 3.1792568 , 3.18651533,
        3.21061282, 3.30887027, 3.15144973, 3.83652458, 4.09514952],
       [3.39355318, 3.26714052, 2.23206997, 3.5532071 , 2.37339551,
        2.69132566, 2.63692048, 2.63050377, 3.31641388, 3.43541694],
       [3.23360121, 3.26467776, 2.75446972, 3.48126554, 2.55466804,
        2.77692407, 2.89998084, 2.91078559, 3.58882058, 3.76675439],
       [3.42006524, 3.24036213, 3.39855665, 3.1792568 , 3.35985279,
        3.30258918, 3.47133595, 3.29440826, 3.96092331, 4.23187703],
       [3.42859077, 3.22802567, 2.88373244, 3.61224806, 2.81812515,
        2.89510646, 2.16551995, 2.86413258, 2.41843826, 3.74121982],
       [3.54173197, 3.51915455, 2.93727962, 3.77527827, 2.94058593,
        2.8077389 , 3.0192438 , 3.04689725, 3.55112974, 3.90595492],
       [3.66774962, 3.51606223, 2.7351279

In [245]:
X1_pref = []

X1_pref = {str(idx): list(row.argsort()) for (idx, row) in enumerate(D)}
X2_pref = {str(idx): list(row.argsort()) for (idx, row) in enumerate(D.T)}

X1_pref = {kk: [str(v) for v in vv] for kk, vv in X1_pref.items()}
X2_pref = {kk: [str(v) for v in vv] for kk, vv in X2_pref.items()}

In [260]:
from flow_wmd.gale_shapeley import Matcher

matcher = Matcher(D)
engaged = matcher.matchmaker()
matcher.check()

  0 and 1
  2 and 2
  3 and 4
  4 and 3
  5 and 6
  6 and 5
  1 and 7
  5 dumped 6 for 9
  7 dumped 1 for 8
  3 dumped 4 for 1
  7 and 8
  6 and 0
  0 dumped 6 for 4
  6 and 9


True

In [261]:
engaged

{1: 0, 2: 2, 4: 3, 3: 1, 6: 5, 5: 9, 7: 8, 8: 7, 0: 4, 9: 6}

In [251]:
distance_df = pd.DataFrame(D, index=X2_sent, columns=X1_sent)
distance_df

Unnamed: 0,beautiful obama speaks media illinois morning troops return iraq,beautiful obama speaks media illinois beautiful morning,which team will lose,obama speaks media illinois,is it gonna rain tomorrow,one more sentence test stuf,you shall not pass,kennedy was here suckers,love thy neighbor,eye revenge eye
beautiful president greets press chicago beautiful morning,2.676035,1.816718,3.151357,3.213971,3.060272,3.246029,3.223785,3.127473,3.514764,3.997225
president greets press chicago evening,3.190478,2.96496,3.207906,3.179257,3.186515,3.210613,3.30887,3.15145,3.836525,4.09515
who is winning,3.393553,3.267141,2.23207,3.553207,2.373396,2.691326,2.63692,2.630504,3.316414,3.435417
the forecast says snow,3.233601,3.264678,2.75447,3.481266,2.554668,2.776924,2.899981,2.910786,3.588821,3.766754
president greets press chicago,3.420065,3.240362,3.398557,3.179257,3.359853,3.302589,3.471336,3.294408,3.960923,4.231877
hello called love you,3.428591,3.228026,2.883732,3.612248,2.818125,2.895106,2.16552,2.864133,2.418438,3.74122
first gang die,3.541732,3.519155,2.93728,3.775278,2.940586,2.807739,3.019244,3.046897,3.55113,3.905955
best times worst times,3.66775,3.516062,2.735128,3.942256,3.084657,2.974307,3.014276,3.161405,3.546272,3.688971
therefore am,3.521831,3.431462,2.510591,3.717776,2.912109,2.828554,2.61733,2.964637,3.496952,3.693427
two people couch,3.497154,3.429147,2.619702,3.750738,3.001796,2.75437,2.852828,2.987996,3.470866,3.658853


In [26]:
def WMD(X1, X2, X1_nbow, X2_nbow, E):
    X1_idxs = list(set([word2idx[word] for word in X1]))
    X2_idxs = list(set([word2idx[word] for word in X2]))
    T_emd = E[X1_idxs + X2_idxs,]
    C_emd = euclidean_distances(T_emd, T_emd)
    X1_sig = np.concatenate((X1_nbow.toarray()[0,X1_idxs], np.zeros(len(X2_idxs))))
    X2_sig = np.concatenate((np.zeros(len(X1_idxs)), X2_nbow.toarray()[0,X2_idxs]))
    # Doing WMD
    WMD = emd_with_flow(np.array(X1_sig, dtype=np.double), 
                                np.array(X2_sig, dtype=np.double), 
                                np.array(C_emd, dtype=np.double))
    #WMD_s.append(WMD + (X1_indeces, X2_indeces))
    m = np.array(WMD[1])[:len(X1_idxs),len(X1_idxs):]
    return (WMD[0],m,X1_idxs,X2_idxs)

In [None]:
import bottleneck as bn

%time

def fast_get_L(X1, X2, 
          X1_nbow, X2_nbow, 
          D, E, n):
    n = 2
    L_values = []
    for idx1, doc1 in enumerate(X1):
        values = bn.partition(D[idx1], D[idx1].size-n)[:-n]
        indeces = bn.argpartition(D[idx1], D[idx1].size-n)[:-n]
        WMD_s = []
        for idx2 in indeces:
            doc2 = X2[idx2]
            X2_indeces = list(set([word2idx[word] for word in doc2]))
            wmd = WMD(doc1, doc2, X1_nbow[idx1],X2_nbow[idx2],E)[0]
            WMD_s.append(wmd)
        L = max(WMD_s)
        L_values.append((idx1, L))
    return L_values

L_values = get_L(X1, X2, X1_nbow, X2_nbow,D, E, 2)

In [37]:
list(range(len(X1)))

[0, 1, 2, 3, 4]

In [35]:
import bottleneck as bn

%time

def get_L(X1, X2, 
          X1_nbow, X2_nbow, 
          D, E, n):
    n = 2
    L_values = []
    for idx1, doc1 in enumerate(X1):
        values = bn.partition(D[idx1], D[idx1].size-n)[:-n]
        indeces = bn.argpartition(D[idx1], D[idx1].size-n)[:-n]
        WMD_s = []
        for idx2 in indeces:
            doc2 = X2[idx2]
            X2_indeces = list(set([word2idx[word] for word in doc2]))
            wmd = WMD(doc1, doc2, X1_nbow[idx1],X2_nbow[idx2],E)[0]
            WMD_s.append(wmd)
        L = max(WMD_s)
        L_values.append((idx1, L))
    return L_values

L_values = get_L(X1, X2, X1_nbow, X2_nbow,D, E, 2)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


In [32]:
L_values

[(0, 3.8610839431781536),
 (1, 3.411543455814765),
 (2, 3.4405574227655413),
 (3, 3.445993450033188),
 (4, 3.6422676818179833)]

In [31]:
cost_df = pd.DataFrame(rmwds[1][1], 
                       index=[idx2word[idx] for idx in rmwds[1][2]], 
                       columns=[idx2word[idx] for idx in rmwds[1][3]])
cost_df

Unnamed: 0,chicago,evening,greets,president,press
beautiful,0.0,0.057143,0.057142,0.0,0.028572
illinois,0.142857,0.0,0.0,0.0,0.0
iraq,0.0,0.0,0.0,0.114286,0.028571
media,0.0,0.0,0.0,0.0,0.142857
morning,0.0,0.142857,0.0,0.0,0.0
obama,0.057143,0.0,0.0,0.085714,0.0
speaks,0.0,0.0,0.142857,0.0,0.0


In [30]:
def R_WMD(L_values, X1, X2, X1_nbow, X2_nbow):
    wmd_s = []
    for L in L_values:
        for idx2, row in enumerate(D[L[0]]):
            if row < L[1]:
                wmd = WMD(X1[L[0]], X2[idx2], X1_nbow[L[0]], X2_nbow[idx2], E)
                wmd_s.append(wmd)
            else:
                pass
    return wmd_s

rmwds = R_WMD(L_values, X1, X2, X1_nbow, X2_nbow)

## Modularize LC-RMWD

In [229]:
from sklearn.feature_extraction.text import TfidfVectorizer

X1 = [['beautiful','obama', 'speaks', 'media', 'illinois', 'morning','iraq'],
      ['beautiful','obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning'],
      ["which","team","will","lose"],
      ["is", "it", "gonna", "rain", "tomorrow"],
      ['obama', 'speaks', 'media', 'illinois']]
X2 = [['beautiful','president', 'greets', 'press', 'chicago', 'beautiful','morning'],
      ['president', 'greets', 'press', 'chicago', 'evening'],
      ["who", "is", "winning"],
      ["the", "forecast", "says", "snow"],
      ['president', 'greets', 'press', 'chicago'],
      ['hello','called','love','you']]

X1_sent = [" ".join(doc) for doc in X1]
X2_sent = [" ".join(doc) for doc in X2]

corpus = X1_sent + X2_sent

#vectorizer = TfidfVectorizer(use_idf=False, norm='l2')
vectorizer = TfidfVectorizer(use_idf=False, norm='l1')
vectorizer.fit(corpus)

TfidfVectorizer(norm='l1', use_idf=False)

In [178]:
features = vectorizer.get_feature_names()
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
idx2word = {idx: word a idx, word in enumerate(vectorizer.get_feature_names())}

In [179]:
word2idx['chicago']

2

In [256]:
#X1_corpus = 
%time
X1_nbow = vectorizer.transform(X1_sent)
X2_nbow = vectorizer.transform(X2_sent)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


In [102]:
E = np.vstack([model.word_vec(word) for word in vectorizer.get_feature_names()])

In [642]:
from flow_wmd.modules_rwmd import Document, LC_RWMD, WMDCluster

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [208]:
X1_sent[0]

'beautiful obama speaks media illinois morning iraq'

In [239]:
X1_sent[1]

'beautiful obama speaks media illinois beautiful morning'

In [564]:
%time 

X1_docs, X2_docs = [], []

X1_nbow = vectorizer.transform(X1_sent)
X2_nbow = vectorizer.transform(X2_sent)

for idx, doc in enumerate(X1):
    X1_docs.append(Document(doc, X1_nbow[idx], word2idx, E))
    
for idx, doc in enumerate(X2):
    X2_docs.append(Document(doc, X2_nbow[idx], word2idx, E))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [565]:
%time

lc_rwmd = LC_RWMD(X1_docs, X2_docs,X1_nbow,X2_nbow,E)
lc_rwmd.get_D()
lc_rwmd.get_L(2)
lc_rwmd.get_rwmd()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


In [566]:
lc_rwmd.wmd_s

[0.7779423936888455,
 1.0577764527086124,
 0.5814076731150226,
 1.0601321450052137,
 1.1221033037203125,
 1.3456935750793113,
 1.33409258504228,
 1.1613731145914146,
 1.2665668245616257,
 1.3413949351295233,
 1.1287236809574008,
 1.1562737829503784,
 1.0927072101407647,
 1.0174646259300113]

In [569]:
X1_docs[0]

<flow_wmd.modules_rwmd.Document at 0x7f942b48d9a0>

In [649]:
X1_docs[0]

<flow_wmd.modules_rwmd.Document at 0x7f942b48d9a0>

In [655]:
%time

distances, wc_X1, wc_X2 = WMDCluster(X1_docs, X2_docs,E,idx2word).get_distances(return_flow = True)

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 2.86 µs


In [658]:
wc_X1

{'is': 1.27494,
 'iraq': 1.18948,
 'it': 1.4954100000000001,
 'beautiful': 2.7164,
 'which': 1.9112699999999998,
 'lose': 1.9683000000000002,
 'speaks': 3.5973,
 'obama': 4.23171,
 'will': 1.8686099999999999,
 'tomorrow': 1.46315,
 'media': 3.61458,
 'illinois': 3.6213599999999997,
 'rain': 1.4949000000000001,
 'gonna': 1.41928,
 'morning': 1.70016,
 'team': 1.9965}

In [578]:
%time

test = WMDCluster(X1_docs[0:2], X2_docs[0:2],E,idx2word).get_flow()[0]

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


In [632]:
test[2][:,0]

array([0.     , 0.     , 0.19653, 0.     , 0.     , 0.     , 0.     ])

In [580]:
pd.DataFrame(test[2], index=test[3],columns=test[4])

Unnamed: 0,beautiful,chicago,greets,morning,president,press
beautiful,0.0,0.0,0.0,0.0,0.0,0.0
illinois,0.0,0.12296,0.0,0.0,0.0,0.0
iraq,0.19653,0.0,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.0,0.0,0.12756
morning,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.0,0.0,0.0,0.0,0.19115,0.0
speaks,0.0,0.0,0.13973,0.0,0.0,0.0


In [521]:
cost_m = test[0]*test[1]
cost_m = cost_m[:len(test[2]),len(test[2]):].round(5)
pd.DataFrame(cost_m, index=w1,columns=w2)

Unnamed: 0,beautiful,chicago,greets,morning,president,press
beautiful,0.0,0.0,0.0,0.0,0.0,0.0
illinois,0.0,0.12296,0.0,0.0,0.0,0.0
iraq,0.19653,0.0,0.0,0.0,0.0,0.0
media,0.0,0.0,0.0,0.0,0.0,0.12756
morning,0.0,0.0,0.0,0.0,0.0,0.0
obama,0.0,0.0,0.0,0.0,0.19115,0.0
speaks,0.0,0.0,0.13973,0.0,0.0,0.0


In [69]:
T2_full = []

# Atasu et al LC-RWMD: Many-to-many NOT WORKING
for idx2,doc2 in enumerate(X2):
    # Helper to quickly get word vectors for X2
    X2_indeces = list(set([word2idx[word] for word in doc2]))
    #Atasu et al. RMWD
    T2 = E[X2_indeces,]
    T2_full.append(T2)
    #ET2 = np.dot(E)
    #Z_mtm =

np.vstack(T2_full).min(axis=1)

array([-0.15780888, -0.17197762, -0.14504768, -0.12993725, -0.16405022,
       -0.1421869 , -0.22472994, -0.15382984, -0.14625083, -0.13629027,
       -0.13017109, -0.17396213, -0.12993725, -0.16405022, -0.15780888,
       -0.14504768, -0.161765  , -0.16040245, -0.16454686, -0.1797309 ],
      dtype=float32)

In [70]:
%time

RWMD_s = []

for idx1,doc1 in enumerate(X1):
    # Helper to quickly get word vectors for X1
    X1_indeces = list(set([word2idx[word] for word in doc1]))

    # Atasu et al. RMWD
    T1 = E[X1_indeces,]
    F1 = X1_nbow[idx1].toarray()[0,X1_indeces]
    
    # Helper
    similarity = []
    for idx2,doc2 in enumerate(X2):
        # Helper to quickly get word vectors for X2
        X2_indeces = list(set([word2idx[word] for word in doc2]))
        
        #Atasu et al. RMWD
        T2 = E[X2_indeces,]
        F2 = X2_nbow[idx2].toarray()[0,X2_indeces]
        
        C = euclidean_distances(T1, T2)
        RWMD = np.maximum(np.dot(np.min(C, axis=1), F1),
                          np.dot(np.min(C, axis=0), F2))
        
        # Helper
        similarity.append((doc1, idx2, doc2,RWMD))
        
        # Required by pyemd, but not part of the Atasu et al.
        T_emd = E[X1_indeces + X2_indeces,]
        C_emd = euclidean_distances(T_emd, T_emd)
        X1_sig = np.concatenate((X1_nbow[3].toarray()[0,X1_indeces], np.zeros(len(F2))))
        X2_sig = np.concatenate((np.zeros(len(F1)), X2_nbow[3].toarray()[0,X2_indeces]))
        
        # Doing WMD
        WMD = emd_with_flow(np.array(X1_sig, dtype=np.double), 
                            np.array(X2_sig, dtype=np.double), 
                            np.array(C_emd, dtype=np.double))
        
        #print(WMD[0])
        #print(RWMD)
    RWMD_s.append(similarity)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


In [71]:
k = 5
k_RWMD = [sorted(rwmd, key = lambda t: t[3], reverse=False)[:k] for rwmd in RWMD_s]

for k in k_RWMD:
    for scores in k:
        print(scores[0])
        print(scores[2])
        print(scores[3])
    print("\n")

['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['president', 'greets', 'press', 'chicago', 'evening']
1.0102365442684718
['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['president', 'greets', 'press', 'chicago']
1.0826340743473597
['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['hello', 'called', 'love', 'you']
1.1934219428471156
['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['the', 'forecast', 'says', 'snow']
1.2727791241237094
['beautiful', 'obama', 'speaks', 'media', 'illinois', 'beautiful', 'morning']
['who', 'is', 'winning']
1.3249833924429755


['which', 'team', 'will', 'lose']
['who', 'is', 'winning']
1.148093432188034
['which', 'team', 'will', 'lose']
['the', 'forecast', 'says', 'snow']
1.2594012022018433
['which', 'team', 'will', 'lose']
['hello', 'called', 'love', 'you']
1.2821693122386932
['which', 'team', 'will', 'lose']
['president', 'greets', 'press