In [149]:
from bs4 import BeautifulSoup
from flow_wmd.documents import Document
from flow_wmd.models import LC_RWMD, WMD, WMDManyToMany
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import pandas as pd
import re

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [150]:
imdb_data = pd.read_csv("IMDB Dataset.csv")

In [151]:
#Setting English stopwords
stopword_list=stopwords.words('english')

In [152]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(denoise_text)

In [153]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_special_characters)

In [154]:
%time

#Lemmatizing the text
def simple_lemmatizer(text):
    lemmatizer=WordNetLemmatizer() 
    text= ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(simple_lemmatizer)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.91 µs


In [155]:
#set stopwords to english
%time stop=set(stopwords.words('english'))

CPU times: user 401 µs, sys: 1.15 ms, total: 1.55 ms
Wall time: 1.97 ms


In [156]:
tokenizer=ToktokTokenizer()

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token.lower() for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
%time imdb_data['review']=imdb_data['review'].apply(remove_stopwords)

CPU times: user 30.5 s, sys: 246 ms, total: 30.7 s
Wall time: 31.1 s


In [157]:
imdb_data['review'][0]

'one reviewer ha mentioned watching oz episode youll hooked right exactly happened methe first thing struck oz wa brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use wordit called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home manyaryans muslims gangsta latinos christians italians irish moreso scuffle death stare dodgy dealing shady agreement never far awayi would say main appeal show due fact go show wouldnt dare forget pretty picture painted mainstream audience forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty wa surreal couldnt say wa ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard wholl sold nickel inmate wholl kill order get away well mannered middle class inmate t

In [323]:
pos = imdb_data[imdb_data.sentiment == "positive"].reset_index(drop=True)
neg = imdb_data[imdb_data.sentiment == "negative"].reset_index(drop=True)

In [324]:
pos = pos.review.tolist()
neg = neg.review.tolist()

## WMD

In [366]:
def tokenize(text):
    tokens = tokenizer.tokenize(text)
    return tokens

pos_tok = list(map(tokenize, pos[:500]))
neg_tok = list(map(tokenize, neg[:500]))

In [367]:
pos_ = [" ".join(doc) for doc in pos_tok]
neg_ = [" ".join(doc) for doc in neg_tok]

In [360]:
def read_1w_corpus(name, sep="\t"):
    for line in open(name):
        yield line.split(sep)

print("Loading GoogleNews Vectors")
%time model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

Loading GoogleNews Vectors
CPU times: user 45 s, sys: 13.3 s, total: 58.4 s
Wall time: 1min 23s


In [368]:
corpus = pos_ + neg_

%time vectorizer = TfidfVectorizer(use_idf=False, tokenizer=tokenize, norm='l1')
%time vectorizer.fit(corpus)

CPU times: user 27 µs, sys: 1e+03 ns, total: 28 µs
Wall time: 30 µs
CPU times: user 294 ms, sys: 6.38 ms, total: 300 ms
Wall time: 305 ms


TfidfVectorizer(norm='l1', tokenizer=<function tokenize at 0x7ffa97122af0>,
                use_idf=False)

In [369]:
%time oov = [word for word in vectorizer.get_feature_names() if word not in model.vocab.keys()]

CPU times: user 23.2 ms, sys: 1.1 ms, total: 24.3 ms
Wall time: 23.7 ms


In [370]:
len(oov)

5426

In [371]:
#removing the stopwords
def remove_oov(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in oov]
    #filtered_tokens = filter(lambda token: token not in oov, tokens)
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

%time pos_ = list(map(remove_oov, pos_))
%time neg_ = list(map(remove_oov, neg_))

CPU times: user 4.66 s, sys: 38.8 ms, total: 4.7 s
Wall time: 4.76 s
CPU times: user 4.92 s, sys: 71.4 ms, total: 4.99 s
Wall time: 5.15 s


In [372]:
pos_[0]

'one reviewer ha mentioned watching oz episode youll hooked right exactly happened first thing struck oz wa brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use called oz nickname given maximum security state focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home muslims gangsta latinos christians italians irish moreso scuffle death stare dodgy dealing shady agreement never far would say main appeal show due fact go show wouldnt dare forget pretty picture painted mainstream audience forget charm forget mess around first episode ever saw struck nasty wa surreal couldnt say wa ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill prison experience watching oz may bec

In [406]:
corpus = pos_ + neg_

%time vectorizer = TfidfVectorizer(use_idf=True, tokenizer=tokenize,norm='l1')
%time vectorizer.fit(corpus)

CPU times: user 32 µs, sys: 8 µs, total: 40 µs
Wall time: 42.9 µs




CPU times: user 275 ms, sys: 27.7 ms, total: 303 ms
Wall time: 338 ms


TfidfVectorizer(norm='l1', tokenizer=<function tokenize at 0x7ffa97122af0>)

In [407]:
%time
pos_nbow = vectorizer.transform(pos_)
neg_nbow = vectorizer.transform(neg_)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [408]:
pos_tok = list(map(tokenize, pos_))
neg_tok =  list(map(tokenize, neg_))

In [409]:
pos_tok[0][:10]

['one',
 'reviewer',
 'ha',
 'mentioned',
 'watching',
 'oz',
 'episode',
 'youll',
 'hooked',
 'right']

In [410]:
%time oov_ = [word for word in vectorizer.get_feature_names() if word not in model.vocab.keys()]

CPU times: user 27.5 ms, sys: 16.2 ms, total: 43.7 ms
Wall time: 57.9 ms


In [411]:
len(oov_)

0

In [412]:
features = vectorizer.get_feature_names()
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
idx2word = {idx: word for idx, word in enumerate(vectorizer.get_feature_names())}

In [413]:
E = np.vstack([model.word_vec(word) for word in vectorizer.get_feature_names()])

In [414]:
%time 

pos_docs, neg_docs = [], []

for idx, doc in enumerate(pos_tok):
    pos_docs.append(Document(doc, pos_nbow[idx], word2idx, E))
    
for idx, doc in enumerate(neg_tok):
    neg_docs.append(Document(doc, neg_nbow[idx], word2idx, E))

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 2.86 µs


In [426]:
%time lc_rwmd = LC_RWMD(pos_docs, neg_docs,pos_nbow,neg_nbow,E)
%time lc_rwmd.get_D()
#%time lc_rwmd.get_L(1)
#%time lc_rwmd.get_rwmd()

In [432]:
from flow_wmd.gale_shapeley import Matcher

matcher = Matcher(lc_rwmd.D)
engaged = matcher.matchmaker()
matcher.check()
pairs = engaged

In [None]:
from flow_wmd.models import WMDPairs

wmd_pairs = WMDPairs(pos_docs,neg_docs,pairs,E,idx2word)
%time wmd_pairwise = wmd_pairs.get_distances()

In [449]:
wmd_pairs_flow = WMDPairs(pos_docs,neg_docs,pairs,E,idx2word)
%time wmd_pairwise = wmd_pairs_flow.get_distances(return_flow = True)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [453]:
{k: v for k, v in sorted(wmd_pairwise[1].items(), key=lambda item: item[1], reverse=True)[:30]}

{'film': 4.906220000000002,
 'wa': 4.775099999999999,
 'movie': 3.6967100000000013,
 'story': 3.62124,
 'ha': 3.5483599999999993,
 'great': 3.009169999999998,
 'love': 2.9116400000000007,
 'performance': 2.51636,
 'best': 2.4944900000000003,
 'character': 2.452740000000001,
 'well': 2.3896100000000002,
 'scene': 2.292920000000001,
 'life': 2.2134100000000005,
 'watch': 2.13619,
 'one': 2.12931,
 'still': 2.1001800000000013,
 'young': 2.08547,
 'show': 2.0658999999999996,
 'excellent': 2.04221,
 'comedy': 2.02044,
 'role': 1.97947,
 'people': 1.9748000000000006,
 'like': 1.9634199999999997,
 'good': 1.9615299999999998,
 'time': 1.9538799999999987,
 'actor': 1.9072100000000003,
 'dvd': 1.8441400000000001,
 'family': 1.8372299999999993,
 'fan': 1.8288100000000003,
 'see': 1.8005600000000004}

In [None]:
%time distances = WMDManyToMany(pos_docs[:20], neg_docs[:20],E,idx2word).get_distances(return_flow = False)

In [418]:
%time distances, wc_X1, wc_X2 = WMDManyToMany(pos_docs[:100],neg_docs[:100],E,idx2word).get_distances(return_flow = True)

CPU times: user 59min 8s, sys: 6min 21s, total: 1h 5min 30s
Wall time: 30min 51s


In [421]:
{k: v for k, v in sorted(wc_X1.items(), key=lambda item: item[1], reverse=True)[:20]}

{'movie': 133.19554999999986,
 'wa': 121.20958999999971,
 'film': 99.08134000000021,
 'ha': 98.73361000000021,
 'old': 71.79807000000012,
 'love': 59.92746,
 'work': 59.43450999999994,
 'character': 59.22272000000011,
 'well': 57.48055999999976,
 'story': 55.968279999999936,
 'masterpiece': 52.472030000000075,
 'fan': 52.02825999999998,
 'life': 50.626579999999905,
 'series': 50.07584999999988,
 'young': 50.036920000000045,
 'actor': 49.58609000000011,
 'like': 49.291179999999905,
 'original': 49.11823000000004,
 'gore': 48.63597000000007,
 'doe': 48.189729999999955}

In [422]:
{k: v for k, v in sorted(wc_X2.items(), key=lambda item: item[1], reverse=True)[:20]}

{'wa': 212.0496599999999,
 'movie': 186.80768999999995,
 'film': 134.59645999999975,
 'worst': 101.03730999999995,
 'bad': 93.02847999999994,
 'plot': 73.08126000000011,
 'acting': 70.24372000000012,
 'ha': 69.90302000000003,
 'actor': 69.22724000000014,
 'story': 67.40877000000003,
 'scene': 63.63788999999996,
 'funny': 62.16114999999998,
 'dont': 61.35071999999992,
 'like': 59.817919999999916,
 'boring': 58.63522000000005,
 'watch': 58.41860999999997,
 'ive': 58.01862999999999,
 'seen': 56.070369999999905,
 'terrible': 55.73976000000005,
 'made': 55.3686699999999}