In [58]:
from bs4 import BeautifulSoup
from flow_wmd.documents import Document
from flow_wmd.models import LC_RWMD, WMD, WMDManyToMany
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import pandas as pd
import re

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Prepare IMDB data

### 1.1 Load data and stopwords.

In [59]:
nfl = pd.read_csv("/Users/jack/Desktop/benchmarking datasets/reddit_sports/nfl.csv")
nba =  pd.read_csv("/Users/jack/Desktop/benchmarking datasets/reddit_sports/nba.csv")
df = pd.concat([nfl,nba]).reset_index(drop=True)
stopword_list=stopwords.words('english')

### 1.2 Initialize cleanup functions 

Functions to remove noisy formatting, lemmatizing and removing stopwords.

In [60]:
# Custom preprocessing functions
# Partly self-authored, partly from https://www.kaggle.com/lakshmi25npathi/sentiment-analysis-of-imdb-movie-reviews

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = re.sub('<br / ><br / >', ' ', text)
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z\s]'
    text=re.sub(pattern,'',text)
    return text

#Lemmatizing the text
def simple_lemmatizer(text):
    lemmatizer=WordNetLemmatizer() 
    text= ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

#removing the stopwords
def remove_stopwords(text, stopword_list, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token.lower() for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

### 1.3 Remove special formatting and stopwords

Remove stopwords before denoising, lemmatizing and removing special characters.

In [61]:
%time 

tokenizer=ToktokTokenizer()
df['body_clean']= [remove_stopwords(r, stopword_list) for r in df['body']]

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.96 µs


Denoise, remove special characters, lemmatize.

In [62]:
%time

df['body_clean']=df['body_clean'].apply(denoise_text)
df['body_clean']=df['body_clean'].apply(remove_special_characters)
df['body_clean']=df['body_clean'].apply(simple_lemmatizer)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.2 µs


Remove stopwords again, after other preprocessing.

In [63]:
%time 

df['body_clean']= [remove_stopwords(r, stopword_list) for r in df['body_clean']]

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.2 µs


Remove reddit formatting.

In [64]:
from redditcleaner import clean

df['body_clean']=[clean(body) for body in df["body_clean"]]


Data _before_ preprocessing.

In [65]:
df['body'][0]

'Yeah, and I think he got the offer (although never confirmed), but turned it down. Certainly gonna be a concern he leaves us in the coming years for the college game but that is a lot different (and less enticing) switch than being poached by an NFL team to be their HC. \n\nWhiz seriously would do well as a college coach though. I just hope he prefers the Chargers. We all bitch about how our offense is run sometimes, but he is at the worst an above average OC that offers unique consistency in staying with the team for long periods of time. A lot of our problems are unavoidable as well, considering our OL is a bunch of fucking goons that require an assistant OL coach position just to make sure everyone’s shoes are tied. It’s gotta be a pain in the ass scheming around Dan “fuck pass protection” Feeney when the Rams are shoving Suh and Donald down your throat every play.'

Data _after_ preprocessing.

In [66]:
df['body_clean'][0]

'yeah think got offer although never confirmed turned certainly gonna concern leaf u coming year college game lot different le enticing switch poached nfl team hc whiz seriously would well college coach though hope prefers charger bitch offense run sometimes worst average oc offer unique consistency staying team long period time lot problem unavoidable well considering ol bunch fucking goon require assistant ol coach position make sure everyone shoe tied gotta pain scheming around dan fuck pas protection feeney ram shoving suh donald throat every play'

### 1.4 Separate pos and neg reviews

In [67]:
nfl = df[df.subreddit == "nfl"].reset_index(drop=True)
nba = df[df.subreddit == "nba"].reset_index(drop=True)

In [68]:
nfl = nfl.body_clean.tolist()
nba = nba.body_clean.tolist()

## 2. WMD

### 2.1 Tokenize and "sample" data

In [69]:
def tokenize(text):
    tokens = tokenizer.tokenize(text)
    return tokens

nfl_tok = list(map(tokenize, nfl))
nba_tok = list(map(tokenize, nba))

In [70]:
nfl_sample = [" ".join(doc) for doc in nfl_tok]
nba_sample = [" ".join(doc) for doc in nba_tok]

### 2.2 Load pretrained Google News W2V model

In [71]:
def read_1w_corpus(name, sep="\t"):
    for line in open(name):
        yield line.split(sep)

print("Loading GoogleNews Vectors")
%time model = KeyedVectors.load_word2vec_format('/Users/jack/Downloads/GoogleNews-vectors-negative300.bin.gz', binary=True)

Loading GoogleNews Vectors
CPU times: user 54.8 s, sys: 7.17 s, total: 1min 1s
Wall time: 1min 11s


### 2.3 Load corpus and remove OOV words

In [72]:
corpus = nfl_sample + nba_sample

%time vectorizer = TfidfVectorizer(use_idf=False, tokenizer=tokenize, norm='l1')
%time vectorizer.fit(corpus)

CPU times: user 69 µs, sys: 758 µs, total: 827 µs
Wall time: 2.18 ms




CPU times: user 1.32 s, sys: 2.26 s, total: 3.58 s
Wall time: 8.63 s


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l1', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function tokenize at 0x1a18ba6b00>, use_idf=False,
                vocabulary=None)

In [73]:
%time oov = [word for word in vectorizer.get_feature_names() if word not in model.key_to_index.keys()]

CPU times: user 57.1 ms, sys: 157 ms, total: 215 ms
Wall time: 340 ms


In [74]:
len(oov)

6508

In [75]:
#removing the oov words
def remove_oov(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in oov]
    #filtered_tokens = filter(lambda token: token not in oov, tokens)
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

%time nfl_sample = list(map(remove_oov, nfl_sample))
%time nba_sample = list(map(remove_oov, nba_sample))

CPU times: user 21.1 s, sys: 223 ms, total: 21.3 s
Wall time: 21.6 s
CPU times: user 19.6 s, sys: 180 ms, total: 19.8 s
Wall time: 20 s


In [78]:
# remove documents that are empty after removing stopwords and OOV

for c,i in enumerate(nba_sample):
    l = i.split()
    if len(l)==0:
        del nba_sample[c]
        
for c,i in enumerate(nfl_sample):
    l = i.split()
    if len(l)==0:
        del nfl_sample[c]

In [81]:
corpus = nfl_sample + nba_sample

%time vectorizer = TfidfVectorizer(use_idf=True, tokenizer=tokenize,norm='l1')
%time vectorizer.fit(corpus)

CPU times: user 111 µs, sys: 601 µs, total: 712 µs
Wall time: 1.37 ms
CPU times: user 1.03 s, sys: 52 ms, total: 1.08 s
Wall time: 1.14 s


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l1', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function tokenize at 0x1a18ba6b00>, use_idf=True,
                vocabulary=None)

Bag-of-words vectorizer.

In [82]:
%time
nfl_nbow = vectorizer.transform(nfl_sample)
nba_nbow = vectorizer.transform(nba_sample)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.2 µs


In [83]:
nfl_tok = list(map(tokenize, nfl_sample))
nba_tok =  list(map(tokenize, nba_sample))

In [84]:
nfl_tok[0][:20]

['yeah',
 'think',
 'got',
 'offer',
 'although',
 'never',
 'confirmed',
 'turned',
 'certainly',
 'gonna',
 'concern',
 'leaf',
 'u',
 'coming',
 'year',
 'college',
 'game',
 'lot',
 'different',
 'le']

In [85]:
%time oov_ = [word for word in vectorizer.get_feature_names() if word not in model.key_to_index.keys()]

CPU times: user 54.8 ms, sys: 214 ms, total: 269 ms
Wall time: 490 ms


In [86]:
len(oov_)

0

### 2.4 Get features and embeddings

In [87]:
features = vectorizer.get_feature_names()
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
idx2word = {idx: word for idx, word in enumerate(vectorizer.get_feature_names())}

Get the embedding matrix "E" for all features.

In [88]:
E = np.vstack([model.word_vec(word) for word in vectorizer.get_feature_names()])

  """Entry point for launching an IPython kernel.


### 2.5 Initialize documents

Transform all reviews into "documents", each with a set of weights per word in the corpus ("nbow"), the sum of these weights ("weights_sum"), the indeces of the words in the documents ("idxs") and the word vectors corresponding to each word ("vecs").

In [89]:
%time 

nfl_docs, nba_docs = [], []

for idx, doc in enumerate(nfl_tok):
    nfl_docs.append(Document(doc, nfl_nbow[idx], word2idx, E))
    
for idx, doc in enumerate(nba_tok):
    nba_docs.append(Document(doc, nba_nbow[idx], word2idx, E))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.2 µs


In [90]:
nfl_docs[0].nbow

array([[0., 0., 0., ..., 0., 0., 0.]])

In [91]:
nfl_docs[0].weights_sum

0.9999999999999998

In [92]:
nfl_docs[0].idxs[:10]

[6670, 15889, 1049, 6684, 11804, 3106, 16422, 6695, 6698, 15922]

In [93]:
nfl_docs[0].vecs[:1][0][:10]

array([ 0.26367188,  0.07568359,  0.16699219,  0.29101562,  0.04443359,
       -0.06835938, -0.06591797, -0.04614258,  0.13574219,  0.16113281],
      dtype=float32)

### 2.6 Linear-Complexity Relaxed WMD (LC-RWMD)

Run the [Linear-Complexity Relaxed WMD](https://arxiv.org/abs/1711.07227) to get the distances between all positive and all negative reviews.

In [94]:
for i in nfl_sample:
    if len(i.split())==0:       
        print(i)
    

In [95]:
%time lc_rwmd = LC_RWMD(nfl_docs, nba_docs,nfl_nbow,nba_nbow,E)
%time lc_rwmd.get_D()
#%time lc_rwmd.get_L(1)
#%time lc_rwmd.get_rwmd()

CPU times: user 21 µs, sys: 348 µs, total: 369 µs
Wall time: 1.66 ms
CPU times: user 2h 24min 11s, sys: 49min 46s, total: 3h 13min 58s
Wall time: 58min 7s


### 2.7 Gale-Shapeley Pairing

Use the [Gale-Shapeley matching algorithm](https://en.wikipedia.org/wiki/Gale%E2%80%93Shapley_algorithm) to find the optimal pairs between positive and negative reviews. This iterates over all the reviews and finds the set of matches that pairs each review with its optimal match given that all positive reviews have to be matched with a negative review and vice versa. The output is a dictionary of key-value pairs, where each pair represents an optimal match.

In [96]:
from flow_wmd.gale_shapeley import Matcher

matcher = Matcher(lc_rwmd.D)
engaged = matcher.matchmaker()
matcher.check()
pairs = engaged

Let's look at the output of Gale-Shapeley:

In [97]:
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))


take(10, pairs.items())

[(4497, 0),
 (2364, 1),
 (1962, 3119),
 (4245, 100),
 (2007, 4184),
 (4000, 965),
 (3508, 2964),
 (977, 53),
 (533, 19),
 (3792, 3675)]

### 2.8 Pairwise WMD

Calculate the pairwise distances between the documents selected by the Galey-Shapeley algorithm _without_ returning the flow between individual words.

In [98]:
from flow_wmd.models import WMDPairs

wmd_pairs = WMDPairs(nfl_docs,nba_docs,pairs,E,idx2word)
%time wmd_pairwise = wmd_pairs.get_distances()

Calculated distances between 0 documents.
Calculated distances between 100 documents.
Calculated distances between 200 documents.
Calculated distances between 300 documents.
Calculated distances between 400 documents.
Calculated distances between 500 documents.
Calculated distances between 600 documents.
Calculated distances between 700 documents.
Calculated distances between 800 documents.
Calculated distances between 900 documents.
Calculated distances between 1000 documents.
Calculated distances between 1100 documents.
Calculated distances between 1200 documents.
Calculated distances between 1300 documents.
Calculated distances between 1400 documents.
Calculated distances between 1500 documents.
Calculated distances between 1600 documents.
Calculated distances between 1700 documents.
Calculated distances between 1800 documents.
Calculated distances between 1900 documents.
Calculated distances between 2000 documents.
Calculated distances between 2100 documents.
Calculated distances b

The return value is a matrix of distances between the document pairs.

In [99]:
wmd_pairwise

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Calculate the pairwise distances between the documents selected by the Galey-Shapeley algorithm, this time also returning the flow between individual words.

In [100]:
wmd_pairs_flow = WMDPairs(nfl_docs,nba_docs,pairs,E,idx2word)
%time wmd_pairwise_flow = wmd_pairs_flow.get_distances(return_flow = True)

Calculated distances between 0 documents.
Calculated distances between 100 documents.
Calculated distances between 200 documents.
Calculated distances between 300 documents.
Calculated distances between 400 documents.
Calculated distances between 500 documents.
Calculated distances between 600 documents.
Calculated distances between 700 documents.
Calculated distances between 800 documents.
Calculated distances between 900 documents.
Calculated distances between 1000 documents.
Calculated distances between 1100 documents.
Calculated distances between 1200 documents.
Calculated distances between 1300 documents.
Calculated distances between 1400 documents.
Calculated distances between 1500 documents.
Calculated distances between 1600 documents.
Calculated distances between 1700 documents.
Calculated distances between 1800 documents.
Calculated distances between 1900 documents.
Calculated distances between 2000 documents.
Calculated distances between 2100 documents.
Calculated distances b

Now we have three return values.

The first one is again a matrix of distances between the document pairs.

In [101]:
wmd_pairwise_flow[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

The second return value is a list of tuples with all the words that contributed the most to the distance from the positive documents to the negative ones. These are _not_ sorted from high to low or vice versa.

In [102]:
take(10, wmd_pairwise_flow[1].items())

[('rife', 0.06325),
 ('trademark', 0.26766999999999996),
 ('boneheaded', 0.40813),
 ('piss', 1.5792100000000002),
 ('surprise', 2.8869199999999995),
 ('forgive', 0.8160400000000001),
 ('payday', 0.8864299999999999),
 ('approve', 0.15117),
 ('gifs', 0.35931),
 ('wilder', 0.00753)]

The third return value is a list of tuples with all the words that contributed the most to the distance from the negative documents to the positive ones. Again, these are _not_ sorted from high to low or vice versa.

In [103]:
take(10, wmd_pairwise_flow[2].items())

[('blurt', 0.13701),
 ('soooooooo', 0.11983000000000002),
 ('trademark', 0.14214),
 ('hooper', 0.10486999999999999),
 ('piss', 1.89673),
 ('forgive', 0.8594999999999999),
 ('surprise', 0.99889),
 ('lapse', 0.64412),
 ('payday', 0.18817),
 ('approve', 0.1886)]

### 2.9 Intepreting pairwise WMD flows

Now, let's sort the distances of the words that created the most distance from the positive to the negative reviews.

In [104]:
{k: v for k, v in sorted(wmd_pairwise_flow[1].items(), key=lambda item: item[1], reverse=True)[:30]}

{'qb': 76.03633000000005,
 'nfl': 58.08311000000005,
 'yard': 46.122809999999994,
 'year': 43.86639000000003,
 'game': 35.08158999999997,
 'brady': 30.907420000000023,
 'team': 30.748360000000037,
 'bowl': 30.237729999999996,
 'season': 29.27642999999998,
 'like': 29.181069999999945,
 'think': 27.45017000000002,
 'time': 27.15802999999999,
 'get': 26.595749999999992,
 'one': 26.522229999999976,
 'would': 26.352930000000008,
 'guy': 26.17762,
 'good': 25.27174,
 'people': 24.856300000000015,
 'rb': 24.610240000000005,
 'fan': 24.470229999999997,
 'play': 24.416509999999985,
 'last': 24.388399999999994,
 'make': 23.79841999999999,
 'wr': 23.184750000000005,
 'football': 22.71535999999999,
 'patriot': 22.631429999999998,
 'offense': 22.417170000000002,
 'better': 22.408880000000018,
 'pat': 22.271230000000006,
 'know': 21.930450000000015}

Next, let's see what added most distance when moving from the negative to the positive reviews.

In [105]:
{k: v for k, v in sorted(wmd_pairwise_flow[2].items(), key=lambda item: item[1], reverse=True)[:30]}

{'lebron': 76.02092999999998,
 'player': 70.39909,
 'harden': 64.42966999999996,
 'team': 58.53596000000008,
 'nba': 58.197859999999984,
 'lakers': 57.88405999999996,
 'game': 53.74635,
 'playoff': 43.73188000000001,
 'shot': 36.739230000000006,
 'like': 36.65150999999996,
 'season': 34.295340000000046,
 'guy': 33.92289000000004,
 'people': 33.804949999999984,
 'point': 32.97387999999997,
 'would': 32.75015999999999,
 'warrior': 32.534659999999995,
 'post': 31.95436,
 'get': 31.729529999999976,
 'kobe': 31.725779999999997,
 'better': 31.20185,
 'year': 31.03171999999998,
 'think': 30.280169999999988,
 'fan': 30.273599999999963,
 'good': 29.319509999999998,
 'kd': 29.26800999999999,
 'play': 28.874340000000014,
 'rocket': 27.734910000000003,
 'league': 27.60889000000002,
 'best': 27.60194000000001,
 'shooting': 27.596130000000006}

## Appendix: Many-to-many WMD

This was a first attempt to do the flows from words between many documents, without first filtering using Gale-Shapeley. However, this proved too inefficient. As you can see looking at the CPU times, it is very slow even with extremely small samples and the time complexity is quadratic (or worse?), meaning it rapidly gets even worse as the sample size increases.

In [49]:
%time m2m_distances = WMDManyToMany(nfl_docs[:20], nba_docs[:20],E,idx2word).get_distances(return_flow = False)

CPU times: user 2min 17s, sys: 1.92 s, total: 2min 18s
Wall time: 54.3 s


In [50]:
%time m2m_distances_flow, wc_X1, wc_X2 = WMDManyToMany(nfl_docs[:20],nba_docs[:20],E,idx2word).get_distances(return_flow = True)

CPU times: user 2min 22s, sys: 1.9 s, total: 2min 24s
Wall time: 55.2 s


In [51]:
{k: v for k, v in sorted(wc_X1.items(), key=lambda item: item[1], reverse=True)[:10]}

{'karen': 8.69223,
 'wrenching': 8.31882,
 'carpenter': 7.468960000000001,
 'laughter': 7.467879999999999,
 'liked': 6.864090000000003,
 'mom': 6.791519999999999,
 'gut': 6.759419999999999,
 'love': 6.551409999999997,
 'camp': 6.533080000000001,
 'hr': 6.1393699999999995}

In [52]:
{k: v for k, v in sorted(wc_X2.items(), key=lambda item: item[1], reverse=True)[:10]}

{'hopper': 8.372459999999998,
 'jake': 7.63837,
 'movie': 7.267059999999995,
 'film': 6.936379999999998,
 'shakespeare': 5.99276,
 'oddness': 5.53033,
 'terrible': 4.943440000000001,
 'parent': 4.751790000000001,
 'actor': 4.672620000000001,
 'bad': 4.430020000000002}