# PPMI SVD
See https://medium.com/data-from-the-trenches/arithmetic-properties-of-word-embeddings-e918e3fda2ac

and

http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

and 

https://stackabuse.com/python-for-nlp-tokenization-stemming-and-lemmatization-with-spacy-library/

In [1]:
from collections import Counter
from itertools import combinations
from math import log
from pprint import pformat
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from string import punctuation
from time import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [6]:
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt

import pickle

In [3]:
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
# from sklearn.naive_bayes import MultinomialNB, GaussianNB
# # Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
# from sklearn.feature_extraction.text import CountVectorizer,\
#                                             TfidfVectorizer

In [4]:
# import nltk
# from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import RegexpTokenizer
# from nltk.stem.porter import PorterStemmer

In [7]:
def save_obj(obj, filename):
    with open(filename + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(filename):
    with open(filename + '.pkl', 'rb') as f:
        return pickle.load(f)

In [8]:
DIR = "C:\\Users\\AzNsAnTaGiN\\DSI\\Projects\\project_3\\data\\"
FILE1 = "theonion"
FILE2 = "nottheonion"
FILE3 = "onionheadlines"

# Data Import

In [11]:
X_theonion = load_obj(DIR+FILE1+"_df_clean")
X_nottheonion = load_obj(DIR+FILE2+"_df_clean")

In [12]:
X_theonion["is_onion"] = 1
X_nottheonion["is_onion"] = -1

## Generating our samples and holdout

In [13]:
N=4000
X_theonion_shuffled = X_theonion.sample(len(X_theonion))
theonion_sample = X_theonion_shuffled.head(N)
theonion_holdout = X_theonion_shuffled.tail(len(X_theonion_shuffled) - N)

X_nottheonion_shuffled = X_nottheonion.sample(len(X_nottheonion))
nottheonion_sample = X_nottheonion_shuffled.head(N)
nottheonion_holdout = X_nottheonion_shuffled.tail(len(X_nottheonion_shuffled)-N)
X_sample = pd.concat([theonion_sample, nottheonion_sample])
X = pd.concat([X_theonion, X_nottheonion])

# Construct the PPMI sparse matrix

The code we use in section to build the PPMI SVD is lifted virtually verbatim from https://www.kaggle.com/alexklibisz/simple-word-vectors-with-co-occurrence-pmi-and-svd

In [14]:
punctrans = str.maketrans(dict.fromkeys(punctuation))
def tokenize(title):
    x = title.lower() # Lowercase
    x = x.encode('ascii', 'ignore').decode() # Keep only ascii chars.
    x = x.translate(punctrans) # Remove punctuation
    return x.split() # Return tokenized.

In [15]:
texts_tokenized = X['title'].apply(tokenize)

In [16]:
# 2a. Compute unigram and bigram counts.
# A unigram is a single word (x). A bigram is a pair of words (x,y).
# Bigrams are counted for any two terms occurring in the same title.
# For example, the title "Foo bar baz" has unigrams [foo, bar, baz]
# and bigrams [(bar, foo), (bar, baz), (baz, foo)]
cx = Counter()
cxy = Counter()
for text in texts_tokenized:
    
    for x in text:
        cx[x] += 1

    # Count all pairs of words, even duplicate pairs.
    for x, y in map(sorted, combinations(text, 2)):
        cxy[(x, y)] += 1

In [18]:
# 2b. Remove frequent and infrequent unigrams.
# Pick arbitrary occurrence count thresholds to eliminate unigrams occurring
# very frequently or infrequently. This decreases the vocab size substantially.
min_count = (1 / 1000) * len(X)
max_count = (1 / 50) * len(X)
for x in list(cx.keys()):
    if cx[x] < min_count or cx[x] > max_count:
        del cx[x]

In [19]:
# 2c. Remove frequent and infrequent bigrams.
# Any bigram containing a unigram that was removed must now be removed.
for x, y in list(cxy.keys()):
    if x not in cx or y not in cx:
        del cxy[(x, y)]

In [20]:
# 3. Build unigram <-> index lookup.
x2i, i2x = {}, {}
for i, x in enumerate(cx.keys()):
    x2i[x] = i
    i2x[i] = x

In [21]:
# 4. Sum unigram and bigram counts for computing probabilities.
# i.e. p(x) = count(x) / sum(all counts).

sx = sum(cx.values())
sxy = sum(cxy.values())

In [26]:
# 5. Accumulate data, rows, and cols to build sparse PMI matrix
# Recall from the blog post that the PMI value for a bigram with tokens (x, y) is: 
# PMI(x,y) = log(p(x,y) / p(x) / p(y)) = log(p(x,y) / (p(x) * p(y)))
# The probabilities are computed on the fly using the sums from above.
t0 = time()
pmi_samples = Counter()
data, rows, cols = [], [], []
for (x, y), n in cxy.items():
    rows.append(x2i[x])
    cols.append(x2i[y])
    data.append(log((n / sxy) / (cx[x] / sx) / (cx[y] / sx)))
    pmi_samples[(x, y)] = data[-1]
PMI = csc_matrix((data, (rows, cols)))
print('%.3lf seconds (%.5lf / iter)' % (time() - t0, (time() - t0) / len(cxy)))
print('%d non-zero elements' % PMI.count_nonzero())
print('Sample PMI values\n', pformat(pmi_samples.most_common()[:30]))

2.244 seconds (0.00000 / iter)
858472 non-zero elements
Sample PMI values
 [(('hires', 'hitman'), 8.140560333991958),
 (('hitman', 'hitman'), 7.858863688555038),
 (('hong', 'kong'), 7.482190770366894),
 (('johns', 'papa'), 7.298524531355853),
 (('nobel', 'prize'), 6.984464302392243),
 (('elon', 'musk'), 6.931772124585697),
 (('ben', 'carson'), 6.8461965050086535),
 (('bieber', 'justin'), 6.808720346409335),
 (('hires', 'hires'), 6.786321554316943),
 (('bell', 'taco'), 6.632516281195759),
 (('biden', 'joe'), 6.6122186891864105),
 (('murdering', 'wrote'), 6.578223357062715),
 (('francisco', 'san'), 6.53800000840054),
 (('bernie', 'sanders'), 6.502903933290499),
 (('francis', 'pope'), 6.461349157531503),
 (('nobel', 'peace'), 6.4516270820200905),
 (('posing', 'undercover'), 6.389303431446813),
 (('cruz', 'ted'), 6.372277364180921),
 (('card', 'credit'), 6.353846318345864),
 (('parade', 'pride'), 6.3415614542873655),
 (('lives', 'matter'), 6.329011988445763),
 (('assaulted', 'sexually'), 6

In [23]:
# 6. Factorize the PMI matrix using sparse SVD aka "learn the unigram/word vectors".
# This part replaces the stochastic gradient descent used by Word2vec
# and other related neural network formulations. We pick an arbitrary vector size k=20.
t0 = time()
U, _, _ = svds(PMI, k=20)
print('%.3lf seconds' % (time() - t0))

0.295 seconds


In [24]:
# 7. Normalize the vectors to enable computing cosine similarity in next cell.
# If confused see: https://en.wikipedia.org/wiki/Cosine_similarity#Definition
t0 = time()
norms = np.sqrt(np.sum(np.square(U), axis=1, keepdims=True))
U /= np.maximum(norms, 1e-7)
print('%.3lf seconds' % (time() - t0))

0.000 seconds


If we wanted to use this for modelling, we'd ideally implement a `fit()` and `transform()` method so we could toss it into a pipeline. Unfortunately, we will have to defer that to a future date!