# Cleaning and LDA

This notebook focuses on the initial cleaning of showerthoughts data and topic modeling with LDA.

I also try a XGBoost classifier using the simple countvectorizer model.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import redditutils
import pickle


from string import punctuation

from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

from skopt import BayesSearchCV

from scipy.io import mmwrite, mmread

from gensim import corpora, models, similarities, matutils

from xgboost import XGBClassifier

%matplotlib inline
%load_ext autoreload
%autoreload 2

Suppress Pandas' automatic conversion of utc column to scientific notation:

In [2]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [3]:
df = pd.read_csv('showerthoughts.csv')

In [4]:
len(df[df.score > 1]) / len(df)

0.392702

39% of submissions get more than one upvote. This makes "greater than one upvote" a good candidate for our criteria for a good post.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
author          1000000 non-null object
title           1000000 non-null object
subreddit       1000000 non-null object
score           1000000 non-null int64
num_comments    1000000 non-null int64
retrieved_on    1000000 non-null int64
id              1000000 non-null object
created_utc     1000000 non-null int64
selftext        591622 non-null object
dtypes: int64(4), object(5)
memory usage: 68.7+ MB


In [6]:
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

In [7]:
pd.set_option('display.max_colwidth', 999)

In [8]:
df.head()

Unnamed: 0,author,title,subreddit,score,num_comments,retrieved_on,id,created_utc,selftext
0,os_coxae,"Everyone that's ever said ""I'm speechless"" is a goddamn liar.",Showerthoughts,0,0,1466445837,4ixl2n,2016-05-11 22:38:48,
1,Calrizle,I wonder how many people I've talked to who have been doing kaigles during our conversation.,Showerthoughts,0,0,1466494743,4lfjxn,2016-05-28 12:03:56,
2,[deleted],"""Palindrome"" is a let down as a word. Rhinoplasty sure isn't.",Showerthoughts,0,0,1466493929,4ldt85,2016-05-28 01:21:03,[deleted]
3,[deleted],"When you tell someone to keep a secret, it is no longer a secret",Showerthoughts,0,0,1466423373,4hmcbg,2016-05-03 08:30:35,[deleted]
4,[deleted],"Someone had the the time to create the words, ""Dick"", ""Penis"", and ""Vagina""",Showerthoughts,0,0,1466474842,4k9ngz,2016-05-20 18:29:28,[deleted]


In [9]:
df.sample()

Unnamed: 0,author,title,subreddit,score,num_comments,retrieved_on,id,created_utc,selftext
116030,hmeggitt,Liberals are probably pretty glad they didn't get rid of the 2nd amendment yet,Showerthoughts,1,0,1484218827,5c1hcx,2016-11-09 14:49:48,[removed]


In [10]:
punctuation_prime = punctuation.replace('\'', '')

In [11]:
punctuation_prime

'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
def strip_punc(s):
    return ''.join(c for c in s if c not in punctuation_prime)

In [13]:
df['quality'] = df['score'].apply(lambda x: redditutils.make_labels(x))

In [14]:
X = df[['title', 'created_utc']]

In [15]:
y = df['quality']

In [16]:
train = df.iloc[:int(len(df)*.70), :]
test = df.iloc[int(len(df)*.70):, :]

## Train test split

In [17]:
feat = df[['title', 'created_utc']]
resp = df['quality']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(feat, resp, stratify = resp)

In [19]:
X_short_train = X_train.iloc[:int(len(X_train)*.1), :]
X_short_test = X_test.iloc[:int(len(X_test)*.1), :]

y_short_train = y_train[:int(len(y_train)*.1)]
y_short_test = y_test[:int(len(y_test)*.1)]

In [20]:
X_short_train.shape

(75000, 2)

In [21]:
stop = set(stopwords.words('english'))

## Word count vectorization

In [58]:
# English words
words_corpus = set(words.words())
# Stop words
stop = set(stopwords.words('english'))
# English words minus stop words
acceptable_words = words_corpus - stop
analyzer = CountVectorizer().build_analyzer()
stem = SnowballStemmer('english')

def english_corpus(doc, stemmer=stem):
    return [stemmer.stem(w) for w in analyzer(doc) if w in acceptable_words]

cv = CountVectorizer(stop_words='english', 
                     min_df = 2,
                     max_df = .15, 
                     tokenizer=english_corpus,
                     strip_accents='unicode',
                     encoding='utf-8', 
                     ngram_range=(1, 2))

In [59]:
X_short_train_dtm = cv.fit_transform(X_short_train['title'])

  sorted(inconsistent))


In [60]:
X_short_test_dtm = cv.transform(X_short_test['title'])

In [316]:
mmwrite('short_train_matrix', X_short_train_dtm)
mmwrite('short_test_matrix', X_short_test_dtm)

In [28]:
pickle.dump(cv, open('fitted_cv.pkl', 'wb'))

## sklearn implementation of LDA

In [61]:
lda_sk = LatentDirichletAllocation(n_components = 10, learning_method='online', learning_decay = 0.6,
                                   learning_offset = 1024, topic_word_prior = .056, n_jobs=5)

In [53]:
lda_sk.fit(X_short_train_dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.6,
             learning_method='online', learning_offset=1024,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=5, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=0.056,
             total_samples=1000000.0, verbose=0)

In [29]:
pickle.dump(lda_sk, open('fitted_lda_short.pkl', 'wb'))

In [54]:
short_train_features = lda_sk.transform(X_short_train_dtm)

In [55]:
short_test_features = lda_sk.transform(X_short_test_dtm)

In [56]:
lda_sk.perplexity(X_short_train_dtm)

13473.986481784577

In [57]:
for topic in range(10):
    print(f"TOPIC {topic}")
    for j in np.argsort(-lda_sk.components_, axis=1)[topic,:10]:
        print(cv.get_feature_names()[j])
    print()

TOPIC 0
trump
come
word
instead
mean
water
movi
presid
big
today

TOPIC 1
great
point
place
star
bad
run
noth
space
pretti
sure

TOPIC 2
feel
shower
dog
tri
human
thought
love
feel like
hair
million

TOPIC 3
differ
man
phone
basic
high
hear
drink
imagin
car
night

TOPIC 4
real
turn
equival
hard
real life
clean
close
smell
wash
anim

TOPIC 5
becom
year
live
start
old
talk
black
age
univers
histori

TOPIC 6
read
die
sex
music
anoth
stop
futur
liter
learn
weird

TOPIC 7
look
thing
new
work
everyon
post
game
right
earth
anyon

TOPIC 8
want
eat
tell
money
best
watch
drive
lot
food
seen

TOPIC 9
need
bodi
wish
white
exist
public
order
room
fall
color



## Model

In [36]:
rfc = RandomForestClassifier()

In [37]:
X_short_train_dtm.shape

(75000, 49826)

# Random Forest

## LDA features 

In [38]:
cross_val_score(rfc, short_train_features, y_short_train, cv=3, scoring='roc_auc')



array([0.52533501, 0.52770925, 0.52263515])

## Word count features

In [39]:
cross_val_score(rfc, X_short_train_dtm, y_short_train, cv=3, scoring='roc_auc')



array([0.58417798, 0.57776892, 0.58014498])

MVP = ~.59 AUC

# Naive Bayes

## LDA features

In [70]:
nb = GaussianNB()

cross_val_score(nb, short_train_features, y_short_train, cv=2, scoring='roc_auc')

array([0.52646529, 0.52597738])

## Word count features

In [None]:
cross_val_score(nb, X_short_train_dtm.toarray(), y_short_train, cv=3, scoring='roc_auc')

# XGBoost

## LDA features

In [40]:
xgb = XGBClassifier()

cross_val_score(xgb, X_short_train_dtm, y_short_train, cv=2, scoring='roc_auc', n_jobs=-1)

array([0.60383635, 0.60136271])

## Word count features

In [None]:
cross_val_score(xgb, X_short_train_dtm.toarray(), y_short_train, cv=2, scoring='roc_auc', n_jobs=-1)

## Set up a pipeline

In [76]:
def bayes_search(X, y):
    cv_search = CountVectorizer(stop_words='english', 
                             min_df = 2, 
                             tokenizer=english_corpus,
                             strip_accents='unicode',
                             encoding='utf-8', 
                             ngram_range=(1, 2))

    lda = LatentDirichletAllocation(learning_method='online', learning_decay = 0.6,
                                   learning_offset = 1024, topic_word_prior = .056, n_jobs=5)

    rfc = RandomForestClassifier()



    tuning_params = {'countvectorizer__max_df': [i*.01 for i in range(10, 95)],
                    'latentdirichletallocation__n_components': [5, 20],
                     'randomforestclassifier__n_estimators': [50, 100, 150, 200]}

    pipe = make_pipeline(cv, lda, rfc)
    
    print(pipe.get_params().keys())

    bs = BayesSearchCV(pipe, tuning_params, cv=3)
    
    bs.fit(X, y)
    
    return bs

In [77]:
bayes_search(X_short_train, y_short_train)

dict_keys(['memory', 'steps', 'countvectorizer', 'latentdirichletallocation', 'randomforestclassifier', 'countvectorizer__analyzer', 'countvectorizer__binary', 'countvectorizer__decode_error', 'countvectorizer__dtype', 'countvectorizer__encoding', 'countvectorizer__input', 'countvectorizer__lowercase', 'countvectorizer__max_df', 'countvectorizer__max_features', 'countvectorizer__min_df', 'countvectorizer__ngram_range', 'countvectorizer__preprocessor', 'countvectorizer__stop_words', 'countvectorizer__strip_accents', 'countvectorizer__token_pattern', 'countvectorizer__tokenizer', 'countvectorizer__vocabulary', 'latentdirichletallocation__batch_size', 'latentdirichletallocation__doc_topic_prior', 'latentdirichletallocation__evaluate_every', 'latentdirichletallocation__learning_decay', 'latentdirichletallocation__learning_method', 'latentdirichletallocation__learning_offset', 'latentdirichletallocation__max_doc_update_iter', 'latentdirichletallocation__max_iter', 'latentdirichletallocation

  sorted(inconsistent))


ValueError: max_df corresponds to < documents than min_df

In [67]:
# include below until https://github.com/scikit-optimize/scikit-optimize/issues/718 is resolved
class BayesSearchCV(BayesSearchCV):
    def _run_search(self, x): raise BaseException('Use newer skopt')