In [421]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import redditutils
import pickle


from string import punctuation

from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

from scipy.io import mmwrite, mmread

from gensim import corpora, models, similarities, matutils

from xgboost import XGBClassifier

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Suppress Pandas' automatic conversion of utc column to scientific notation:

In [2]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [43]:
df = pd.read_csv('showerthoughts.csv')

In [4]:
len(df[df.score > 1]) / len(df)

0.392702

39% of submissions get more than one upvote. This makes "greater than one upvote" a good candidate for our criteria for a good post.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
author          1000000 non-null object
title           1000000 non-null object
subreddit       1000000 non-null object
score           1000000 non-null int64
num_comments    1000000 non-null int64
retrieved_on    1000000 non-null int64
id              1000000 non-null object
created_utc     1000000 non-null int64
selftext        591622 non-null object
dtypes: int64(4), object(5)
memory usage: 68.7+ MB


In [44]:
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

In [7]:
pd.set_option('display.max_colwidth', 999)

In [134]:
df.head()

Unnamed: 0,author,title,subreddit,score,num_comments,retrieved_on,id,created_utc,selftext,quality
0,os_coxae,"everyone that's ever said ""i'm speechless"" is a goddamn liar.",Showerthoughts,0,0,1466445837,4ixl2n,2016-05-11 22:38:48,,0
1,Calrizle,i wonder how many people i've talked to who have been doing kaigles during our conversation.,Showerthoughts,0,0,1466494743,4lfjxn,2016-05-28 12:03:56,,0
2,[deleted],"""palindrome"" is a let down as a word. rhinoplasty sure isn't.",Showerthoughts,0,0,1466493929,4ldt85,2016-05-28 01:21:03,[deleted],0
3,[deleted],"when you tell someone to keep a secret, it is no longer a secret",Showerthoughts,0,0,1466423373,4hmcbg,2016-05-03 08:30:35,[deleted],0
4,[deleted],"someone had the the time to create the words, ""dick"", ""penis"", and ""vagina""",Showerthoughts,0,0,1466474842,4k9ngz,2016-05-20 18:29:28,[deleted],0


In [135]:
df.sample()

Unnamed: 0,author,title,subreddit,score,num_comments,retrieved_on,id,created_utc,selftext,quality
990321,Jolkro,"tattoos and babies have the same tabu. no one says they are ugly to their owners, but you can be sure the ugliness is talked about to everyone else.",Showerthoughts,1077,49,1536660067,91qs2i,2018-07-25 10:51:13,[removed],1


In [196]:
punctuation_prime = punctuation.replace('\'', '')

In [197]:
punctuation_prime

'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'

In [200]:
def strip_punc(s):
    return ''.join(c for c in s if c not in punctuation_prime)

In [128]:
df['quality'] = df['score'].apply(lambda x: redditutils.make_labels(x))

In [137]:
X = df[['title', 'created_utc']]

In [136]:
y = df['quality']

In [158]:
train = df.iloc[:int(len(df)*.70), :]
test = df.iloc[int(len(df)*.70):, :]

## Train test split

In [338]:
feat = df[['title', 'created_utc']]
resp = df['quality']

In [342]:
X_train, X_test, y_train, y_test = train_test_split(feat, resp, stratify = resp)

In [386]:
X_short_train = X_train.iloc[:int(len(X_train)*.1), :]
X_short_test = X_test.iloc[:int(len(X_test)*.1), :]

y_short_train = y_train[:int(len(y_train)*.1)]
y_short_test = y_test[:int(len(y_test)*.1)]

In [390]:
X_short_train.shape

(75000, 2)

In [300]:
stop = set(stopwords.words('english'))

## Word count vectorization

In [376]:
# English words
words_corpus = set(words.words())
# Stop words
stop = set(stopwords.words('english'))
# English words minus stop words
acceptable_words = words_corpus - stop
analyzer = CountVectorizer().build_analyzer()
stem = SnowballStemmer('english')

def english_corpus(doc, stemmer=stem):
    return [stemmer.stem(w) for w in analyzer(doc) if w in acceptable_words]

cv = CountVectorizer(stop_words='english', min_df = 2, max_df = .95, analyzer=english_corpus,
                     strip_accents='unicode', encoding='utf-8')

In [388]:
X_short_train_dtm = cv.fit_transform(X_short_train['title'])

In [389]:
X_short_test_dtm = cv.transform(X_short_test['title'])

In [410]:
pickle.dump(cv, open('fitted_cv.pkl', 'wb'))

## sklearn implementation of LDA

In [393]:
lda_sk = LatentDirichletAllocation(n_components = 10, learning_method='online', learning_decay = 0.6,
                                   learning_offset = 1024, topic_word_prior = .056)

In [394]:
lda_sk.fit(X_short_train_dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.6,
             learning_method='online', learning_offset=1024,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=0.056,
             total_samples=1000000.0, verbose=0)

In [413]:
pickle.dump(lda_sk, open('fitted_lda_short.pkl', 'wb'))

In [396]:
short_train_features = lda_sk.transform(X_short_train_dtm)

In [397]:
short_test_features = lda_sk.transform(X_short_test_dtm)

In [316]:
mmwrite('short_train_matrix.csv', X_short_train_dtm)
mmwrite('short_test_matrix.csv', X_short_test_dtm)

In [399]:
lda_sk.perplexity(X_short_train_dtm)

2236.0784312037654

In [400]:
for topic in range(10):
    print(f"TOPIC {topic}")
    for j in np.argsort(-lda_sk.components_, axis=1)[topic,:10]:
        print(cv.get_feature_names()[j])
    print()

TOPIC 0
way
first
say
peopl
come
like
someon
name
use
alway

TOPIC 1
world
realli
see
could
life
never
human
would
futur
peopl

TOPIC 2
post
made
read
word
find
tell
anyon
next
hear
toilet

TOPIC 3
good
everi
bad
die
phone
school
liter
high
realli
star

TOPIC 4
new
instead
long
like
movi
music
man
idea
countri
origin

TOPIC 5
time
like
day
one
would
us
peopl
go
trump
still

TOPIC 6
much
water
car
food
seen
noth
sleep
never
pretti
imagin

TOPIC 7
differ
earth
mean
would
realiz
sound
one
complet
top
media

TOPIC 8
get
peopl
like
wonder
mani
look
take
feel
year
shower

TOPIC 9
make
someth
would
live
thing
becom
ever
call
someon
best



## Model

In [401]:
rfc = RandomForestClassifier()

In [408]:
cross_val_score(rfc, X_short_train_cv, y_short_train, cv=3, scoring='roc_auc')

array([0.58950989, 0.5953642 , 0.5933468 ])

MVP = ~.59 AUC

In [415]:
nb = GaussianNB()

cross_val_score(nb, X_short_train_cv.toarray(), y_short_train, cv=3, scoring='roc_auc')

array([0.51727064, 0.51928188, 0.51481831])

In [None]:
xgb = XGBClassifier()

cross_val_score(xgb, X_short_train_cv.toarray(), y_short_train, cv=2, scoring='roc_auc', n_jobs=-1)

## Set up a pipeline

In [420]:
pipe = make_pipeline(cv, )

ValueError: not enough values to unpack (expected 2, got 0)