In [166]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import redditutils


from string import punctuation
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models, similarities, matutils

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Suppress Pandas' automatic conversion of utc column to scientific notation:

In [2]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [43]:
df = pd.read_csv('showerthoughts.csv')

In [4]:
len(df[df.score > 1]) / len(df)

0.392702

39% of submissions get more than one upvote. This makes "greater than one upvote" a good candidate for our criteria for a good post.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
author          1000000 non-null object
title           1000000 non-null object
subreddit       1000000 non-null object
score           1000000 non-null int64
num_comments    1000000 non-null int64
retrieved_on    1000000 non-null int64
id              1000000 non-null object
created_utc     1000000 non-null int64
selftext        591622 non-null object
dtypes: int64(4), object(5)
memory usage: 68.7+ MB


In [44]:
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

In [7]:
pd.set_option('display.max_colwidth', 999)

In [134]:
df.head()

Unnamed: 0,author,title,subreddit,score,num_comments,retrieved_on,id,created_utc,selftext,quality
0,os_coxae,"everyone that's ever said ""i'm speechless"" is a goddamn liar.",Showerthoughts,0,0,1466445837,4ixl2n,2016-05-11 22:38:48,,0
1,Calrizle,i wonder how many people i've talked to who have been doing kaigles during our conversation.,Showerthoughts,0,0,1466494743,4lfjxn,2016-05-28 12:03:56,,0
2,[deleted],"""palindrome"" is a let down as a word. rhinoplasty sure isn't.",Showerthoughts,0,0,1466493929,4ldt85,2016-05-28 01:21:03,[deleted],0
3,[deleted],"when you tell someone to keep a secret, it is no longer a secret",Showerthoughts,0,0,1466423373,4hmcbg,2016-05-03 08:30:35,[deleted],0
4,[deleted],"someone had the the time to create the words, ""dick"", ""penis"", and ""vagina""",Showerthoughts,0,0,1466474842,4k9ngz,2016-05-20 18:29:28,[deleted],0


In [135]:
df.sample()

Unnamed: 0,author,title,subreddit,score,num_comments,retrieved_on,id,created_utc,selftext,quality
990321,Jolkro,"tattoos and babies have the same tabu. no one says they are ugly to their owners, but you can be sure the ugliness is talked about to everyone else.",Showerthoughts,1077,49,1536660067,91qs2i,2018-07-25 10:51:13,[removed],1


In [137]:
X = df[['title', 'created_utc']]

In [136]:
y = df['quality']

In [158]:
train = df.iloc[:int(len(df)*.70), :]
test = df.iloc[int(len(df)*.70):, :]

## Clean the data

In [149]:
df_short = df.iloc[:100000, :].copy()

In [150]:
df_short['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

In [151]:
df_short['title'] = df_short['title'].apply(lambda x: strip_punc(x))

In [152]:
df_short['title'] = df_short['title'].apply(lambda x: redditutils.lower_string(x))

In [153]:
df_short['title'] = df_short['title'].apply(lambda x: redditutils.strip_nums(x))

In [157]:
df_short_train = df.iloc[:int(len(df_short)*.70), :]
df_short_test = df.iloc[int(len(df_short)*.70):, :]

In [159]:
df_short_test.head()

Unnamed: 0,author,title,subreddit,score,num_comments,retrieved_on,id,created_utc,selftext,quality
70000,AmagicFish,"if sony comes out with a k playstation, they should call it ""psk""",Showerthoughts,1,0,1463484931,4b8o1a,2016-03-20 18:34:50,[removed],0
70001,Phileas_Fogg,they guy that set up clinton's email server probably asked for help here on reddit.,Showerthoughts,1,0,1463496352,4bwp2r,2016-03-25 14:35:59,[removed],0
70002,Thenotoriouscanadian,"today, march th, is pi day, nap day and steak &amp; bj day",Showerthoughts,1,0,1463470751,4aevlu,2016-03-14 20:16:49,[removed],0
70003,theredwillow,"now that facebook shows users pages similar to what they've commented on before, the pages get more hateful comments",Showerthoughts,1,0,1463483543,4b5p18,2016-03-20 01:11:19,,0
70004,[deleted],life hacks,Showerthoughts,1,0,1463457576,49n8my,2016-03-09 08:39:09,[deleted],0


In [160]:
words_corpus = set(words.words())
analyzer = CountVectorizer().build_analyzer()

def english_corpus(doc):
    return [w for w in analyzer(doc) if w in words_corpus]

cv = CountVectorizer(stop_words='english', analyzer=english_corpus, min_df = 2, max_df = .95, ngram_range=(1, 2),
                     strip_accents='unicode', encoding='utf-8')

In [162]:
X_short_train = cv.fit_transform(df_short_train['title'])

In [163]:
X_short_test = cv.fit_transform(df_short_test['title'])

In [165]:
df_short_train.head()

Unnamed: 0,author,title,subreddit,score,num_comments,retrieved_on,id,created_utc,selftext,quality
0,os_coxae,"everyone that's ever said ""i'm speechless"" is a goddamn liar.",Showerthoughts,0,0,1466445837,4ixl2n,2016-05-11 22:38:48,,0
1,Calrizle,i wonder how many people i've talked to who have been doing kaigles during our conversation.,Showerthoughts,0,0,1466494743,4lfjxn,2016-05-28 12:03:56,,0
2,[deleted],"""palindrome"" is a let down as a word. rhinoplasty sure isn't.",Showerthoughts,0,0,1466493929,4ldt85,2016-05-28 01:21:03,[deleted],0
3,[deleted],"when you tell someone to keep a secret, it is no longer a secret",Showerthoughts,0,0,1466423373,4hmcbg,2016-05-03 08:30:35,[deleted],0
4,[deleted],"someone had the the time to create the words, ""dick"", ""penis"", and ""vagina""",Showerthoughts,0,0,1466474842,4k9ngz,2016-05-20 18:29:28,[deleted],0


In [167]:
counts = X_short_train.transpose()

In [168]:
counts.shape

(11313, 70000)

In [169]:
corpus = matutils.Sparse2Corpus(counts)

In [175]:
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [183]:
lda = models.LdaModel(corpus=corpus, num_topics=20, minimum_probability=.03, id2word=id2word, passes=10)

In [184]:
lda.print_topics()

[(0,
  '0.117*"infertility" + 0.053*"indent" + 0.050*"haircut" + 0.042*"facetiously" + 0.038*"incorruptible" + 0.030*"antiquity" + 0.027*"afraid" + 0.024*"concussion" + 0.020*"cricketer" + 0.019*"inconveniently"'),
 (1,
  '0.034*"corps" + 0.026*"june" + 0.025*"hubby" + 0.024*"drivable" + 0.024*"adrenaline" + 0.023*"chat" + 0.021*"helix" + 0.019*"detach" + 0.019*"dissolve" + 0.018*"cutoff"'),
 (2,
  '0.106*"cycler" + 0.079*"allure" + 0.069*"leapfrog" + 0.046*"dotted" + 0.035*"electronics" + 0.030*"chick" + 0.027*"dating" + 0.020*"corporately" + 0.019*"dividing" + 0.016*"kopi"'),
 (3,
  '0.080*"dating" + 0.072*"lamp" + 0.071*"know" + 0.043*"incorporated" + 0.037*"craft" + 0.028*"lambda" + 0.028*"cricketer" + 0.028*"legality" + 0.026*"afraid" + 0.023*"comparable"'),
 (4,
  '0.045*"consumption" + 0.042*"canister" + 0.033*"endorse" + 0.026*"ka" + 0.023*"dissect" + 0.022*"feign" + 0.017*"coffee" + 0.015*"kissy" + 0.014*"fashionably" + 0.014*"create"'),
 (5,
  '0.073*"incorporated" + 0.043*"e

In [179]:
list(lda.get_document_topics(corpus))

[[(0, 0.48501182), (3, 0.1769079), (4, 0.28641376)],
 [(4, 0.9381339)],
 [(0, 0.5576467), (1, 0.20012663), (2, 0.19732082)],
 [(0, 0.70838916), (3, 0.24475144)],
 [(1, 0.34179717), (4, 0.61094755)],
 [(0, 0.18956083), (2, 0.4404718), (4, 0.33585298)],
 [(0, 0.4811795), (1, 0.33671126), (2, 0.14479505)],
 [(4, 0.9330567)],
 [(0, 0.6500862), (1, 0.28903052)],
 [(1, 0.3656871), (4, 0.59092015)],
 [(1, 0.8780542), (2, 0.08097604)],
 [(0, 0.32501397), (1, 0.20343865), (3, 0.22533333), (4, 0.23610726)],
 [(3, 0.58013403), (4, 0.35243762)],
 [(0, 0.2828749), (2, 0.5500578), (4, 0.13747178)],
 [(0, 0.26416692), (3, 0.31689236), (4, 0.38540933)],
 [(0, 0.5959086), (4, 0.33720866)],
 [(0, 0.12555292), (1, 0.84666663)],
 [(1, 0.5844485), (2, 0.32940057)],
 [(0, 0.12880693),
  (1, 0.3237193),
  (2, 0.478655),
  (3, 0.034180265),
  (4, 0.034638505)],
 [(2, 0.1526612), (3, 0.09608552), (4, 0.7270954)],
 [(0, 0.45419598), (1, 0.40472138), (4, 0.119470686)],
 [(1, 0.15457182), (3, 0.10799031), (4, 0.7

In [19]:
def strip_punc(s):
    return ''.join(c for c in s if c not in punctuation)

In [21]:
df['title'] = df['title'].apply(lambda x: strip_punc(x))

In [46]:
df['title'] = df['title'].apply(lambda x: redditutils.lower_string(x))

In [47]:
df['title'] = df['title'].apply(lambda x: redditutils.strip_nums(x))

In [128]:
df['quality'] = df['score'].apply(lambda x: redditutils.make_labels(x))