In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import redditutils


from string import punctuation
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline
%load_ext autoreload
%autoreload 2

Suppress Pandas' automatic conversion of utc column to scientific notation:

In [2]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [43]:
df = pd.read_csv('showerthoughts.csv')

In [4]:
len(df[df.score > 1]) / len(df)

0.392702

39% of submissions get more than one upvote. This makes "greater than one upvote" a good candidate for our criteria for a good post.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
author          1000000 non-null object
title           1000000 non-null object
subreddit       1000000 non-null object
score           1000000 non-null int64
num_comments    1000000 non-null int64
retrieved_on    1000000 non-null int64
id              1000000 non-null object
created_utc     1000000 non-null int64
selftext        591622 non-null object
dtypes: int64(4), object(5)
memory usage: 68.7+ MB


In [44]:
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

In [7]:
pd.set_option('display.max_colwidth', 999)

## Clean the data

In [110]:
df_short = df.iloc[:100000, :].copy()

In [111]:
df_short['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

In [112]:
df_short['title'] = df_short['title'].apply(lambda x: strip_punc(x))

In [113]:
df_short['title'] = df_short['title'].apply(lambda x: redditutils.lower_string(x))

In [115]:
df_short['title'] = df_short['title'].apply(lambda x: redditutils.strip_nums(x))

In [19]:
def strip_punc(s):
    return ''.join(c for c in s if c not in punctuation)

In [21]:
df['title'] = df['title'].apply(lambda x: strip_punc(x))

In [46]:
df['title'] = df['title'].apply(lambda x: redditutils.lower_string(x))

In [47]:
df['title'] = df['title'].apply(lambda x: redditutils.strip_nums(x))

In [26]:
df.to_csv('cleaning.csv')

In [91]:
words_corpus = set(words.words())
analyzer = CountVectorizer().build_analyzer()

def english_corpus(doc):
    return [w for w in analyzer(doc) if w in words_corpus]

# min_df and max_df
cv = CountVectorizer(stop_words='english', analyzer=english_corpus, min_df = 2, max_df = .95, ngram_range=(1, 2),
                     strip_accents='unicode', encoding='utf-8')

In [124]:
short_cv_df = pd.DataFrame(short_cv.toarray(), columns=cv.get_feature_names())

In [104]:
cv_2gram = CountVectorizer(stop_words='english', analyzer=english_corpus, min_df = 2, max_df = .95, ngram_range=(2, 2),
                           strip_accents='unicode', encoding='utf-8')

In [92]:
X = cv.fit_transform(df['title'])

In [105]:
X2 = cv_2gram.fit_transform(df['title'])

In [106]:
cv_df2 = pd.DataFrame(X2.toarray(), columns = cv.get_feature_names())

In [107]:
cv_df2.head()

Unnamed: 0,aa,aal,aardvark,aback,abacus,abandon,abandoned,abandonment,abate,abbey,...,zoning,zoo,zoologist,zoology,zoom,zoophile,zoophilia,zorro,zucchini,zygote
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
cv_df = pd.DataFrame(X.toarray(), columns = cv.get_feature_names())

In [102]:
cv_df.columns.values[1000:2000]

array(['argon', 'arguable', 'argue', 'argument', 'argumentative', 'arid',
       'ariel', 'arise', 'arisen', 'aristocracy', 'aristocrat',
       'aristocratic', 'arithmetic', 'arithmetically', 'ark', 'arm',
       'armada', 'armadillo', 'armament', 'armchair', 'armed', 'arming',
       'armistice', 'armless', 'armor', 'armored', 'armpit', 'armrest',
       'arms', 'army', 'arn', 'aroma', 'aromatic', 'arose', 'around',
       'arousal', 'arouse', 'arrange', 'arrangement', 'array', 'arrest',
       'arresting', 'arrival', 'arrive', 'arrogance', 'arrogant',
       'arrogantly', 'arrow', 'arrowhead', 'arse', 'arsenal', 'arsenic',
       'arses', 'arson', 'arsonist', 'art', 'artefact', 'artery',
       'artesian', 'artful', 'artfully', 'arthritis', 'artichoke',
       'article', 'articulate', 'articulated', 'articulately',
       'articulation', 'artifact', 'artificial', 'artificially',
       'artillery', 'artisan', 'artist', 'artistic', 'artistically',
       'artistry', 'ary', 'as', 'asb

In [98]:
print('test')

test


Get rid of any misspellings and non-english words.

In [4]:
df['title'] = df['title'].apply(lambda x: redditutils.strip_nums(x))

## Word count vectorization

In [6]:
cv = CountVectorizer(stop_words = 'english', min_df = 2, max_df = .95, strip_accents = 'unicode',
                     encoding = 'ISO-8859-1')

In [7]:
X = cv.fit_transform(df['title'])

In [16]:
cv_df = pd.DataFrame(X.toarray(), columns = cv.get_feature_names())

In [17]:
cv_df.head()

Unnamed: 0,1닷,1닷com,1닷콤,1점,1점coм,1쫌컴,4ᆞcom,TM,__,___,...,毕业证代办,毕业证办理,永久存档,留信网认证,留学回国人员证明,績單,荜业證,落户购买免税车,诚招代理,靠谱
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
cv_df.sum(axis=1).order(ascending=False).head()