In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score
from yellowbrick.regressor.residuals import ResidualsPlot, PredictionError
from yellowbrick.regressor.alphas import AlphaSelection
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion

In [2]:
data = pd.read_csv('processed_posts.csv')
len(data)

253815

Data Cleanup, Outlier Removal, Descriptive Statistics

In [3]:
data.drop_duplicates('title', inplace=True)

In [4]:
data['gt20'] = data['ups'] > 20
data['gt50'] = data['ups'] > 50
data['gt100'] = data['ups'] > 100
data_sub = data[['title', 'gt20', 'gt50', 'gt100']]
data_sub.is_copy = False
# data_sub['gt20'] = pd.factorize(data_sub['gt20'])[0]


train_titles, test_titles, train_labels, test_labels = train_test_split(data_sub.title, 
                                                                        data_sub.gt20, 
                                                                        test_size=0.20,
                                                                        random_state=42)

In [5]:
# Stats for Upvotes
num_of_titles = len(data_sub['title'])
max_len_title = max([len(x) for x in data_sub['title']])
avg_len_title = int(np.mean([len(x) for x in data_sub['title']]))
max_word_count = max([len(x.split()) for x in data_sub['title']])
avg_word_count = int(np.mean([len(x.split()) for x in data_sub['title']]))

print('Number of Titles: \t{0}'.format(num_of_titles))
print('Max Length of Title: \t{0} characters'.format(max_len_title))
print('Avg Length of Title: \t{0} characters'.format(avg_len_title))
print('Max words in Title: \t{0} words'.format(max_word_count))
print('Avg words in Title: \t{0} words'.format(avg_word_count))

Number of Titles: 	253815
Max Length of Title: 	300 characters
Avg Length of Title: 	61 characters
Max words in Title: 	67 words
Avg words in Title: 	10 words


In [6]:
estimators = [
              ('tfidf', TfidfVectorizer(ngram_range=(1,3), stop_words='english')),
              ('clf', MultinomialNB())
             ]
pipe = Pipeline(estimators)
pipe.fit(train_titles, train_labels)
pipe.predict(["Donald Trump attacks UN human rights council for including human rights abusers"])[0]

True

In [8]:
index = 6
print('Title: ', test_titles.iloc[index])
print('Greater than 20 Votes: ', test_labels.iloc[index])
print('Prediction: ', pipe.predict([test_titles.iloc[index]])[0])


Title:  My Aunt's Husband posted on her Facebook these words of wisdom about Christmas...
Greater than 20 Votes:  False
Prediction:  False


In [9]:
pipe.score(X=test_titles, y=test_labels)

0.64688848176821701