# Data Analysis

In [1]:
% matplotlib inline

import pandas as pd
import numpy as np

# pd.options.display.max_columns = None

In [2]:
df = pd.read_csv('processed_data/processed_data.csv')

df = df.drop_duplicates('post_id', keep='last')

In [3]:
df.shape

(2225549, 21)

In [4]:
df = df.sample(frac=.5)  # use 10% sample for build phase

In [5]:
df.shape

(1112774, 21)

In [6]:
data_large = True
if df.shape[0] > 50000:
    data_large = True
    
    
print("BIIIIIIG Data?: {}".format(data_large))

BIIIIIIG Data?: True


In [7]:
df.head(1)

Unnamed: 0,author,comment_karma,comments,content,crossposts,karma,link,post_id,post_time,score,...,subreddit,subreddit_rank,subscriptions,title,has_emoji,outside_content,is_question,elapsed_time,minutes_since_post,default_subreddit
215057,ChEJobSearch,,6,/r/personalfinance/comments/79jufh/should_i_op...,0,,/r/personalfinance/comments/79jufh/should_i_op...,t3_79jufh,2017-10-29 19:05:08.000000,18.0,...,r/personalfinance,41.0,12382010.0,Retirementshould I open up my vanguard roth ir...,0,0,1,1 days 13:54:11.565483000,2274.192758,1


In [8]:
df = df.drop(['karma', 'comment_karma'], axis=1)

### Feature Engineering

#### Generate the response vector

In [9]:
comment_threshold = df.groupby('subreddit').quantile(.9).reset_index()[['subreddit', 'comments']].set_index('subreddit').transpose()

comment_threshold.head()

subreddit,r/100yearsago,r/1200isjerky,r/13ReasonsWhy,r/13or30,r/195,r/19KidsandCounting,r/2007scape,r/2juicy4bones,r/2meirl42meirl4meirl,r/30ROCK,...,r/xmen,r/yesyesyesno,r/yesyesyesyesno,r/yorku,r/youdontsurf,r/youseeingthisshit,r/youtube,r/youtubehaiku,r/yuruyuri,r/zen
0.9,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
comments,3.0,14.2,23.0,32.7,2.0,33.2,34.0,10.2,18.0,25.0,...,23.0,26.0,31.0,11.1,82.4,112.2,11.2,29.1,13.0,106.7


In [10]:
%%time

# response

response = []

for i, row in df.iterrows():
    subreddit = row['subreddit']
    comments = row['comments']
    
    is_greater = 0
    if comments > comment_threshold[subreddit].values:
        is_greater = 1
        
    response.append(is_greater)
    

df['response'] = response

CPU times: user 1min 15s, sys: 90.5 ms, total: 1min 15s
Wall time: 1min 15s


### Generate dummy variables

In [11]:
df['title']
df = df.dropna()

In [12]:
df.shape

(1095770, 20)

In [13]:
subreddit_dummies = pd.get_dummies(df['subreddit'])

## Sentiment Analysis on the Title

In [14]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia = SIA()

sentiment = df['title'].apply(sia.polarity_scores)

sent = pd.DataFrame(list(sentiment))

df = df.join(sent)

## Setup Data

In [15]:
y = df['response']


to_drop = ['comments', 'response', 'author', 'content', 'link', 'post_id', 'subreddit', 'scrape_time', 'post_time', 'elapsed_time']

x = df.drop(to_drop, axis=1)

In [16]:
x = x.join(subreddit_dummies)

In [17]:
x.shape

(1095770, 2575)

## Split Data

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

## NLP - TFIDF Vectorization

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english',
                        strip_accents='unicode',
                        max_features=df.shape[0]//10)

tfidf.fit(x_train)

tfidf_train = pd.DataFrame(tfidf.transform(x_train['title']).todense(), columns=tfidf.get_feature_names())
tfidf_test = pd.DataFrame(tfidf.transform(x_test['title']).todense(), columns=tfidf.get_feature_names())

In [20]:
# x_train.head(1)

In [21]:
x_train = x_train.drop('title', axis=1)
x_test = x_test.drop('title', axis=1)

In [22]:
x_train = x_train.join(tfidf_train, lsuffix='_df')
x_test = x_test.join(tfidf_test, lsuffix='_df')

In [23]:
x_train.fillna(0.0, inplace=True)
x_test.fillna(0.0, inplace=True)

# Analyze Data

### Build Models

In [24]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

forest = RandomForestClassifier(n_jobs=-1)
if not data_large:
    boost = GradientBoostingClassifier()
bag = BaggingClassifier(n_jobs=-1)
knn = KNeighborsClassifier(n_jobs=-1)

### Train

In [25]:
%%time

forest.fit(x_train, y_train);

CPU times: user 29min 11s, sys: 1h 28min 46s, total: 1h 57min 57s
Wall time: 43min 6s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
%%time

if not data_large:
    boost.fit(x_train, y_train);

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 10 µs


In [None]:
%%time

bag.fit(x_train, y_train)

In [None]:
%%time

knn.fit(x_train, y_train);

### Test

In [None]:
from sklearn.metrics import classification_report

forest_predictions = forest.predict(x_test)
if not data_large: boost_predictions = boost.predict(x_test)
bag_predictions = bag.predict(x_test)
knn_predictions = knn.predict(x_test)

In [None]:
print("Random Forest:\n", classification_report(y_test, forest_predictions))

In [None]:
if not data_large: print("Gradient Boost:\n", classification_report(y_test, boost_predictions))

In [None]:
print("Bagging:\n", classification_report(y_test, bag_predictions))

In [None]:
print("kNN:\n", classification_report(y_test, knn_predictions))