# Process Data

Pipeline for building training and testing data.

In [1]:
% matplotlib inline

import pandas as pd
import numpy as np

sampling = False

In [2]:
df = pd.read_csv('cleaned_data/processed_data.csv')

df = df.drop_duplicates('post_id', keep='last')

In [4]:
if sampling:
    df = df.sample(frac=.1)  # use 10% sample for build phase

In [5]:
df.shape

(2225549, 21)

In [8]:
df = df.drop(['karma', 'comment_karma'], axis=1)

### Feature Engineering

#### Generate the response vector

In [9]:
comment_threshold = df.groupby('subreddit').quantile(.9).reset_index()[['subreddit', 'comments']].set_index('subreddit').transpose()

# comment_threshold.head()

subreddit,r/100yearsago,r/1200isjerky,r/13ReasonsWhy,r/13or30,r/195,r/19KidsandCounting,r/2007scape,r/2juicy4bones,r/2meirl42meirl4meirl,r/30ROCK,...,r/xmen,r/yesyesyesno,r/yesyesyesyesno,r/yorku,r/youdontsurf,r/youseeingthisshit,r/youtube,r/youtubehaiku,r/yuruyuri,r/zen
0.9,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
comments,3.0,15.0,23.0,30.1,2.0,33.0,31.0,12.1,20.9,25.0,...,23.0,26.0,31.0,13.0,84.8,118.0,12.0,30.0,13.0,102.4


In [10]:
%%time

# response

response = []

for i, row in df.iterrows():
    subreddit = row['subreddit']
    comments = row['comments']
    
    is_greater = 0
    if comments > comment_threshold[subreddit].values:
        is_greater = 1
        
    response.append(is_greater)
    

df['response'] = response

CPU times: user 2min 44s, sys: 571 ms, total: 2min 45s
Wall time: 2min 45s


### Generate dummy variables

In [11]:
df['title']
df = df.dropna()

In [12]:
df.shape

(2191475, 20)

In [13]:
subreddit_dummies = pd.get_dummies(df['subreddit'])

## Sentiment Analysis on the Title

In [14]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia = SIA()

sentiment = df['title'].apply(sia.polarity_scores)

sent = pd.DataFrame(list(sentiment))

df = df.join(sent)

## Setup Data

In [15]:
to_drop = ['comments', 'response', 'author', 'content', 'link', 'post_id',
           'subreddit', 'scrape_time', 'post_time', 'elapsed_time']

df = df.drop(to_drop, axis=1)

In [16]:
df = df.join(subreddit_dummies)

In [17]:
df.shape

(2191475, 2575)

## Split Data

In [18]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.3)  # 70/30 train/test split

## NLP - TFIDF Vectorization

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_percent = 10


tfidf = TfidfVectorizer(stop_words='english',
                        strip_accents='unicode',
                        max_features=df.shape[0]//tfidf_percent)

tfidf.fit(df_train)

tfidf_train = pd.DataFrame(tfidf.transform(df_train['title']).todense(), columns=tfidf.get_feature_names())
tfidf_test = pd.DataFrame(tfidf.transform(df_test['title']).todense(), columns=tfidf.get_feature_names())

In [20]:
df_train = df_train.drop('title', axis=1)
df_test = df_test.drop('title', axis=1)

In [None]:
df_train = df_train.join(tfidf_train, lsuffix='_df')
df_test = df_test.join(tfidf_test, lsuffix='_df')

In [None]:
df_train.fillna(0.0, inplace=True)
df_test.fillna(0.0, inplace=True)

In [None]:
df_train.shape()

In [None]:
df_test.shape()

In [None]:
if not os.path.isfile('processed_data/train_data.csv'):
    df_train.to_csv('processed_data/train_data.csv', index=False)
else:
    df_train.to_csv('processed_data/train_data.csv', mode='a', index=False, header=False)

In [None]:
if not os.path.isfile('processed_data/test_data.csv'):
    df.to_csv('processed_data/test_data.csv', index=False)
else:
    df.to_csv('processed_data/test_data.csv', mode='a', index=False, header=False)