# Analyze Reddit data

### Import packages

In [1]:
import numpy as np
import pandas as pd
import re
import praw

from reddit_utils import *
from secrets import *


### Load top comments from the most active subreddits

In [4]:
%time reddit = praw.Reddit(client_id=CLIENT_ID, \
                           client_secret=CLIENT_SECRET, \
                           user_agent=USER_AGENT)
print()

%time subreddits = subredditList(ranking='activity', nsubs=5)
print(subreddits)
print()

%time data = load_data(reddit, subreddits, columns=[])
print(data.head())
print()
print(data.loc[:,'subreddit'].drop_duplicates())


Wall time: 3.01 ms

Wall time: 8.18 s
['AskReddit', 'politics', 'The_Donald', 'news', 'nba']

Wall time: 4min 17s
                                                body  subreddit
0  I have a great uncle that is a walking, talkin...  AskReddit
1  The one who’s serving life for murdering her t...  AskReddit
2  My biological uncle Troy. I've only met him a ...  AskReddit
3  I have this one uncle that always slaps mine a...  AskReddit
4  My great aunt by marriage is a heinous, social...  AskReddit

0     AskReddit
0      politics
0    The_Donald
0          news
0           nba
Name: subreddit, dtype: object


### Classification
- TF-IDF
- Multinomial Naive Bayes
- 10-fold cross validation

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

X = data.body
le = LabelEncoder()
y = le.fit_transform(data.subreddit)

vectorizer = TfidfVectorizer(strip_accents='ascii', # default None
                             stop_words='english', # default None
                            )
clf = MultinomialNB()
tfidf_multinomialNB = Pipeline([('tfidf', vectorizer), ('MultiNB', clf)])

scores = cross_validate(tfidf_multinomialNB, X, y, cv=10, return_train_score=False)

print("Train time: %0.2f (+/- %0.2f)" % (scores['fit_time'].mean(), scores['fit_time'].std() * 2))
print("Score time: %0.2f (+/- %0.2f)" % (scores['score_time'].mean(), scores['score_time'].std() * 2))
print("Accuracy  : %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))

Train time: 1.42 (+/- 0.09)
Score time: 0.15 (+/- 0.05)
Accuracy  : 0.67 (+/- 0.05)
