# Analyze Reddit data

### Import packages

In [None]:
import numpy as np
import pandas as pd
import re
import praw

from reddit_utils import *
from secrets import *


### Load top comments from the most active subreddits

In [None]:
%time reddit = praw.Reddit(client_id=CLIENT_ID, \
                           client_secret=CLIENT_SECRET, \
                           user_agent=USER_AGENT)
print()

%time subreddits = subredditList(ranking='activity', nsubs=5)
print(subreddits)
print()

%time data = load_data(reddit, subreddits, columns=[])
print(data.head())
print()
print(data.loc[:,'subreddit'].drop_duplicates())


Version 5.2.0 of praw is outdated. Version 5.3.0 was released Sunday December 17, 2017.
Wall time: 140 ms

Wall time: 8.32 s
['AskReddit', 'politics', 'The_Donald', 'worldnews', 'nba']



### Classification
- TF-IDF
- Multinomial Naive Bayes
- 10-fold cross validation

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

X = data.body
le = LabelEncoder()
y = le.fit_transform(data.subreddit)

vectorizer = TfidfVectorizer(strip_accents='ascii', # default None
                             stop_words='english', # default None
                            )
clf = MultinomialNB()
tfidf_multinomialNB = Pipeline([('tfidf', vectorizer), ('MultiNB', clf)])

scores = cross_validate(tfidf_multinomialNB, X, y, cv=10, return_train_score=False)

print("Train time: %0.2f (+/- %0.2f)" % (scores['fit_time'].mean(), scores['fit_time'].std() * 2))
print("Score time: %0.2f (+/- %0.2f)" % (scores['score_time'].mean(), scores['score_time'].std() * 2))
print("Accuracy  : %0.2f (+/- %0.2f)" % (scores['test_score'].mean(), scores['test_score'].std() * 2))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

X = data.body
X_tfidf = vectorizer.fit_transform(X)

svd = TruncatedSVD(1000)
X_svd = svd.fit_transform(X_tfidf)

clusterer = KMeans(n_clusters=10, random_state=0)
clusterer.fit(X_svd)
preds = clusterer.predict(X_svd)

score = silhouette_score(X_svd, preds)
