In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import redditutils as ru
import word2vecReader as wvr

from gensim.models import Word2Vec
from gensim.models import KeyedVectors


from nltk.corpus import words
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

from skopt import BayesSearchCV
import pickle


%load_ext autoreload
%autoreload 2

In [2]:
!ls

100k_response.pkl             reddit-ETL.ipynb
100k_train.pkl                redditutils.py
1k_features.pkl               results.csv
1k_response.pkl               [34mseinfeld-chronicles[m[m
25k_test.pkl                  short_test_matrix.csv.mtx
Model_tuning.ipynb            short_train_matrix.csv.mtx
Project Fletcher Proposal.pdf shower_clean.csv
README.md                     showerthoughts-clean.ipynb
[34m__pycache__[m[m                   showerthoughts.csv
bayes_search.pkl              test.csv
classification.ipynb          test_csv.csv.
cleaned_shower.csv            tfidf.ipynb
cleaning.csv                  tokenized.csv
count-vec-models.ipynb        train.csv
darkweb-EDA.ipynb             vectorized_df
first_5k_response.pkl         vectorized_df.csv
first_5k_words.pkl            word2vec.ipynb
fitted_cv.pkl                 word2vecReader.py
fitted_lda_short.pkl          word2vecReaderUtils.py
[34mflask[m[m                         [34mword2vec_twitter

In [3]:
df = pd.read_csv('cleaned_shower.csv', index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df['quality'] = df['score'].apply(lambda x: ru.make_labels(x))

In [5]:
df = df[df['title'].notnull()]

In [6]:
X = df['title']
y = df['quality']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 2325)

In [8]:
X_train_short = X_train[:100000]
y_train_short = y_train[:100000]

In [9]:
# English words
words_corpus = set(words.words())
# Stop words
stop = set(stopwords.words('english'))
# English words minus stop words
acceptable_words = words_corpus - stop
analyzer = CountVectorizer().build_analyzer()
stem = SnowballStemmer('english')

def english_corpus(doc, stemmer=stem):
    return [stemmer.stem(w) for w in analyzer(doc) if w in acceptable_words]

cv = CountVectorizer(stop_words='english', 
                     min_df = 2,
                     max_df = .15, 
                     tokenizer=english_corpus,
                     strip_accents='unicode',
                     encoding='utf-8', 
                     ngram_range=(1, 2))

In [10]:
X_train_short_dtm = cv.fit_transform(X_train_short)

  sorted(inconsistent))


In [45]:
rfc = RandomForestClassifier(n_estimators = 100)

rfc_scores = cross_val_score(rfc, X_train_short_dtm, y_train_short, cv=3, scoring='roc_auc')

In [49]:
rfc.fit(X_train_short_dtm, y_train_short)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [50]:
pickle.dump(rfc, open('random_forest.pkl', 'wb'))

## TFIDF Clustering

In [11]:
# English words
words_corpus = set(words.words())
# Stop words
stop = set(stopwords.words('english'))
# English words minus stop words
acceptable_words = words_corpus - stop
analyzer = CountVectorizer().build_analyzer()
stem = SnowballStemmer('english')

def english_corpus(doc, stemmer=stem):
    return [stemmer.stem(w) for w in analyzer(doc) if w in acceptable_words]

tfidf = TfidfVectorizer(stop_words = "english",
                        strip_accents = 'ascii',
                        max_df = .10,
                        min_df = 3, 
                        tokenizer = english_corpus,
                        ngram_range=(1, 2))

In [12]:
X_train_short_tf = tfidf.fit_transform(X_train_short)

  sorted(inconsistent))


In [56]:
rfc = RandomForestClassifier(n_estimators = 100)

rfc_scores = cross_val_score(rfc, X_train_short_tf, y_train_short, cv=3, scoring='roc_auc')

In [13]:
nb = GaussianNB()

nb_scores = cross_val_score(nb, X_train_short_tf.toarray(), y_train_short, cv=3, scoring='roc_auc')

In [14]:
nb_scores

array([0.52520644, 0.52275651, 0.51909955])

In [57]:
rfc_scores

array([0.59729186, 0.5928255 , 0.58642293])

## Try clustering TF-IDF with KMeans / DBSCAN

In [33]:
km = KMeans(n_clusters = 14)

km.fit(X_train_short_tf)

features = pd.get_dummies(km.labels_)

rfc = RandomForestClassifier()

In [34]:
rfc_scores = cross_val_score(rfc, features, y_train_short, cv=3, scoring='roc_auc')



In [35]:
rfc_scores

array([0.53619818, 0.53488184, 0.53566778])

In [36]:
db = DBSCAN()

db.fit(X_train_short_tf)

DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=5, n_jobs=None, p=None)

In [38]:
rfcdb = RandomForestClassifier()

cross_val_score()

array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
        12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
        25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
        38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
        77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
       129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
       142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
       155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
       168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 17