In [255]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import redditutils as ru
import word2vecReader as wvr

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

from skopt import BayesSearchCV


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [256]:
df = pd.read_csv('cleaned_shower.csv', index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)


In [259]:
df['quality'] = df['score'].apply(lambda x: ru.make_labels(x))

In [262]:
df.to_csv('shower_clean.csv')

## Train test split

In [263]:
df = df[df['title'].notna()]

In [265]:
X = df['title']
y = df['quality']

In [276]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 2325)

In [277]:
X_train_short = X_train[:100000]
y_train_short = y_train[:100000]

In [278]:
X_train_shorter = X_train[:5000]
y_train_shorter = y_train[:5000]

In [279]:
X_test_shorter = X_test[:2500]
y_test_shorter = y_test[:2500]

## Word embedding with word2vec

In [270]:
model_path = "./word2vec_twitter_model.bin"
model = KeyedVectors.load_word2vec_format(model_path, binary=True, unicode_errors='ignore')

In [271]:
def make_features(words, model, num_features):
    features = np.zeros(num_features)
    
    model_vocab = set(model.index2word)
    
    num_words = 0
    
    # Loop over words in documents. If the word is in model's vocabulary,
    # generate its feature vector
    for w in words:
        if w in model_vocab:
            num_words += 1
            features = np.add(features, model[w])
            
    # Normalize the feature vector
    features = np.divide(features, num_words)
    
    return features

In [272]:
def document_vecs(docs, model, num_features):
    # Get the average feature vector for each showerthought based on the words it's comprised of
    counter = 0
    
    doc_vector = np.zeros((len(docs), num_features))
    
    for d in docs:
        if counter%100 == 0:
            print(f'Finished document number {counter}')
            
        # Add this document's feature vector to doc_vector
        doc_vector[counter] = make_features(d, model, num_features)
            
        counter += 1
    return doc_vector

In [81]:
features_test = document_vecs(X_test_shorter, model, 400)

Finished document number 0
Finished document number 100
Finished document number 200
Finished document number 300
Finished document number 400
Finished document number 500
Finished document number 600
Finished document number 700
Finished document number 800
Finished document number 900
Finished document number 1000
Finished document number 1100
Finished document number 1200
Finished document number 1300
Finished document number 1400
Finished document number 1500
Finished document number 1600
Finished document number 1700
Finished document number 1800
Finished document number 1900
Finished document number 2000
Finished document number 2100
Finished document number 2200
Finished document number 2300
Finished document number 2400


In [None]:
features_train = document_vecs(X_train_shorter[:1000], model, 400)

Finished document number 0
Finished document number 100
Finished document number 200
Finished document number 300
Finished document number 400
Finished document number 500
Finished document number 600
Finished document number 700
Finished document number 800


In [None]:
response_train = y_train[:1000]

In [None]:
def remove_bad_indices(features, response):
    # Remove document-response pairs that failed to be embedded in vector space
    bad_indices = list(np.unique(np.where(np.isnan(features, axis=0)[0])))
    
    features = np.delete(features, bad_indices, axis=0)
    response = np.delete(np.array(response), bad_indices)

## Throw it in to a Random Forest

Cluster if this doesn't work.

In [125]:
features_train.shape

(5000, 400)

In [131]:
rfc = RandomForestClassifier()

score = cross_val_score(rfc, features_train_dropna, y_train_shorter, cv=3)

print(score)



[0.55728854 0.56422569 0.56816817]


Definitely some improvement with word2vec! ~.56 ROC AUC!

## Clustering

In [None]:
km = KMeans()

rfc = RandomForestClassifier()

pipe = make_pipeline(km, rfc)

tuning_params = {'randomforestclassifier__n_estimators': [50, 100, 150, 200],
                 'Kmeans__n_clusters': [i for i in range(1, 20)]}

bs = BayesSearchCV(pipe, tuning_params, cv=3, scoring='roc_auc')

bs.fit(features, y_train_shorter)