In [132]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import redditutils as ru
import word2vecReader as wvr

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

from skopt import BayesSearchCV
import pickle


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [139]:
# include below until https://github.com/scikit-optimize/scikit-optimize/issues/718 is resolved
class BayesSearchCV(BayesSearchCV):
    def _run_search(self, x): raise BaseException('Use newer skopt')

In [12]:
!ls word2vec_twitter_model

README                     word2vecReaderUtils.py
[34m__pycache__[m[m                word2vec_twitter_model.bin
word2vecReader.py


In [5]:
df = pd.read_csv('cleaned_shower.csv', index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)


In [30]:
df['quality'] = df['score'].apply(lambda x: ru.make_labels(x))

In [32]:
df.quality.mean()

0.3927000364998175

## Train test split

In [57]:
df = df[df['title'].notna()]

In [59]:
X = df['title']
y = df['quality']

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [61]:
X_train_short = X_train[:100000]
y_train_short = y_train[:100000]

In [74]:
X_train_shorter = X_train[:5000]

In [177]:
y_train_shorter = y_train[:5000]

In [80]:
X_test_shorter = X_test[:2500]
y_test_shorter = y_test[:2500]

## Word embedding with word2vec

In [22]:
model_path = "./word2vec_twitter_model.bin"
model = KeyedVectors.load_word2vec_format(model_path, binary=True, unicode_errors='ignore')

In [45]:
def make_features(words, model, num_features):
    features = np.zeros(num_features)
    
    model_vocab = set(model.index2word)
    
    num_words = 0
    
    # Loop over words in documents. If the word is in model's vocabulary,
    # generate its feature vector
    for w in words:
        if w in model_vocab:
            num_words += 1
            features = np.add(features, model[w])
            
    # Normalize the feature vector
    features = np.divide(features, num_words)
    
    return features

In [144]:
def document_vecs(docs, model, num_features):
    # Get the average feature vector for each showerthought based on the words it's comprised of
    counter = 0
    
    nan_indices = []
    
    doc_vector = np.zeros((len(docs), num_features))
    
    for d in docs:
        if counter%100 == 0:
            print(f'Finished document number {counter}')
            
        # Add this document's feature vector to doc_vector
        doc_vector[counter] = make_features(d, model, num_features)
            
        counter += 1
    return doc_vector

## Throw it in to a Random Forest

Cluster if this doesn't work.

In [168]:
features_train_dropna = features_train[~np.isnan(features_train)]

In [176]:
len(y_train_shorter)

4996

In [170]:
features_train_dropna = np.delete(features_train, [739, 3705], axis=0)

In [178]:
y_train_shorter = np.array(y_train_shorter)

In [179]:
y_train_shorter = np.delete(y_train_shorter, [739, 3705])

In [151]:
rfc = RandomForestClassifier()

tuning_params = {'n_estimators': [50, 100, 150, 200], 'max_features': [2, 4, 6],
                 'max_depth': [10, 30, 50]}

bs = BayesSearchCV(rfc, tuning_params, cv=3, scoring='roc_auc')

#bs.fit(features_train_dropna, y_train_shorter)

scores = np.mean(cross_val_score(rfc, features_train_dropna, y_train_shorter, cv=3, scoring='roc_auc'))

print(scores)

opt = bs



0.4977906476937834


In [166]:
features = pickle.load(open('first_5k_words.pkl', 'rb'))
response = pickle.load(open('first_5k_response.pkl', 'rb'))

In [180]:
rfc = RandomForestClassifier()

scores = np.mean(cross_val_score(rfc, features_train_dropna, y_train_shorter, cv=3, scoring='roc_auc'))

print(scores)



0.5017668723907978


In [191]:
features_train_dropna.shape

(4998, 400)

In [155]:
y_train_shorter

array([0, 1, 1, ..., 1, 0, 1])

Definitely some improvement with word2vec! ~.56 ROC AUC!

In [133]:
pickle.dump(features_train_dropna, open('first_5k_words.pkl', 'wb'))
pickle.dump(y_train_shorter, open('first_5k_response.pkl', 'wb'))

## Clustering

In [None]:
km = KMeans()

rfc = RandomForestClassifier()

pipe = make_pipeline(km, rfc)

tuning_params = {'randomforestclassifier__n_estimators': [50, 100, 150, 200],
                 'Kmeans__n_clusters': [i for i in range(1, 20)]}

bs = BayesSearchCV(pipe, tuning_params, cv=3, scoring='roc_auc')

bs.fit(features, y_train_shorter)