In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import redditutils as ru
import word2vecReader as wvr

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

from skopt import BayesSearchCV


%load_ext autoreload
%autoreload 2

In [12]:
!ls word2vec_twitter_model

README                     word2vecReaderUtils.py
[34m__pycache__[m[m                word2vec_twitter_model.bin
word2vecReader.py


In [5]:
df = pd.read_csv('cleaned_shower.csv', index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)


In [30]:
df['quality'] = df['score'].apply(lambda x: ru.make_labels(x))

In [32]:
df.quality.mean()

0.3927000364998175

## Train test split

In [57]:
df = df[df['title'].notna()]

In [59]:
X = df['title']
y = df['quality']

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [61]:
X_train_short = X_train[:100000]
y_train_short = y_train[:100000]

In [74]:
X_train_shorter = X_train[:5000]
y_train_shorter = y_train[:5000]

In [80]:
X_test_shorter = X_test[:2500]
y_test_shorter = y_test[:2500]

## Word embedding with word2vec

In [22]:
model_path = "./word2vec_twitter_model.bin"
model = KeyedVectors.load_word2vec_format(model_path, binary=True, unicode_errors='ignore')

In [45]:
def make_features(words, model, num_features):
    features = np.zeros(num_features)
    
    model_vocab = set(model.index2word)
    
    num_words = 0
    
    # Loop over words in documents. If the word is in model's vocabulary,
    # generate its feature vector
    for w in words:
        if w in model_vocab:
            num_words += 1
            features = np.add(features, model[w])
            
    # Normalize the feature vector
    features = np.divide(features, num_words)
    
    return features

In [72]:
def document_vecs(docs, model, num_features):
    # Get the average feature vector for each showerthought based on the words it's comprised of
    counter = 0
    
    doc_vector = np.zeros((len(docs), num_features))
    
    for d in docs:
        if counter%100 == 0:
            print(f'Finished document number {counter}')
            
        # Add this document's feature vector to doc_vector
        doc_vector[counter] = make_features(d, model, num_features)
            
        counter += 1
    return doc_vector

In [81]:
features_test = document_vecs(X_test_shorter, model, 400)

Finished document number 0
Finished document number 100
Finished document number 200
Finished document number 300
Finished document number 400
Finished document number 500
Finished document number 600
Finished document number 700
Finished document number 800
Finished document number 900
Finished document number 1000
Finished document number 1100
Finished document number 1200
Finished document number 1300
Finished document number 1400
Finished document number 1500
Finished document number 1600
Finished document number 1700
Finished document number 1800
Finished document number 1900
Finished document number 2000
Finished document number 2100
Finished document number 2200
Finished document number 2300
Finished document number 2400


In [85]:
features_test

array([[-0.04595001,  0.11741913,  0.02367049, ..., -0.05944299,
         0.0996727 ,  0.1051121 ],
       [-0.05437983,  0.06098557,  0.00839084, ..., -0.09163244,
         0.12625733,  0.12745151],
       [-0.06536687,  0.04458294,  0.03854551, ..., -0.10071999,
         0.11806172,  0.18819486],
       ...,
       [ 0.01586874,  0.08002475,  0.03917049, ..., -0.06246087,
         0.10881239,  0.21049194],
       [-0.02163505,  0.00665951, -0.01421123, ..., -0.13325126,
         0.13243245,  0.2009986 ],
       [ 0.00686216,  0.06225939, -0.04909481, ..., -0.00733361,
         0.11381253,  0.14644653]])

In [88]:
features_train = document_vecs(X_train_shorter, model, 400)

Finished document number 0
Finished document number 100
Finished document number 200
Finished document number 300
Finished document number 400
Finished document number 500
Finished document number 600
Finished document number 700


  app.launch_new_instance()


Finished document number 800
Finished document number 900
Finished document number 1000
Finished document number 1100
Finished document number 1200
Finished document number 1300
Finished document number 1400
Finished document number 1500
Finished document number 1600
Finished document number 1700
Finished document number 1800
Finished document number 1900
Finished document number 2000
Finished document number 2100
Finished document number 2200
Finished document number 2300
Finished document number 2400
Finished document number 2500
Finished document number 2600
Finished document number 2700
Finished document number 2800
Finished document number 2900
Finished document number 3000
Finished document number 3100
Finished document number 3200
Finished document number 3300
Finished document number 3400
Finished document number 3500
Finished document number 3600
Finished document number 3700
Finished document number 3800
Finished document number 3900
Finished document number 4000
Finished doc

In [65]:
features.shape

(100000, 400)

## Throw it in to a Random Forest

Cluster if this doesn't work.

In [84]:
features_train

array([[-0.12027476,  0.09294142,  0.0002563 , ..., -0.0215239 ,
         0.11527485,  0.11532648],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [91]:
features_train.dtype

dtype('float64')

In [102]:
features_train[739]

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

In [109]:
features_train[3705]

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

In [111]:
len(features_train[~np.isnan(features_train)])

1999200

In [112]:
features_train_dropna = features_train[~np.isnan(features_train)]

In [118]:
len(y_train_shorter)

5000

In [130]:
features_train_dropna = np.delete(features_train, [739, 3705], axis=0)

In [121]:
y_train_shorter = np.array(y_train_shorter)

In [123]:
y_train_shorter = np.delete(y_train_shorter, [739, 3705])

In [104]:
np.where(np.isnan(features_train))

(array([ 739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739,  739,  739,  739,  739,  739,  739,
         739,  739,  739,  739,  739, 

In [125]:
features_train.shape

(5000, 400)

In [131]:
rfc = RandomForestClassifier()

score = cross_val_score(rfc, features_train_dropna, y_train_shorter, cv=3)

print(score)



[0.55728854 0.56422569 0.56816817]


Definitely some improvement with word2vec! ~.56 ROC AUC!

## Clustering

In [None]:
km = KMeans()

rfc = RandomForestClassifier()

pipe = make_pipeline(km, rfc)

tuning_params = {'randomforestclassifier__n_estimators': [50, 100, 150, 200],
                 'Kmeans__n_clusters': [i for i in range(1, 20)]}

bs = BayesSearchCV(pipe, tuning_params, cv=3, scoring='roc_auc')

bs.fit(features, y_train_shorter)