In [1]:
import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/lemm_training.csv')
testing = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/lemm_testing.csv')

In [3]:
data.drop('Unnamed: 0', 1, inplace=True)
testing.drop('Unnamed: 0', 1, inplace=True)

In [4]:
data = data.dropna()
testing = testing.dropna()

In [5]:
data_y = data['sentiment']
data_x = data['lemm_text']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.05, random_state=232)

In [6]:
n_dim = 300
#Initialize model and build vocab
w2v = Word2Vec(size=n_dim, min_count=10)
w2v.build_vocab(x_train)

In [7]:
w2v.train(x_train, total_examples=w2v.corpus_count, epochs=w2v.iter)

  """Entry point for launching an IPython kernel.


(49622377, 285825085)

In [8]:
#Build word vector for training set by using the average value of all word vectors in the tweet, then scale
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [9]:
train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_train])
train_vecs = scale(train_vecs)

  import sys


In [10]:
w2v.train(x_test, total_examples=w2v.corpus_count, epochs=w2v.iter)

  """Entry point for launching an IPython kernel.


(2614364, 15066655)

In [11]:
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_test])
test_vecs = scale(test_vecs)

  import sys


In [12]:
param_grid = {'max_depth': [10,15,20],
              'max_features': ['auto','sqrt','log2'],
              'min_samples_leaf': [25,50,75]}

rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf_cv = RandomizedSearchCV(rf, param_grid, n_iter=5, cv=3)
rf_cv.fit(train_vecs, y_train)
print(rf_cv)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=5, n_jobs=1,
          param_distributions={'max_depth': [10, 15, 20], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [25, 50, 75]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)


In [13]:
pred = rf_cv.predict(test_vecs)

In [14]:
print(rf_cv.score(test_vecs, y_test))

0.524940886452


In [15]:
print(classification_report(y_test, pred,
     target_names=['neg','pos']))

             precision    recall  f1-score   support

        neg       0.53      0.39      0.45     39579
        pos       0.52      0.65      0.58     39929

avg / total       0.53      0.52      0.52     79508



In [16]:
print(confusion_matrix(y_test, pred))

[[15609 23970]
 [13801 26128]]


In [17]:
print(roc_auc_score(y_test, pred))

0.524368648499


In [18]:
from sklearn.externals import joblib
joblib.dump(rf_cv, 'word2vec_rf_lemm.pkl') 

['word2vec_rf_lemm.pkl']