In [1]:
import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/stop_training.csv')
testing = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/stop_testing.csv')

In [3]:
data.drop('Unnamed: 0', 1, inplace=True)
testing.drop('Unnamed: 0', 1, inplace=True)

In [4]:
data = data.dropna()
testing = testing.dropna()

In [5]:
data_y = data['sentiment']
data_x = data['stop_text']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.05, random_state=232)

In [6]:
n_dim = 300
#Initialize model and build vocab
w2v = Word2Vec(size=n_dim, min_count=10)
w2v.build_vocab(x_train)

In [7]:
w2v.train(x_train, total_examples=w2v.corpus_count, epochs=w2v.iter)

  """Entry point for launching an IPython kernel.


(54522777, 314722155)

In [8]:
#Build word vector for training set by using the average value of all word vectors in the tweet, then scale
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [9]:
train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_train])
train_vecs = scale(train_vecs)

  import sys


In [10]:
w2v.train(x_test, total_examples=w2v.corpus_count, epochs=w2v.iter)

  """Entry point for launching an IPython kernel.


(2871311, 16573585)

In [11]:
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_test])
test_vecs = scale(test_vecs)

  import sys


In [13]:
nb_smoothing = np.linspace(0.001, 30, 20)
param_grid = {'C': nb_smoothing, 'max_iter':[100,125,150]}

lr = LogisticRegression(n_jobs=-1, penalty='l1',solver='saga')
lr_cv = RandomizedSearchCV(lr, param_grid, n_iter=15, cv=3)
lr_cv.fit(train_vecs, y_train)



RandomizedSearchCV(cv=3, error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=None, solver='saga', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=15, n_jobs=1,
          param_distributions={'C': array([  1.00000e-03,   1.57989e+00,   3.15879e+00,   4.73768e+00,
         6.31658e+00,   7.89547e+00,   9.47437e+00,   1.10533e+01,
         1.26322e+01,   1.42111e+01,   1.57899e+01,   1.73688e+01,
         1.89477e+01,   2.05266e+01,   2.21055e+01,   2.36844e+01,
         2.52633e+01,   2.68422e+01,   2.84211e+01,   3.00000e+01]), 'max_iter': [100, 125, 150]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [14]:
pred = lr_cv.predict(test_vecs)

In [15]:
lr_cv.score(test_vecs, y_test)

0.49919555544382715

In [16]:
print(classification_report(y_test, pred,
     target_names=['neg','pos']))

             precision    recall  f1-score   support

        neg       0.50      0.48      0.49     39718
        pos       0.50      0.52      0.51     39840

avg / total       0.50      0.50      0.50     79558



In [17]:
confusion_matrix(y_test, pred)

array([[19103, 20615],
       [19228, 20612]])

In [18]:
roc_auc_score(y_test, pred)

0.49916764343238301

In [19]:
from sklearn.externals import joblib
joblib.dump(lr_cv, 'word2vec_lr_l1_stop.pkl') 

['word2vec_lr_l1_stop.pkl']