In [19]:
import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import scale
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/stop_training.csv')
testing = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/stop_testing.csv')

In [3]:
data.drop('Unnamed: 0', 1, inplace=True)
testing.drop('Unnamed: 0', 1, inplace=True)

In [4]:
data = data.dropna()
testing = testing.dropna()

In [5]:
data_y = data['sentiment']
data_x = data['stop_text']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.05, random_state=232)

In [6]:
n_dim = 300
#Initialize model and build vocab
w2v = Word2Vec(size=n_dim, min_count=10)
w2v.build_vocab(x_train)

In [7]:
w2v.train(x_train, total_examples=w2v.corpus_count, epochs=w2v.iter)

  """Entry point for launching an IPython kernel.


(54525461, 314722155)

In [8]:
#Build word vector for training set by using the average value of all word vectors in the tweet, then scale
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [9]:
train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_train])
train_vecs = scale(train_vecs)

  import sys


In [10]:
w2v.train(x_test, total_examples=w2v.corpus_count, epochs=w2v.iter)

  """Entry point for launching an IPython kernel.


(2871547, 16573585)

In [11]:
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_test])
test_vecs = scale(test_vecs)

  import sys


In [17]:
loss = ['log','hinge']
penalty = ['l1','l2']
alpha = np.linspace(.00001, 1, 10)
param_grid = {'loss':loss,'penalty':penalty,'alpha':alpha}

sgd = SGDClassifier(n_jobs=-1)
sgd_cv = RandomizedSearchCV(sgd, param_grid, n_iter=10, cv=3)
sgd_cv.fit(train_vecs, y_train)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=-1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'loss': ['log', 'hinge'], 'penalty': ['l1', 'l2'], 'alpha': array([  1.00000e-05,   1.11120e-01,   2.22230e-01,   3.33340e-01,
         4.44450e-01,   5.55560e-01,   6.66670e-01,   7.77780e-01,
         8.88890e-01,   1.00000e+00])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [20]:
pred = sgd_cv.predict(test_vecs)

In [21]:
sgd_cv.score(test_vecs, y_test)

0.51543527992156668

In [22]:
print(classification_report(y_test, pred,
     target_names=['neg','pos']))

             precision    recall  f1-score   support

        neg       0.51      0.51      0.51     39718
        pos       0.52      0.52      0.52     39840

avg / total       0.52      0.52      0.52     79558



In [23]:
confusion_matrix(y_test, pred)

array([[20318, 19400],
       [19151, 20689]])

In [24]:
roc_auc_score(y_test, pred)

0.51542934098547377

In [25]:
from sklearn.externals import joblib
joblib.dump(sgd_cv, 'word2vec_sgd_stop.pkl') 

['word2vec_sgd_stop.pkl']