In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/lemm_training.csv')
testing = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/lemm_testing.csv')

In [3]:
data.drop('Unnamed: 0', 1, inplace=True)
testing.drop('Unnamed: 0', 1, inplace=True)

In [4]:
data = data.dropna()
testing = testing.dropna()

In [5]:
data.dtypes, testing.dtypes

(sentiment     int64
 lemm_text    object
 dtype: object, sentiment     int64
 lemm_text    object
 dtype: object)

In [6]:
data_y = data['sentiment']
data_x = data['lemm_text']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.05, random_state=232)

In [7]:
nb_smoothing = np.linspace(0.001, 30, 20)
param_grid = {'lr__C': nb_smoothing,
             'tf_vec__min_df':[20,30,40],
             'tf_vec__max_df':[.8,.9],
             'tf_vec__ngram_range':[(1,1),(1,2),(1,3)]}

In [8]:
tf_vec = TfidfVectorizer(max_features=100000)
train_x_tf = tf_vec.fit_transform(x_train)

lr = LogisticRegression()
text_lr = Pipeline([('tf_vec', TfidfVectorizer()),
                    ('lr', LogisticRegression()),
])

lr_gs = RandomizedSearchCV(text_lr, param_grid, n_iter=15, cv=4, n_jobs=-1)

lr_gs.fit(x_train, y_train)

print(lr_gs)

RandomizedSearchCV(cv=4, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('tf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=15, n_jobs=-1,
          param_distributions={'lr__C': array([  1.00000e-03,   1.57989e+00,   3.15879e+00,   4.73768e+00,
         6.31658e+00,   7.89547e+00,   9.47437e+00,   1.10533e+01,
         1.26322e+01,   1.42111e+01,   1.57899e+01,   1.73688e+01,
         1.89477e+01,   2.05266e+01,   2.21055e+01,   2.36844e+01,
         2.52633e+01,   2.68422e+01,   2.84211e+01,   3.00000e+01]), 'tf_vec__min_df': [20, 30, 40], 'tf_vec__max_df': [0.8, 0.9], 'tf

In [9]:
pred = lr_gs.predict(x_test)

In [10]:
print(lr_gs.score(x_test, y_test))

0.780912372496


In [11]:
print(classification_report(y_test, pred,
     target_names=['neg','pos']))

             precision    recall  f1-score   support

        neg       0.79      0.76      0.78     39718
        pos       0.77      0.80      0.79     39789

avg / total       0.78      0.78      0.78     79507



In [12]:
print(confusion_matrix(y_test, pred))

[[30236  9482]
 [ 7937 31852]]


In [13]:
print(roc_auc_score(y_test, pred))

0.780894844708


In [14]:
from sklearn.externals import joblib
joblib.dump(lr_gs, 'lr_lemm_tfidf_vec_ngrams_limited.pkl') 

['lr_lemm_tfidf_vec_ngrams_limited.pkl']