In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/lemm_training.csv')
testing = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/lemm_testing.csv')

In [3]:
data.drop('Unnamed: 0', 1, inplace=True)
testing.drop('Unnamed: 0', 1, inplace=True)

In [4]:
data = data.dropna()
testing = testing.dropna()

In [5]:
data.dtypes, testing.dtypes

(sentiment     int64
 lemm_text    object
 dtype: object, sentiment     int64
 lemm_text    object
 dtype: object)

In [6]:
data_y = data['sentiment']
data_x = data['lemm_text']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.05, random_state=232)

In [7]:
nb_smoothing = np.linspace(0, 1, 10)
param_grid = {'nb__alpha': nb_smoothing}

In [8]:
count_vec = CountVectorizer()
train_x_count = count_vec.fit_transform(x_train)

tf_tran = TfidfTransformer()
train_x_tfidf = tf_tran.fit_transform(train_x_count)

nb = MultinomialNB()
text_nb = Pipeline([('count_vec', CountVectorizer()),
                      ('tf_tran', TfidfTransformer()),
                      ('nb', MultinomialNB()),
])

nb_gs = GridSearchCV(text_nb, param_grid, cv=4, n_jobs=-1)

nb_gs.fit(x_train, y_train)

print(nb_gs)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...linear_tf=False, use_idf=True)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'nb__alpha': array([ 0.     ,  0.11111,  0.22222,  0.33333,  0.44444,  0.55556,
        0.66667,  0.77778,  0.88889,  1.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)


In [9]:
pred = nb_gs.predict(x_test)

In [10]:
print(nb_gs.score(x_test, y_test))

0.754499603808


In [11]:
print(classification_report(y_test, pred,
     target_names=['neg','pos']))

             precision    recall  f1-score   support

        neg       0.75      0.76      0.76     39718
        pos       0.76      0.74      0.75     39789

avg / total       0.75      0.75      0.75     79507



In [12]:
print(confusion_matrix(y_test, pred))

[[30377  9341]
 [10178 29611]]


In [13]:
print(roc_auc_score(y_test, pred))

0.754508809019


In [14]:
from sklearn.externals import joblib
joblib.dump(nb_gs, 'nb_lemm_count_tfidf.pkl')

['nb_lemm_count_tfidf.pkl']