In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/stop_training.csv')
testing = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/stop_testing.csv')

In [3]:
data.drop('Unnamed: 0', 1, inplace=True)
testing.drop('Unnamed: 0', 1, inplace=True)

In [4]:
data = data.dropna()
testing = testing.dropna()

In [5]:
data.dtypes, testing.dtypes

(sentiment     int64
 stop_text    object
 dtype: object, sentiment     int64
 stop_text    object
 dtype: object)

In [6]:
data_y = data['sentiment']
data_x = data['stop_text']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.05, random_state=232)

In [7]:
nb_smoothing = np.linspace(0, 1, 9)
param_grid = {'nb__alpha': nb_smoothing,
             'tf_vec__min_df':[20,30,40],
             'tf_vec__max_df':[.8,.9],
             'tf_vec__ngram_range': [(1,1),(1,2)]}

In [8]:
tf_vec = TfidfVectorizer()
train_x_tf = tf_vec.fit_transform(x_train)

nb = MultinomialNB()
text_nb = Pipeline([('tf_vec', TfidfVectorizer()),
                    ('nb', MultinomialNB()),
])

nb_gs = RandomizedSearchCV(text_nb, param_grid, n_iter=15, cv=4, n_jobs=-1)

nb_gs.fit(x_train, y_train)

print(nb_gs)

RandomizedSearchCV(cv=4, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('tf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...True,
        vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
          fit_params=None, iid=True, n_iter=15, n_jobs=-1,
          param_distributions={'nb__alpha': array([ 0.   ,  0.125,  0.25 ,  0.375,  0.5  ,  0.625,  0.75 ,  0.875,  1.   ]), 'tf_vec__min_df': [20, 30, 40], 'tf_vec__max_df': [0.8, 0.9], 'tf_vec__ngram_range': [(1, 1), (1, 2)]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)


In [9]:
pred = nb_gs.predict(x_test)

In [10]:
print(nb_gs.score(x_test, y_test))

0.772053093341


In [11]:
print(classification_report(y_test, pred,
     target_names=['neg','pos']))

             precision    recall  f1-score   support

        neg       0.77      0.77      0.77     39701
        pos       0.77      0.77      0.77     39857

avg / total       0.77      0.77      0.77     79558



In [12]:
print(confusion_matrix(y_test, pred))

[[30654  9047]
 [ 9088 30769]]


In [13]:
print(roc_auc_score(y_test, pred))

0.772053227426


In [14]:
from sklearn.externals import joblib
joblib.dump(nb_gs, 'nb_stop_tfidf_vec_ngrams.pkl')

['nb_stop_tfidf_vec_ngrams.pkl']