In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/lemm_training.csv')
testing = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/lemm_testing.csv')

In [3]:
data.drop('Unnamed: 0', 1, inplace=True)
testing.drop('Unnamed: 0', 1, inplace=True)

In [4]:
data = data.dropna()
testing = testing.dropna()

In [5]:
data.dtypes, testing.dtypes

(sentiment     int64
 lemm_text    object
 dtype: object, sentiment     int64
 lemm_text    object
 dtype: object)

In [6]:
data_y = data['sentiment']
data_x = data['lemm_text']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.05, random_state=232)

In [7]:
param_grid = {'rf__max_features': ['auto','sqrt','log2'],
              'rf__min_samples_leaf': [25,50,75],
             'tf_vec__min_df':[20,30,40],
             'tf_vec__max_df':[.8,.9],
             'tf_vec__ngram_range':[(1,1),(1,2),(1,3)]}

In [8]:
tf_vec = TfidfVectorizer()
train_x_tf = tf_vec.fit_transform(x_train)

rf = RandomForestClassifier(n_estimators=500)
text_rf = Pipeline([('tf_vec', TfidfVectorizer()),
                    ('rf', RandomForestClassifier()),
])

rf_gs = RandomizedSearchCV(text_rf, param_grid, n_iter=8, cv=4, n_jobs=-1)

rf_gs.fit(x_train, y_train)

print(rf_gs)

RandomizedSearchCV(cv=4, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('tf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
          fit_params=None, iid=True, n_iter=8, n_jobs=-1,
          param_distributions={'rf__max_features': ['auto', 'sqrt', 'log2'], 'rf__min_samples_leaf': [25, 50, 75], 'tf_vec__min_df': [20, 30, 40], 'tf_vec__max_df': [0.8, 0.9], 'tf_vec__ngram_range': [(1, 1), (1, 2), (1, 3)]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)


In [9]:
pred = rf_gs.predict(x_test)

In [10]:
print(rf_gs.score(x_test, y_test))

0.737104909002


In [11]:
print(classification_report(y_test, pred,
     target_names=['neg','pos']))

             precision    recall  f1-score   support

        neg       0.74      0.73      0.73     39718
        pos       0.73      0.75      0.74     39789

avg / total       0.74      0.74      0.74     79507



In [12]:
print(confusion_matrix(y_test, pred))

[[28915 10803]
 [10099 29690]]


In [13]:
print(roc_auc_score(y_test, pred))

0.737096792193


In [14]:
from sklearn.externals import joblib
joblib.dump(rf_gs, 'rf_lemm_tfidf_vec_ngrams_limited.pkl') 

['rf_lemm_tfidf_vec_ngrams_limited.pkl']