In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/stop_training.csv')
testing = pd.read_csv('/Users/matthewcassi/Documents/nhl_sentiment_working/sentiment_data/none/stop_testing.csv')

In [3]:
data.drop('Unnamed: 0', 1, inplace=True)
testing.drop('Unnamed: 0', 1, inplace=True)

In [4]:
data = data.dropna()
testing = testing.dropna()

In [5]:
data.dtypes, testing.dtypes

(sentiment     int64
 stop_text    object
 dtype: object, sentiment     int64
 stop_text    object
 dtype: object)

In [6]:
data_y = data['sentiment']
data_x = data['stop_text']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.05, random_state=232)

In [7]:
tf_vec = TfidfVectorizer(min_df=20, max_df=.8)
train_x_tf = tf_vec.fit_transform(x_train)

svc = LinearSVC(C=3)
text_svc = Pipeline([('tf_vec', TfidfVectorizer()),
                    ('svc', LinearSVC()),
])

text_svc.fit(x_train, y_train)

print(text_svc)

Pipeline(memory=None,
     steps=[('tf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])


In [8]:
pred = text_svc.predict(x_test)

In [9]:
print(text_svc.score(x_test, y_test))

0.772354760049


In [10]:
print(classification_report(y_test, pred,
     target_names=['neg','pos']))

             precision    recall  f1-score   support

        neg       0.78      0.76      0.77     39701
        pos       0.77      0.78      0.78     39857

avg / total       0.77      0.77      0.77     79558



In [11]:
print(confusion_matrix(y_test, pred))

[[30205  9496]
 [ 8615 31242]]


In [12]:
print(roc_auc_score(y_test, pred))

0.772332171048


In [13]:
from sklearn.externals import joblib
joblib.dump(text_svc, 'svc_stop_tfidf_vec_nocv.pkl') 

['svc_stop_tfidf_vec_nocv.pkl']