In [49]:
#
# Imports

import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from sklearn.feature_extraction.text import *
import common
from importlib import reload
%matplotlib inline


In [47]:
#
# Load the dataset
#
reload(common)

(train_sentences, train_categories, test_sentences, test_categories) = common.load_dataset('../dataset/redis_dataset.csv', split=0.6)
print(train_sentences.shape)

#
# Vectorizers
#
cv = CountVectorizer()
train_cv = cv.fit_transform(train_sentences)
test_cv = cv.transform(test_sentences)

tf = TfidfVectorizer()
train_tf = tf.fit_transform(train_sentences)
test_tf = tf.transform(test_sentences)


9849
(5909,)


In [59]:
#
# NaiveBayes
#

#
# Bernoulli NB Model
#
alphas = {'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}
grid = GridSearchCV(BernoulliNB(),alphas)
grid.fit(train_cv, train_categories)
print('Best BernoulliNB score with CV:', grid.best_score_, 'with alpha= ', grid.best_params_)

grid.fit(train_tf, train_categories)
print('Best BernoulliNB score with TF:', grid.best_score_, 'with alpha= ', grid.best_params_)

b_clf = BernoulliNB(alpha=grid.best_params_['alpha'])
b_clf.fit(train_tf, train_categories)
pred = b_clf.predict(test_tf)
print('\nAccuracy:', np.mean(pred == test_categories))
report = metrics.classification_report(pred, test_categories)
print(report)

Best BernoulliNB score with CV: 0.7134878998138433 with alpha=  {'alpha': 2.0}
Best BernoulliNB score with TF: 0.7134878998138433 with alpha=  {'alpha': 2.0}

Accuracy: 0.7368020304568528
             precision    recall  f1-score   support

          0       0.88      0.77      0.82      3070
          1       0.43      0.62      0.51       870

avg / total       0.78      0.74      0.75      3940



In [56]:
#
# Logistic Regression 
#
Cs = {'C': [.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 1]}
lg_clf = LogisticRegression(penalty='l2')
grid_search = GridSearchCV(lg_clf, Cs)
grid_search.fit(train_tf, train_categories)
print('Best Logistic regression score:', grid_search.best_score_, 'with C= ', grid_search.best_params_)

lg_clf = LogisticRegression(penalty='l2', C=1)
lg_clf.fit(train_tf, train_categories)
pred = lg_clf.predict(test_tf)
report = metrics.classification_report(pred, test_categories)
print(report)


Best Logistic regression score: 0.7656117786427483 with C=  {'C': 1}
             precision    recall  f1-score   support

          0       0.90      0.81      0.85      3003
          1       0.54      0.73      0.62       937

avg / total       0.82      0.79      0.80      3940

