# testing_classifiers

In [3]:
import numpy as np
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics

import pandas as pd
import connect_aws_db as cadb


In [18]:
engine = cadb.connect_aws_db(write_unicode=True)

In [19]:
categories = ['dogs', 'general']

### Restore BF Reviews and Ratings

In [20]:
cmd = "SELECT review_rating, review_text FROM bf_reviews"

In [21]:
bfdf = pd.read_sql_query(cmd, engine)

In [22]:
print(len(bfdf))
bfdf.head(5)

1750


Unnamed: 0,review_rating,review_text
0,2,When you first go to your room you notice the ...
1,2,We were going to the Pre-Westminster event tha...
2,2,While the room was not the fanciest that one c...
3,1,I stayed at the Hotel Penn in Manhattan becaus...
4,5,We loved this hotel! The are very friendly and...


### Restore Yelp Reviews and Ratings

In [23]:
cmd = "SELECT review_rating, review_text FROM yelp_reviews"

In [24]:
yelpdf = pd.read_sql_query(cmd, engine)

In [25]:
print(len(yelpdf))
yelpdf.head(5)

6263


Unnamed: 0,review_rating,review_text
0,4,I stayed at the hotel for several months durin...
1,5,I have been coming to Pittsburgh for quite a w...
2,4,This is a very good hotel and with a corporate...
3,4,"i got upgraded to a junior suite, and then aga..."
4,5,This is a fantastic hotel. I went to a conven...


### Create a training and test data sets

In [55]:
train_data = np.hstack((bfdf['review_text'].values[:1000],
                        yelpdf['review_text'].values[:1000]))

In [59]:
len(train_data)

2000

In [73]:
labels = ['dog'] * 1000
labels.extend(['general'] * 1000)

In [74]:
len(labels)

2000

In [68]:
y_train = labels

In [57]:
test_data = np.hstack((bfdf['review_text'].values[1000:],
                       yelpdf['review_text'].values[1000:1750]))

In [58]:
len(test_data)

1500

In [75]:
labels = ['dog'] * 750
labels.extend(['general'] * 750)

In [76]:
y_test = labels

In [77]:
len(y_test)

1500

In [28]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')

In [56]:
X_train = vectorizer.fit_transform(train_data)

In [60]:
X_test = vectorizer.transform(test_data)

In [61]:
feature_names = np.asarray(vectorizer.get_feature_names())

### Try RidgeClassifier

In [82]:
clf = RidgeClassifier(tol=1e-2, solver="lsqr")

In [83]:
print(clf)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, solver='lsqr', tol=0.01)


In [84]:
clf.fit(X_train, y_train)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, solver='lsqr', tol=0.01)

In [85]:
pred = clf.predict(X_test)

In [93]:
score = metrics.f1_score(y_test, pred, labels=None, pos_label=None)

  sample_weight=sample_weight)


In [92]:
len(y_test) == len(pred)

True

In [87]:
help(metrics.f1_score)

Help on function f1_score in module sklearn.metrics.classification:

f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None)
    Compute the F1 score, also known as balanced F-score or F-measure
    
    The F1 score can be interpreted as a weighted average of the precision and
    recall, where an F1 score reaches its best value at 1 and worst score at 0.
    The relative contribution of precision and recall to the F1 score are
    equal. The formula for the F1 score is::
    
        F1 = 2 * (precision * recall) / (precision + recall)
    
    In the multi-class and multi-label case, this is the weighted average of
    the F1 score of each class.
    
    Parameters
    ----------
    y_true : 1d array-like, or label indicator array / sparse matrix
        Ground truth (correct) target values.
    
    y_pred : 1d array-like, or label indicator array / sparse matrix
        Estimated targets as returned by a classifier.
    
    labels : array
      

In [99]:
def benchmark(clf, pos_label=None):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.f1_score(y_test, pred, pos_label=pos_label)
    print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

#         if opts.print_top10 and feature_names is not None:
#             print("top 10 keywords per class:")
#             for i, category in enumerate(categories):
#                 top10 = np.argsort(clf.coef_[i])[-10:]
#                 print(trim("%s: %s"
#                       % (category, " ".join(feature_names[top10]))))
        print()

#     if opts.print_report:
#         print("classification report:")
#         print(metrics.classification_report(y_test, pred,
#                                             target_names=categories))

#     if opts.print_cm:
#         print("confusion matrix:")
#         print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

In [100]:
results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, solver='lsqr', tol=0.01)
train time: 0.011s
test time:  0.000s
f1-score:   0.909
dimensionality: 10427
density: 1.000000
()
()
Perceptron
________________________________________________________________________________
Training: 
Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=50, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)
train time: 0.019s
test time:  0.001s
f1-score:   0.883
dimensionality: 10427
density: 0.409514
()
()
Passive-Aggressive
________________________________________________________________________________
Training: 
PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge',
              n_iter=50, n_jobs=1, random_state=None, shuffle=True,
            

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)



f1-score:   0.823
()


In [101]:
for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                            dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty=penalty)))



L2 penalty
________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.001, verbose=0)
train time: 0.052s

  sample_weight=sample_weight)



test time:  0.000s
f1-score:   0.911
dimensionality: 10427
density: 1.000000
()
()
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
train time: 0.012s
test time:  0.000s
f1-score:   0.894
dimensionality: 10427
density: 0.675937
()
()
L1 penalty
________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l1', random_state=None, tol=0.001, verbose=0)
train time: 0.034s
test time:  0.000s
f1-score:   0.898
dimensionality: 10427
density: 0.033087
()
()
_____________________

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)



test time:  0.001s
f1-score:   0.889
dimensionality: 10427
density: 0.081423
()
()


In [102]:
# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))


Elastic-Net penalty
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
train time: 0.056s
test time:  0.000s
f1-score:   0.890
dimensionality: 10427
density: 0.389086
()
()
NearestCentroid (aka Rocchio classifier)
________________________________________________________________________________
Training: 
NearestCentroid(metric='euclidean', shrink_threshold=None)
train time: 0.006s
test time:  0.001s
f1-score:   0.902
()
Naive Bayes
________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
train time: 0.003s
test time:  0.000s
f1-score:   0.873
dimensiona

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
