# NORMALIZED COUNTS VS TF-IDF

In [1]:
from classifiers import TransparentLogisticRegression
import numpy as np
import glob
from sklearn.feature_extraction.text import CountVectorizer
from time import time
from scipy.sparse.construct import diags
from utils import load_imdb, ColoredDoc, ColoredWeightedDoc, TopInstances
from IPython import display

In [2]:
print "Loading the data"
    
t0 = time()

vect = CountVectorizer(min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1))
X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("C:\\Users\\Mustafa\\Desktop\\aclImdb", shuffle=True, vectorizer=vect)
feature_names = vect.get_feature_names()

duration = time() - t0

print
print "Loading took %0.2fs." % duration
print

Loading the data
Loading the imdb reviews data
Data loaded.
Extracting features from the training dataset using a sparse vectorizer
Feature extraction technique is CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None).
done in 5.994000s
n_samples: 25000, n_features: 27272

Extracting features from the test dataset using the same vectorizer
done in 5.377000s
n_samples: 25000, n_features: 27272


Loading took 14.77s.



In [3]:
# Classifiers, datasets; fill these in
clfs = []
names = []
X_trains = []
X_tests = []
tis = []

In [4]:
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfTransformer
# Normalized counts
clfs.append(TransparentLogisticRegression(penalty='l1', C=1.))
X_trains.append(normalize(csr_matrix(X_train, dtype=float))) #l2-norm; normalize instances
X_tests.append(normalize(csr_matrix(X_test, dtype=float))) #l2-norm; normalize instances
names.append("Normalized Counts")
# Tf-idf
tfidftransformer = TfidfTransformer()
clfs.append(TransparentLogisticRegression(penalty='l1', C=1.))
X_trains.append(tfidftransformer.fit_transform(X_train)) #by default, take l2-norm of the instances
X_tests.append(tfidftransformer.transform(X_test)) #by default, take l2-norm of the instances
names.append("Tf-idf")

In [5]:
print "Fitting the classifiers"

t0 = time()

for i in range(len(clfs)):
    clfs[i].fit(X_trains[i], y_train)
    
duration = time() - t0

print "Fitting %d classifiers took %0.2fs." % (len(clfs), duration)

Fitting the classifiers
Fitting 2 classifiers took 4.61s.


In [6]:
print "Accuracies"

from sklearn import metrics

for i in range(len(clfs)):
    y_pred = clfs[i].predict(X_tests[i])
    print "%s \t accuracy %0.2f" %(names[i], metrics.accuracy_score(y_test, y_pred))

Accuracies
Normalized Counts 	 accuracy 0.87
Tf-idf 	 accuracy 0.88


In [7]:
print "Predicting the evidences"

t0 = time()


for i in range(len(clfs)):
    neg_evi, pos_evi = clfs[i].predict_evidences(X_tests[i])
    tis.append(TopInstances(neg_evi, pos_evi, clfs[i].intercept_))

duration = time() - t0

print "Predicting evidences took %0.2fs." % duration

Predicting the evidences
Predicting evidences took 0.70s.


## Most Negative

In [8]:
for i in range(len(clfs)):
    display.display_html("<b>"+names[i]+"<b>", raw=True)
    j=tis[i].most_negatives()[0]
    display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clfs[i].coef_[0]))

## Most Positive

In [9]:
for i in range(len(clfs)):
    display.display_html("<b>"+names[i]+"<b>", raw=True)
    j=tis[i].most_positives()[0]
    display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clfs[i].coef_[0]))

## Most opinionated

In [10]:
for i in range(len(clfs)):
    display.display_html("<b>"+names[i]+"<b>", raw=True)
    j=tis[i].most_opinionateds()[0]
    display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clfs[i].coef_[0]))

## Least opinionated

In [11]:
for i in range(len(clfs)):
    display.display_html("<b>"+names[i]+"<b>", raw=True)
    j=tis[i].least_opinionateds()[0]
    display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clfs[i].coef_[0]))

## Most conflicted

In [12]:
for i in range(len(clfs)):
    display.display_html("<b>"+names[i]+"<b>", raw=True)
    j=tis[i].most_conflicteds()[0]
    display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clfs[i].coef_[0]))

## Least conflicted

In [13]:
for i in range(len(clfs)):
    display.display_html("<b>"+names[i]+"<b>", raw=True)
    j=tis[i].least_conflicteds()[0]
    display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clfs[i].coef_[0]))