# COUNTS VS. TF-IDF

In this notebook, we explore the effect of counts vs tf-idf representation on interpretability and accuracy.

In [1]:
from classifiers import TransparentLogisticRegression
import numpy as np
import glob
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from time import time
from scipy.sparse.construct import diags
from utils import load_imdb, ColoredDoc, ColoredWeightedDoc, TopInstances
from IPython import display

In [2]:
print "Loading the data"
    
t0 = time()

vect = CountVectorizer(min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1))
X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("C:\\Users\\mbilgic\\Desktop\\aclImdb", shuffle=True, vectorizer=vect)
feature_names = vect.get_feature_names()

duration = time() - t0

print
print "Loading took %0.2fs." % duration
print

Loading the data
Loading the imdb reviews data
Data loaded.
Extracting features from the training dataset using a sparse vectorizer
Feature extraction technique is CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None).
done in 5.531000s
n_samples: 25000, n_features: 27272

Extracting features from the test dataset using the same vectorizer
done in 5.030000s
n_samples: 25000, n_features: 27272


Loading took 301.40s.



In [3]:
X_train_counts = X_train
X_test_counts = X_test

In [4]:
tfidftransformer = TfidfTransformer()
X_train_tfidf = tfidftransformer.fit_transform(X_train)
X_test_tfidf = tfidftransformer.transform(X_test)

In [7]:
print "Fitting the classifiers"

t0 = time()

c=1
clf_counts = TransparentLogisticRegression(penalty='l1', C=c)
clf_tfidf = TransparentLogisticRegression(penalty='l1', C=c)

clf_counts.fit(X_train_counts, y_train)
clf_tfidf.fit(X_train_tfidf, y_train)

print "Fitting the classifiers took %0.2fs." % (duration)

Fitting the classifiers
Fitting the classifiers took 301.40s.


In [8]:
print "Accuracies"

from sklearn import metrics

y_pred = clf_counts.predict(X_test_counts)
print "Counts accuracy=%0.2f" %(metrics.accuracy_score(y_test, y_pred))
y_pred = clf_tfidf.predict(X_test_tfidf)
print "Tf-idf accuracy=%0.2f" %(metrics.accuracy_score(y_test, y_pred))


Accuracies
Counts accuracy=0.86
Tf-idf accuracy=0.88


In [14]:
print "Predicting the evidences"

t0 = time()

neg_evi, pos_evi = clf_counts.predict_evidences(X_test_counts)
tis_counts = TopInstances(neg_evi, pos_evi, clf_counts.intercept_)

neg_evi, pos_evi = clf_tfidf.predict_evidences(X_test_tfidf)
tis_tfidf = TopInstances(neg_evi, pos_evi, clf_tfidf.intercept_)


duration = time() - t0

print "Predicting evidences took %0.2fs." % duration

Predicting the evidences
Predicting evidences took 0.47s.


## Most Negative

In [15]:
display.display_html("<b>Counts<b>", raw=True)
j=tis_counts.most_negatives()[0]
display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clf_counts.coef_[0]))

display.display_html("<b>Tf-idf<b>", raw=True)
j=tis_tfidf.most_negatives()[0]
display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clf_tfidf.coef_[0]))

## Most Positive

In [16]:
display.display_html("<b>Counts<b>", raw=True)
j=tis_counts.most_positives()[0]
display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clf_counts.coef_[0]))

display.display_html("<b>Tf-idf<b>", raw=True)
j=tis_tfidf.most_positives()[0]
display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clf_tfidf.coef_[0]))

## Most opinionated

In [18]:
display.display_html("<b>Counts<b>", raw=True)
j=tis_counts.most_opinionateds()[0]
display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clf_counts.coef_[0]))

display.display_html("<b>Tf-idf<b>", raw=True)
j=tis_tfidf.most_opinionateds()[0]
display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clf_tfidf.coef_[0]))

## Least opinionated

In [19]:
display.display_html("<b>Counts<b>", raw=True)
j=tis_counts.least_opinionateds()[0]
display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clf_counts.coef_[0]))

display.display_html("<b>Tf-idf<b>", raw=True)
j=tis_tfidf.least_opinionateds()[0]
display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clf_tfidf.coef_[0]))

## Most conflicted

In [20]:
display.display_html("<b>Counts<b>", raw=True)
j=tis_counts.most_conflicteds()[0]
display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clf_counts.coef_[0]))

display.display_html("<b>Tf-idf<b>", raw=True)
j=tis_tfidf.most_conflicteds()[0]
display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clf_tfidf.coef_[0]))

## Least conflicted

In [21]:
display.display_html("<b>Counts<b>", raw=True)
j=tis_counts.least_conflicteds()[0]
display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clf_counts.coef_[0]))

display.display_html("<b>Tf-idf<b>", raw=True)
j=tis_tfidf.least_conflicteds()[0]
display.display(ColoredWeightedDoc(test_corpus[j].decode('utf-8'), feature_names, clf_tfidf.coef_[0]))