In [1]:
from classifiers import TransparentLogisticRegression
import numpy as np
import glob
from sklearn.feature_extraction.text import CountVectorizer
from time import time
from scipy.sparse.construct import diags
from utils import load_imdb, ColoredDoc

In [3]:
print "Loading the data"
    
t0 = time()

vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("C:\\Users\\mbilgic\\Desktop\\aclImdb", shuffle=True, vectorizer=vect)
feature_names = vect.get_feature_names()

duration = time() - t0

print
print "Loading took %0.2fs." % duration
print

Loading the data
Loading the imdb reviews data
Data loaded.
Extracting features from the training dataset using a sparse vectorizer
Feature extraction technique is CountVectorizer(analyzer=u'word', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None).
done in 5.536000s
n_samples: 25000, n_features: 27272

Extracting features from the test dataset using the same vectorizer
done in 5.059000s
n_samples: 25000, n_features: 27272


Loading took 123.01s.



In [5]:
print "Fitting the classifier"

t0 = time()
clf = TransparentLogisticRegression(penalty='l1', C=1)
clf.fit(X_train, y_train)

duration = time() - t0

print
print "Fitting took %0.2fs." % duration
print

Fitting the classifier

Fitting took 1.41s.



In [6]:
print "Predicting the evidences"
    
t0 = time()
neg_evi, pos_evi = clf.predict_evidences(X_test)

duration = time() - t0

print
print "Predicting evidences took %0.2fs." % duration
print

Predicting the evidences

Predicting evidences took 0.25s.



In [7]:
print "Predicting the probs"
    
t0 = time()
probs = clf.predict_proba(X_test)

duration = time() - t0

print
print "Predicting probs took %0.2fs." % duration
print

Predicting the probs

Predicting probs took 0.03s.



In [8]:
total_evi = neg_evi + pos_evi

total_evi += clf.intercept_[0]

evi_sorted = np.argsort(total_evi)

coef_diags = diags(clf.coef_[0], 0)

In [9]:
highlight_coef = np.copy(clf.coef_[0])
coef_sorted = np.argsort(highlight_coef)
top_k=500
highlight_coef[coef_sorted[top_k:-top_k]]=0

In [10]:
print
print "Most negative"
print
i = evi_sorted[0]
print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
#print test_corpus[i]
ColoredDoc(test_corpus[i].decode('utf-8'), feature_names, highlight_coef)


Most negative

-35.6438027503 -61.0991928595 25.2798031237 [  1.00000000e+00   3.31202153e-16]


In [11]:
print
print "Most positive"
print
i = evi_sorted[-1]
print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
ColoredDoc(test_corpus[i].decode('utf-8'), feature_names, highlight_coef)


Most positive

28.6445732852 -32.8606960322 61.3296823319 [  3.62820884e-13   1.00000000e+00]


In [12]:
print
print "Least opiniated (minimum total absolute evidence)"
print
total_abs_evi = abs(neg_evi) + pos_evi
total_abs_evi += abs(clf.intercept_[0])
abs_evi_sorted = np.argsort(total_abs_evi)
i = abs_evi_sorted[0]
print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
ColoredDoc(test_corpus[i].decode('utf-8'), feature_names, highlight_coef)


Least opiniated (minimum total absolute evidence)

0.0618314494431 -0.239005181654 0.125249645542 [ 0.48454706  0.51545294]


In [13]:
print
print "Most opiniated (maximum total absolute evidence)"
print
i = abs_evi_sorted[-1]
print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
ColoredDoc(test_corpus[i].decode('utf-8'), feature_names, highlight_coef)


Most opiniated (maximum total absolute evidence)

-7.28048101812 -76.660883936 69.2048159323 [  9.99311620e-01   6.88379939e-04]


In [14]:
print
print "Colossal false negative failure"
print
for i in evi_sorted:
    if total_evi[i] > 0:
        i=-1
        break
    elif y_test[i] == 1:
        break
cd = None
if i != -1:
    print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
    cd = ColoredDoc(test_corpus[i].decode('utf-8'), feature_names, highlight_coef)
cd


Colossal false negative failure

-16.4873227176 -40.6188012959 23.9558915928 [  9.99999931e-01   6.91268381e-08]


In [15]:
print
print "Colossal false positive failure"
print
for i in evi_sorted[::-1]:
    if total_evi[i] < 0:
        i=-1
        break
    elif y_test[i] == 0:
        break
cd = None
if i != -1:
    print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
    cd = ColoredDoc(test_corpus[i].decode('utf-8'), feature_names, highlight_coef)
cd


Colossal false positive failure

15.343606131 -12.6495115483 27.8175306936 [  2.16948389e-07   9.99999783e-01]
