In [14]:
from classifiers import TransparentLogisticRegression
import numpy as np
import glob
from sklearn.feature_extraction.text import CountVectorizer
from time import time
from scipy.sparse.construct import diags
from utils import load_imdb, ColoredDoc, ColoredWeightedDoc

In [15]:
print "Loading the data"
    
t0 = time()

vect = CountVectorizer(min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1))
X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("C:\\Users\\Mustafa\\Desktop\\aclImdb", shuffle=True, vectorizer=vect)
feature_names = vect.get_feature_names()

duration = time() - t0

print
print "Loading took %0.2fs." % duration
print

Loading the data
Loading the imdb reviews data
Data loaded.
Extracting features from the training dataset using a sparse vectorizer
Feature extraction technique is CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None).
done in 6.120000s
n_samples: 25000, n_features: 27272

Extracting features from the test dataset using the same vectorizer
done in 5.640000s
n_samples: 25000, n_features: 27272


Loading took 15.18s.



In [16]:
print "Fitting the classifier"

t0 = time()
clf = TransparentLogisticRegression(penalty='l1', C=1)
clf.fit(X_train, y_train)

duration = time() - t0

print
print "Fitting took %0.2fs." % duration
print

Fitting the classifier

Fitting took 3.55s.



In [17]:
print "Predicting the evidences"
    
t0 = time()
neg_evi, pos_evi = clf.predict_evidences(X_test)

duration = time() - t0

print
print "Predicting evidences took %0.2fs." % duration
print

Predicting the evidences

Predicting evidences took 0.29s.



In [18]:
print "Predicting the probs"
    
t0 = time()
probs = clf.predict_proba(X_test)

duration = time() - t0

print
print "Predicting probs took %0.2fs." % duration
print

Predicting the probs

Predicting probs took 0.02s.



In [19]:
total_evi = neg_evi + pos_evi

total_evi += clf.intercept_[0]

evi_sorted = np.argsort(total_evi)

coef_diags = diags(clf.coef_[0], 0)

In [20]:
print
print "Most negative"
print
i = evi_sorted[0]
print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
#print test_corpus[i]
ColoredWeightedDoc(test_corpus[i].decode('utf-8'), feature_names, clf.coef_[0])


Most negative

-52.7933845929 -99.4634012816 46.6943337096 [  1.00000000e+00   1.18065875e-23]


In [21]:
print
print "Most positive"
print
i = evi_sorted[-1]
print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
ColoredWeightedDoc(test_corpus[i].decode('utf-8'), feature_names, clf.coef_[0])


Most positive

45.2409779101 -21.6684833186 66.9337782496 [ 0.  1.]


In [22]:
print
print "Least opiniated (minimum total absolute evidence)"
print
total_abs_evi = abs(neg_evi) + pos_evi
total_abs_evi += abs(clf.intercept_[0])
abs_evi_sorted = np.argsort(total_abs_evi)
i = abs_evi_sorted[0]
print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
ColoredWeightedDoc(test_corpus[i].decode('utf-8'), feature_names, clf.coef_[0])


Least opiniated (minimum total absolute evidence)

-0.418909810117 -0.468221027907 0.0736282387224 [ 0.60322235  0.39677765]


In [23]:
print
print "Most opiniated (maximum total absolute evidence)"
print
i = abs_evi_sorted[-1]
print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
ColoredWeightedDoc(test_corpus[i].decode('utf-8'), feature_names, clf.coef_[0])


Most opiniated (maximum total absolute evidence)

23.1992288439 -97.5471171853 120.77066305 [  8.40820746e-11   1.00000000e+00]


In [24]:
print
print "Colossal false negative failure"
print
for i in evi_sorted:
    if total_evi[i] > 0:
        i=-1
        break
    elif y_test[i] == 1:
        break
cd = None
if i != -1:
    print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
    cd = ColoredWeightedDoc(test_corpus[i].decode('utf-8'), feature_names, clf.coef_[0])
cd


Colossal false negative failure

-20.1112229172 -51.1619349182 31.0750290219 [  9.99999998e-01   1.84419509e-09]


In [25]:
print
print "Colossal false positive failure"
print
for i in evi_sorted[::-1]:
    if total_evi[i] < 0:
        i=-1
        break
    elif y_test[i] == 0:
        break
cd = None
if i != -1:
    print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
    cd = ColoredWeightedDoc(test_corpus[i].decode('utf-8'), feature_names, clf.coef_[0])
cd


Colossal false positive failure

22.8121406271 -20.0921101602 42.9285678082 [  1.23826283e-10   1.00000000e+00]


In [26]:
# Document lengths with respect to the vectorizer
document_lengths=X_test.sum(1).A1

In [36]:
print
print "Least opiniated (minimum total absolute evidence) --- normalized by length"
print
norm_total_abs_evi = total_abs_evi / document_lengths
norm_abs_evi_sorted = np.argsort(norm_total_abs_evi)
i = norm_abs_evi_sorted[0]
print "Total abs evi: %0.3f; document length: %d; normalized abs evi: %0.3f" % (total_abs_evi[i],  document_lengths[i], norm_total_abs_evi[i])
ColoredWeightedDoc(test_corpus[i].decode('utf-8'), feature_names, clf.coef_[0])


Least opiniated (minimum total absolute evidence) --- normalized by length

Total abs evi: 1.737; document length: 38; normalized abs evi: 0.046


In [37]:
print
print "Most opiniated (maximum total absolute evidence) --- normalized by length"
print
i = norm_abs_evi_sorted[-1]
print "Total abs evi: %0.3f; document length: %d; normalized abs evi: %0.3f" % (total_abs_evi[i],  document_lengths[i], norm_total_abs_evi[i])
ColoredWeightedDoc(test_corpus[i].decode('utf-8'), feature_names, clf.coef_[0])


Most opiniated (maximum total absolute evidence) --- normalized by length

Total abs evi: 2.985; document length: 6; normalized abs evi: 0.498
