In [2]:
from classifiers import TransparentLogisticRegression
import numpy as np
import glob
from sklearn.feature_extraction.text import CountVectorizer
from time import time
from scipy.sparse.construct import diags

In [3]:
def load_imdb(path, shuffle = True, random_state=42, vectorizer = CountVectorizer(min_df=5, max_df=1.0, binary=True)):
    
    print "Loading the imdb reviews data"
    
    train_neg_files = glob.glob(path+"\\train\\neg\\*.txt")
    train_pos_files = glob.glob(path+"\\train\\pos\\*.txt")
    
    train_corpus = []
    
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r')
        line = f.read()
        train_corpus.append(line)
        y_train.append(0)
        f.close()
    
    for tpf in train_pos_files:
        f = open(tpf, 'r')
        line = f.read()
        train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    test_neg_files = glob.glob(path+"\\test\\neg\\*.txt")
    test_pos_files = glob.glob(path+"\\test\\pos\\*.txt")
    
    test_corpus = []
    
    y_test = []
    
    for tnf in test_neg_files:
        f = open(tnf, 'r')
        test_corpus.append(f.read())
        y_test.append(0)
        f.close()
    
    for tpf in test_pos_files:
        f = open(tpf, 'r')
        test_corpus.append(f.read())
        y_test.append(1)
        f.close()
        
    print "Data loaded."
    
    print "Extracting features from the training dataset using a sparse vectorizer"
    print "Feature extraction technique is %s." % vectorizer
    t0 = time()
    
    X_train = vectorizer.fit_transform(train_corpus)
    
    duration = time() - t0
    print("done in %fs" % (duration))
    print "n_samples: %d, n_features: %d" % X_train.shape
    print
        
    print "Extracting features from the test dataset using the same vectorizer"
    t0 = time()
        
    X_test = vectorizer.transform(test_corpus)
    
    duration = time() - t0
    print("done in %fs" % (duration))
    print "n_samples: %d, n_features: %d" % X_test.shape
    print
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    if shuffle:
        np.random.seed(random_state)
        indices = np.random.permutation(len(y_train))        
        
        X_train = X_train.tocsr()
        X_train = X_train[indices]
        y_train = y_train[indices]
        train_corpus_shuffled = [train_corpus[i] for i in indices]
        
        
        indices = np.random.permutation(len(y_test))
        
        X_test = X_test.tocsr()
        X_test = X_test[indices]
        y_test = y_test[indices]
        test_corpus_shuffled = [test_corpus[i] for i in indices]
         
    return X_train, y_train, X_test, y_test, train_corpus_shuffled, test_corpus_shuffled

In [64]:
import re
class ColoredDoc(object):
    def __init__(self, doc, feature_names, coefs):
        self.doc = doc
        self.feature_names = feature_names
        self.coefs = coefs
        self.token_pattern = re.compile(r"(?u)\b\w\w+\b")
    def _repr_html_(self):
        html_rep = ""
        tokens = self.doc.split(" ")        
        for token in tokens:
            vocab_tokens = self.token_pattern.findall(token.lower())
            if len(vocab_tokens) > 0:
                vocab_token = vocab_tokens[0]
                try:
                    vocab_index = self.feature_names.index(vocab_token)
                    if self.coefs[vocab_index] > 0:
                        html_rep = html_rep + "<font color=blue> " + token + " </font>"
                    elif self.coefs[vocab_index] < 0:
                        html_rep = html_rep + "<font color=red> " + token + " </font>"
                    else:
                        html_rep = html_rep + "<font color=gray> " + token + " </font>"
                except:
                    html_rep = html_rep + "<font color=gray> " + token + " </font>"
            else:
                html_rep = html_rep + "<font color=gray> " + token + " </font>"
        return html_rep

In [5]:
print "Loading the data"
    
t0 = time()

vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("C:\\Users\\mbilgic\\Desktop\\aclImdb", shuffle=True, vectorizer=vect)
feature_names = vect.get_feature_names()

duration = time() - t0

print
print "Loading took %0.2fs." % duration
print

Loading the data
Loading the imdb reviews data
Data loaded.
Extracting features from the training dataset using a sparse vectorizer
Feature extraction technique is CountVectorizer(analyzer=u'word', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None).
done in 5.672000s
n_samples: 25000, n_features: 27272

Extracting features from the test dataset using the same vectorizer
done in 5.062000s
n_samples: 25000, n_features: 27272


Loading took 327.68s.



In [65]:
print "Fitting the classifier"

t0 = time()
clf = TransparentLogisticRegression(penalty='l1', C=0.01)
clf.fit(X_train, y_train)

duration = time() - t0

print
print "Fitting took %0.2fs." % duration
print

Fitting the classifier

Fitting took 0.53s.



In [66]:
print "Predicting the evidences"
    
t0 = time()
neg_evi, pos_evi = clf.predict_evidences(X_test)

duration = time() - t0

print
print "Predicting evidences took %0.2fs." % duration
print

Predicting the evidences

Predicting evidences took 0.10s.



In [67]:
print "Predicting the probs"
    
t0 = time()
probs = clf.predict_proba(X_test)

duration = time() - t0

print
print "Predicting probs took %0.2fs." % duration
print

Predicting the probs

Predicting probs took 0.03s.



In [68]:
total_evi = neg_evi + pos_evi

evi_sorted = np.argsort(total_evi)

coef_diags = diags(clf.coef_[0], 0)

In [69]:
test_corpus[0]

"This was an excellent show. It came on PBS back home in Chicago and I remember Cindy Herron (From EnVogue) played the teen aged daughter. The show dealt with subjects such as sex, peer pressure and puberty. IT was about a middle class black family who had a teen aged daughter and son who moved to a middle class neighborhood from Oakland or somewhere (I can't remember). I remember several episodes but the one I remember most was when their cousin got her period for the first time. I was probably 7-8 when I first watched it and I was able to keep up with the program. This was a great show. I can't remember the name of the guy who played the son on the show, but I always got him confused with Kevin Hooks."

In [71]:
ColoredDoc(test_corpus[0], feature_names, clf.coef_[0])