In [1]:
import os
import random

import numpy as np
from sklearn.cross_validation import *
from sklearn.decomposition import *
from sklearn.ensemble import *
from sklearn.feature_extraction.text import *
from sklearn.linear_model import *
from sklearn.naive_bayes import *
from sklearn.metrics import *
from sklearn.pipeline import *

In [2]:
DATA_PATH_PREFIX = './txt_sentoken/'

In [3]:
random.seed(5001)

In [4]:
def shuffle_in_unison(X, Y):
    assert X.shape[0] == Y.shape[0]
    n = X.shape[0]
    indices = np.arange(n)
    random.shuffle(indices)
    X = X[indices]
    Y = Y[indices]
    return X, Y

In [5]:
def target_to_int(target):
    if target == 'pos': return 1
    if target == 'neg': return 0
    raise ValueError('Invalid label')

def load_reviews_for_target(target):
    data_dir = DATA_PATH_PREFIX + target + '/'
    X = []
    Y = []
    for folder, subfolders, filenames in os.walk(data_dir):
        for filename in filenames:
            path = os.path.join(folder, filename)
            x = open(path, 'r').read()
            y = target_to_int(target)
            
            X.append(x)
            Y.append(y)
    return X, Y

def load_reviews():
    X_pos, Y_pos = load_reviews_for_target('pos')
    X_neg, Y_neg = load_reviews_for_target('neg')
    assert len(X_pos) == len(Y_pos)
    assert len(X_neg) == len(Y_neg)
    X = np.hstack((X_pos, X_neg))
    Y = np.hstack((Y_pos, Y_neg))
    assert X.shape[0] == Y.shape[0]
    return X, Y

Loads the movie reviews raw data.

In [6]:
X, Y = load_reviews()

In [7]:
print 'X.shape = %s' % (str(X.shape))
print 'Y.shape = %s' % (str(Y.shape))

X.shape = (2000,)
Y.shape = (2000,)


In [8]:
print 'Target [%s]. Text : [%s]' % (Y[10], X[10][:1000])

Target [1]. Text : [after watching " rat race " last week , i noticed my cheeks were sore and realized that , when not laughing aloud , i had held a grin for virtually all of the film's 112 minutes . 
saturday night , i attended another sneak preview for the movie and damned if i didn't enjoy it as much the second time as the first . 
 " rat race " is a great goofy delight , a dandy mix of energetic performances , inspired sight gags and flat-out silliness . 
hands down , this is the most fun film of the summer . 
the movie begins with zippy retro-style opening credits that look like they were torn straight out of a '60s slapstick comedy , featuring animated photos of the cast attached to herky-jerky bodies bounding around the screen . 
then comes the setup . 
donald sinclair ( john cleese ) , the extremely rich owner of the venetian hotel and casino in las vegas , enjoys concocting unusual bets for his high rolling clients . 
to that end , he places a half dozen very special tokens in

In [9]:
print 'Target [%s]. Text : [%s]' % (Y[50], X[50][:1000])

Target [1]. Text : [national lampoon's animal house , made in 1978 and set in 1962 , remains one of the -- no , fuck that noise -- * the * funniest movie ever made . 
and this isn't just my opinion , either ; everybody knows this , and that's why about a gazillion inferior rip-offs have been made , trying to duplicate its success . 
 ( pcu anyone ? 
and the first person to bring up glory daze gets decked . ) 
animal house takes place at the fictional faber college , circa 1962 , where the omega frat calls the shots . 
these guys are wholesome , clean-cut , model-citizens . . . 
i . e . 
a bunch of assholes . 
greg , their leader , is going out with mandy pepperidge , but since the silly bastard doesn't believe in pre-marital sex , their relationship never goes further than a quick jack-off under the stars . 
neidermeyer is the supreme-bozo of the bunch , walking around with his dick out kicking freshman ass and trying to impress the muff . 
also hanging around these losers is babs , fu

In [10]:
vec = TfidfVectorizer(min_df = 2, binary = False, max_features = 5000, norm = 'l2', \
                      smooth_idf = True, sublinear_tf = True, strip_accents = 'unicode', ngram_range = (1,2))
X = vec.fit_transform(X)

In [11]:
print X[0]

  (0, 2097)	0.0539521213271
  (0, 1936)	0.0612564806551
  (0, 2182)	0.0623279500936
  (0, 650)	0.0291614645782
  (0, 377)	0.0620517670827
  (0, 392)	0.0617812789746
  (0, 1468)	0.0515358461756
  (0, 1834)	0.0529589178657
  (0, 4024)	0.0404558916491
  (0, 656)	0.0293183024144
  (0, 4894)	0.0206958547527
  (0, 4004)	0.0445677443688
  (0, 4153)	0.0426905176693
  (0, 2004)	0.0453090833243
  (0, 247)	0.0631932004655
  (0, 4026)	0.0631932004655
  (0, 3974)	0.0535172237602
  (0, 4461)	0.05568277434
  (0, 4072)	0.0291818136668
  (0, 4347)	0.051054389994
  (0, 1261)	0.0438660686487
  (0, 2542)	0.0467591946777
  (0, 3533)	0.0536605935858
  (0, 2226)	0.0430066343746
  (0, 387)	0.0470172661181
  :	:
  (0, 4270)	0.0169336679531
  (0, 632)	0.0405653412896
  (0, 4934)	0.0487793446691
  (0, 1617)	0.064775623557
  (0, 943)	0.056732791266
  (0, 4003)	0.0633747881269
  (0, 2326)	0.0420802584061
  (0, 4504)	0.0541003512485
  (0, 3015)	0.0375802299035
  (0, 3723)	0.0749184897
  (0, 453)	0.0565518475907
  (

Split the (X, Y) into training set and validation set.

In [12]:
X_train, X_cv, Y_train, Y_cv = train_test_split(X, Y, test_size = 0.2, random_state = 5005)

In [13]:
model = BernoulliNB(alpha = 1)
model.fit(X_train, Y_train)
Y_cv_pred = model.predict(X_cv)
score = accuracy_score(Y_cv_pred, Y_cv)

In [14]:
print score

0.8025


In [15]:
print Y_cv[:50]

[0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 1 1 1 0 1 0 1 0 0 1 1 1 1
 1 0 1 1 0 0 1 0 1 1 0 0 0]


In [16]:
print Y_cv_pred[:50]

[0 1 0 0 0 0 1 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 1 1 1
 1 0 0 1 0 1 1 0 1 0 0 0 0]


In [17]:
model = LogisticRegression()

In [18]:
model.fit(X_train, Y_train)
Y_cv_pred = model.predict(X_cv)
score = accuracy_score(Y_cv_pred, Y_cv)

In [19]:
print score

0.8625


In [20]:
print Y_cv[:50]

[0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 1 1 1 0 1 0 1 0 0 1 1 1 1
 1 0 1 1 0 0 1 0 1 1 0 0 0]


In [21]:
print Y_cv_pred[:50]

[0 1 1 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 1 1 1 1
 1 0 1 1 0 1 1 0 1 0 0 0 0]
