In [1]:
from theano.sandbox import cuda

In [1]:
%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

Using Theano backend.


In [73]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [3]:
model_path = 'data/imdb/models/'

## Setup data

We're going to look at the IMDB dataset, which contains movie reviews from IMDB, along with their sentiment. Keras comes with some helpers for this dataset.

In [3]:
from keras.datasets import imdb
idx = imdb.get_word_index()

This is the word list:

In [4]:
idx_arr = sorted(idx, key=idx.get)
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

...and this is the mapping from id to word

In [8]:
idx2word = {v: k for k, v in idx.iteritems()}

We download the reviews using code copied from keras.datasets:

In [2]:
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [9]:
len(x_train)

25000

Here's the 1st review. As you see, the words have been replaced by ids. The ids can be looked up in idx2word.

In [5]:
', '.join(map(str, x_train[0]))

'23022, 309, 6, 3, 1069, 209, 9, 2175, 30, 1, 169, 55, 14, 46, 82, 5869, 41, 393, 110, 138, 14, 5359, 58, 4477, 150, 8, 1, 5032, 5948, 482, 69, 5, 261, 12, 23022, 73935, 2003, 6, 73, 2436, 5, 632, 71, 6, 5359, 1, 25279, 5, 2004, 10471, 1, 5941, 1534, 34, 67, 64, 205, 140, 65, 1232, 63526, 21145, 1, 49265, 4, 1, 223, 901, 29, 3024, 69, 4, 1, 5863, 10, 694, 2, 65, 1534, 51, 10, 216, 1, 387, 8, 60, 3, 1472, 3724, 802, 5, 3521, 177, 1, 393, 10, 1238, 14030, 30, 309, 3, 353, 344, 2989, 143, 130, 5, 7804, 28, 4, 126, 5359, 1472, 2375, 5, 23022, 309, 10, 532, 12, 108, 1470, 4, 58, 556, 101, 12, 23022, 309, 6, 227, 4187, 48, 3, 2237, 12, 9, 215'

The first word of the first review is 23022. Let's see what that is.

In [10]:
idx2word[23022]

'bromwell'

Here's the whole review, mapped from ids to words.

In [11]:
' '.join([idx2word[o] for o in x_train[0]])

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

The labels are 1 for positive, 0 for negative.

In [20]:
train = [' '.join([idx2word[o] for o in review]) for review in x_train]
test = [' '.join([idx2word[o] for o in review]) for review in x_test]

In [51]:
train[0][:20]

'bromwell high is a c'

In [13]:
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Reduce vocab size by setting rare words to max index.

In [13]:
# vocab_size = 5000

# trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
# test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [42]:
train[0].split('to')

['bromwell high is a car',
 'on comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me ',
 " believe that bromwell high's satire is much closer ",
 ' reality than is teachers the scramble ',
 " survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried ",
 ' burn down the school i immediately recalled at high a classic line inspec',
 "r i'm here ",
 ' sack one of your teachers student welcome ',
 " bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"]

In [89]:
%%time
count_vect = CountVectorizer(lowercase=True,
                             stop_words='english', 
                             ngram_range=(1, 1), analyzer='word', 
                             max_df=1.0, min_df=1, max_features=5000,
                             binary=False)
X_train_counts = count_vect.fit_transform(train)
print (X_train_counts.shape)
X_test_counts = count_vect.transform(test)

for c in [.01, .05, .1, .2]:
    clf = LinearSVC(C=c, max_iter=5000)
    clf.fit(X_train_counts, labels_train)

    pred = clf.predict(X_test_counts)

    print (c, accuracy_score(pred, labels_test))

(25000, 5000)
0.1 0.84036
0.5 0.82284
1 0.8142
2 0.80708
5 0.80212
10 0.79896
CPU times: user 1min 14s, sys: 332 ms, total: 1min 15s
Wall time: 1min 15s



## Results for 1-gram + linear SVM

In [93]:
count_vect = CountVectorizer(lowercase=True,
                             stop_words='english', 
                             ngram_range=(1, 1), analyzer='word', 
                             max_df=1.0, min_df=1, max_features=5000,
                             binary=False)
X_train_counts = count_vect.fit_transform(train)
print (X_train_counts.shape)
X_test_counts = count_vect.transform(test)

for c in [0.0005, .001, .002, .005, .01]:
    clf = LinearSVC(C=c, max_iter=5000)
    clf.fit(X_train_counts, labels_train)

    pred = clf.predict(X_test_counts)

    print ('c =',c, ' acc = ', accuracy_score(pred, labels_test))

c = 0.0005  acc =  0.86852
c = 0.001  acc =  0.87272
c = 0.002  acc =  0.875
c = 0.005  acc =  0.8702
c = 0.01  acc =  0.86676


## Results for 1,2-gram + linear SVM

In [94]:
count_vect = CountVectorizer(lowercase=True,
                             stop_words='english', 
                             ngram_range=(1, 2), analyzer='word', 
                             max_df=1.0, min_df=1, max_features=5000,
                             binary=False)
X_train_counts = count_vect.fit_transform(train)
print (X_train_counts.shape)
X_test_counts = count_vect.transform(test)

for c in [ .001, .01, .1, 1]:
    clf = LinearSVC(C=c, max_iter=5000)
    clf.fit(X_train_counts, labels_train)

    pred = clf.predict(X_test_counts)

    print ('c =',c, ' acc = ', accuracy_score(pred, labels_test))

(25000, 5000)
c = 0.001  acc =  0.87292
c = 0.01  acc =  0.86832
c = 0.1  acc =  0.84452
c = 1  acc =  0.8218
c = 10  acc =  0.80688


In [96]:
count_vect = CountVectorizer(lowercase=True,
                             stop_words='english', 
                             ngram_range=(2, 2), analyzer='word', 
                             max_df=1.0, min_df=1, max_features=10000,
                             binary=False)
X_train_counts = count_vect.fit_transform(train)
print (X_train_counts.shape)
X_test_counts = count_vect.transform(test)

for c in [ .001, .01, .1, 1]:
    clf = LinearSVC(C=c, max_iter=5000)
    clf.fit(X_train_counts, labels_train)

    pred = clf.predict(X_test_counts)

    print ('c =',c, ' acc = ', accuracy_score(pred, labels_test))

(25000, 10000)
c = 0.001  acc =  0.78232
c = 0.01  acc =  0.80488
c = 0.1  acc =  0.78404
c = 1  acc =  0.75516


In [97]:
count_vect = CountVectorizer(lowercase=True,
                             stop_words='english', 
                             ngram_range=(3, 3), analyzer='word', 
                             max_df=1.0, min_df=1, max_features=10000,
                             binary=False)
X_train_counts = count_vect.fit_transform(train)
print (X_train_counts.shape)
X_test_counts = count_vect.transform(test)

for c in [ .001, .01, .1, 1]:
    clf = LinearSVC(C=c, max_iter=5000)
    clf.fit(X_train_counts, labels_train)

    pred = clf.predict(X_test_counts)

    print ('c =',c, ' acc = ', accuracy_score(pred, labels_test))

(25000, 10000)
c = 0.001  acc =  0.64868
c = 0.01  acc =  0.68476
c = 0.1  acc =  0.68008
c = 1  acc =  0.66184
