## Setup

In [69]:
%matplotlib inline
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, Convolution1D, MaxPooling1D
from keras.datasets import imdb
from keras.utils.data_utils import get_file
from keras.preprocessing import sequence
from keras.optimizers import Adam
import pickle

In [3]:
model_path = 'data/imdb/models/'
%mkdir -p $model_path

In [5]:
%ls

Activations.ipynb  README.md          [1m[36mcnn[m[m/               [1m[36mimg[m[m/
CNNs.ipynb         RNNs.ipynb         [1m[36mdata[m[m/
Embeddings.ipynb   Untitled.ipynb     [1m[36mdate[m[m/


### Setup IMDB data

In [6]:
idx = imdb.get_word_index()

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [11]:
len(idx)

88584

In [12]:
# This is the word list
idx_arr = sorted(idx, key=idx.get)
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

In [16]:
# mapping from id to word
idx2word = {v: k for k, v in idx.items()}

In [21]:
# download the imdb reviews
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')

f = open(path, 'rb')

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_full.pkl


In [None]:
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [34]:
len(x_train)

25000

In [28]:
', '.join(map(str, x_train[0]))

'23022, 309, 6, 3, 1069, 209, 9, 2175, 30, 1, 169, 55, 14, 46, 82, 5869, 41, 393, 110, 138, 14, 5359, 58, 4477, 150, 8, 1, 5032, 5948, 482, 69, 5, 261, 12, 23022, 73935, 2003, 6, 73, 2436, 5, 632, 71, 6, 5359, 1, 25279, 5, 2004, 10471, 1, 5941, 1534, 34, 67, 64, 205, 140, 65, 1232, 63526, 21145, 1, 49265, 4, 1, 223, 901, 29, 3024, 69, 4, 1, 5863, 10, 694, 2, 65, 1534, 51, 10, 216, 1, 387, 8, 60, 3, 1472, 3724, 802, 5, 3521, 177, 1, 393, 10, 1238, 14030, 30, 309, 3, 353, 344, 2989, 143, 130, 5, 7804, 28, 4, 126, 5359, 1472, 2375, 5, 23022, 309, 10, 532, 12, 108, 1470, 4, 58, 556, 101, 12, 23022, 309, 6, 227, 4187, 48, 3, 2237, 12, 9, 215'

In [29]:
idx2word[23022]

'bromwell'

In [30]:
' '.join([idx2word[o] for o in x_train[0]])

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

In [43]:
# Labels are 1 for positive, 0 for negative sentiment
labels_train[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [46]:
# Reduce vocab size by setting rare words to max index
vocab_size = 5000

train = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [57]:
# Look at the distribution of lengths of sentences
lens = np.array([*map(len, train)])
(lens.max(), lens.min(), lens.mean())

(2493, 10, 237.71364)

In [59]:
# Pad each sentence to make consistent length
seq_len = 500
train = sequence.pad_sequences(train, maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)

In [60]:
train.shape

(25000, 500)

## 1. Simple model: 1 hidden layer

In [66]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [67]:
model.compile(loss='binary_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               1600100   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 1,760,201
Trainable params: 1,760,201
Non-trainable params: 0
_________________________________________________________________


In [68]:
model.fit(train,
          labels_train,
          validation_data=(test, labels_test),
          nb_epoch=2,
          batch_size=64)



Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x12f6a89e8>

## 2. Simple CNN: 1 conv layer with max pooling

In [70]:
conv = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Dropout(0.2),
    Convolution1D(64, 5, border_mode='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')
])



In [72]:
conv.compile(loss='binary_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])