In [1]:
import numpy as np
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Embedding, Conv1D, Dense, Dropout, GlobalMaxPool1D, MaxPooling1D, Activation, Flatten
from keras.models import Model

Using TensorFlow backend.


In [2]:
PAD_SEQUENCES = 1000

# Create char_to_index, index_to_char dicts

In [3]:
valid_chars = []
id = ord('a')
while id <= ord('z'):
    valid_chars.append(chr(id))
    id += 1    
    
id = ord('A')
while id <= ord('Z'):
    valid_chars.append(chr(id))
    id += 1

print(valid_chars)
char_to_index = {}
index_to_char = {}
for idx, ch in enumerate(valid_chars):
    char_to_index[ch] = idx
    index_to_char[idx] = ch
    
print("Len of chars ", len(valid_chars))

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
Len of chars  52


# Load data

In [4]:
(x_train,y_train), (x_test, y_test) = imdb.load_data(num_words=10000000)
word_index = imdb.get_word_index()
word_index = {key:(value + 3) for key, value in word_index.items()}
index_to_word = {value:key for key, value in word_index.items()}
index_to_word[0] = ""
index_to_word[1] = ""
index_to_word[2] = ""
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print(type(x_train))

25000 train sequences
25000 test sequences
<class 'numpy.ndarray'>


In [5]:
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
print("shape ", y_train.shape)

shape  (25000, 2)


# Convert word reviews to char review

In [6]:
x_train_char = []
x_test_char = []

for idx in range(x_train.shape[0]):
    sentence = ' '.join([index_to_word[x] for x in x_train[idx]])
    sentence_indexes = []
    for ch in sentence:
        if ch in char_to_index:
            sentence_indexes.append(char_to_index[ch])
    x_train_char.append(sentence_indexes)

for idx in range(x_test.shape[0]):
    sentence = ' '.join([index_to_word[x] for x in x_test[idx]])
    sentence_indexes = []
    for ch in sentence:
        if ch in char_to_index:
            sentence_indexes.append(char_to_index[ch])
    x_test_char.append(sentence_indexes)    
    
x_train_char = np.array(x_train_char)
x_test_char = np.array(x_test_char)
print("Shape", x_train_char.shape)

Shape (25000,)


In [7]:
x_train_char = pad_sequences(x_train_char, maxlen=PAD_SEQUENCES)
x_test_char = pad_sequences(x_test_char, maxlen=PAD_SEQUENCES)
print("Shape", x_train_char.shape)

Shape (25000, 1000)


# Construct model

In [8]:
def define_model():
    seq_input = Input((PAD_SEQUENCES,))    
    l = Embedding(len(valid_chars), 10)(seq_input)    
    l = Dropout(0.2)(l)    
    l = Conv1D(filters=200, kernel_size=3)(l)    
    l = GlobalMaxPool1D()(l)    
    l = Dense(256)(l)    
    l = Dropout(0.2)(l)    
    l = Activation(activation='relu')(l)    
    pred = Dense(2, activation='softmax')(l)
    model = Model(seq_input, pred)
    model.compile(optimizer="adam",
                 loss="categorical_crossentropy",
                 metrics=["accuracy"])
    return model

In [9]:
model = define_model()

In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 10)          520       
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000, 10)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 998, 200)          6200      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               51456     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
__________

In [11]:
model.fit(x_train_char, y_train,
          batch_size=200,
          epochs=10,
          verbose=1,
          validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f272ead0550>

In [12]:
score = model.evaluate(x_test_char, y_test, verbose=0)
print(score)

[0.46683025493621827, 0.77728]
