In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
pwd()

'/home/martinpella/Projects/imdb-review'

In [3]:
current_path = os.getcwd()
DATA_PATH = current_path + '/data'
train_path = DATA_PATH + '/train'
test_path = DATA_PATH + '/test'
results_path = current_path + '/results'

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Flatten, Dense, Dropout, Convolution1D, MaxPooling1D, SpatialDropout1D, Input, concatenate
from keras.optimizers import Adam

Using TensorFlow backend.


<h2>Setup data</h2>

In [5]:
train_df = pd.read_csv(train_path + '/train_df.csv')
test_df = pd.read_csv(test_path + '/test_df.csv')

In [6]:
def shuffle_df(df):
    indices = np.arange(df.shape[0])
    np.random.shuffle(indices)
    df = df.iloc[indices]
    return df

In [7]:
# shuffle train and test data
train_df = shuffle_df(train_df)
test_df = shuffle_df(test_df)

Neural networks don't take plain text as input, they only understand about numeric tensors. So the first thing we need to do is to transform our reviews. One way is applying word segmentation, where the text is divided into its components words. Each word is then transformed into a vector.
Keras provides built-in utilities to do it. 

In [8]:
# the vocabulary size will be restricted to the top 5000 most common words in the dataset
max_words = 5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_df['review'])
sequences_train = tokenizer.texts_to_sequences(train_df['review'])
sequences_test = tokenizer.texts_to_sequences(test_df['review'])

In [9]:
# dictionary mapping words (str) to their index (int)
word_index = tokenizer.word_index

In [10]:
# dictionary mapping indices (int) to their word (str)
index2word = {v: k for k, v in word_index.items()}

In [11]:
# this is our first training review
', '.join(map(str, sequences_train[0]))

'685, 5, 1, 4, 3, 1, 2103, 1883, 1141, 23, 835, 463, 1135, 2, 410, 1063, 4, 921, 187, 33, 3108, 1, 348, 4939, 5, 145, 578, 8, 175, 3050, 389, 134, 1141, 184, 8, 1, 1355, 3249, 8, 1, 510, 4, 1747, 1624, 1534, 14, 4716, 453, 16, 843, 3866, 2154, 2, 3366, 14, 3169, 2, 985, 4716, 2932, 1, 23, 1, 61, 220, 20, 65, 887, 34, 78, 21, 202, 3, 862, 65, 159, 4717, 1, 25, 1442, 35, 5, 398, 53, 1, 862, 1509, 14, 7, 7, 469, 440, 6, 31, 32, 151, 4043, 2, 26, 38, 92, 159, 2, 1141, 1, 510, 2078, 182, 1534, 44, 2064, 3432, 5, 1, 427, 2, 24, 862, 23, 37, 2, 2, 1, 4716, 220, 166, 9, 875, 5, 16, 1, 7, 7, 149, 137, 227, 192, 80, 91, 202, 1768, 1355, 3249, 18, 9, 6, 3, 3210, 2997, 2, 1574, 5, 862, 19, 1534, 2, 1, 174, 3108, 70, 16, 862, 14, 2, 2052, 1751, 4201, 3230, 14, 1, 88, 903, 2155, 164, 3264, 2, 1048, 583, 2095, 3815, 1588, 1376, 1555, 225, 2, 588, 1588, 1196, 2124, 7, 7, 2933, 3264, 1534, 3866, 2154, 1509, 3366'

In [12]:
' '.join(index2word[o] for o in sequences_train[0])

"due to the of a the flesh eating zombies are brought under control and become members of society however they perform the dead attend to those living in us 1950s small while zombies around in the wild zone in the town of pre teen ray as robinson lives with parents carrie anne and baker as helen and bill robinson alas the are the only family on their street who do not own a zombie their new neighbors the have six so to keep up the zombie billy as br br unfortunately mr is by an old walker and he her then new and zombies the town meanwhile young ray has grown attached to the boy and his zombie are like and and the robinson family find it difficult to with the br br doesn't go far enough into its own intriguing wild zone but it is a colorful stylish and addition to zombie film ray and the cast perform well with zombie as and owner tim blake nelson as the most memorable pair director andrew and crew including rob gray design photography don music and james design won awards br br 2006 and

Let's look at the distribution of the sequences lengths

In [13]:
x_train = [np.array([i for i in s]) for s in sequences_train]
x_test = [np.array([i for i in s]) for s in sequences_test]

In [14]:
lens = np.array(list(map(len, x_train)))

In [15]:
(lens.max(), lens.min(), lens.mean())

(1982, 9, 214.02860000000001)

In [16]:
# Maximum sequence length, longer sequences will be truncated and shorter sequences will be padded with zeros at the beggining
max_len = 500

In [17]:
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

In [18]:
x_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [19]:
x_train.shape

(25000, 500)

In [20]:
labels_train = np.array(train_df['sentiment']).reshape(-1, 1)
labels_test = np.array(test_df['sentiment']).reshape(-1, 1)

In [21]:
labels_train.shape

(25000, 1)

<h2>Modeling</h2>

<h3>Simple linear model</h3>

"The Embedding layer is a dictionary mapping integer indices (that stand for specific words) to dense vectors. It takes as input integers, it looks up these integers into an internal dictionary, and it returns the associated vectors." From the book "Deep Learning with Python" of Francois Chollet.

In [22]:
model = Sequential([Embedding(input_dim=max_words, output_dim=8, input_length=max_len),
                    Flatten(),
                    Dense(1, activation='sigmoid')
                    ])

model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [23]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 8)            40000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 4000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 4001      
Total params: 44,001
Trainable params: 44,001
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.fit(x_train, labels_train, validation_split=0.2, epochs=4, batch_size=64)

Train on 20000 samples, validate on 5000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f4f83502c18>

In [25]:
def save_model(model, fname):
    model_json = model.to_json()
    with open(fname, 'w') as json_file:
        json_file.write(model_json)

In [26]:
save_model(model, results_path + '/models/linear1.json')

In [27]:
model.save_weights(results_path + '/models/linear1.h5')

<h3>Simple convolutional neural network</h3>

In [32]:
model = Sequential([Embedding(input_dim=max_words, output_dim=16, input_length=max_len),
                    SpatialDropout1D(0.2),
                    Dropout(0.2),
                    Convolution1D(64, 5, padding='same', activation='relu'),
                    Dropout(0.2),
                    MaxPooling1D(),
                    Flatten(),
                    Dense(100, activation='relu'),
                    Dropout(0.7),
                    Dense(1, activation='sigmoid')])

model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [33]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 16)           80000     
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 500, 16)           0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 16)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 64)           5184      
_________________________________________________________________
dropout_2 (Dropout)          (None, 500, 64)           0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 64)           0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 16000)             0         
__________

In [34]:
model.fit(x_train, labels_train, validation_split=0.2, epochs=4, batch_size=64)

Train on 20000 samples, validate on 5000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f4f5de96048>

In [35]:
save_model(model, results_path + '/models/conv1.json')
model.save_weights(results_path + '/models/conv1.h5')

<h3>Pre-trained word embeddings</h3>

In the previous cases, we started with random initializated word vectors. Let's try with pre-trained word vectors. This is similar to use pre-trained convolutional neural networks (as VGG16, ResNet, Inception, etc) in the context of computer vision. We want to take advantage of learned features.

Word2Vec, GloVe and fastText are some of the most famous pre-trained word vectors. In this case I will use GloVe, and in concretely the word vectors trained on English Wikipedia from 2014: https://nlp.stanford.edu/projects/glove/

In [36]:
glove_dir = '/home/martinpella/Downloads/GloVe/Wikipedia2014'

In [37]:
embeddings_index = {}
f = open(glove_dir + '/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [38]:
len(embeddings_index)

400000

Imdb word_index created during tokenization and GloVe embeddings_index have different indices. We need to create a matrix with imdb word_index and GloVe embeddings (in case that they exist).

In [39]:
import re

In [41]:
def create_emb_matrix(max_words, embedding_dim):
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if i < max_words:
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
            else:
                embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return embedding_matrix

In [42]:
embedding_matrix = create_emb_matrix(max_words, 50)

In [50]:
model = Sequential([Embedding(max_words, 50, input_length=max_len), 
                    SpatialDropout1D(0.2),
                    Dropout(0.25),
                    Convolution1D(64, 5, padding='same', activation='relu'),
                    Dropout(0.25),
                    MaxPooling1D(),
                    Flatten(),
                    Dense(100, activation='relu'),
                    Dropout(0.85),
                    Dense(1, activation='sigmoid')])

In [51]:
model.layers[0].set_weights = [embedding_matrix]
model.layers[0].trainable = True

In [52]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [53]:
model.fit(x_train, labels_train, validation_split=0.2, epochs=4, batch_size=64)

Train on 20000 samples, validate on 5000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f4f30405b38>

In [55]:
model.layers[0].trainable = False
model.optimizer.lr = 1e-5

In [57]:
model.fit(x_train, labels_train, validation_split=0.2, epochs=2, batch_size=64)

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4f0a1b6978>

In [54]:
save_model(model, results_path + '/models/conv2.json')
model.save_weights(results_path + '/models/conv2.h5')

<h3>Multi-size convolutional neural network</h3>

https://quid.com/feed/how-quid-uses-deep-learning-with-small-data

In [64]:
graph_in = Input(shape=(max_words, 50))

In [94]:
convs = []
for filter_size in range(3, 6):
    x = Convolution1D(64, filter_size, padding='same', activation='relu')(graph_in)
    x = MaxPooling1D()(x)
    x = Flatten()(x)
    convs.append(x)
graph_out = concatenate(convs, axis=1)
graph = Model(graph_in, graph_out)

In [115]:
model = Sequential([Embedding(max_words, 50, input_length=max_len),
                    SpatialDropout1D(0.2),
                    Dropout(0.2),
                    graph,
                    Dropout(0.5),
                    Dense(100, activation='relu'),
                    Dropout(0.85),
                    Dense(1, activation='sigmoid')])

In [116]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [117]:
model.fit(x_train, labels_train, validation_split=0.2, epochs=2, batch_size=64)

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4efbd67748>

In [118]:
save_model(model, results_path + '/models/conv3.json')
model.save_weights(results_path + '/models/conv3.h5')