### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, append=1)

### Import the IMDB data

In [2]:
from keras.datasets import imdb

Using TensorFlow backend.


In [3]:
(X_train, y_train), (X_test, y_test) = imdb.load_data()

### Exploring the data

In [4]:
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

In [5]:
X_df = pd.DataFrame(X)
X_df.count()

0    50000
dtype: int64

In [6]:
y_df = pd.DataFrame(X)
y_df.count()

0    50000
dtype: int64

In [7]:
# Classes
print(np.unique(y))

[0 1]


In [8]:
# Number of unique words:
print(len(np.unique(np.hstack(X))))

88585


In [9]:
# Length of Reviews
rev_len = [len(x) for x in X]
rev_len

[218,
 189,
 141,
 550,
 147,
 43,
 123,
 562,
 233,
 130,
 450,
 99,
 117,
 238,
 109,
 129,
 163,
 752,
 212,
 177,
 129,
 140,
 256,
 888,
 93,
 142,
 220,
 193,
 171,
 221,
 174,
 647,
 233,
 162,
 597,
 234,
 51,
 336,
 139,
 231,
 704,
 142,
 861,
 132,
 122,
 570,
 55,
 214,
 103,
 186,
 113,
 169,
 469,
 138,
 302,
 766,
 351,
 146,
 59,
 206,
 107,
 152,
 186,
 431,
 147,
 684,
 383,
 324,
 252,
 263,
 787,
 211,
 314,
 118,
 390,
 132,
 710,
 306,
 167,
 115,
 95,
 158,
 156,
 82,
 502,
 314,
 190,
 174,
 60,
 145,
 214,
 659,
 408,
 515,
 461,
 202,
 238,
 170,
 107,
 171,
 158,
 145,
 790,
 258,
 287,
 67,
 123,
 975,
 775,
 236,
 195,
 274,
 214,
 91,
 1038,
 815,
 183,
 206,
 50,
 118,
 147,
 141,
 60,
 56,
 439,
 439,
 213,
 144,
 533,
 303,
 203,
 563,
 129,
 153,
 55,
 92,
 174,
 187,
 183,
 165,
 78,
 198,
 156,
 223,
 127,
 61,
 362,
 84,
 57,
 176,
 159,
 57,
 159,
 165,
 213,
 194,
 149,
 130,
 203,
 19,
 98,
 466,
 525,
 130,
 322,
 153,
 408,
 215,
 472,
 143,
 1

In [10]:
# Average review length
np.mean(rev_len)

234.75891999999999

In [11]:
# Standart Deviation of the review
np.std(rev_len)

172.91149458735703

### Multi-layer Perceptron

In [12]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [13]:
# Loading the dataset with top 10000  words and assigning zero for the rest
top_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [14]:
# Limiting the reviews to 450 words
X_train = sequence.pad_sequences(X_train, maxlen=450)
X_test = sequence.pad_sequences(X_test, maxlen=450)

In [15]:
# Creating the model using an Embedding layer that vectorizes the words to 32 dimensions
mlp = Sequential()
mlp.add(Embedding(top_words, 32, input_length=450))
mlp.add(Flatten())
mlp.add(Dense(250, activation='relu'))
mlp.add(Dense(1, activation='sigmoid'))
mlp.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
mlp.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
 - 51s - loss: 0.4990 - acc: 0.7141 - val_loss: 0.3001 - val_acc: 0.8701
Epoch 2/2
 - 43s - loss: 0.1639 - acc: 0.9388 - val_loss: 0.3088 - val_acc: 0.8724


<keras.callbacks.History at 0x23dd6e04518>

In [17]:
score = mlp.evaluate(X_test, y_test, verbose=0)
score[1]*100

87.244

### Convolutional Neural Network

In [18]:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [19]:
cnn = Sequential()
cnn.add(Embedding(top_words, 32, input_length=450))
cnn.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(Flatten())
cnn.add(Dense(250, activation='relu'))
cnn.add(Dense(1, activation='sigmoid'))
cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
cnn.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
 - 53s - loss: 0.4736 - acc: 0.7395 - val_loss: 0.2719 - val_acc: 0.8874
Epoch 2/2
 - 52s - loss: 0.1926 - acc: 0.9264 - val_loss: 0.2740 - val_acc: 0.8873


<keras.callbacks.History at 0x23d82d08b70>

In [21]:
scores = cnn.evaluate(X_test, y_test, verbose=0)
scores[1]*100

88.727999999999994

### LSTM

In [22]:
from keras.layers import LSTM

In [23]:
lstm = Sequential()
lstm.add(Embedding(top_words, 32, input_length=450))
lstm.add(LSTM(100))
lstm.add(Dense(1, activation='sigmoid'))
lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [24]:
lstm.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x23d85a6b780>

In [25]:
scores = lstm.evaluate(X_test, y_test, verbose=0)
scores[1]*100

86.343999999999994

## Conclusions:

##### Multi-layer perceptron provided an accuracy of 87.24% on the data set whereas LSTM gave an accuracy of 86.34%

##### CNN with word embedding provided the best accuracy score of 88.72%

##### The best accuracy of 88.72% can be achieved through CNN although LSTM and Multi-layer Perceptron are not far behind