# IMDB

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [17]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense,LSTM,Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence


Load the Imdb dataset. We are constraining the dataset to top 5000 words.
We split the dataset into train (50%) and test(50%)

In [4]:
#Load the dataset but keep only top 5000 words , zero the rest
top_words = 5000
(X_train,y_train),(X_test,y_test) = imdb.load_data(num_words=top_words)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(25000,)
(25000,)
(25000,)
(25000,)


In [7]:
#truncate and pad input sequences to max_length = 500
max_review_len = 500
X_train = sequence.pad_sequences(X_train,maxlen=max_review_len)
X_test = sequence.pad_sequences(X_test,maxlen=max_review_len)

Compile and Fit LSTM Model:
- First layer is Embedding Layer  (32 length vectors)
- Second Layer is LSTM layer (100 neurons)
- Dense Output Layer (since classification we use single neuron)
- Sigmoid Activation function (since its a binary classification problem)
- Since binary classification problem , loss function is binary cross entropy
- ADAM optimization algorithm 

In [13]:
embedding_vector_length=32
model=Sequential()
model.add(Embedding(top_words,embedding_vector_length,input_length=max_review_len))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=3,batch_size=64)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x134a9174f08>

In [15]:
#Final Evaluation of the model:
scores = model.evaluate(X_test,y_test,verbose=0)
print("Accuracy: %.2f%%" %(scores[1]*100))

Accuracy: 87.54%


# Using Dropout 

We can add dropout layers inbetween Embedding and LSTM and LSTM and Dense output layer.

In [19]:
model=Sequential()
model.add(Embedding(top_words,embedding_vector_length,input_length=max_review_len))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())


Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
dropout_3 (Dropout)          (None, 500, 32)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
model.fit(X_train,y_train,epochs=3,batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x134a9116788>

In [21]:
#Final model evaluation:
scores = model.evaluate(X_test,y_test,verbose=0)
print("Accuracy score %.2f%%" %(scores[1]*100))

Accuracy score 86.68%


Keras provides the capability with parameters on LSTM layers , the dropout for configuring in input dropout and 
recurrent dropout for configuring in the recurrent layers.

In [23]:
model = Sequential()
model.add(Embedding(top_words,embedding_vector_length,input_length=max_review_len))
model.add(LSTM(100,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
model.fit(X_train,y_train,epochs=3,batch_size=64)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x134a3b11f08>

In [24]:
scores = model.evaluate(X_test,y_test,verbose=0)
print("Accuracy score %.2f%%" %(scores[1]*100))

Accuracy score 84.17%
