### Recurrent Neural Networks with IMDb Dataset
Build an RNN to perform text classification - predict if a review of an IMDb movie rating is positive (> 5) or negative (< 5)

In [1]:
# Imports
import numpy as np
from tensorflow.keras.datasets import imdb

#### Preprocess data

In [2]:
num_words = 20000 # only grab reviews with the 'k' most frequent words; i.e. only reviews with the 20000 most frequent words are included
max_len = 100 # Reviews all have different lengths; this will pad reviews to ensure they have the same length, to be able to be input into the RNN

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=num_words)

In [3]:
# Pad sequences to be of same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

#### Defining the RNN

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [5]:
rnn = Sequential()

'''
Embedding layer: creates a word vector representation (assigns each word an integer)
input_dim - No. of words
output_dim - Embedding size
'''
rnn.add(Embedding(input_dim=num_words, output_dim=128, input_shape=(X_train.shape[1], )))

In [6]:
# LSTM layer
rnn.add(LSTM(units=128, activation='tanh'))

# Output layer
rnn.add(Dense(units=1, activation='sigmoid'))
#rnn.add(Dropout(0.4))

In [7]:
# Binary target variable; hence loss: binary_crossentropy
rnn.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

rnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 128)          2560000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


#### Training the RNN

In [8]:
rnn.fit(X_train, y_train, epochs=20, batch_size=128)

Train on 25000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1740cda0f60>

In [9]:
test_loss, test_accuracy = rnn.evaluate(X_test, y_test)

print(f'Test accuracy: {test_accuracy}')

Test accuracy: 0.8118000030517578
