#### RNN for Sequences; Sentiment Analysis with the IMDb Dataset

In [1]:
from tensorflow.keras.datasets import imdb

In [2]:
number_of_words = 10000

In [3]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=number_of_words)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [4]:
X_train.shape

(25000,)

In [5]:
y_train.shape

(25000,)

In [6]:
X_test.shape

(25000,)

In [7]:
y_test.shape

(25000,)

In [8]:
%pprint

Pretty printing has been turned OFF


In [9]:
X_train[123]

[1, 307, 5, 1301, 20, 1026, 2511, 87, 2775, 52, 116, 5, 31, 7, 4, 91, 1220, 102, 13, 28, 110, 11, 6, 137, 13, 115, 219, 141, 35, 221, 956, 54, 13, 16, 11, 2714, 61, 322, 423, 12, 38, 76, 59, 1803, 72, 8, 2, 23, 5, 967, 12, 38, 85, 62, 358, 99]

In [10]:
word_to_index = imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [11]:
word_to_index['great']

84

In [12]:
index_to_word = \
    {index: word for (word, index) in word_to_index.items()}

In [13]:
[index_to_word[i] for i in range(1, 51)]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i', 'this', 'that', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'on', 'not', 'you', 'are', 'his', 'have', 'he', 'be', 'one', 'all', 'at', 'by', 'an', 'they', 'who', 'so', 'from', 'like', 'her', 'or', 'just', 'about', "it's", 'out', 'has', 'if', 'some', 'there', 'what', 'good', 'more']

In [14]:
' '.join([index_to_word.get(i - 3, '?') for i in X_train[123]])

'? beautiful and touching movie rich colors great settings good acting and one of the most charming movies i have seen in a while i never saw such an interesting setting when i was in china my wife liked it so much she asked me to ? on and rate it so other would enjoy too'

In [15]:
y_train[123]

1

In [16]:
words_per_review = 200  

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
X_train = pad_sequences(X_train, maxlen=words_per_review)

In [19]:
X_train.shape

(25000, 200)

In [20]:
X_test = pad_sequences(X_test, maxlen=words_per_review)

In [21]:
X_test.shape

(25000, 200)

In [22]:
from sklearn.model_selection import train_test_split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, random_state=11, test_size=0.20) 

In [23]:
X_test.shape

(20000, 200)

In [24]:
X_val.shape

(5000, 200)

In [25]:
from tensorflow.keras.models import Sequential

In [26]:
rnn = Sequential()

In [27]:
from tensorflow.keras.layers import Dense, LSTM

In [28]:
from tensorflow.keras.layers import Embedding

In [29]:
rnn.add(Embedding(input_dim=number_of_words, output_dim=128,
                  input_length=words_per_review))

In [30]:
rnn.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))

In [31]:
rnn.add(Dense(units=1, activation='sigmoid'))

In [32]:
rnn.compile(optimizer='adam',
            loss='binary_crossentropy', 
            metrics=['accuracy'])

In [33]:
rnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,411,713
Trainable params: 1,411,713
Non-trainable params: 0
_________________________________________________________________


In [34]:
rnn.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History object at 0x00000279F9FEBF10>

In [35]:
results = rnn.evaluate(X_test, y_test)



In [36]:
results

[0.4489673972129822, 0.8644499778747559]