# Recurrent Networks for Textual Data Processing <br>
Loading the necessary packages <br>
Loading the data (IMDB dataset)<br>
Development of a Vanilla RNN model<br>
Development of a Bidirectional LSTM RNN model<br>

In [1]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.layers import SimpleRNN, Dense, Activation, Embedding, LSTM, Bidirectional
from keras.utils import pad_sequences
from keras.datasets import imdb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Loading the data (IMDB)

https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb/load_data






In [2]:
(X_train,Y_train), (X_test, Y_test) = imdb.load_data(path = "imdb.npz",
                                    num_words = None,
                                    skip_top = 0,
                                    maxlen = None,
                                    seed = 113,
                                    start_char = 1,
                                    oov_char = 2,
                                    index_from = 3)

In [3]:
print("X_train: ",X_train.shape)
print("Y_train: ",Y_train.shape)
print("X_test: ",X_test.shape)
print("Y_test: ",Y_test.shape)

print("Existing classes: ", np.unique(Y_train))

X_train:  (25000,)
Y_train:  (25000,)
X_test:  (25000,)
Y_test:  (25000,)
Existing classes:  [0 1]


In [4]:
unique,  cc = np.unique(Y_train, return_counts = True)
print("Class distribution (training): ", dict(zip(unique,cc)))

Class distribution (training):  {0: 12500, 1: 12500}


In [5]:
unique,  cc = np.unique(Y_test, return_counts = True)
print("Class distribution (test): ", dict(zip(unique,cc)))

Class distribution (test):  {0: 12500, 1: 12500}


# Examples of Reviews (Sentences)

In [6]:
# obtaining the dictionary (id --> word)
dictionary = imdb.get_word_index() 
num_words = len(dictionary) 
print("Total number of words in the dictionary: ", num_words)

Total number of words in the dictionary:  88584


In [7]:
# Some words (10 most frequent)
for (word, id) in dictionary.items(): 
    if id <= 10: print(id, word)

10 i
9 it
6 is
8 in
4 of
3 a
7 br
1 the
2 and
5 to


In [8]:
def Decode(idx = 1):
    reverse_index = dict([(id,word) for (word, id) in dictionary.items()])
    sentence = " ".join([reverse_index.get(i - 3, "!") for i in X_train[idx]])
    return sentence

In [9]:
# Example of sentence (ids)
idx = 5 
sentence = Decode(idx) 
print("Ids: ", np.array(X_train[idx])-3) # subtraction of 3 - initial encoding 
print("Sentence: ", sentence) 
print("Length: ", len(X_train[idx])) 
print("Class: ", Y_train[idx])

Ids:  [   -2   775   125    71     9   627   160    12     1  1763  7979  1048
 43219    29    82   153    42    37   145   136   118   661   662     7
     7  1358   170     1   746 86585    13  3801     5     1   223    62
     9    40   124    21 15341     7     7]
Sentence:  ! begins better than it ends funny that the russian submarine crew outperforms all other actors it's like those scenes where documentary shots br br spoiler part the message dechifered was contrary to the whole story it just does not mesh br br
Length:  43
Class:  0


# Preprocessing
Padding, dictionary reduction, and embedding dimension

In [10]:
# Reducing the number of words in the dictionary
# to reduce the number of parameters in the model
num_words = 5000
(X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words=num_words)

maxlen = 100 
embedding = 16

In [11]:
# Maximum sentence length
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

print(X_train[5])
decoded_review = Decode(5)

# Words outside the dictionary (absent | >num_words)
# are replaced by oov_char (2)

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    1  778  128   74   12  630  163   15    4 1766    2 1051    2
   32   85  156   45   40  148  139  121  664  665   10   10 1361  173
    4  749    2   16 3804    8    4  226   65   12   43  127   24    2
   10   10]
