In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import spacy 

### 1) Data preparation

In [2]:
word2index = tf.keras.datasets.imdb.get_word_index()
VOCAB_SIZE = len(word2index)

In [3]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data()

In [4]:
# #training exambles
print(X_train.shape)
# #testing set
print(X_test.shape)

(25000,)
(25000,)


In [5]:
print(len(X_train[0]))
print(len(X_train[1]))

218
189


In [6]:
idx2word = {v : k for k, v in word2index.items()}
print(len(idx2word))

88584


In [7]:
def text_reconstruction(tokens):
  text = []
  for token in tokens:
    text.append(idx2word[token])
  return " ".join(text)

In [8]:
text_reconstruction(X_train[0])

"the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room titillate it so heart shows to years of every never going villaronga help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but pratfalls to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other tricky in of seen over landed for anyone of gilmore's br show's to whether from than out themselves history he name half some br of 'n odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but wh

In [9]:
np.unique(y_train)

array([0, 1])

In [10]:
# Make padding according to 100 length for all sequences insted of the longest_sequence because it's too large
def padding(dataset, seq_length):
  padded = np.zeros((len(dataset), seq_length))
  for i, tokens in enumerate(dataset):
    padded[i, :len(tokens)] = tokens[:seq_length]
  return padded

In [11]:
X_train_padded = padding(X_train, 100)
X_test_padded  = padding(X_test, 100)
print(X_train_padded.shape, X_test_padded.shape)

(25000, 100) (25000, 100)


### 2) Build and train the model

In [12]:
# Build SimpleRNN model
model = tf.keras.models.Sequential([
                    tf.keras.layers.Embedding(VOCAB_SIZE, 64),
                    tf.keras.layers.SimpleRNN(64),
                    tf.keras.layers.Dense(64, activation='relu'),
                    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [13]:
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
histoey = model.fit(X_train_padded, y_train, epochs=5, batch_size=128, validation_split=0.3)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
test_loss, test_acc = model.evaluate(X_test_padded, y_test)
print('Loss: ', test_loss)
print('Accuracy: ', test_acc)

Loss:  0.954551637172699
Accuracy:  0.7381200194358826


### Try Bidirectional RNN

In [15]:
model = tf.keras.models.Sequential([
                      tf.keras.layers.Embedding(VOCAB_SIZE, 64),
                      # return_sequences --> to return the result on each time stamp to be used by the next layer
                      tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(64, return_sequences=True)),
                      tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(64)),
                      tf.keras.layers.Dense(64, activation='relu'),
                      tf.keras.layers.Dense(1, activation='sigmoid')
])

In [16]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [17]:
history = model.fit(X_train_padded, y_train, epochs=5, batch_size=128,
                    validation_data=(X_test_padded, y_test), 
                    validation_steps=30)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
test_loss, test_acc = model.evaluate(X_test_padded, y_test)
print(test_loss, test_acc)

0.9103752970695496 0.7619199752807617


### Build GRU

In [19]:
Bidir_GRU = tf.keras.models.Sequential([
                        tf.keras.layers.Embedding(VOCAB_SIZE, 64),
                        tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)),
                        tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
                        tf.keras.layers.Dense(64, activation='relu'),
                        tf.keras.layers.Dense(1, activation='sigmoid')
])

In [20]:
Bidir_GRU.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=['accuracy'])
Bidir_GRU.fit(X_train_padded, y_train, epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa5da401ad0>

In [21]:
test_loss, test_acc = Bidir_GRU.evaluate(X_test_padded, y_test)



### LSTM
**For long sequences**

In [22]:
# Here we will not shrink text length,and will make padding according to the longest sequence in the set
longest_sequence = max([len(i) for i in X_train])
print(longest_sequence)
train = padding(X_train, longest_sequence)
test = padding(X_test, longest_sequence)
print(train.shape)
print(test.shape)

2494
(25000, 2494)
(25000, 2494)


In [23]:
Bidir_LSTM = tf.keras.models.Sequential([
                              tf.keras.layers.Embedding(VOCAB_SIZE, 64),
                              tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
                              tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
                              tf.keras.layers.Dense(64, activation='relu'),
                              tf.keras.layers.Dense(1, activation='sigmoid')
])

In [24]:
Bidir_LSTM.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [25]:
Bidir_LSTM.fit(X_train_padded, y_train, epochs=5,
                    validation_data=(X_test_padded, y_test), 
                    validation_steps=30)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa5d8cbd7d0>

In [26]:
test_loss, test_acc = Bidir_LSTM.evaluate(X_test_padded, y_test)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

Test Loss: 1.0926005840301514
Test Accuracy: 0.7802799940109253
