<a href="https://colab.research.google.com/github/nrjcs/iitpbse/blob/master/dl/day4/bse_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Dataset**



> Large Movie Review Dataset or IMDB Dataset

> A dataset for binary sentiment classification

> A set of 25,000 highly polar movie reviews for training, and 25,000 for testing

> Labeled by sentiment (positive/negative)

> Additional details @ https://ai.stanford.edu/%7Eamaas/data/sentiment/


In [None]:
from keras.datasets import imdb #importing imdb dataset from keras...details @ https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb/load_data

# Reviews have been preprocessed
# Each review is encoded as a list of word indexes (integers) ==> A vector
# For convenience, words are indexed by overall frequency in the dataset
# For instance, the integer "3" encodes the 3rd most frequent word in the data
# As a convention, "0" does not stand for a specific word, but instead is used to encode any unknown word


# imdb.load_data( ) method to load the dataset
  # num_words => integer or None. Words are ranked by how often they occur (in the training set) and only the num_words most frequent 
    # words are kept. If None, all words are kept. 
    # Defaults to None.
  # skip_top =>	skip the top N most frequently occurring words (which may not be informative). 
  
num_words = 5000
skip_top = 5

(X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words = num_words, skip_top = skip_top)

print ("....Done....")

In [None]:
import numpy as np

# about dataset
print (X_train.shape)
print (Y_train.shape)
print (X_test.shape)
print (Y_test.shape)

print (X_train[0])
print (np.shape(X_train[0]))
print (Y_train[0:10])

print ("....Done....")

In [None]:
print (X_train)

In [None]:
# input can be of any length...however, the length of vectors is supposed to be the same so as to work with these vectors in Keras

max_len = 500 # defining maximum length for each review

from keras.preprocessing import sequence

# sequence.pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.0) details @ https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences
# Pads sequences to the same length.
# Transforms a list (of length num_samples) of sequences (lists of integers) into a 2D Numpy array of shape (num_samples, num_timesteps)
# num_timesteps is either the maxlen argument if provided, or the length of the longest sequence in the list. 
# padding/truncating = pre/post

X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)


print (X_train.shape)
print (Y_train.shape)
print (X_test.shape)

print (X_train[0])
print (np.shape(X_train[0]))
print (Y_train[0:10])

print ("....Done....")

In [None]:
print (X_train)

In [13]:
# define model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN
from keras.layers import LSTM
from keras.layers.embeddings import Embedding


In [28]:
# Word embedding and Embedding layer in Keras
# Bag of word model => Sparse representation of words in a text
# Word embeddings => Dense representation of words 
  # words are represented by dense vectors
  # words with the same meaning are likely to have similar representation


# Keras provides an Embedding layer to turn positive integers (indexes) into dense vectors of fixed size.
# e.g. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
# This layer can only be used as the first layer in a model details @ https://keras.io/api/layers/core_layers/embedding/

# tf.keras.layers.Embedding(input_dim, output_dim, embeddings_initializer="uniform", ..., input_length=None)
# input_dim: Integer. Size of the vocabulary, i.e. maximum integer index + 1.
# output_dim: Integer. Dimension of the dense embedding
# input_length: Length of input sequences

embed_vc_len = 100

model = Sequential()
model.add(Embedding(num_words, embed_vc_len, input_length=max_len))


In [29]:

model.add(SimpleRNN(16)) # SimpleRNN layer details @ https://keras.io/api/layers/recurrent_layers/simple_rnn/


# Classification problem with two classes => adding a Dense layer with a single neuron and a sigmoid activation function 

model.add(Dense(1, activation='sigmoid'))



In [None]:
print(model.summary())

In [None]:
# compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train model
batch_size = 200
no_epochs = 5
history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=no_epochs, batch_size=batch_size)

In [None]:
# about training 
history.history.keys()

In [None]:
# evaluate model
scores = model.evaluate(X_test, Y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
# Accuracy with the epochs
import matplotlib.pyplot as plt			#to plot images
plt.plot(history.history['accuracy'],'r')
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['training'], loc='center right')
plt.show()

In [None]:
# Loss with epochs

plt.plot(history.history['loss'],'g')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['training'], loc='upper right')
plt.show()

In [None]:
############################
# Alternative
model = Sequential()
model.add(Embedding(num_words, embed_vc_len, input_length=max_len))

# LSTM layer Keras .... details @ https://keras.io/api/layers/recurrent_layers/lstm/
# LSTM(units, activation="tanh", recurrent_activation="sigmoid", ...)

model.add(LSTM(16)) # An LSTM layer with n cells... dimensionality of the output space

model.add(Dense(1, activation='sigmoid'))

# compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train model
history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=no_epochs, batch_size=batch_size)

# evaluate model
scores = model.evaluate(X_test, Y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
############################
# Alternative
model = Sequential()
model.add(Embedding(num_words, embed_vc_len, input_length=max_len))

model.add(SimpleRNN(128)) # SimpleRNN layer details @ https://keras.io/api/layers/recurrent_layers/simple_rnn/

model.add(Dense(1, activation='sigmoid'))

# compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train model
history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=no_epochs, batch_size=batch_size)

# evaluate model
scores = model.evaluate(X_test, Y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
############################
# Alternative
model = Sequential()
model.add(Embedding(num_words, embed_vc_len, input_length=max_len))

model.add(LSTM(128)) 
model.add(Dense(1, activation='sigmoid'))

# compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train model
history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=no_epochs, batch_size=batch_size)

# evaluate model
scores = model.evaluate(X_test, Y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))


In [None]:
############################
# Alternative
model = Sequential()
model.add(Embedding(num_words, embed_vc_len, input_length=max_len))

model.add(LSTM(16, return_sequences=True)) #return full sequence 
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))

# compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train model
history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=no_epochs, batch_size=batch_size)

# evaluate model
scores = model.evaluate(X_test, Y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
############################
from keras.layers import Dropout
# Alternative
model = Sequential()
model.add(Embedding(num_words, embed_vc_len, input_length=max_len))

model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train model
history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=no_epochs, batch_size=batch_size)

# evaluate model
scores = model.evaluate(X_test, Y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))
