In [None]:
ls

aclImdb_v1.tar.gz            NLP_Assignment-1_Malhar.ipynb  train_data.csv
Assignment-3_Imdb_RNN.ipynb  NLP_Assignment-2.ipynb
[0m[01;34mdata[0m/                        test_data.csv


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.utils import shuffle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Pre-Processing the Data
Each file is read and review is added to a dataframe.<br>
There are two dataframes - train_data and test_data - both have 25000 reviews each with first 12500 of them being negative reviews (that's the order in which they were appended).<br>
The dataframe was made on my local python environment instead of Colab because somehow, Colab was taking 10x as longer as local python did. This dataframe is then uploaded on Colab and accessed here.<br><br>
Below is the code for making the dataframe from individual files.

In [None]:
'''
train, test = [], []

for filename in glob.glob("train/neg/*.txt"):
  train.append(filename)
for filename in glob.glob("train/pos/*.txt"):
  train.append(filename)
for filename in glob.glob("test/neg/*.txt"):
  test.append(filename)
for filename in glob.glob("test/pos/*.txt"):
  test.append(filename)

train_data = pd.DataFrame(columns=['review'])
for i in range(len(train)):
  with open(train[i],'r',encoding='utf-8') as fil:
    data = fil.read()
  train_data = train_data.append({'review':data},ignore_index=True)
  if i%1000 == 0:
    print('Iteration:',i)
train_data.to_csv('train_data.csv',index=False)

test_data = pd.DataFrame(columns=['review'])
for j in range(len(test)):
  with open(test[j],'r',encoding='utf-8') as fil_1:
    data_1 = fil_1.read()
  test_data = test_data.append({'review':data_1},ignore_index=True)
  if j%1000 == 0:
    print('Iteration:',j)
test_data.to_csv('test_data.csv',index=False)
'''

Once the dataframes are ready, a new column 'target' is defined that indicates whether the review is negative (0) or positive (1). The dataframe is then shuffled by rows because presently top 12500 are all negative and rest all positive.<br><br>
Further, some regex code is written to clean up the review.<br><br>
And then NLTK stopwords list is used to remove the stopwords from reviews. A couple of stopwords are removed from the stopwords list since they might play a crucial role in classifying reviews.

In [None]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

train_data['target'] = np.asarray([0 if i < 12500 else 1 for i in range(25000)])
test_data['target'] = np.asarray([0 if i < 12500 else 1 for i in range(25000)])

train_data = shuffle(train_data).reset_index(drop=True)
test_data = shuffle(test_data).reset_index(drop=True)

train_list = list(train_data['review'])
test_list = list(test_data['review'])

y_train = np.asarray(train_data['target'])
y_test = np.asarray(test_data['target'])

In [None]:
# Regex expressions to clean the reviews taken from a blog
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def clean(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    return reviews

train_list = clean(train_list)
test_list = clean(test_list)

In [None]:
def remove_stopwords(tokens, stopwords_list):
    result = []
    for word in tokens:
        if word not in stopwords_list:
            result.append(word)
    return result

stopwords_list = stopwords.words('english')
stopwords_list.remove('not')
stopwords_list.remove('very')

## Building vocabulary

In [None]:
# Build vocabulary from training data
# Build dictionary of all words in training data, sort it by its frequency and
# take top (say) 1000 or 2000 values to be the vocabulary.

word_dic = {}
X_train = []

for review in train_list:
  tokens = review.split()
  tokens = remove_stopwords(tokens, stopwords_list)
  X_train.append(tokens)
  for token in tokens:
    if token in word_dic.keys():
      word_dic[token] += 1
    else:
      word_dic[token] = 1

vocab_size = 1500
words_sorted = [k for k, v in sorted(word_dic.items(), key=lambda item: item[1], reverse=True)]
vocab = words_sorted[:vocab_size]

# Make word to index and index to word dictinoaries from vocabulary for further use
word2idx = {v:i for i,v in enumerate(vocab)}
idx2word = {i:v for v,i in word2idx.items()}

In [None]:
# Prepare the training and test dataset with reviews containing words only from the vocabulary.

for i in range(len(X_train)):
  X_train[i] = [word2idx[word] for word in X_train[i] if word in vocab]

X_test = []
for review in test_list:
  tokens = review.split()
  temp = [word2idx[word] for word in tokens if word in vocab]
  X_test.append(temp)

In [None]:
# Function for encoding word indexes to one hot vectors that will be input for RNN

def one_hot_encode(idx, vocab_size):
  one_hot = np.zeros((vocab_size))
  one_hot[idx] = 1
  return one_hot

# Initially, One-hot vectors were created and saved here itself (in envrionment), before passing them to RNN
# But this ate up all the RAM of Colab for vocab_size > 800.
# So instead, now the one hot vectors are defined one at a time in the rnn() function defined a few blocks below
# This creates the one hot vector, uses it for forward and backward pass and then throws it out.
'''
for i in range(len(X_train)):
  X_train[i] = np.array([one_hot_encode(idx,vocab_size) for idx in X_train[i]])

for i in range(len(X_test)):
  X_test[i] = np.array([one_hot_encode(idx,vocab_size) for idx in X_test[i]])
'''
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)

## Numpy RNN

The RNN structure followed here is as follows : <br><br>
X - input (array of reviews) ; y - output (0 or 1)<br>
Hidden state - h<sub>t</sub><br>
h<sub>t</sub> = tanh(U.X<sub>t</sub> + V.h<sub>t-1</sub> + b1) <br>
O = W.h<sub>T</sub> + b2 <br>
y_hat = sigmoid(O)


In [None]:
# Sigmoid activation function to be used on output layer
def sigmoid(Z):
  return 1 / (1 + np.exp(-Z))

In [None]:
# Initializing weights for one hidden layer RNN
# Initializing weights to random normal gave better results than initializing with zeros. (Bias excluded)
def init_weights(nh, vocab_size):
  U = np.random.randn(nh,vocab_size)
  V = np.random.randn(nh,nh)
  W = np.random.randn(1,nh)
  b1 = np.zeros((nh,1))
  b2 = np.zeros((1,1))
  return U,V,W,b1,b2

In [None]:
# Forward pass is carried out review by review.
# For a single review, hidden states (hts) are calculated for each word in the review by looping over it
# And finally class probabilities are calculated as sigmoid of
# last hidden states' value together with weight and bias.

def forward_pass(input_review,weights):
  U,V,W,b1,b2 = weights
  hts = []
  ht = np.zeros((V.shape[0], 1))
  for t in range(len(input_review)):
    ht = np.tanh(np.dot(U,input_review[t].reshape(-1,1)) + np.dot(V,ht) + b1)
    hts.append(ht)
  try:
    O = np.dot(W,hts[-1]) + b2
  except:
    O = np.dot(W,ht) + b2
  y_hat = sigmoid(O)
  return y_hat, hts

In [None]:
# Binary Crossentropy loss
def loss_fn(y,y_hat):
  loss = -(y*np.log(y_hat) + (1-y)*np.log(1-y_hat))
  return loss

In [None]:
# Function to clip gradients to stop them exploding.
# Implemented as described in the RNN class.
# Clip the gradient if gradient norm is greater than a set theshold value.
# Threshold value of 0.5 worked best

def clip(gradient,threshold=0.5):
  gradient_norm = np.sqrt(np.sum(gradient**2))
  if gradient_norm > threshold:
    gradient = (threshold / gradient_norm) * gradient
  return gradient

In [None]:
# Backpropagation of the network.
# Notation : dX represents the gradient of parameter X.
# Here, parameters W and b2 are used only once and at the last time point because this is a 'many to one' type of RNN
# Hence, gradients of W and b2 are calculated outside the loop.
# First iteration of gradient is done outside of loop because of different nature of values it requires (coz of many to one type)
# Then, across the time (words), gradient values are added accordingly.
def backprop(input_review,y,y_hat,hts,weights):
  U,V,W,b1,b2 = weights
  dU,dV,dW,db1,db2 = np.zeros(U.shape), np.zeros(V.shape), np.zeros(W.shape), np.zeros(b1.shape), np.zeros(b2.shape)

  dO = y_hat - y
  dW = np.dot(dO,hts[-1].T)
  db2 = dO
  dht = np.dot(W.T,dO)
  db1 += dht * (1-hts[-1]**2)
  dV += np.dot(dht * (1-hts[-1]**2),hts[-2].T)
  dU += np.dot(dht * (1-hts[-1]**2),input_review[-1].reshape(-1,1).T)
  for t in reversed(range(len(hts)-1)):
    dht = np.dot(V.T, dht * (1-hts[t+1]**2))
    db1 += dht * (1-hts[t]**2)
    dV += np.dot(dht * (1-hts[t]**2),hts[t-1].T)
    dU += np.dot(dht * (1-hts[t]**2),input_review[t-1].reshape(-1,1).T)

  gradients = [dU,dV,dW,db1,db2]
  for g in range(len(gradients)):
    gradients[g] = clip(gradients[g])
    
  return tuple(gradients)

In [None]:
def update_weights(weights, gradients, learning_rate):
  U,V,W,b1,b2 = weights
  dU,dV,dW,db1,db2 = gradients
  U_new = U - learning_rate * dU
  V_new = V - learning_rate * dV
  W_new = W - learning_rate * dW
  b1_new = b1 - learning_rate * db1
  b2_new = b2 - learning_rate * db2
  new_weights = U_new, V_new, W_new, b1_new, b2_new
  return new_weights

In [None]:
# The final RNN model
# Update of weights is made after each review
# The model returns the trained weights
def rnn_model(X,Y,nh,num_epochs,vocab_size,learning_rate=0.01):
  weights = init_weights(nh,vocab_size)
  loss = []
  for itr in range(num_epochs):
    for i in range(X.shape[0]):
      x = np.array([one_hot_encode(idx,vocab_size) for idx in X[i]])
      y = Y[i]
      y_hat, hts = forward_pass(x,weights)
      loss.append(loss_fn(y,y_hat))
      gradients = backprop(x,y,y_hat,hts,weights)
      weights = update_weights(weights,gradients,learning_rate)
  return weights

In [None]:
def predict(X,weights,learning_rate=0.01):
  y_pred = []
  for i in range(X.shape[0]):
    x = np.array([one_hot_encode(idx,vocab_size) for idx in X[i]])
    y_hat, _ = forward_pass(x, weights)
    if y_hat < 0.5:
      y_pred.append(0)
    else:
      y_pred.append(1)
  y_pred = np.asarray(y_pred)
  return y_pred

In [None]:
def accuracy(Y,Y_pred):
  acc = np.mean(Y == Y_pred)
  return acc

In [None]:
start_time = time.time()
weights = rnn_model(X_train,y_train,3,20,vocab_size,0.06)
end_time = time.time()
print('It took the model %d seconds to run' % int(end_time - start_time))

It took the model 1998 seconds to run


In [None]:
Y_pred = predict(X_test, weights)
print('Testing accuracy : %.3f' % accuracy(y_test, Y_pred))

Testing accuracy : 0.763


In [None]:
Y_pred = predict(X_train, weights)
print('Training accuracy : %.3f' % accuracy(y_train, Y_pred))

Training accuracy : 0.777


So, the accuracy on test set is 76.3%.<br><br>
Different combinations of vocab_size, hidden layer size and learning rate were tried and this particular combination gave the best result.