In [1]:
import os
from  tqdm import tqdm
from collections import Counter
import nltk
from nltk.tokenize import RegexpTokenizer
import numpy as np
import re
from torchtext import vocab as vc
import itertools
import random

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
torch.manual_seed(0)


<torch._C.Generator at 0x10ebc00f0>

## Data Preparation

In [2]:
imdb_dir = '/Users/sampath/dlProjects/imdb_sent/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
test_dir = os.path.join(imdb_dir, 'test')


## Functions for reading the train and test data

In [3]:
## Load the each text into texts list
## corresponding label will be in labels list. 
def read_test_train_dir(path,):
    labels = []
    texts = []
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(train_dir, label_type)
        for fname in os.listdir(dir_name):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname))
                texts.append(f.read())
                f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
    return texts,labels

## Read the test and train data set

In [4]:
train_texts,train_labels = read_test_train_dir(train_dir)
test_texts, test_labels = read_test_train_dir(test_dir)

## Functions for tokenizing the text

In [5]:
def get_paragraph_words(text):
    return (flatten([word_tokenize(s) for s in sent_tokenize(text)]))

sent_tokenize = nltk.sent_tokenize
word_tokenize = RegexpTokenizer(r'\w+').tokenize

def word_tokenize_para(text):
    return [word_tokenize(s) for s in sent_tokenize(text)]

def flatten(l):
    return [item for sublist in l for item in sublist]



## Populate the vocabulary counter

In [6]:
vocab_counter = Counter(flatten([get_paragraph_words(text) for text in train_texts]))   

In [7]:
len(vocab_counter)

93929

## Load the glove word vectors  100 dimension. 

In [8]:
#get w2v populated for vocabulary
w2v = vc.Vocab(vocab_counter,max_size=20000,min_freq=3,vectors='glove.6B.100d')

## Random shuffle test and train data sets

In [9]:
# randomly shuffle the training data
training_set = list(zip(train_texts,train_labels))
#shuffle works inplace and returns None . 
random.shuffle(training_set)

# randomly shuffle the training data
testing_set  = list(zip(test_texts,test_labels))
#shuffle works inplace and returns None . 
random.shuffle(testing_set)


## Since average sentence length is 246 words, setting sequence length to 250

In [10]:
maxSeqLength = 250


## Functions to get w2v from review text

In [11]:
#function to get vocabular indices from text returns list of indices (cut-off at maxSeqLength)
def stoiForReview(w2v, text,maxSeqLength):
    #trim the sentence to maxSeqLength, otherwise return with original length. 
    return [w2v.stoi[word] for word in get_paragraph_words(text)[0:maxSeqLength]]

#function to get word vectors for review - returns tensor of size 1, min(len(review),maxSeqLength),embedded_dim
def wordVectorsForReview(w2v,text,maxSeqLength):
    indexes = stoiForReview(w2v, text,maxSeqLength)
    #returns tensor with size [num_words,1,embedding_dim]
    #That extra 1 dimension is because PyTorch assumes everything is in batches - we’re just using a batch size of 1 here.
    sent_word_vectors = torch.cat([w2v.vectors[i].view(1,-1) for i in indexes]).view(len(indexes),1,-1)
    
    #batch first (1,seq_len,embedding_dim)
    #seq_len has been maximized to maxSeqLength
    sent_word_vectors = sent_word_vectors.view(1,len(sent_word_vectors),-1)
    
    return sent_word_vectors 


### Function to get data from start index (str_idx) to end index (end_idx) from list of data points give in t_set
#### function returns two tensors - first packed_padded_sequence with input w2v and labels variable

In [12]:
#Create nn.Emedding for easy lookup
idx2vec = w2v.vectors;
embedding = nn.Embedding(idx2vec.shape[0],idx2vec.shape[1])
embedding.weight = nn.Parameter(idx2vec) 
embedding.weight.requires_grad = False

def get_batch2(t_set,str_idx,end_idx):
        training_batch_set = t_set[str_idx:end_idx]
        
        input_texts,labels = zip(*training_batch_set)
        labels = torch.LongTensor(labels)
        input_text_ids = [stoiForReview(w2v,text,maxSeqLength) for text in input_texts]
        seq_lens = torch.LongTensor([len(i) for i in input_text_ids])
        padded_input_text_ids = Variable(torch.LongTensor(len(seq_lens), seq_lens.max()).zero_())
        for idx,(seq,seqlen) in enumerate(zip(input_text_ids,seq_lens)):
            padded_input_text_ids[idx,:seqlen] = torch.LongTensor(seq)
            
        #sort according seq lengths
        seq_lens, perm_idx = seq_lens.sort(0, descending=True)
        padded_input_text_ids = padded_input_text_ids[perm_idx]
        labels = labels[perm_idx]
        
        embed = embedding(padded_input_text_ids)
        packed_input = pack_padded_sequence(embed, seq_lens.numpy(),batch_first=True)


        
        return(packed_input,labels)
        #assign each training vector to left 
        
        #print(seq_lens)

In [13]:
def get_batch(t_set,str_idx,end_idx):
        training_batch_set = t_set[str_idx:end_idx]
        
        input_texts,labels = zip(*training_batch_set)
        
        #convert texts to vectors shape - Batch(=1),seq_length(cut-off at maxSeqLength),embedded_dim
        input_vectors = [wordVectorsForReview(w2v,text,maxSeqLength) for text in input_texts]
        
        #convert to variable w/ long tensor
        labels = Variable(torch.LongTensor(labels))
        
        seq_lens = torch.LongTensor([i.shape[1] for i in input_vectors])
        embedding_dim = input_vectors[0].shape[2]
        #batch_inputs  - [batch_size, seq_len,embedding_dim]
        batch_inputs = Variable(torch.zeros((len(seq_lens), seq_lens.max(),embedding_dim)))
        for idx,(seq,seqlen) in enumerate(zip(input_vectors,seq_lens)):
            batch_inputs[idx,:seqlen] = seq
        seq_lens, perm_idx = seq_lens.sort(0, descending=True)
        batch_inputs = batch_inputs[perm_idx]
        batch_inputs = pack_padded_sequence(batch_inputs, seq_lens.numpy(),batch_first=True)
        labels = labels[perm_idx]
        return(batch_inputs,labels)
        #assign each training vector to left 
        
        #print(seq_lens)

In [14]:
def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)

### RNN1 Model 

### Model -> GRU (context dimension - 64) + Logistic  + Softmax

In [15]:
class RNN1(nn.Module):
    def __init__(self, input_dim,context_dim,num_classes):
        super(RNN1,self).__init__()        
        self.context_dim = context_dim;
        # set pretrained glove embeddings for use. 
        #freeze the embeddin

        self.gru = nn.GRU(input_dim,context_dim,1,bias=True,batch_first=True)
        self.linear = nn.Linear(context_dim,num_classes);
    def forward(self,input,hidden):
        #use given hidden for initial_hidden_states
        all_h, last_h = self.gru(input,hidden);
        #since we have only 1 layer and 1 direction
        output = self.linear(last_h[0]);
        #return the last_h to re-feed for next batch
        return output,last_h;
    
    def init_hidden(self, batch_size):
        return Variable(torch.zeros(1,batch_size, self.context_dim))

### RNN2 Model
#### Bidrectional GRU

In [16]:
class RNN2(nn.Module):
    def __init__(self, input_dim,context_dim,num_classes):
        super(RNN2,self).__init__()        
        self.context_dim = context_dim;
        self.gru = nn.GRU(input_dim,context_dim,num_layers=1,bias=True,batch_first=True,dropout=0,bidirectional=True)
        # since we are using 2 directions
        self.linear = nn.Linear(2*context_dim,num_classes);
   
    def forward(self,input,hidden):
        #we dont need to initialize explicitly - 
        #h0 = Variable(torch.zeros(1,input.size(0),self.context_dim))
        all_h, last_h = self.gru(input,hidden);
        #last_h shape is 2,batch_size,context_dim (2 is for 2 directions)
        concated_h = torch.cat([last_h[0],last_h[1]],1)
        output = self.linear(concated_h);
        return output,last_h;
    
    def init_hidden(self, batch_size):
        #since we are using bi-directional use 2 layers. 
        return Variable(torch.zeros(2,batch_size, self.context_dim))
    

In [17]:
learning_rate = 0.001
batch_size = 50
#num_passes = 200//batch_size
num_passes = 25000//batch_size # number of batches with given batch_size
num_epochs = 50 #number of times we will go through all the training samples

In [18]:
input_dim = 100 # embedding dimension
context_dim = 50
num_classes = 2

criterion = nn.CrossEntropyLoss()  
model =  RNN2(input_dim,context_dim,num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

In [19]:
# Train the Model
model.train()
for epoch in range(num_epochs):
    #re-initialize after 
   # random.shuffle(training_set)
    hidden = model.init_hidden(batch_size)
    # reinitialize hidden layers to zero after each epoch
    for i in range(num_passes):
        str_idx = i * batch_size
        end_idx = (i+1) * batch_size
        inputs,labels = get_batch(training_set,str_idx,end_idx)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        #hidden = repackage_hidden(hidden)
        hidden.detach_()
        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs,hidden = model(inputs,hidden)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print ('pass [%d/%d], in epoch [%d/%d] Loss: %.4f' 
                   %(i+1, num_passes,epoch, num_epochs, loss.data[0]))

pass [100/500], in epoch [0/50] Loss: 0.6398
pass [200/500], in epoch [0/50] Loss: 0.5498
pass [300/500], in epoch [0/50] Loss: 0.4605
pass [400/500], in epoch [0/50] Loss: 0.4980
pass [500/500], in epoch [0/50] Loss: 0.4671
pass [100/500], in epoch [1/50] Loss: 0.3123
pass [200/500], in epoch [1/50] Loss: 0.4207
pass [300/500], in epoch [1/50] Loss: 0.4152
pass [400/500], in epoch [1/50] Loss: 0.4363
pass [500/500], in epoch [1/50] Loss: 0.4566
pass [100/500], in epoch [2/50] Loss: 0.2608
pass [200/500], in epoch [2/50] Loss: 0.3935
pass [300/500], in epoch [2/50] Loss: 0.3881
pass [400/500], in epoch [2/50] Loss: 0.3870
pass [500/500], in epoch [2/50] Loss: 0.4444
pass [100/500], in epoch [3/50] Loss: 0.2278
pass [200/500], in epoch [3/50] Loss: 0.3733
pass [300/500], in epoch [3/50] Loss: 0.3540
pass [400/500], in epoch [3/50] Loss: 0.3479
pass [500/500], in epoch [3/50] Loss: 0.4270
pass [100/500], in epoch [4/50] Loss: 0.2074
pass [200/500], in epoch [4/50] Loss: 0.3485
pass [300/

KeyboardInterrupt: 

# Training Accuracy

In [21]:
## Test the Model on training data
model.eval()
correct = 0
total = 0
testing_inputs,testing_labels = get_batch(training_set,0,25000)
hidden = model.init_hidden(25000)


outputs,hidden = model(testing_inputs,hidden)
_, predicted = torch.max(outputs.data, 1)
total = testing_labels.size(0)
correct = (predicted == testing_labels.data).sum()


print('Accuracy of the network on the  training data : %d %%' % (100 * correct / total))



Accuracy of the network on the  training data : 93 %


## Testing Accuracy

In [None]:
## Test the Model on training data
model.eval()
correct = 0
total = 0
testing_inputs,testing_labels = get_batch(testing_set,0,25000)
hidden = model.init_hidden(25000)


outputs,hidden = model(testing_inputs,hidden)
_, predicted = torch.max(outputs.data, 1)
total = testing_labels.size(0)
correct = (predicted == testing_labels.data).sum()


print('Accuracy of the network on the  test data: %d %%' % (100 * correct / total))

