In [195]:
import os
from  tqdm import tqdm
from collections import Counter
import nltk
from nltk.tokenize import RegexpTokenizer
import numpy as np
import re
from torchtext import vocab as vc
import itertools
import random

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
torch.manual_seed(0)


<torch._C.Generator at 0x1154310f0>

### Data Preparation

In [43]:
imdb_dir = '/Users/sampath/dlProjects/imdb_sent/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
test_dir = os.path.join(imdb_dir, 'test')


In [6]:
## Load the each text into texts list
## corresponding label will be in labels list. 
def read_test_train_dir(path,):
    labels = []
    texts = []
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(train_dir, label_type)
        for fname in os.listdir(dir_name):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname))
                texts.append(f.read())
                f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
    return texts,labels

In [7]:
train_texts,train_labels = read_test_train_dir(train_dir)
test_texts, test_labels = read_test_train_dir(test_dir)

In [8]:
def get_paragraph_words(text):
    return (flatten([word_tokenize(s) for s in sent_tokenize(text)]))

sent_tokenize = nltk.sent_tokenize
word_tokenize = RegexpTokenizer(r'\w+').tokenize

def word_tokenize_para(text):
    return [word_tokenize(s) for s in sent_tokenize(text)]

def flatten(l):
    return [item for sublist in l for item in sublist]



In [9]:
#Create vocabulary
vocab_counter = Counter(flatten([get_paragraph_words(text) for text in train_texts]))   

#get w2v populated for vocabulary
w2v = vc.Vocab(vocab_counter,max_size=50000,min_freq=5,vectors='glove.6B.50d')

In [100]:
# randomly shuffle the training data
train_comb = list(zip(train_texts,train_labels))
random.shuffle(train_comb)
train_texts2,train_labels2 = zip(*train_comb)
maxSeqLength = 250

In [104]:
def indexesFromSentence(w2v, text,maxSeqLength):
    #trim the sentence to maxSeqLength, otherwise return with original length. 
    return [w2v.stoi[word] for word in get_paragraph_words(text)[0:maxSeqLength]]

# pads a tensor of shape [num_words,1,embeddings_dim] to a maximum sentence length given by length argument. 
# it takes a tensor, keeps adding zero rows till length
def pad(tensor, length):
        return torch.cat([tensor, tensor.new(length - tensor.size(0), *tensor.size()[1:]).zero_()])

def variableFromSentence(w2v,text,maxSeqLength):
    indexes = indexesFromSentence(w2v, text,maxSeqLength)
    #returns tensor with size [num_words,1,embedding_dim]
    #That extra 1 dimension is because PyTorch assumes everything is in batches - we’re just using a batch size of 1 here.
    sent_word_vectors = torch.cat([w2v.vectors[i].view(1,-1) for i in indexes]).view(len(indexes),1,-1)
    
    #pad 500 , output is 500,1,embedding_dim
    #sent_word_vectors = pad(sent_word_vectors,maxSeqLength)
    
    #batch first (1,500,embedding_dim)
    sent_word_vectors = sent_word_vectors.view(1,len(sent_word_vectors),-1)
    
    return sent_word_vectors 

def variablesFromPair(pair):
    input_variable = variableFromSentence(w2v, pair[0])
    target_variable = pair[1]
    return (input_variable, target_variable)

In [105]:
train_vectors = [variableFromSentence(w2v,text,maxSeqLength) for text in train_texts2]
train_targets = train_labels2



In [147]:
train_vectors[2].shape

torch.Size([1, 195, 50])

In [116]:
len(get_paragraph_words(train_texts2[2]))

172

### RNN Model 

In [236]:
class RNN_Classifier(nn.Module):
    def __init__(self, input_dim,context_dim,num_classes):
        super(RNN_Classifier,self).__init__()        
        self.context_dim = context_dim;
        self.gru = nn.GRU(input_dim,context_dim,1,bias=True,batch_first=True)
        self.linear = nn.Linear(context_dim,num_classes);
   
    def forward(self,input):
        #we dont need to initialize explicitly - 
        #h0 = Variable(torch.zeros(1,input.size(0),self.context_dim))
        all_h, last_h = self.gru(input);
        #since we have only 1 layer and 1 direction
        output = self.linear(last_h[0]);
        return output;
    

In [237]:
input_dim = 50
context_dim = 64
num_classes = 2
model =  RNN_Classifier(input_dim,context_dim,num_classes)


In [40]:
criterion = nn.CrossEntropyLoss()  
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  
batch_size = 50
num_epochs = 25000//batch_size


In [238]:
def get_batch(str_idx,end_idx):
        input_vectors = train_vectors[str_idx:end_idx]
        labels = Variable(torch.LongTensor(train_labels2[str_idx:end_idx]))
        seq_lens = torch.LongTensor([i.shape[1] for i in input_vectors])
        embedding_dim = train_vectors[0].shape[2]
        #batch_inputs  - [batch_size, seq_len,embedding_dim]
        batch_inputs = Variable(torch.zeros((len(seq_lens), seq_lens.max(),embedding_dim)))
        for idx,(seq,seqlen) in enumerate(zip(input_vectors,seq_lens)):
            batch_inputs[idx,:seqlen] = seq
        seq_lens, perm_idx = seq_lens.sort(0, descending=True)
        batch_inputs = batch_inputs[perm_idx]
        batch_inputs = pack_padded_sequence(batch_inputs, seq_lens.numpy(),batch_first=True)
        labels = labels[perm_idx]
        return(batch_inputs,labels)
        #assign each training vector to left 
        
        #print(seq_lens)

In [None]:
# Train the Model
model.train()

for epoch in range(num_epochs):
    str_idx = epoch * batch_size
    end_idx = (epoch+1) * batch_size
    
    inputs,labels = get_batch(str_idx,end_idx)
    
    # Forward + Backward + Optimize
    optimizer.zero_grad()  # zero the gradient buffer
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 100 == 0:
        print ('Epoch [%d/%d],  Loss: %.4f' 
               %(epoch+1, num_epochs, loss.data[0]))

Epoch [100/500],  Loss: 0.7227
Epoch [200/500],  Loss: 0.7264
Epoch [300/500],  Loss: 0.7301


## Training Accuracy

In [18]:
## Test the Model on training data
model.eval()
correct = 0
total = 0
testing_inputs = Variable(torch.cat(train_vectors))
testing_labels = torch.LongTensor(train_targets)
outputs = model(testing_inputs)
_, predicted = torch.max(outputs.data, 1)
total = testing_labels.size(0)
correct = (predicted == testing_labels).sum()


print('Accuracy of the network on the  train reviews: %d %%' % (100 * correct / total))



Accuracy of the network on the  train reviews: 55 %


In [26]:
## Test the Model
model.eval()
correct = 0
total = 0
test_vectors = [variableFromSentence(w2v,text,maxSeqLength) for text in test_texts]
testing_inputs = Variable(torch.cat(test_vectors))
testing_labels = torch.LongTensor(test_labels)
outputs = model(testing_inputs)
_, predicted = torch.max(outputs.data, 1)
total = testing_labels.size(0)
correct = (predicted == testing_labels).sum()


print('Accuracy of the network on the 25000 test images: %d %%' % (100 * correct / total))


Accuracy of the network on the 25000 test images: 50 %


## MLP Model

In [249]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2);
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

   

In [250]:
# Loss and Optimizer
net = Net(25000,500,2)
criterion = nn.CrossEntropyLoss()  
learning_rate = 0.001
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)  
batch_size = 50
num_epochs = 25000//50
len(train_vectors)

25000

In [1]:
# Train the Model
net.train()

for epoch in range(num_epochs):
    str_idx = epoch * batch_size
    end_idx = (epoch+1) * batch_size
    input_tensor = torch.cat(train_vectors[str_idx:end_idx]);
    inputs = Variable(input_tensor.view(input_tensor.shape[0],-1))
    labels = Variable(torch.LongTensor(train_labels2[str_idx:end_idx]));
    
    # Forward + Backward + Optimize
    optimizer.zero_grad()  # zero the gradient buffer
    outputs = net(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 100 == 0:
        print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
               %(epoch+1, num_epochs, i+1, len(train_vectors)//batch_size, loss.data[0]))

NameError: name 'net' is not defined

In [252]:
## Test the Model
net.eval()
correct = 0
total = 0
test_vectors = [variableFromSentence(w2v,text,maxSeqLength) for text in test_texts]
test_input_tensor = torch.cat(test_vectors)
test_label_subset = torch.LongTensor(test_labels)

test_inputs = Variable(test_input_tensor.view(test_input_tensor.shape[0],-1))
outputs = net(test_inputs)
_, predicted = torch.max(outputs.data, 1)
total = test_label_subset.size(0)
correct = (predicted == test_label_subset).sum()


print('Accuracy of the network on the 25000 test images: %d %%' % (100 * correct / total))


Accuracy of the network on the 25000 test images: 79 %
