In [3]:
'''
    Author - Oyesh Mann Singh
    Date - 12/04/2018
    Description 
        - Basic deep learning with Pytorch
'''
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

import torchtext
from torchtext import data, vocab

from sklearn.metrics import accuracy_score

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

CUDA_VISIBLE_DEVICES=1

In [4]:
'''
    Load training/testing/word2vec data
'''

root_path = './data/aclImdb/'

dict_path = './data/word2vec.txt'

vec = vocab.Vectors(dict_path)

In [5]:
'''
    Prepare dataset using torchtext
'''
LABEL = data.Field(unk_token=None, pad_token=None, sequential=False, use_vocab=False)
TEXT = data.Field(batch_first=True)

train_set, test_set = data.TabularDataset.splits(path=root_path, 
                                                  format='csv', 
                                                  skip_header = True, train = 'train.csv', test='test.csv',
                                                  fields=[('label',LABEL),('text',TEXT)])

# Further split train_set into train/validation
train_set, valid_set = train_set.split()

In [6]:
print("Fields of training/testing dataset: ", train_set.fields.items())
print("Length of training dataset: ", len(train_set))
print("Length of validation dataset: ", len(valid_set))
print("Length of test dataset: ", len(test_set))

Fields of training/testing dataset:  dict_items([('label', <torchtext.data.field.Field object at 0x00000242F803AFD0>), ('text', <torchtext.data.field.Field object at 0x00000242F8A5F080>)])
Length of training dataset:  17500
Length of validation dataset:  7500
Length of test dataset:  25000


In [7]:
# Checking a training data
print(vars(train_set[0]))

{'label': '1', 'text': ['many', 'people', 'judge', 'it', 'as', 'a', 'fan', 'service', 'film', 'because', 'a', 'lot', 'of', 'super', 'star', 'starring', 'in', 'this', 'movie', 'gackt', 'hyde', 'and', 'wang', 'lee', 'hom', 'is', 'very', 'famous', 'singer', 'in', 'japan', 'but', 'don', 't', 'judge', 'it', 'before', 'you', 'watch', 'is', 'what', 'i', 'say', 'gackt', 'and', 'staff', 'are', 'very', 'serious', 'when', 'made', 'this', 'film', 'and', 'they', 'worked', 'so', 'hard', 'it', 's', 'a', 'good', 'film', 'with', 'a', 'touchy', 'story', 'inside', 'several', 'scenes', 'can', 'be', 'so', 'fun', 'and', 'some', 'others', 'are', 'so', 'sad', 'they', 'made', 'it', 'so', 'good', 'until', 'i', 'can', 't', 'stop', 'watching', 'this', 'all', 'over', 'again', 'the', 'story', 'has', 'written', 'pretty', 'well', 'but', 'i', 'admit', 'that', 'their', 'act', 'are', 'little', 'disappointing', 'this', 'is', 'especially', 'for', 'hyde', 'because', 'his', 'skill', 'of', 'acting', 'is', 'under', 'from', 't

In [8]:
# Create an iterator
data_batch_size = 128

train_iter, val_iter, test_iter = data.BucketIterator.splits((train_set, valid_set, test_set), batch_size=data_batch_size, 
                                                   sort_key=lambda x: len(x.text), device=device, shuffle=True)

In [9]:
print(len(train_iter))

137


In [10]:
# Build the vocabulary
LABEL.build_vocab(train_set)
TEXT.build_vocab(train_set, valid_set, test_set, vectors=vec)

# Check the vocabulary shape
print(TEXT.vocab.vectors.shape)

torch.Size([102661, 50])


In [11]:
len(TEXT.vocab)

102661

In [12]:
print(vars(train_set[0]))

{'label': '1', 'text': ['many', 'people', 'judge', 'it', 'as', 'a', 'fan', 'service', 'film', 'because', 'a', 'lot', 'of', 'super', 'star', 'starring', 'in', 'this', 'movie', 'gackt', 'hyde', 'and', 'wang', 'lee', 'hom', 'is', 'very', 'famous', 'singer', 'in', 'japan', 'but', 'don', 't', 'judge', 'it', 'before', 'you', 'watch', 'is', 'what', 'i', 'say', 'gackt', 'and', 'staff', 'are', 'very', 'serious', 'when', 'made', 'this', 'film', 'and', 'they', 'worked', 'so', 'hard', 'it', 's', 'a', 'good', 'film', 'with', 'a', 'touchy', 'story', 'inside', 'several', 'scenes', 'can', 'be', 'so', 'fun', 'and', 'some', 'others', 'are', 'so', 'sad', 'they', 'made', 'it', 'so', 'good', 'until', 'i', 'can', 't', 'stop', 'watching', 'this', 'all', 'over', 'again', 'the', 'story', 'has', 'written', 'pretty', 'well', 'but', 'i', 'admit', 'that', 'their', 'act', 'are', 'little', 'disappointing', 'this', 'is', 'especially', 'for', 'hyde', 'because', 'his', 'skill', 'of', 'acting', 'is', 'under', 'from', 't

In [13]:
# Check the sample batch
sample_train_batch = next(iter(train_iter))

print(sample_train_batch.text.shape)

torch.Size([128, 831])


In [14]:
# Check the tensor for a sample input word
TEXT.vocab.vectors[TEXT.vocab.stoi['a']]

tensor([-1.4315, -2.6598,  0.4538, -0.4569,  1.1227,  0.2325, -0.2207,  2.3917,
        -2.2225,  1.6792, -0.8594,  1.1664,  0.8578,  0.0004, -1.2418, -3.3718,
         0.9664,  1.0759,  5.0070,  1.2052,  1.0962,  2.0620,  1.5243,  1.4128,
        -2.3810,  3.3441, -1.6822, -0.2576, -1.1957,  1.3121,  2.7869,  3.1460,
         1.5586,  0.4723,  1.2427,  0.6243, -1.7932, -3.9185,  2.2868, -2.0155,
         3.8783, -2.0760, -1.8423, -0.2244,  3.4017, -2.4928, -2.1677,  0.8344,
         3.0949, -3.1904])

In [15]:
# Check the top 10 frequent words in vocabulary
print(TEXT.vocab.freqs.most_common(10))

[('the', 667700), ('and', 324294), ('a', 322886), ('of', 289379), ('to', 268071), ('is', 211046), ('it', 190675), ('in', 186718), ('i', 175431), ('this', 150909)]


In [16]:
ex = train_set[0]
# ex.text, ex.label

In [18]:
class RNN(nn.Module):
    def __init__(self, weights, vocab_size, embedding_length=50, hidden_size=128, output_size=2):
        super().__init__()
        self.input_dim = vocab_size
        self.embedding_dim  = embedding_length
        self.hidden_dim = hidden_size
        self.output_dim = output_size
        
        self.embedding = nn.Embedding.from_pretrained(weights)
        self.rnn = nn.RNN(input_size=self.embedding_dim, hidden_size=self.hidden_dim)
        self.fc = nn.Linear(self.hidden_dim, self.output_dim)
        
    def init_hidden(self, bs):
        if torch.cuda.is_available():
            return Variable(torch.zeros(1, bs, self.hidden_dim).cuda())
        else:
            return Variable(torch.zeros(1, bs, self.hidden_dim))
    
    def forward(self, input_word):
        # input_word -> batch_size * sent_length = 128* 784
        # hidden -> num_layer * batch_size * hidden_dim = 1*128*128
        batch_size = input_word.shape[0] # 784
        h_0 = self.init_hidden(batch_size) #1*784*128
        
        embedded = self.embedding(input_word)   # embedded -> batch_size * sent_length * embedding_dim = 128*784*50
        
        embedded = embedded.permute(1,0,2)      # embedded -> sent_length * batch_size * embedding_dim = 784*128*50
        
        rnn_out, hidden = self.rnn(embedded, h_0)    # rnn_out -> sent_length * batch_size * hidden_dim
                                                # hidden -> num_layer * batch_size * hidden_dim
        output = self.fc(hidden[-1])          # output -> batch_size * output_dim
                                    
        return output    # output -> batch_size * output_dim

In [19]:
class RNN_2(nn.Module):
    def __init__(self, output_size, hidden_size, vocab_size, embedding_length, weights):
        super(RNN, self).__init__()

        """
        Arguments
        ---------
        batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
        output_size : 2 = (pos, neg)
        hidden_sie : Size of the hidden_state of the LSTM
        vocab_size : Size of the vocabulary containing unique words
        embedding_length : Embeddding dimension of GloVe word embeddings
        weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 

        """

#         self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length

        self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
        self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)
        self.rnn = nn.RNN(embedding_length, hidden_size, num_layers=1, bidirectional=False)
        self.label = nn.Linear(hidden_size, output_size)
        
    def init_hidden(self, batch_size):
        if torch.cuda.is_available():
            return Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
        else:
            return Variable(torch.zeros(1, batch_size, self.hidden_size))    

    def forward(self, input_sentences):

        """ 
        Parameters
        ----------
        input_sentence: input_sentence of shape = (batch_size, num_sequences)
        batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)

        Returns
        -------
        Output of the linear layer containing logits for pos & neg class which receives its input as the final_hidden_state of RNN.
        logits.size() = (batch_size, output_size)

        """

        input = self.word_embeddings(input_sentences)
        input = input.permute(1, 0, 2)
        
        h_0 = self.init_hidden(input_sentences.shape[0])
        
        output, h_n = self.rnn(input, h_0)
        # h_n.size() = (4, batch_size, hidden_size)
        h_n = h_n.permute(1, 0, 2) # h_n.size() = (batch_size, 4, hidden_size)
        h_n = h_n.contiguous().view(h_n.size()[0], h_n.size()[1]*h_n.size()[2])
        # h_n.size() = (batch_size, 4*hidden_size)
        logits = self.label(h_n) # logits.size() = (batch_size, output_size)

        return logits

In [20]:
vocab_size, embedding_length = vec.vectors.shape
hidden_dim = 256
output_dim = len(LABEL.vocab)

model = RNN(weights=vec.vectors, hidden_size=hidden_dim,output_size=output_dim, vocab_size=vocab_size, embedding_length=embedding_length)

optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)

criterion = nn.CrossEntropyLoss()

In [21]:
y_gt = []
y_predicted = []
total_loss_train = 0

def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)

def train_model(model=model, train_iter=train_iter, epoch=0):
    model.train()
    
    # After epoch
    y_gt = []
    y_predicted = []
    steps = 0
    total_loss_train = 0

    optimizer.zero_grad()
    
    for index, (X,y) in enumerate(train_iter):
        target = X[0]
        target = target.view(-1)
        input_word = X[1]
        
        if torch.cuda.is_available():
            target = target.cuda()
            input_word = input_word.cuda()

#         print(input_word.shape)
        # Forward pass
    
        optimizer.zero_grad()
        
        output = model(input_word)
#         print(output.shape)

        loss = criterion(output, target)

        loss.backward()

        clip_gradient(model, 1e-1)

        optimizer.step()

        pred_idx = torch.max(output, dim=1)[1]

        y_gt += list(target.cpu().data.numpy())
        y_predicted += list(pred_idx.cpu().data.numpy())

        total_loss_train += loss.item()
        
        acc = accuracy_score(y_gt, y_predicted)

        steps += 1

        if steps % 40 == 0:
            print (f'Epoch: {epoch+1}, Idx: {index+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')

    train_acc = accuracy_score(y_gt, y_predicted)
    train_loss = total_loss_train/len(train_iter)
    
    return train_acc, train_loss

In [22]:
def eval_model(model=model, train_iter=val_iter, epoch=0):
    model.eval()
    
    # After epoch
    y_gt = []
    y_predicted = []
    steps = 0
    total_loss_train = 0

    for index, (X,y) in enumerate(train_iter):
        target = X[0]
        target = target.view(-1)
        input_word = X[1]

        if torch.cuda.is_available():
            target = target.cuda()
            input_word = input_word.cuda()

        # Forward pass
        output = model(input_word)

        loss = criterion(output, target)

        pred_idx = torch.max(output, dim=1)[1]

        y_gt += list(target.cpu().data.numpy())
        y_predicted += list(pred_idx.cpu().data.numpy())

        total_loss_train += loss.item()
        
        acc = accuracy_score(y_gt, y_predicted)

        steps += 1

        if steps % 40 == 0:
            print (f'Epoch: {epoch+1}, Idx: {index+1}, Validation Loss: {loss.item():.4f}, Validation Accuracy: {acc.item(): .2f}%')

    train_acc = accuracy_score(y_gt, y_predicted)
    train_loss = total_loss_train/len(val_iter)
    
    return train_acc, train_loss

In [None]:
num_epochs = 5

if torch.cuda.is_available():
    model = model.to(device)
    
for epoch in range(num_epochs):
    train_acc, train_loss = train_model(model, train_iter, epoch)
    val_acc, val_loss = eval_model(model, val_iter, epoch)
    
    print (f'Epoch: {epoch+1}, Training Accuracy: {train_acc:.4f}%, Training Loss: {train_loss:.4f}, Validation Accuracy: {val_acc:.4f}%, Validation Loss: {val_loss:.4f}')
    

Epoch: 1, Idx: 40, Training Loss: 0.6946, Training Accuracy:  0.51%
