<a href="https://colab.research.google.com/github/manishiitg/ML_Experiments/blob/master/autoencoder/pytorch_lstm_getting_started_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Simple LSTM for text classification using torchtext

In [0]:
%matplotlib inline

import pandas as pd
import numpy as np

In [5]:
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe

def normalize(comment):
    comment = comment.lower()
    lines = comment.split()
    lines = [x.strip(' ') for x in lines]
    lines = [x.replace('"', '') for x in lines]
    lines = [x.replace('\\"', '') for x in lines]
    lines = [x.replace(u'\xa0', u'') for x in lines]

    return lines

tokenize = lambda x: x.split()
TEXT = data.Field(sequential=True, tokenize=normalize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
LABEL = data.LabelField()

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train_data)

aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 11.3MB/s]
.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                          
100%|█████████▉| 399880/400000 [00:50<00:00, 12046.96it/s]

Above, we are tokenizing the text and label. 

Also we genering word embedding using glove vectors

In [6]:
word_embeddings = TEXT.vocab.vectors
print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
print ("Label Length: " + str(len(LABEL.vocab)))

train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data
train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)

'''Alternatively we can also use the default configurations'''
# train_iter, test_iter = datasets.IMDB.iters(batch_size=32)

vocab_size = len(TEXT.vocab)

Length of Text Vocabulary: 233023
Vector size of Text Vocabulary:  torch.Size([233023, 300])
Label Length: 2
Length of Text Vocabulary: 233023
Vector size of Text Vocabulary:  torch.Size([233023, 300])
Label Length: 2


In [7]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

class LSTMClassifier(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, weights):
    super(LSTMClassifier, self).__init__()
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim
    self.vocab_size = vocab_size

    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.embedding.weight = nn.Parameter(weights, requires_grad=False)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1)

    self.hidden2out = nn.Linear(hidden_dim, output_size)
    self.softmax = nn.LogSoftmax()

    self.dropout_layer = nn.Dropout(p=0.2)


  def init_hidden(self, batch_size):
    h0 = torch.zeros(1, batch_size, self.hidden_dim).to(device)
    # Initialize cell state
    c0 = torch.zeros(1, batch_size, self.hidden_dim).to(device)
    return(h0,c0)


  def forward(self, batch):
    

    # print(batch.shape, "batch shape")
    self.hidden = self.init_hidden(batch.size(0))

    embeds = self.embedding(batch)
    embeds = embeds.permute(1, 0, 2)
    # print(embeds.shape, "embeds")
    # packed_input = pack_padded_sequence(embeds, lengths)
    outputs, (ht, ct) = self.lstm(embeds, self.hidden)

    # ht is the last hidden state of the sequences
    # ht = (1 x batch_size x hidden_dim)
    # ht[-1] = (batch_size x hidden_dim)
    output = self.dropout_layer(ht[-1])
    # print(output.shape, "output")
    output = self.hidden2out(output)
    # print(output.shape, "output")
    output = self.softmax(output)
    # print(output.shape, "output")

    return output

import torch.nn.functional as F
loss_fn = F.cross_entropy

batch_size = 32




def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)


def train_model(model, train_iter, epoch):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.to(device)
    optim = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    steps = 0
    model.train()
    for idx, batch in enumerate(train_iter):
        # print(idx)
        text = batch.text[0].to(device) # this is a tuple for some reason
        target = batch.label
        target = target.to(device)
        if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.
            continue
        optim.zero_grad()
        prediction = model(text)
        loss = loss_fn(prediction, target)
        num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)
        loss.backward()
        # clip_gradient(model, 1e-1)
        optim.step()
        steps += 1
        
        if steps % 100 == 0:
            print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')
        
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
        
    return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)

def eval_model(model, val_iter):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(val_iter):
            text = batch.text[0]
            if (text.size()[0] is not 32):
                continue
            target = batch.label
            target = torch.autograd.Variable(target).long()
            if torch.cuda.is_available():
                text = text.cuda()
                target = target.cuda()
            prediction = model(text)
            loss = loss_fn(prediction, target)
            num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(batch)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()

    return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)

output_size = 2
hidden_size = 256
embedding_length = 300

model = LSTMClassifier(vocab_size, embedding_length,  hidden_size, output_size, word_embeddings)


cuda


In [8]:

for epoch in range(10):
  train_loss, train_acc = train_model(model, train_iter, epoch)
  val_loss, val_acc = eval_model(model, valid_iter)
    
  print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')




Epoch: 1, Idx: 100, Training Loss: 0.6917, Training Accuracy:  56.25%
Epoch: 1, Idx: 200, Training Loss: 0.6950, Training Accuracy:  46.88%
Epoch: 1, Idx: 300, Training Loss: 0.6929, Training Accuracy:  50.00%
Epoch: 1, Idx: 400, Training Loss: 0.6895, Training Accuracy:  59.38%
Epoch: 1, Idx: 500, Training Loss: 0.6615, Training Accuracy:  59.38%
Epoch: 01, Train Loss: 0.690, Train Acc: 51.74%, Val. Loss: 0.679772, Val. Acc: 54.30%
Epoch: 2, Idx: 100, Training Loss: 0.7088, Training Accuracy:  59.38%
Epoch: 2, Idx: 200, Training Loss: 0.6871, Training Accuracy:  56.25%
Epoch: 2, Idx: 300, Training Loss: 0.7076, Training Accuracy:  68.75%
Epoch: 2, Idx: 400, Training Loss: 0.6397, Training Accuracy:  65.62%
Epoch: 2, Idx: 500, Training Loss: 0.6778, Training Accuracy:  43.75%
Epoch: 02, Train Loss: 0.671, Train Acc: 57.25%, Val. Loss: 0.645017, Val. Acc: 61.32%
Epoch: 3, Idx: 100, Training Loss: 0.6255, Training Accuracy:  65.62%
Epoch: 3, Idx: 200, Training Loss: 0.6423, Training Accu