In this notebook you will find an example of Bidirectional LSTM implemented in PyTorch.

The task that we will try to solve in this notebook is MNIST images classification.

In [1]:
#Import dependencies
import torch
import torch.nn as nn
import torch.nn.functional as f
from torch.autograd import Variable
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
from torch.optim import Adam
import numpy as np

In [2]:
def accuracy(preds, y_true):
    '''
    Use this function to check accuracy of a model trained.
    
    :param: preds - predictions generated by neural network
    :param: y_true - true/real labels for each sample in the dataset
    '''
    correct = 0 
    assert len(preds) == len(y_true)
    
    for i in range(len(preds)):
        if np.argmax(preds[i]) == y_true[i]:
            correct += 1
    return correct / len(preds)

In [3]:
#hyperparams
#How many samples do we feed to the NN at onces
batch_size = 100
#step size-this param is used by optimizer
learning_rate = 0.003 
#How many times do we want to go through all samples in a dataset 
#NOTE: more is not always batter. More epochs can lead to overfit if you didnt regularize a network properly
epochs = 3
#number of RNN layers 
number_of_layers = 2
#Number of units/neurons in LSTM cell
rnn_units = 128
#If bi_dir == True LSTM layer will be bidirectional
bi_dir = True
#Number of numbers in the input vector (MNIST images are 28x28, so we put in_size 28 and sequnce_length to 28)
in_size = 28
seq_len = 28

#### Dataset preparation steps

In [4]:
#Download and transform MNIST training set
train_dataset = MNIST(root='./data/', 
                      train=True, 
                      transform=transforms.ToTensor(), 
                      download=True)

In [5]:
#Unpack and transform MNIST test set
test_dataset = MNIST(root='./data/', 
                      train=False, 
                      transform=transforms.ToTensor(), 
                      download=False)

In [6]:
#Create training set loader - DataLoader will help us with batching a dataset
train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True, 
                                           num_workers=2)

In [7]:
#Create test set loader
test_loader = torch.utils.data.DataLoader(test_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False, 
                                           num_workers=2)

#### Create Bidirectional RNN in PyTorch

In [8]:
class BiRNN(nn.Module):
    
    def __init__(self, in_size, seq_len, rnn_units, number_of_layers, num_classes):
        '''
            Init function of the BiRNN class helps to setup everything that we need for bidirectional rnn.
            
            :param: in_size - size of an input vector
            :param: seq_len - how many time steps a network will have
            :param: rnn_units - number of units/neurons in the RNN
            :param: number_of_layers - number of RNN layers
            :param: num_classes - number of different classes in a dataset (e.p. MNIST has 10 classes)
        '''
        super(BiRNN, self).__init__()
        
        self.hidden_units = rnn_units
        self.num_layers = number_of_layers
        
        self.rnn = nn.LSTM(in_size, rnn_units, number_of_layers, batch_first=True, bidirectional=True)
        
        #Put number of units x2 -> We are doing this because of Bidirectional RNN
        self.output = nn.Linear(self.hidden_units*2, num_classes)
        
    def forward(self, X):
        '''
        This method is called for networks forward-prop.
        
        :param: X - batch of data from a dataset
        '''
        
        #Create starting states for the LSTM layer
        h0 = Variable(torch.zeros(self.num_layers*2, X.size(0), self.hidden_units)).cuda()
        #NOTE: that we have number of layers * 2 -> that is because we are using Bidirectional RNN
        c0 = Variable(torch.zeros(self.num_layers*2, X.size(0), self.hidden_units)).cuda()
        
        out, _ = self.rnn(X, (h0, c0))
        return f.softmax(self.output(out[:, -1, :]))

In [9]:
#Create BiRNN network object
rnn = BiRNN(in_size, seq_len, rnn_units, number_of_layers, 10)

In [10]:
#If you have avaliable cuda supported GPU card use this command to run the network on GPU instead of CPU
rnn.cuda()

BiRNN(
  (rnn): LSTM(28, 128, num_layers=2, batch_first=True, bidirectional=True)
  (output): Linear(in_features=256, out_features=10)
)

In [11]:
#Create object of the cross entropy loss function
criterion = nn.CrossEntropyLoss()

In [12]:
#Create object of Adam optimizer
optimizer = Adam(rnn.parameters(), lr=learning_rate)

#### Create training loop

In [13]:
for epoch in range(epochs):
    epoch_accuracy = [] 
    epoch_loss = []
    
    for images, labels in train_loader:
        
        #Create batch of images and labels
        X_batch = Variable(images.view(-1, seq_len, in_size)).cuda()
        y_batch = Variable(labels).cuda()
        
        optimizer.zero_grad() #set network grads to zero
        preds = rnn(X_batch) #get network prediction
        epoch_accuracy.append(accuracy(preds.cpu().data.numpy(), y_batch.cpu().data.numpy())) #get accuracy for the current batch
        loss = criterion(preds, y_batch) #calculate cross entropy loss for a given batch
        epoch_loss.append(loss.cpu().data.numpy()) #log batch loss
        loss.backward() #call backprop function in respect to calculated loss
        optimizer.step() #finally we call optimizers step function
        
    print("Epoch: {}/{}".format(epoch+1, epochs), 
          " | Epoch loss: {}".format(np.mean(epoch_loss)), 
          " | Epoch accuracy: {}".format(np.mean(epoch_accuracy)))



Epoch: 1/3  | Epoch loss: 1.7570611238479614  | Epoch accuracy: 0.7049
Epoch: 2/3  | Epoch loss: 1.5680830478668213  | Epoch accuracy: 0.8934999999999998
Epoch: 3/3  | Epoch loss: 1.5165644884109497  | Epoch accuracy: 0.9450999999999999
