# Building a Recurrent Neural Network with PyTorch
__Created:__ 09/05/2020 8:14PM

__Last Edited:__ 14/05/2020 7:40PM

__Firstly, I would like to thank Deep Learning Wizard for this resource.__
https://www.deeplearningwizard.com/deep_learning/practical_pytorch/pytorch_recurrent_neuralnetwork/

## Model A: 1 Hidden Layer (ReLU)
- Unroll 28 time steps
   - Each step input size: 28 x 1
   - Total per unroll: 28 x 28
      - Feedforward Neural Network input size: 28 x 28
- 1 Hidden Layer
- ReLU Activation Function


### Steps
1. Load Dataset
2. Make Dataset Iterable
3. Create Model Class
4. Instantiate Model Class (?)
5. Instantiate Loss Class
6. Instantiate Optimiser Class
7. Train Model

In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dset

In [2]:
from torch.utils.data import DataLoader

### Rather than using MNIST, I will opt for Fashion MNIST in this project to explore a different dataset type

In [39]:
train_data = dset.FashionMNIST(root='./data',
                               train = True,
                               transform=transforms.ToTensor(),
                               download = True)

test_data = dset.FashionMNIST(root='./data',
                               train = False,
                               transform=transforms.ToTensor())

#### Make Dataset Iterable

In [40]:
train_loader = DataLoader(dataset = train_data, batch_size = 100, shuffle = True)
test_loader = DataLoader(dataset = test_data, batch_size = 100, shuffle = True)

In [5]:
batch_size = 100
n_iters = 3000

In [41]:
num_epochs = n_iters / (len(train_data) / batch_size)
num_epochs = int(num_epochs)

In [42]:
num_epochs

5

#### Create Model Class

In [26]:
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(RNNModel, self).__init__()
        
        # Hidden Dimensions
        self.hidden_dim = hidden_dim
        
        # Number of Hidden Layers
        self.layer_dim = layer_dim
        
        # Building your RNN
        # batch_first = True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, input_dim)
        # batch_dim = number of samples per batch
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='relu')
        
        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # Initialize hidden state with zeros
        # (layer_dim, batch_size, hidden_dim)
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
        
        # We need to detach the hidden state to prevent exploding/vanishing gradients
        # This is part of the truncated backpropagation through time (BPTT)
        out, hn = self.rnn(x, h0.detach())
        
        # Index hidden state of last time step
        # out.size() --> 100, 28, 10
        # out[:, -1, :] --> 100, 10 --> just want last time step hidden states!
        out = self.fc(out[:, -1, :])
        # out.size() --> 100, 10
        return out

#### Instantiate Model Class
- 28 time steps
   - Each time step: input dimension = 28
- 1 hidden layer
- MNIST 1-9 digits --> output dimension = 10

In [43]:
input_dim = 28
hidden_dim = 100
layer_dim = 1
output_dim = 10

In [44]:
model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim)

#### Instantiate Loss Class
- Recurent Neural Network: __Cross Entropy Loss__


In [9]:
criterion = nn.CrossEntropyLoss()

#### Instantiate Optimizer Class
- Simplified equation
  - $\theta$ = $\theta$ - $\eta$ ⋅ $\Delta_\theta$
     - $\theta$ : parameters (out tensors with gradient accumulation abilities)
     - $\eta$ : learning rate (how fast we want to learn)
     - $\Delta_\theta$ : gradients of loss with respect to the model's parameters
- Even simpler equation
   - `parameters = parameters - learning_rate * parameter_gradients`
   - __At every iteration, we update our model's parameters__

In [10]:
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

#### Total groups of parameters

In [32]:
len(list(model.parameters()))

6

#### Input to Hidden weight
We defined our hidden layer to have a size of 100. Because our input is of size 28 at each time step, this should give rise to a matrix of size 100 x 28.

In [33]:
# Input --> Hidden (A1)
list(model.parameters())[0].size()

torch.Size([100, 28])

__Input to Hidden Bias__

In [34]:
# Input --> Hidden BIAS (B1)
list(model.parameters())[2].size()

torch.Size([100])

__Hidden to Hidden__

In [15]:
# Hidden --> Hidden (A3)
list(model.parameters())[1].size()

torch.Size([100, 100])

__Hidden to Hidden Bias__

In [16]:
# Hidden --> Hidden BIAS (B3)
list(model.parameters())[3].size()

torch.Size([100])

__Hidden to Output__

In [17]:
# Hidden --> Output (A2)
list(model.parameters())[4].size()

torch.Size([10, 100])

__Hidden to Output Bias__

In [18]:
# Hidden --> Output BIAS (B2)
list(model.parameters())[5].size()

torch.Size([10])

#### Train Model
- Process
   1. __Convert input/labels to tensors with gradients accumulation abilities__
      - RNN Input: (1, 28)
      - CNN Input: (1, 28, 28)
      - FNN Input: (1, 28 * 28)
   2. Clear gradient buffers
   3. Get output given inputs
   4. Get loss
   5. Get gradients wrt parameters
   6. Update parameters using gradients
      - `parameters = parameters - learning_rate * parameter_gradients`
   7. REPEAT

In [46]:
train_data = dset.MNIST(root='./data',
                               train = True,
                               transform=transforms.ToTensor(),
                               download = True)

test_data = dset.MNIST(root='./data',
                               train = False,
                               transform=transforms.ToTensor())

train_loader = DataLoader(dataset = train_data, batch_size = 100, shuffle = True)
test_loader = DataLoader(dataset = test_data, batch_size = 100, shuffle = True)

0it [00:00, ?it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data\MNIST\raw\train-images-idx3-ubyte.gz


 98%|███████████████████████████████████████████▉ | 9674752/9912422 [00:11<00:00, 876063.48it/s]

Extracting ./data\MNIST\raw\train-images-idx3-ubyte.gz to ./data\MNIST\raw



0it [00:00, ?it/s][A

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data\MNIST\raw\train-labels-idx1-ubyte.gz



  0%|                                                                 | 0/28881 [00:00<?, ?it/s][A
32768it [00:00, 45474.02it/s]                                                                   [A

0it [00:00, ?it/s][A

Extracting ./data\MNIST\raw\train-labels-idx1-ubyte.gz to ./data\MNIST\raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data\MNIST\raw\t10k-images-idx3-ubyte.gz



  0%|                                                               | 0/1648877 [00:00<?, ?it/s][A
  1%|▍                                               | 16384/1648877 [00:00<00:29, 55145.65it/s][A
  3%|█▍                                              | 49152/1648877 [00:00<00:22, 69669.52it/s][A
  6%|██▊                                             | 98304/1648877 [00:01<00:17, 87506.93it/s][A
 13%|█████▉                                        | 212992/1648877 [00:01<00:12, 116581.83it/s][A
 26%|████████████                                  | 434176/1648877 [00:01<00:07, 158646.35it/s][A
 38%|█████████████████▎                            | 622592/1648877 [00:01<00:04, 206768.86it/s][A
 50%|██████████████████████▊                       | 819200/1648877 [00:02<00:03, 273045.47it/s][A
 63%|████████████████████████████▏                | 1032192/1648877 [00:02<00:01, 347501.75it/s][A
 76%|█████████████████████████████████▉           | 1245184/1648877 [00:02<00:00, 430234.35it/s][A

Extracting ./data\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data\MNIST\raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz



8192it [00:00, 10319.80it/s]                                                                    [A


Extracting ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw
Processing...
Done!


In [72]:
model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim)

In [20]:
num_epochs = 5

In [65]:
# Number of swteps to unroll
seq_dim = 28

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        #model.train()
        
        # Load images as tensors with gradient accumulation abilities
        images = images.view(-1, seq_dim, input_dim).requires_grad_()
        
        # Clear gradients wrt parameters
        optimizer.zero_grad()
        
        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)
        
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
        
        # Getting gradients wrt parameters
        loss.backward()
        
        # Updating parameters
        optimizer.step()
        
        iter += 1
        
        if iter % 500 == 0:
            model.eval()
            
            # Calculate Accuracy
            correct = 0
            total = 0
            
            # Iterate through test dataset
            for images, labels in test_loader:
                
                # Load images to a Torch tensors with gradient accumulation abilities
                images = images.view(-1, seq_dim, input_dim)
                
                # Forward pass only to get logits/output
                outputs = model(images)
                
                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)
                
                # Total number of labels
                total += labels.size(0)
                
                # Total correct predictions
                correct += (predicted == labels).sum()
                
            accuracy = 100 * correct / total
            
            # Print Loss
            print('Epoch: {}. Iteration: {}. Loss: {:.2f}. Accuracy: {:.2f}'.format(epoch, iter, loss.item(), accuracy))

Epoch: 0. Iteration: 500. Loss: 2.31. Accuracy: 8.00
Epoch: 1. Iteration: 500. Loss: 2.31. Accuracy: 8.00
Epoch: 2. Iteration: 500. Loss: 2.31. Accuracy: 8.00
Epoch: 3. Iteration: 500. Loss: 2.31. Accuracy: 8.00
Epoch: 4. Iteration: 500. Loss: 2.30. Accuracy: 8.00


In [70]:
inputs.shape

torch.Size([100, 1, 28, 28])

In [73]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

RuntimeError: input must have 3 dimensions, got 4

Why doesn't the accuracy change??

In [29]:
total

10000

In [30]:
correct

tensor(1417)

Since there's something wrong with this model, I'm going to give LSTM a shot 

https://github.com/vinhkhuc/PyTorch-Mini-Tutorials/blob/master/6_lstm.py


# LSTM

In [49]:
from __future__ import division
import numpy as np

import torch
from torch.autograd import variable
from torch import optim, nn

In [50]:
class LSTMNet(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMNet, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, output_dim, bias=False)
        
    def forward(self,x):
        batch_size = x.size()[1]
        h0 = Variable(torch.zeros([1, batch_size, self.hidden_dim]), requires_grad=False)
        c0 = Variable(torch.zeros([1, batch_size, self.hidden_dim]), requires_grad=False)
        fx, _ = self.lstm.forward(x, (h0, c0))
        return self.linear.forward(fx[-1])

In [51]:
def train(model, loss, optimizer, x_val, y_val):
    x = Variable(x_val, requires_grad=False)
    y = Variable(y_val, requires_grad=False)
    
    # Reset Gradient
    optimizer.zero_grad()
    
    # Forward
    fx = model.forward(x)
    output = loss.forward(fx,y)
    
    # Backward
    output.backward()
    
    # Update parameters
    optimizer.step()
    
    return output.item()

In [52]:
def predict(model, x_val):
    x = Variable(x_val, requires_grad=False)
    output = model.forward(x)
    return output.data.numpy().argmax(axis=1)

In [55]:
from data_util import load_mnist

ModuleNotFoundError: No module named 'data_util'

In [63]:
train_data = dset.FashionMNIST(root='./data',
                               train = True,
                               transform=transforms.ToTensor(),
                               download = True)

test_data = dset.FashionMNIST(root='./data',
                               train = False,
                               transform=transforms.ToTensor())

In [64]:
train_loader = DataLoader(dataset = train_data, batch_size = 100, shuffle = True)
test_loader = DataLoader(dataset = test_data, batch_size = 100, shuffle = True)

In [53]:
trX, teX, trY, teY = load_mnist(onehot=False)

NameError: name 'load_mnist' is not defined

In [62]:
train_data[0][0].shape

torch.Size([1, 28, 28])

for some reason, this particular model doesn't use a dataloader. Maybe try to implement that yourself

In [94]:
torch.manual_seed(42)

train_size = 60000
n_classes = 10
seq_length = 28
input_dim = 28
hidden_dim = 128
batch_size = 100
epochs = 10


# Convert the data to the shape (seq_length, num_samples, input_dim)

model = LSTMNet(input_dim, hidden_dim, n_classes)
loss = torch.nn.CrossEntropyLoss(reduction='elementwise_mean')
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
'''
for i in range(epochs):
    cost = 0.
    num_batches = train_size // batch_size
    for k in range(num_batches):
        start, end = k * batch_size, (k+1) * batch_size
        cost += train(model, loss, optimizer, trX[:, start:end, :], trY[start:end])
        predY = predict(model, teX)
        print('Epoch %d, cost = %f, acc = %.2f%%' %
             i + 1, cost / num_batches, 100. * np.mean(predY == teY))
'''

"\nfor i in range(epochs):\n    cost = 0.\n    num_batches = train_size // batch_size\n    for k in range(num_batches):\n        start, end = k * batch_size, (k+1) * batch_size\n        cost += train(model, loss, optimizer, trX[:, start:end, :], trY[start:end])\n        predY = predict(model, teX)\n        print('Epoch %d, cost = %f, acc = %.2f%%' %\n             i + 1, cost / num_batches, 100. * np.mean(predY == teY))\n"

In [96]:
for i in range(epochs):
    cost = 0
    for i, (images, labels) in enumerate(train_loader):
        images = images.view(-1, seq_dim, input_dim).requires_grad_()
        cost += train(model, loss, optimizer, images, labels)
        predY = predict(model, labels)
        print('Epoch %d, cost = %f, acc = %.2f%%' %
             i + 1, cost / num_batches, 100. * np.mean(predY == teY))

NameError: name 'Variable' is not defined

## LSTM Model obtained from Deep Learning wizard:
https://www.deeplearningwizard.com/deep_learning/practical_pytorch/pytorch_lstm_neuralnetwork/

In [76]:
batch_size = 100
n_iters = 3000
num_epochs = n_iters / (len(train_data) / batch_size)
num_epochs = int(num_epochs)

In [77]:
num_epochs

5

In [74]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # 28 time steps
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10
        return out


In [97]:
# Device Configuration
device = torch.device('cpu')

In [100]:
input_dim = 28
hidden_dim = 100
layer_dim = 1
output_dim = 10

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
loss = torch.nn.CrossEntropyLoss(reduction='elementwise_mean')
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [101]:
# Number of steps to unroll
seq_dim = 28  

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images as a torch tensor with gradient accumulation abilities
        images = images.view(-1, seq_dim, input_dim).requires_grad_()
        
        images = images.to(device)
        labels = labels.to(device)

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                # Resize images
                images = images.view(-1, seq_dim, input_dim)

                # Forward pass only to get logits/output
                outputs = model(images).argmax(axis=1)

                # Get predictions from the maximum value
                # _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (outputs == labels).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Number Correct: {}'.format(iter, loss.item(), correct))

Iteration: 500. Loss: 1.051063895225525. Number Correct: 5737
Iteration: 1000. Loss: 0.5534528493881226. Number Correct: 7568
Iteration: 1500. Loss: 0.6016789078712463. Number Correct: 7879
Iteration: 2000. Loss: 0.5808594822883606. Number Correct: 8101
Iteration: 2500. Loss: 0.5271108746528625. Number Correct: 8277
Iteration: 3000. Loss: 0.44537222385406494. Number Correct: 8365


In [102]:
len(test_data)

10000

## IT WORKS!!!!

In [81]:
labels

tensor([6, 1, 2, 0, 2, 3, 3, 5, 8, 3, 7, 3, 8, 3, 5, 9, 0, 8, 5, 9, 4, 3, 9, 1,
        2, 0, 1, 1, 1, 8, 1, 1, 8, 5, 0, 1, 5, 8, 0, 3, 2, 8, 8, 7, 5, 2, 5, 7,
        9, 1, 3, 1, 5, 7, 5, 0, 9, 3, 7, 7, 1, 2, 5, 6, 4, 9, 2, 4, 4, 3, 0, 4,
        2, 2, 0, 9, 2, 2, 3, 3, 0, 4, 1, 1, 3, 2, 4, 8, 5, 2, 2, 2, 6, 2, 2, 1,
        2, 2, 3, 4])

In [87]:
model(images).argmax(axis=1)

tensor([9, 5, 6, 5, 6, 5, 5, 9, 9, 9, 9, 5, 9, 9, 9, 9, 5, 5, 9, 9, 6, 9, 9, 9,
        6, 9, 9, 5, 9, 9, 5, 5, 6, 9, 5, 9, 9, 9, 5, 9, 6, 5, 9, 9, 9, 6, 9, 9,
        9, 9, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 6, 9, 9, 9, 9, 6, 6, 6, 9, 5, 9,
        6, 6, 5, 9, 9, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 6, 6, 6, 6, 9, 6, 5,
        9, 6, 9, 9])

In [88]:
for i, (images, labels) in enumerate(train_loader):
    break

In [91]:
images.shape

torch.Size([100, 1, 28, 28])

In [90]:
model(images).argmax(axis=1)

RuntimeError: input must have 3 dimensions, got 4

##

In [106]:
# Number of steps to unroll
seq_dim = 28  

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images as a torch tensor with gradient accumulation abilities
        images = images.view(-1, seq_dim, input_dim).requires_grad_()
        
        images = images.to(device)
        labels = labels.to(device)

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                # Resize images
                images = images.view(-1, seq_dim, input_dim)

                # Forward pass only to get logits/output
                outputs = model(images).argmax(axis=1)

                # Get predictions from the maximum value
                # _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (outputs == labels).sum()

            accuracy = 100.* correct / total

            # Print Loss
            print('Iteration: {}. Loss: {:.3f}. Number Correct: {:.2f}'.format(iter, loss.item(), correct))

Iteration: 500. Loss: 0.40. Number Correct: 87.00
Iteration: 1000. Loss: 0.38. Number Correct: 87.00
Iteration: 1500. Loss: 0.26. Number Correct: 87.00
Iteration: 2000. Loss: 0.36. Number Correct: 88.00
Iteration: 2500. Loss: 0.32. Number Correct: 87.00
Iteration: 3000. Loss: 0.22. Number Correct: 87.00
