<p style="font-family:ComicSansMS; font-size: 30px;"> Recurrent Neural Network with PyTorch</p>

<p style="font-family:ComicSansMS; font-size: 24px; color: magenta"> Model A: 1 Hidden Layer (ReLU)</p>

<p style="font-family:ComicSansMS; font-size: 16px; color: magenta"> Unroll 28 time steps</p>
<p style="font-family:ComicSansMS; font-size: 22px; color: yellow"> 1 Hidden layer</p>
<p style="font-family:ComicSansMS; font-size: 16px; color: magenta"> ReLU Activation Function</p>

In [None]:
# Unroll 28 time steps
    # Each step input size: 28 x 1
        # Total per unroll: 28 x 28
        # Feedforward Neural Network input size: 28 x 28
# 1 Hidden layer
# ReLU Activation Function

In [None]:
# Steps¶
# Step 1: Load Dataset
# Step 2: Make Dataset Iterable
# Step 3: Create Model Class
# Step 4: Instantiate Model Class
# Step 5: Instantiate Loss Class
# Step 6: Instantiate Optimizer Class
# Step 7: Train Model

> Step 1: Loading MNIST Train Dataset

> Looking into the MNIST Dataset

In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets

In [2]:
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())

100%|██████████| 9.91M/9.91M [00:04<00:00, 2.06MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 223kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.31MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 2.27MB/s]


In [25]:
print(train_dataset.train_data.size())

print(train_dataset.train_labels.size())

print(test_dataset.test_data.size())

print(test_dataset.test_labels.size())

torch.Size([60000, 28, 28])
torch.Size([60000])
torch.Size([10000, 28, 28])
torch.Size([10000])


> Step 2: Make Dataset Iterable

> Creating iterable objects to loop through subsequently

In [7]:
batch_size = 100
n_iters = 3000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

> Step 3: Create Model Class

> 1 Layer RNN

In [8]:
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(RNNModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building your RNN
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, input_dim)
        # batch_dim = number of samples per batch
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='relu')

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        # (layer_dim, batch_size, hidden_dim)
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # We need to detach the hidden state to prevent exploding/vanishing gradients
        # This is part of truncated backpropagation through time (BPTT)
        out, hn = self.rnn(x, h0.detach())

        # Index hidden state of last time step
        # out.size() --> 100, 28, 10
        # out[:, -1, :] --> 100, 10 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10
        return out

> Step 4: Instantiate Model Class

In [9]:
# 28 time steps
    # Each time step: input dimension = 28
# 1 hidden layer
# MNIST 1-9 digits 
#  output dimension = 10

> Instantiate model class and assign to an object

In [10]:
input_dim = 28
hidden_dim = 100
layer_dim = 1
output_dim = 10

model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim)

> Step 5: Instantiate Loss Class

In [11]:
# Recurrent Neural Network: Cross Entropy Loss
    # Convolutional Neural Network: Cross Entropy Loss
    # Feedforward Neural Network: Cross Entropy Loss
    # Logistic Regression: Cross Entropy Loss
    # Linear Regression: MSE

> Cross Entropy Loss for Classification Task

In [12]:
criterion = nn.CrossEntropyLoss()

> Step 6: Instantiate Optimizer Class

In [13]:
# Even simplier equation
    # parameters = parameters - learning_rate * parameters_gradients
    # At every iteration, we update our model's parameters

In [14]:
learning_rate = 0.01

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

In [15]:
# Parameters In-Depth¶
# Input to Hidden Layer Affine Function
# A1, B1
# Hidden Layer to Output Affine Function
# A2, B2
# Hidden Layer to Hidden Layer Affine Function
# A3, B3

> Total groups of parameters

In [16]:
len(list(model.parameters()))

6

> Input to Hidden Weight

In [17]:
# Input --> Hidden (A1)
list(model.parameters())[0].size()

torch.Size([100, 28])

> Input to Hidden Bias

In [18]:
# Input --> Hidden BIAS (B1)
list(model.parameters())[2].size()

torch.Size([100])

> Hidden to Hidden

In [19]:
# Hidden --> Hidden (A3)
list(model.parameters())[1].size()

torch.Size([100, 100])

> Hidden to Hidden Bias

In [20]:
# Hidden --> Hidden BIAS(B3)
list(model.parameters())[3].size()

torch.Size([100])

> Hidden to Output

In [21]:
# Hidden --> Output (A2)
list(model.parameters())[4].size()

torch.Size([10, 100])

> Hidden to Output Bias

In [22]:
# Hidden --> Output BIAS (B2)
list(model.parameters())[5].size()

torch.Size([10])

> Step 7: Train Model

In [23]:
# Process
    # Convert inputs/labels to tensors with gradient accumulation abilities
        # RNN Input: (1, 28)
        # CNN Input: (1, 28, 28)
        # FNN Input: (1, 28*28)
    # Clear gradient buffets
    # Get output given inputs
    # Get loss
    # Get gradients w.r.t. parameters
    # Update parameters using gradients
    # parameters = parameters - learning_rate * parameters_gradients
    # REPEAT

> Same 7 step process for training models

In [24]:
# Number of steps to unroll
seq_dim = 28  

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        model.train()
        # Load images as tensors with gradient accumulation abilities
        images = images.view(-1, seq_dim, input_dim).requires_grad_()

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            model.eval()
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                # Load images to a Torch tensors with gradient accumulation abilities
                images = images.view(-1, seq_dim, input_dim)

                # Forward pass only to get logits/output
                outputs = model(images)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

Iteration: 500. Loss: 2.300598382949829. Accuracy: 15.399999618530273
Iteration: 1000. Loss: 2.3022639751434326. Accuracy: 16.3700008392334
Iteration: 1500. Loss: 2.275730848312378. Accuracy: 20.09000015258789
Iteration: 2000. Loss: 2.0255444049835205. Accuracy: 25.719999313354492
Iteration: 2500. Loss: 1.3834927082061768. Accuracy: 61.16999816894531
Iteration: 3000. Loss: 0.7578946948051453. Accuracy: 72.77999877929688


<p style="font-family:ComicSansMS; font-size: 24px; color: magenta"> Model B: 2 Hidden Layer (ReLU)</p>

<p style="font-family:ComicSansMS; font-size: 16px; color: magenta"> Unroll 28 time steps</p>
<p style="font-family:ComicSansMS; font-size: 22px; color: yellow"> 2 Hidden layer</p>
<p style="font-family:ComicSansMS; font-size: 16px; color: magenta"> ReLU Activation Function</p>

> Steps

In [None]:
    # Step 1: Load Dataset
    # Step 2: Make Dataset Iterable
    # Step 3: Create Model Class
        # Step 4: Instantiate Model Class
    # Step 5: Instantiate Loss Class
    # Step 6: Instantiate Optimizer Class
    # Step 7: Train Model

> 2 Hidden Layer + ReLU

In [26]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets

'''
STEP 1: LOADING DATASET
'''
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())

'''
STEP 2: MAKING DATASET ITERABLE
'''

batch_size = 100
n_iters = 3000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

'''
STEP 3: CREATE MODEL CLASS
'''

class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(RNNModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building your RNN
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='relu')

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # We need to detach the hidden state to prevent exploding/vanishing gradients
        # This is part of truncated backpropagation through time (BPTT)
        out, hn = self.rnn(x, h0.detach())

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10
        return out

'''
STEP 4: INSTANTIATE MODEL CLASS
'''
input_dim = 28
hidden_dim = 100
layer_dim = 2  # ONLY CHANGE IS HERE FROM ONE LAYER TO TWO LAYER
output_dim = 10

model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim)

# JUST PRINTING MODEL & PARAMETERS 
print(model)
print(len(list(model.parameters())))
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())

'''
STEP 5: INSTANTIATE LOSS CLASS
'''
criterion = nn.CrossEntropyLoss()

'''
STEP 6: INSTANTIATE OPTIMIZER CLASS
'''
learning_rate = 0.01

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

'''
STEP 7: TRAIN THE MODEL
'''

# Number of steps to unroll
seq_dim = 28  

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        model.train()
        # Load images as tensors with gradient accumulation abilities
        images = images.view(-1, seq_dim, input_dim).requires_grad_()

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            model.eval()
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                # Resize images
                images = images.view(-1, seq_dim, input_dim)

                # Forward pass only to get logits/output
                outputs = model(images)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

RNNModel(
  (rnn): RNN(28, 100, num_layers=2, batch_first=True)
  (fc): Linear(in_features=100, out_features=10, bias=True)
)
10
torch.Size([100, 28])
torch.Size([100, 100])
torch.Size([100])
torch.Size([100])
torch.Size([100, 100])
torch.Size([100, 100])
torch.Size([100])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])
Iteration: 500. Loss: 2.2966396808624268. Accuracy: 10.279999732971191
Iteration: 1000. Loss: 2.302166223526001. Accuracy: 11.479999542236328
Iteration: 1500. Loss: 2.2946512699127197. Accuracy: 15.5600004196167
Iteration: 2000. Loss: 2.2757158279418945. Accuracy: 18.469999313354492
Iteration: 2500. Loss: 2.1378917694091797. Accuracy: 21.81999969482422
Iteration: 3000. Loss: 1.1979039907455444. Accuracy: 64.70999908447266


<p style="font-family:ComicSansMS; font-size: 24px; color: magenta"> Model C: 2 Hidden Layer</p>

<p style="font-family:ComicSansMS; font-size: 16px; color: magenta"> Unroll 28 time steps</p>
<p style="font-family:ComicSansMS; font-size: 22px; color: magenta"> 2 Hidden layer</p>
<p style="font-family:ComicSansMS; font-size: 16px; color: yellow"> Tanh Activation Function</p>

> Steps    

In [None]:
# Step 1: Load Dataset
    # Step 2: Make Dataset Iterable
    # Step 3: Create Model Class
        # Step 4: Instantiate Model Class
    # Step 5: Instantiate Loss Class
    # Step 6: Instantiate Optimizer Class
    # Step 7: Train Model
# !!! "2 Hidden + ReLU"

In [27]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets

'''
STEP 1: LOADING DATASET
'''
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())

'''
STEP 2: MAKING DATASET ITERABLE
'''

batch_size = 100
n_iters = 3000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

'''
STEP 3: CREATE MODEL CLASS
'''

class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(RNNModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building your RNN
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='tanh')

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # One time step
        # We need to detach the hidden state to prevent exploding/vanishing gradients
        # This is part of truncated backpropagation through time (BPTT)
        out, hn = self.rnn(x, h0.detach())

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10
        return out

'''
STEP 4: INSTANTIATE MODEL CLASS
'''
input_dim = 28
hidden_dim = 100
layer_dim = 2  # ONLY CHANGE IS HERE FROM ONE LAYER TO TWO LAYER
output_dim = 10

model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim)

# JUST PRINTING MODEL & PARAMETERS 
print(model)
print(len(list(model.parameters())))
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())

'''
STEP 5: INSTANTIATE LOSS CLASS
'''
criterion = nn.CrossEntropyLoss()

'''
STEP 6: INSTANTIATE OPTIMIZER CLASS
'''
learning_rate = 0.1

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

'''
STEP 7: TRAIN THE MODEL
'''

# Number of steps to unroll
seq_dim = 28  

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images as tensors with gradient accumulation abilities
        images = images.view(-1, seq_dim, input_dim).requires_grad_()

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                # Resize images
                images = images.view(-1, seq_dim, input_dim)

                # Forward pass only to get logits/output
                outputs = model(images)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

RNNModel(
  (rnn): RNN(28, 100, num_layers=2, batch_first=True)
  (fc): Linear(in_features=100, out_features=10, bias=True)
)
10
torch.Size([100, 28])
torch.Size([100, 100])
torch.Size([100])
torch.Size([100])
torch.Size([100, 100])
torch.Size([100, 100])
torch.Size([100])
torch.Size([100])
torch.Size([10, 100])
torch.Size([10])
Iteration: 500. Loss: 0.9490087032318115. Accuracy: 72.1500015258789
Iteration: 1000. Loss: 0.5472593903541565. Accuracy: 89.11000061035156
Iteration: 1500. Loss: 0.44565892219543457. Accuracy: 81.16999816894531
Iteration: 2000. Loss: 0.1718563288450241. Accuracy: 93.75
Iteration: 2500. Loss: 0.23098306357860565. Accuracy: 95.87000274658203
Iteration: 3000. Loss: 0.06793251633644104. Accuracy: 96.08999633789062


In [None]:
# Summary of Results 
# Model A	            Model B	            Model C
# ReLU	                ReLU	            Tanh
# 1 Hidden Layer	    2 Hidden Layers	    2 Hidden Layers
# 100 Hidden Units	    100 Hidden Units	100 Hidden Units
# 72.77%	            64.70%	            96.08%