In [None]:
# Install torchsummary
!pip install torchsummary

# Import libraries and methods
from torchsummary import summary
from torchvision import datasets, transforms
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Check if GPU/ Cuda is available and set the device accordingly
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
# Print the final device being used
device

device(type='cuda')

In [None]:
# Define the batch size
# The batch size defines the number of samples that will be propagated through the network
# The ideal way to set batch size is to monitor resource utilization (CPU, GPU, RAM) during training
batch_size = 128

# Define the loaders
# The loaders are used to load the data in batches
# https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
train_loader = torch.utils.data.DataLoader(
    # Load the MNIST dataset
    datasets.MNIST(
        "../data",
        train=True,
        download=True,
        #    Define the transformations to be applied on the images
        transform=transforms.Compose(
            [
                # Convert the images to tensors
                transforms.ToTensor(),
                # Normalize the images with mean and standard deviation for each channel
                transforms.Normalize((0.1307,), (0.3081,)),
            ]
        ),
    ),
    batch_size=batch_size,
    shuffle=True,
)

# Define the test loader in a similar way to the train loader
# Remember to set train to False
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST(
        "../data",
        train=False,
        transform=transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        ),
    ),
    batch_size=batch_size,
    shuffle=True,
)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 210868674.02it/s]

Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 14361078.11it/s]


Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 71610870.16it/s]

Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz





Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 20550732.22it/s]


Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw



# Some Notes on our naive model

We are going to write a network based on what we have learnt so far.

The size of the input image is 28x28x1. We are going to add as many layers as required to reach RF = 32 "atleast".


In [None]:
# Create a class to create the network
# All the convolutions used in the network are 3x3 kernels
class FirstDNN(nn.Module):
    def __init__(self):
        # Call the constructor of the parent class: nn.Module
        super(FirstDNN, self).__init__()
        # Define the convolution layers in the network

        # r_in - Input receptive field
        # n_in - Number of input features
        # j_in - Input jump or representation power of pixels, initializes with 1 on first layer
        # s - stride
        # r_out - Output receptive field, calculated as (r_in + (k-1)*j_in)
        # n_out - Number of output features, calculated as (((n_in + 2*p - k)/s) + 1)
        # j_out - Output jump or representation power of pixels, calculated as j_in * s

        # First layer: convolution
        # r_in:1, n_in:28, j_in:1, s:1, r_out:3, n_out:28, j_out:1
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)

        # Second layer: convolution
        # r_in:3 , n_in:28 , j_in:1 , s:1 , r_out:5 , n_out:28 , j_out:1
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)

        # Third layer: max pooling
        # r_in:5 , n_in:28 , j_in:1 , s:2 , r_out:6 , n_out:14 , j_out:2
        self.pool1 = nn.MaxPool2d(2, 2)

        # Fourth layer: convolution
        # r_in:6 , n_in:14 , j_in:2 , s:1 , r_out:10 , n_out:14 , j_out:2
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)

        # Fifth layer: convolution
        # r_in:10 , n_in:14 , j_in:2 , s:1 , r_out:14 , n_out:14 , j_out:2
        self.conv4 = nn.Conv2d(128, 256, 3, padding=1)

        # Sixth layer: max pooling
        # r_in:14 , n_in:14 , j_in:2 , s:2 , r_out:16 , n_out:7 , j_out:4
        self.pool2 = nn.MaxPool2d(2, 2)

        # Seventh layer: convolution
        # r_in:16 , n_in:7 , j_in:4 , s:1 , r_out:24 , n_out:5 , j_out:4
        self.conv5 = nn.Conv2d(256, 512, 3)

        # Eighth layer: convolution
        # r_in:24 , n_in:5 , j_in:4 , s:1 , r_out:32 , n_out:3 , j_out:4
        self.conv6 = nn.Conv2d(512, 1024, 3)

        # Ninth layer: convolution
        # r_in:32 , n_in:3 , j_in:4 , s:1 , r_out:40 , n_out:1 , j_out:4
        self.conv7 = nn.Conv2d(1024, 10, 3)

    # Define the forward pass
    def forward(self, x):
        # Pass the input through the first convolution layer, followed by a relu activation, followed by the second convolution layer, followed by a relu activation, followed by the max pooling layer
        x = self.pool1(F.relu(self.conv2(F.relu(self.conv1(x)))))
        # Pass the input through the third convolution layer, followed by a relu activation, followed by the fourth convolution layer, followed by a relu activation, followed by the max pooling layer
        x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x)))))
        # Pass the input through the fifth convolution layer, followed by a relu activation, followed by the sixth convolution layer, followed by a relu activation
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        # Pass the input through the seventh convolution layer, followed by a relu activation
        x = F.relu(self.conv7(x))
        # Flatten the output of the seventh convolution layer
        x = x.view(-1, 10)
        # Return the output by applying a softmax activation with log probabilities
        return F.log_softmax(x)

In [None]:
# Create an instance of the network and send it to the device
# https://pytorch.org/docs/stable/generated/torch.Tensor.to.html
model = FirstDNN().to(device)

In [None]:
# Print the model summary by specifying the input size
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 28, 28]             320
            Conv2d-2           [-1, 64, 28, 28]          18,496
         MaxPool2d-3           [-1, 64, 14, 14]               0
            Conv2d-4          [-1, 128, 14, 14]          73,856
            Conv2d-5          [-1, 256, 14, 14]         295,168
         MaxPool2d-6            [-1, 256, 7, 7]               0
            Conv2d-7            [-1, 512, 5, 5]       1,180,160
            Conv2d-8           [-1, 1024, 3, 3]       4,719,616
            Conv2d-9             [-1, 10, 1, 1]          92,170
Total params: 6,379,786
Trainable params: 6,379,786
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 1.51
Params size (MB): 24.34
Estimated Total Size (MB): 25.85
-------------------------------------

  return F.log_softmax(x)


In [None]:
# Defining functions to train and test the network


# Function to train the network given the model, device, train loader, optimizer and epoch
def train(model, device, train_loader, optimizer, epoch):
    # Set the model to training mode
    # Depending on the mode, the model behaves differently. Some steps are used only during training, such as calculating the gradients and updating the weights
    model.train()
    # tqdm is used to display the progress message
    pbar = tqdm(train_loader)

    # Iterate over the training data
    for batch_idx, (data, target) in enumerate(pbar):
        # Send the input and target to the device
        data, target = data.to(device), target.to(device)

        # Clear the gradients calculated from the last iteration
        # https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
        optimizer.zero_grad()
        # Pass the input through the model
        output = model(data)

        # Calculate the negative log likelihood loss for the batch by comparing the model's output to the target
        # https://pytorch.org/docs/stable/generated/torch.nn.functional.nll_loss.html
        loss = F.nll_loss(output, target)

        # The backward() method is used to compute the gradients of the loss with respect to the parameters of the model
        loss.backward()

        # The step() method is used to update the parameters of the model in the direction that minimizes the loss using the gradients computed in the backward() method
        optimizer.step()

        # Update the progress bar with the loss for the batch
        pbar.set_description(desc=f"loss={loss.item()} batch_id={batch_idx}")


# Function to test the network given the model, device and test loader
def test(model, device, test_loader):
    # Set the model to evaluation mode
    model.eval()

    # Define variables to keep track of the test loss and the number of correct predictions
    test_loss = 0
    correct = 0

    # Tell PyTorch not to calculate gradients by using the no_grad() context manager
    with torch.no_grad():
        # Iterate over the test data
        for data, target in test_loader:
            # Send the input and target to the device
            data, target = data.to(device), target.to(device)
            # Pass the input through the model
            output = model(data)

            # Calculate the negative log likelihood loss for the batch by comparing the model's output to the target
            # Sum up batch loss
            test_loss += F.nll_loss(output, target, reduction="sum").item()

            # Get the index of the max log-probability
            # Out of the 10 output values, find the index of the one with the highest value
            pred = output.argmax(dim=1, keepdim=True)

            # If the prediction is correct by comparing to target with same dimension, increment the correct counter
            correct += pred.eq(target.view_as(pred)).sum().item()

    # Divide the test loss by the number of examples in the test set to get the average loss
    test_loss /= len(test_loader.dataset)

    # Print the average loss and the accuracy for the test set
    print(
        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
            test_loss,
            correct,
            len(test_loader.dataset),
            100.0 * correct / len(test_loader.dataset),
        )
    )

In [None]:
# Create an instance of the stochastic gradient descent optimizer
# https://pytorch.org/docs/stable/generated/torch.optim.SGD.html
# lr is the learning rate which controls the step size of the optimizer
# momentum is a parameter that controls the amount of inertia in the optimizer i.e. how much the previous step affects the current step
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Running for 1 epoch
for epoch in range(1, 2):
    print(f"Epoch: {epoch}")
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

Epoch: 1


  return F.log_softmax(x)
loss=0.8639147281646729 batch_id=468: 100%|██████████| 469/469 [00:35<00:00, 13.11it/s]



Test set: Average loss: 0.9417, Accuracy: 6066/10000 (61%)

