# Code 2

Please find the notes at the end

![](https://1.bp.blogspot.com/-HGCac9oDqdI/XOxeOCB0E0I/AAAAAAAAQl4/zkGtCTlFUbIvg3PA_q2csMxsUgH1sQBuQCLcBGAs/s1600/IMG_2294.JPG)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as f
from torchvision import datasets, transforms
from torchsummary import summary
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm

In [None]:
# Train data transformation

train_transforms = transforms.Compose([
                                       transforms.RandomRotation((-6.9, 6.9), fill=(1,)),
                                       transforms.ToTensor(),
                                       transforms.Normalize(mean=(0.1307,), std=(0.3081,))
                                    ])

# Test data transformations

test_transforms = transforms.Compose([
                                      transforms.ToTensor(),
                                      transforms.Normalize(mean=(0.1307,), std=(0.3081,))
])

In [None]:
# download dataset and create train and test sets separately

train = datasets.MNIST(root = '../data', train = True, download = True, transform= train_transforms)

test = datasets.MNIST(root = '../data', train = False, download = True, transform=test_transforms)

In [None]:
# set seed values
torch.manual_seed(400)
dropout = 0.05

In [None]:
cuda = torch.cuda.is_available()

if cuda:
    torch.cuda.manual_seed(400)
    
device = torch.device("cuda" if cuda else "cpu")

In [None]:
dataloader_args = dict(shuffle=True, batch_size = 64, num_workers=2, pin_memory=True) if cuda else dict(shuffle=True, batch_size = 64)

In [None]:
#Dataloaders

train_loader = torch.utils.data.DataLoader(dataset=train, **dataloader_args)

test_loader = torch.utils.data.DataLoader(dataset=test, **dataloader_args)

In [None]:
# dataiter = iter(train_loader)
# images, labels = next(dataiter)
# plt.imshow(images[1].numpy().squeeze(), cmap='gray_r')

In [None]:
train_losses = []
train_accuracy = []

test_losses = []
test_accuracy = []

In [None]:
def train_eval_model(model, train_loader, optimizer, device, epochs=1, test=False, test_loader=None):
          
    model.train() # set the train mode
    
    # iterate over for `epochs` epochs and keep storing valuable info

    for epoch in range(epochs):
      correct = processed = train_loss = 0

      print(f"\n epoch num ================================= {epoch+1}")
      
      pbar = tqdm(train_loader)
      
      for batch_idx, (data, target) in enumerate(pbar):
          data, target = data.to(device), target.to(device) # move data to `device`

          optimizer.zero_grad() # zero out the gradients to avoid accumulating them over loops

          output = model(data) # get the model's predictions

          loss = f.nll_loss(output, target) # calculate Negative Log Likelihood loss using ground truth labels and the model's predictions

          train_loss += loss.item() # add up the train loss

          loss.backward() # boom ! The magic function to perform backpropagation and calculate the gradients

          optimizer.step() # take 1 step foe the optimizer and update the weights

          pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability

          correct += pred.eq(target.view_as(pred)).sum().item() #  compare and see how many predictions are coorect and then add up the count

          processed += len(data) # total processed data size

      acc = 100 * correct/processed

      train_losses.append(train_loss)

      train_accuracy.append(acc)
      
      pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}')
      
      
      train_loss /= len(train_loader.dataset)
      print('Train metrics: accuracy: {}/{} ({:.4f}%)'.format(correct,
                                                              len(train_loader.dataset),
                                                              correct * 100 / len(train_loader.dataset)))

      if test: # moving to evaluation
          model.eval() # set the correct mode
          
          correct = test_loss = 0

          with torch.no_grad(): # to disable gradient calculation with no_grad context
              
              for data, target in test_loader:

                  data, target = data.to(device), target.to(device)
                  
                  output = model(data)
                  
                  test_loss += f.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
                  
                  pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
                  
                  correct += pred.eq(target.view_as(pred)).sum().item()

          test_loss /= len(test_loader.dataset)
          test_losses.append(test_loss)
          test_accuracy.append(100. * correct / len(test_loader.dataset))

          print('Test metrics: average loss: {:.4f}, accuracy: {}/{} ({:.5f}%)\n'.format(
              test_loss, correct, len(test_loader.dataset),
              100. * correct / len(test_loader.dataset)))

In [None]:
class Network_2(nn.Module):        
    def __init__(self):

        super().__init__()

        # Input Block
        self.convblock1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=12, kernel_size=(3, 3), padding=1, bias=False),
            nn.BatchNorm2d(12),
            nn.ReLU(),
            nn.Dropout(dropout)) #input = 28, output = 28, RF = 3

        # CONVOLUTION BLOCK 1
        self.convblock2 = nn.Sequential(
            nn.Conv2d(12, 16, 3, padding=0, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Dropout(dropout)) #input = 28, output = 26, RF = 5
        
        # TRANSITION BLOCK 1, let's have a mix of channels without extracting features here
        self.convblock3 = nn.Sequential(
            nn.Conv2d(16, 10, 1, padding=0, bias=False),
            nn.ReLU()) #input = 28, output = 26, RF = 5
        
        self.pool1 = nn.MaxPool2d(2, 2) #input = 26, output = 13, RF = 10

        
        self.convblock4 = nn.Sequential(
            nn.Conv2d(10, 16, 3, padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout)) #input = 13, output = 11, RF = 12

        
        self.convblock5 = nn.Sequential(
            nn.Conv2d(16, 16, 3, padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout)) #input = 11, output = 9, RF = 14
        
        # Transition via 1x1 to reduce params and allow selection of relevant channels for next 3x3 layer to extract features
        self.convblock6 = nn.Sequential(
            nn.Conv2d(16, 10, 1, padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(10),
            nn.Dropout(dropout)) #input = 9, output = 9, RF = 14

        self.convblock7 = nn.Sequential(
            nn.Conv2d(10, 16, 3, padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout)) #input = 9, output = 7, RF = 16
        
        self.avg = nn.AvgPool2d(7) # Average Pool layer to reduce dimensions and have a larger view for incoming dimensions to make a decision
        
        # Final layer with 1x1 to have 10 output channels
        self.convblock8 = nn.Sequential(
            nn.Conv2d(16, 10, 1, padding=0, bias=False)) 

    def forward(self, x):
        x = self.convblock1(x)
        x = self.convblock2(x)
        x = self.convblock3(x)
        x = self.pool1(x) 
        x = self.convblock4(x)
        x = self.convblock5(x)
        x = self.convblock6(x)
        x = self.convblock7(x)
        x = self.avg(x)
        x = self.convblock8(x)
        x = x.view(-1, 10)
        return f.log_softmax(x, dim=-1)

In [None]:
# check model summary
model = Network_2().to(device)
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 12, 28, 28]             108
       BatchNorm2d-2           [-1, 12, 28, 28]              24
              ReLU-3           [-1, 12, 28, 28]               0
           Dropout-4           [-1, 12, 28, 28]               0
            Conv2d-5           [-1, 16, 26, 26]           1,728
       BatchNorm2d-6           [-1, 16, 26, 26]              32
              ReLU-7           [-1, 16, 26, 26]               0
           Dropout-8           [-1, 16, 26, 26]               0
            Conv2d-9           [-1, 10, 26, 26]             160
             ReLU-10           [-1, 10, 26, 26]               0
        MaxPool2d-11           [-1, 10, 13, 13]               0
           Conv2d-12           [-1, 16, 11, 11]           1,440
             ReLU-13           [-1, 16, 11, 11]               0
      BatchNorm2d-14           [-1, 16,

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9)

train_eval_model(model, train_loader, optimizer, device, epochs=14, test=True, test_loader=test_loader)

  0%|          | 0/938 [00:00<?, ?it/s]




100%|██████████| 938/938 [00:25<00:00, 37.34it/s]

Train metrics: accuracy: 52391/60000 (87.3183%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.1021, accuracy: 9687/10000 (96.87000%)




100%|██████████| 938/938 [00:24<00:00, 38.55it/s]

Train metrics: accuracy: 58002/60000 (96.6700%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.0570, accuracy: 9818/10000 (98.18000%)




100%|██████████| 938/938 [00:23<00:00, 39.52it/s]

Train metrics: accuracy: 58570/60000 (97.6167%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.0399, accuracy: 9877/10000 (98.77000%)




100%|██████████| 938/938 [00:24<00:00, 39.07it/s]

Train metrics: accuracy: 58874/60000 (98.1233%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.0489, accuracy: 9848/10000 (98.48000%)




100%|██████████| 938/938 [00:23<00:00, 39.13it/s]

Train metrics: accuracy: 58991/60000 (98.3183%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.0340, accuracy: 9889/10000 (98.89000%)




100%|██████████| 938/938 [00:23<00:00, 39.81it/s]

Train metrics: accuracy: 59101/60000 (98.5017%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.0366, accuracy: 9883/10000 (98.83000%)




100%|██████████| 938/938 [00:23<00:00, 39.88it/s]

Train metrics: accuracy: 59210/60000 (98.6833%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.0355, accuracy: 9893/10000 (98.93000%)




100%|██████████| 938/938 [00:23<00:00, 39.90it/s]

Train metrics: accuracy: 59253/60000 (98.7550%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.0485, accuracy: 9863/10000 (98.63000%)




100%|██████████| 938/938 [00:23<00:00, 39.84it/s]

Train metrics: accuracy: 59263/60000 (98.7717%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.0288, accuracy: 9910/10000 (99.10000%)




100%|██████████| 938/938 [00:23<00:00, 39.51it/s]

Train metrics: accuracy: 59316/60000 (98.8600%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.0308, accuracy: 9907/10000 (99.07000%)




100%|██████████| 938/938 [00:23<00:00, 39.71it/s]

Train metrics: accuracy: 59345/60000 (98.9083%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.0287, accuracy: 9915/10000 (99.15000%)




100%|██████████| 938/938 [00:23<00:00, 39.52it/s]

Train metrics: accuracy: 59350/60000 (98.9167%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.0278, accuracy: 9904/10000 (99.04000%)




100%|██████████| 938/938 [00:23<00:00, 39.99it/s]

Train metrics: accuracy: 59400/60000 (99.0000%)



  0%|          | 0/938 [00:00<?, ?it/s]

Test metrics: average loss: 0.0258, accuracy: 9917/10000 (99.17000%)




100%|██████████| 938/938 [00:23<00:00, 40.15it/s]

Train metrics: accuracy: 59392/60000 (98.9867%)





Test metrics: average loss: 0.0289, accuracy: 9912/10000 (99.12000%)



# Code 2 Notes:

## Target:
To have reduced gap between train and test accuracies as well as push model towards 99.4% target, with help of Dropout and Image Augmentation to recude over-fitting as well as ease out the variances with augmentation and help network learn better.

## Result:
Test accuracy crosses 99% and has consistency in last epochs with train accuracy dropping down and thus lowering the gap between the two.
Final train accuracy: 98.98%
Final test accuracy : 99.12% 
> when trained and tested for 14 epochs


## Analysis:
The results are still not great, need to further use advanced approaches to further increase test accuracy.

Important point to note is that the gap between the two accuracies has reduced a lot in comparison to previous code.
