# Code 3

Please find the notes at the end


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as f
from torchvision import datasets, transforms
from torchsummary import summary
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR

In [2]:
# Train data transformation

train_transforms = transforms.Compose([
                                       transforms.RandomRotation((-6.9, 6.9), fill=(1,)),
                                       transforms.RandomAffine(degrees=15), #translate=(0.1, 0.1), scale=(0.8, 1.2)),
                                       transforms.ToTensor(),
                                       transforms.Normalize(mean=(0.1307,), std=(0.3081,))
                                    ])

# Test data transformations

test_transforms = transforms.Compose([
                                      transforms.ToTensor(),
                                      transforms.Normalize(mean=(0.1307,), std=(0.3081,))
])

In [3]:
# download dataset and create train and test sets separately

train = datasets.MNIST(root = '../data', train = True, download = True, transform= train_transforms)

test = datasets.MNIST(root = '../data', train = False, download = True, transform=test_transforms)

In [4]:
# set seed values
torch.manual_seed(400)
dropout = 0.05

In [5]:
cuda = torch.cuda.is_available()

if cuda:
    torch.cuda.manual_seed(400)
    
device = torch.device("cuda" if cuda else "cpu")

In [6]:
dataloader_args = dict(shuffle=True, batch_size = 64, num_workers=2, pin_memory=True) if cuda else dict(shuffle=True, batch_size = 64)

In [7]:
#Dataloaders

train_loader = torch.utils.data.DataLoader(dataset=train, **dataloader_args)

test_loader = torch.utils.data.DataLoader(dataset=test, **dataloader_args)

In [8]:
# dataiter = iter(train_loader)
# images, labels = next(dataiter)
# plt.imshow(images[1].numpy().squeeze(), cmap='gray_r')

In [9]:
train_losses = []
train_accuracy = []

test_losses = []
test_accuracy = []

In [10]:
def train_eval_model(model, train_loader, optimizer, device, epochs=1, test=False, test_loader=None, scheduler=None):
          
    model.train() # set the train mode
    
    # iterate over for `epochs` epochs and keep storing valuable info

    for epoch in range(epochs):
      correct = processed = train_loss = 0
    #   if scheduler:
    #     scheduler.step()

      print(f"\n epoch num ================================= {epoch+1}")
      
      pbar = tqdm(train_loader)
      
      for batch_idx, (data, target) in enumerate(pbar):
          data, target = data.to(device), target.to(device) # move data to `device`

          optimizer.zero_grad() # zero out the gradients to avoid accumulating them over loops

          output = model(data) # get the model's predictions

          loss = f.nll_loss(output, target) # calculate Negative Log Likelihood loss using ground truth labels and the model's predictions

          train_loss += loss.item() # add up the train loss

          loss.backward() # boom ! The magic function to perform backpropagation and calculate the gradients

          optimizer.step() # take 1 step for the optimizer and update the weights

          pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability

          correct += pred.eq(target.view_as(pred)).sum().item() #  compare and see how many predictions are coorect and then add up the count

          processed += len(data) # total processed data size

      acc = 100 * correct/processed

      train_losses.append(train_loss)

      train_accuracy.append(acc)

      if scheduler:
        print("\n\n\t\t\tLast LR -->", scheduler.get_last_lr())
        scheduler.step()

      pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}')
      
      
      train_loss /= len(train_loader.dataset)
      print('\n\t\t\tTrain metrics: accuracy: {}/{} ({:.4f}%)'.format(correct,
                                                              len(train_loader.dataset),
                                                              correct * 100 / len(train_loader.dataset)))

      if test: # moving to evaluation
          model.eval() # set the correct mode
          
          correct = test_loss = 0

          with torch.no_grad(): # to disable gradient calculation with no_grad context
              
              for data, target in test_loader:

                  data, target = data.to(device), target.to(device)
                  
                  output = model(data)
                  
                  test_loss += f.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
                  
                  pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
                  
                  correct += pred.eq(target.view_as(pred)).sum().item()

          test_loss /= len(test_loader.dataset)
          test_losses.append(test_loss)
          test_accuracy.append(100. * correct / len(test_loader.dataset))

          print('\n\tTest metrics: average loss: {:.4f}, accuracy: {}/{} ({:.5f}%)\n'.format(
              test_loss, correct, len(test_loader.dataset),
              100. * correct / len(test_loader.dataset)))

In [11]:
class Network3(nn.Module):        
    def __init__(self):

        super().__init__()

        # Input Block
        self.convblock1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=12, kernel_size=(3, 3), padding=1, bias=False),
            nn.BatchNorm2d(12),
            nn.ReLU(),
            nn.Dropout(dropout)) #input = 28, output = 28, RF = 3

        # CONVOLUTION BLOCK 1
        self.convblock2 = nn.Sequential(
            nn.Conv2d(12, 16, 3, padding=0, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Dropout(dropout)) #input = 28, output = 26, RF = 5
        
        # TRANSITION BLOCK 1, let's have a mix of channels without extracting features here
        self.convblock3 = nn.Sequential(
            nn.Conv2d(16, 10, 1, padding=0, bias=False),
            nn.ReLU()) #input = 28, output = 26, RF = 5
        
        self.pool1 = nn.MaxPool2d(2, 2) #input = 26, output = 13, RF = 10

        
        self.convblock4 = nn.Sequential(
            nn.Conv2d(10, 16, 3, padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout)) #input = 13, output = 11, RF = 12

        
        self.convblock5 = nn.Sequential(
            nn.Conv2d(16, 16, 3, padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout)) #input = 11, output = 9, RF = 14
        
        # Transition via 1x1 to reduce params and allow selection of relevant channels for next 3x3 layer to extract features
        self.convblock6 = nn.Sequential(
            nn.Conv2d(16, 10, 1, padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(10),
            nn.Dropout(dropout)) #input = 9, output = 9, RF = 14

        self.convblock7 = nn.Sequential(
            nn.Conv2d(10, 16, 3, padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout)) #input = 9, output = 7, RF = 16
        
        self.avg = nn.AvgPool2d(7) # Average Pool layer to reduce dimensions and have a larger view for incoming dimensions to make a decision
        
        # Final layer with 1x1 to have 10 output channels
        self.convblock8 = nn.Sequential(
            nn.Conv2d(16, 10, 1, padding=0, bias=False)) 

    def forward(self, x):
        x = self.convblock1(x)
        x = self.convblock2(x)
        x = self.convblock3(x)
        x = self.pool1(x) 
        x = self.convblock4(x)
        x = self.convblock5(x)
        x = self.convblock6(x)
        x = self.convblock7(x)
        x = self.avg(x)
        x = self.convblock8(x)
        x = x.view(-1, 10)
        return f.log_softmax(x, dim=-1)

In [12]:
# check model summary
model = Network3().to(device)
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 12, 28, 28]             108
       BatchNorm2d-2           [-1, 12, 28, 28]              24
              ReLU-3           [-1, 12, 28, 28]               0
           Dropout-4           [-1, 12, 28, 28]               0
            Conv2d-5           [-1, 16, 26, 26]           1,728
       BatchNorm2d-6           [-1, 16, 26, 26]              32
              ReLU-7           [-1, 16, 26, 26]               0
           Dropout-8           [-1, 16, 26, 26]               0
            Conv2d-9           [-1, 10, 26, 26]             160
             ReLU-10           [-1, 10, 26, 26]               0
        MaxPool2d-11           [-1, 10, 13, 13]               0
           Conv2d-12           [-1, 16, 11, 11]           1,440
             ReLU-13           [-1, 16, 11, 11]               0
      BatchNorm2d-14           [-1, 16,

In [13]:
optimizer = optim.SGD(model.parameters(), lr=0.075,momentum=0.9, nesterov=True)
scheduler = StepLR(optimizer=optimizer, step_size=1, gamma=0.65)

train_eval_model(model, train_loader, optimizer, device, epochs=14, test=True, test_loader=test_loader,scheduler=scheduler)

  0%|          | 0/938 [00:00<?, ?it/s]




100%|██████████| 938/938 [00:27<00:00, 33.58it/s]



			Last LR --> [0.075]

			Train metrics: accuracy: 55526/60000 (92.5433%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0629, accuracy: 9811/10000 (98.11000%)




100%|██████████| 938/938 [00:27<00:00, 33.97it/s]



			Last LR --> [0.04875]

			Train metrics: accuracy: 58340/60000 (97.2333%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0720, accuracy: 9755/10000 (97.55000%)




100%|██████████| 938/938 [00:27<00:00, 33.71it/s]



			Last LR --> [0.0316875]

			Train metrics: accuracy: 58729/60000 (97.8817%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0371, accuracy: 9896/10000 (98.96000%)




100%|██████████| 938/938 [00:27<00:00, 33.51it/s]



			Last LR --> [0.020596875]

			Train metrics: accuracy: 58984/60000 (98.3067%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0378, accuracy: 9882/10000 (98.82000%)




100%|██████████| 938/938 [00:28<00:00, 33.41it/s]



			Last LR --> [0.013387968750000001]

			Train metrics: accuracy: 59102/60000 (98.5033%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0240, accuracy: 9931/10000 (99.31000%)




100%|██████████| 938/938 [00:28<00:00, 33.25it/s]



			Last LR --> [0.0087021796875]

			Train metrics: accuracy: 59217/60000 (98.6950%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0219, accuracy: 9942/10000 (99.42000%)




100%|██████████| 938/938 [00:28<00:00, 33.41it/s]



			Last LR --> [0.005656416796875001]

			Train metrics: accuracy: 59251/60000 (98.7517%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0211, accuracy: 9935/10000 (99.35000%)




100%|██████████| 938/938 [00:28<00:00, 33.32it/s]



			Last LR --> [0.003676670917968751]

			Train metrics: accuracy: 59266/60000 (98.7767%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0216, accuracy: 9936/10000 (99.36000%)




100%|██████████| 938/938 [00:28<00:00, 33.34it/s]



			Last LR --> [0.0023898360966796883]

			Train metrics: accuracy: 59324/60000 (98.8733%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0211, accuracy: 9937/10000 (99.37000%)




100%|██████████| 938/938 [00:28<00:00, 33.29it/s]



			Last LR --> [0.0015533934628417976]

			Train metrics: accuracy: 59313/60000 (98.8550%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0202, accuracy: 9942/10000 (99.42000%)




100%|██████████| 938/938 [00:28<00:00, 33.45it/s]



			Last LR --> [0.0010097057508471684]

			Train metrics: accuracy: 59337/60000 (98.8950%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0203, accuracy: 9944/10000 (99.44000%)




100%|██████████| 938/938 [00:28<00:00, 33.03it/s]



			Last LR --> [0.0006563087380506594]

			Train metrics: accuracy: 59336/60000 (98.8933%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0197, accuracy: 9945/10000 (99.45000%)




100%|██████████| 938/938 [00:27<00:00, 33.59it/s]



			Last LR --> [0.00042660067973292865]

			Train metrics: accuracy: 59334/60000 (98.8900%)



  0%|          | 0/938 [00:00<?, ?it/s]


	Test metrics: average loss: 0.0201, accuracy: 9945/10000 (99.45000%)




100%|██████████| 938/938 [00:28<00:00, 33.36it/s]



			Last LR --> [0.0002772904418264036]

			Train metrics: accuracy: 59354/60000 (98.9233%)






	Test metrics: average loss: 0.0198, accuracy: 9945/10000 (99.45000%)



# Code 3 Notes:

## Target:
To have test accuracy consistently be above 99.40% (at least last 4 epochs), push the learning capacity of the model further while ensuring the gap between the two accuracies stays close.

## Result:
Test accuracy crosses 99% and has consistency in last epochs with train accuracy being below 99% mark, thus having some more learning tendency, perhaps more epochs and experiments with LR could train accuracy more.
Final train accuracy: 98.92%
Final test accuracy : 99.45% 
>  trained and tested for 14 epochs


## Analysis:
Dropout (along with Augmentation) definitely played it's role in ensuring that the gap between two accuracies lowers down, resulting in lower train accuracy and allowing network to be pushed further along with test accuracy. Augmentation and Dropout have not only avoided over-fitting, they have resulted in very small level of under-fitting as model could definitely be pushed further as per train accuracy values.

StepLR is another main concept implemented in code 3 and thus allowed to decrease LR with increasing epochs. Had to experiment with multiple combinations of LR and Gamma though.

Train accuracy is still not high as expected. 

Further experiments with augmentation can be done. Dropout value has to be fixed in this case but it's position can be experimented with further.

