### MNIST Classifer model with  < 10K parameters

In this notebook, a conv net is trained on the MNIST dataset. The model is loosely based on **LeNet** with some modern enhancements.

The Model has **<10K parameters** and achieves **>99.4% accuracy** on the MNIST test set.<br>

#### Key techniques used:
*   Training convergence and stability improvements
  *   Batch Normalization
  *   Adam Optimizer

*   Regularization techniques
  *   Data Augumentation
  *   Weight Decay
<br>

In [1]:
import time
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchvision
from torchvision import datasets, transforms

In [2]:
def get_size(img_shape, filters):
  """ Calculate the shape of conv net output """
  out_shape = img_shape[-2:]
  for f in filters:
    if f[0] == 'pool':
      out_shape.div_(f[1])
    elif f[0] == 'conv':
      k, s, p = f[1:]
      out_shape = torch.floor(((out_shape + 2*p - (k-1) - 1)/s)+1)
  return out_shape.cpu().numpy()

In [3]:
class Model(nn.Module):

  def __init__(self, input_shape, out_dim, fc_layers=[100]):
    
    super(Model, self).__init__()

    # details of the conv net layers
    lyrs = [('conv', 5, 1, 0),
            ('conv', 5, 1, 0),
            ('pool', 2),
            ('conv', 3, 1, 0),
            ('conv', 3, 1, 0),
            ('pool', 2),
            ('conv', 3, 1, 0),]

    # initialize the conv layers
    self.conv_1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0, bias=True)
    self.conv_2 = nn.Conv2d(6, 6, kernel_size=5, stride=1, padding=0, bias=False)
    self.bn_1 = nn.BatchNorm2d(6, affine=True)
    self.conv_3 = nn.Conv2d(6, 16, kernel_size=3, stride=1, padding=0, bias=True)
    self.conv_4 = nn.Conv2d(16, 16, kernel_size=3, stride=1, padding=0, bias=False)
    self.bn_2 = nn.BatchNorm2d(16, affine=True)

    self.conv_5 = nn.Conv2d(16, 16, kernel_size=3, stride=1, padding=0, bias=False)
    self.bn_3 = nn.BatchNorm2d(16, affine=True)

    self.pool = nn.MaxPool2d(kernel_size=2)

    t_out = get_size(torch.tensor(input_shape), lyrs)
    # print(f'ConvNet output shape: {t_out[0]}x{t_out[1]}')
    self._conv_out_dim = int(np.prod(t_out)*16)

    # initialize the fully-connected layers
    self._fc = []
    inp = self._conv_out_dim

    for lyr_dim in fc_layers:
      self._fc.append(nn.Linear(inp, lyr_dim, bias=False))
      self._fc.append(nn.BatchNorm1d(lyr_dim, affine=True))
      self._fc.append(nn.ReLU())
      inp = lyr_dim
    self._fc.append(nn.Linear(inp, out_dim))
    self._fc = nn.Sequential(*self._fc)

    self.parameter_count()

  def parameter_count(self):
    """ Calculate the number of parameters in the model """
    num_params=0
    for params in self.parameters():
      num_params += params.numel()
    print(f'Model has {num_params} parameters...')

  def forward(self, x):
    x = F.relu(self.conv_1(x))
    x = self.pool(F.relu(self.bn_1(self.conv_2(x))))
    x = F.relu(self.conv_3(x))
    x = self.pool(F.relu(self.bn_2(self.conv_4(x))))
    x = F.relu(self.bn_3(self.conv_5(x)))
    x = x.view(-1, self._conv_out_dim)
    x = self._fc(x)
    return x

In [4]:
# major hyperparameters
seed = 20
lr = 0.002
batch_size = 64
num_epochs = 40
device = 'cuda' if torch.cuda.is_available() else 'cpu'
weight_decay = 0.001

In [5]:
# set seed
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [6]:
# data preprocessing and augumentation
da_transforms = transforms.RandomApply([transforms.RandomAffine(10, scale=(0.95, 1.05), translate=(0.1, 0.1))], p=0.3)

train_transform = transforms.Compose([da_transforms,
                                      transforms.ToTensor(),
                                      transforms.Normalize((0.5, ), (0.5, ))])
test_transform = transforms.Compose([transforms.ToTensor(),
                                     transforms.Normalize((0.5, ), (0.5, ))])

In [7]:
# setup the train/test data loaders

trainset = datasets.MNIST(root='./data', train=True,
                          download=True, transform=train_transform)
testset = datasets.MNIST(root='./data', train=False,
                         download=True, transform=test_transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, drop_last=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=True, drop_last=False)

In [8]:
# setup model, loss, and optimizer

network = Model(trainset.data.shape[-3:], len(trainset.classes))
network.to(device)
print(f'Using {device}...')

criterion = nn.CrossEntropyLoss(reduction='mean')

optimizer = optim.Adam(network.parameters(), lr=lr, weight_decay=weight_decay, eps=1e-05)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.2, verbose=False)

Model has 9430 parameters...
Using cuda...


In [9]:
def train(net, criterion, optimizer, scheduler=None, num_epochs=30, train_set=trainloader, test_set=testloader):
  """
  Trains the provided model for specified number of epochs
  """
  loss_hist = []
  time_hist = []

  for epoch in range(1, num_epochs+1):
    loss_hist.append([])
    start_time = time.time()

    for i, data in enumerate(train_set, 0):
      inputs, labels = data[0].to(device), data[1].to(device)
      optimizer.zero_grad()

      outputs = net(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

      loss_hist[-1].append(loss.cpu().item())
    
    loss_hist[-1] = np.round(np.mean(loss_hist[-1]), 4)
    time_hist.append(np.round(time.time()-start_time, 2))
    print(f'Epoch {epoch}  -  loss: {loss_hist[-1]}  time: {time_hist[-1]}')
    if (epoch%5) == 0:
      print('---------------------------------------')
      get_accuracy(network, train_set, label='Train')
      get_accuracy(network, test_set, label='Test')
      print('---------------------------------------')
      network.train()
    if scheduler is not None:
      scheduler.step()

In [10]:
def get_accuracy(model, dataloader, label='Test'):
  """ Test the performance of the network on the provided dataset """
  model.eval()
  with torch.no_grad():
    c = []
    for data in dataloader:
      images, labels = data[0].to(device), data[1].to(device)
      outputs = model(images)
      _, predicted = torch.max(outputs, 1)
      c.append((predicted == labels).squeeze())

    c = torch.cat(c, dim=0)
    acc = 100*(c.sum()/len(c))
    print(f'{label} Accuracy: {acc:.2f}%')

In [11]:
train(network, criterion=criterion, optimizer=optimizer, scheduler=scheduler, num_epochs=num_epochs)

Epoch 1  -  loss: 0.2513  time: 17.11
Epoch 2  -  loss: 0.1096  time: 16.87
Epoch 3  -  loss: 0.0869  time: 16.87
Epoch 4  -  loss: 0.077  time: 17.45
Epoch 5  -  loss: 0.0771  time: 17.43
---------------------------------------
Train Accuracy: 97.81%
Test Accuracy: 98.69%
---------------------------------------
Epoch 6  -  loss: 0.0742  time: 17.85
Epoch 7  -  loss: 0.0724  time: 17.56
Epoch 8  -  loss: 0.0699  time: 17.43
Epoch 9  -  loss: 0.0662  time: 17.44
Epoch 10  -  loss: 0.0653  time: 17.49
---------------------------------------
Train Accuracy: 98.51%
Test Accuracy: 99.06%
---------------------------------------
Epoch 11  -  loss: 0.0472  time: 18.51
Epoch 12  -  loss: 0.0414  time: 16.87
Epoch 13  -  loss: 0.0401  time: 17.35
Epoch 14  -  loss: 0.0395  time: 19.4
Epoch 15  -  loss: 0.0383  time: 18.36
---------------------------------------
Train Accuracy: 99.20%
Test Accuracy: 99.37%
---------------------------------------
Epoch 16  -  loss: 0.0386  time: 17.68
Epoch 17  - 