## Import Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler
import torch.nn.functional as F  


import torchvision.datasets as dset
import torchvision.transforms as T

import matplotlib.pyplot as plt

import numpy as np

## Set up Training Device and Global Variables

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print("Using device:", device)

Using device: cuda


In [3]:
print_every = 100 # How often we print the train loss
dtype = torch.float32 # Data type to use

## Import Data and Set up Training Batches

In [4]:
NUM_TRAIN = 49000 # Number of training samples out of 50000 total samples

# Normalize Data
transform = T.Compose([
                T.ToTensor(),
                T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
            ])

# Set up a Data Loader + Sampler combination for Batch Training

cifar10_train = dset.CIFAR10('./datasets', train=True, download=True,
                             transform=transform)
loader_train = DataLoader(cifar10_train, batch_size=64, 
                          sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))

cifar10_val = dset.CIFAR10('./datasets', train=True, download=True,
                           transform=transform)
loader_val = DataLoader(cifar10_val, batch_size=64, 
                        sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN, 50000)))

cifar10_test = dset.CIFAR10('./datasets', train=False, download=True, 
                            transform=transform)
loader_test = DataLoader(cifar10_test, batch_size=64)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [5]:
# Augmented Data

NUM_TRAIN = 49000


transformAug = T.Compose([
                T.ToTensor(),
                T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
                T.RandomAffine(degrees=15, translate=(0.15, 0.15)), # Image Shift & Turn
                T.RandomHorizontalFlip(p=0.5) # Image Mirroring
            ])

# Data augmentation only on the training set
cifar10_train = dset.CIFAR10('./datasets', train=True, download=True,
                             transform=transform)
loader_train_aug = DataLoader(cifar10_train, batch_size=64, 
                          sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))

Files already downloaded and verified


## Set Up Util Functions

In [6]:
def flatten(x):
    N = x.shape[0] # read in N, C, H, W
    return x.view(N, -1)  # "flatten" the C * H * W values into a single vector per image


class Flatten(nn.Module):
    def forward(self, x):
        return flatten(x)

In [10]:
def check_accuracy(loader, model, device, dtype):
    if loader.dataset.train:
        print('Checking accuracy on validation set')
    else:
        print('Checking accuracy on test set')   
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)
            scores = model(x)
            _, preds = scores.max(1)
            num_correct += (preds == y).sum()
            num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))

In [11]:
def train_model(model, optimizer, device, dtype, epochs=1):
    """
    Train a model on CIFAR-10 using the PyTorch Module API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train):
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            loss = F.cross_entropy(scores, y)

            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
                check_accuracy(loader_val, model, device, dtype)
                print()

In [19]:
def train_model_aug(model, optimizer, device, dtype, epochs=1):
    """
    Train a model on a CIFAR-10 augmented dataset using the PyTorch Module API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train_aug):
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            loss = F.cross_entropy(scores, y)

            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()

            # This is the backwards pass: compute the gradient of the loss with
            # respect to each  parameter of the model.
            loss.backward()

            # Actually update the parameters of the model using the gradients
            # computed by the backwards pass.
            optimizer.step()

            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
                check_accuracy(loader_val, model, device, dtype)


## No Data Augmentation

In [12]:
lr=0.001

model = nn.Sequential(nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding='same'), 
                      nn.BatchNorm2d(32),
                      nn.LeakyReLU(),
                      nn.MaxPool2d(2, stride=2),
                      nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding='same'),
                      nn.BatchNorm2d(64),
                      nn.LeakyReLU(), 
                      nn.MaxPool2d(2, stride=2),
                      nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding='same'),
                      nn.BatchNorm2d(64),
                      nn.LeakyReLU(), 
                      nn.MaxPool2d(2, stride=2),
                      nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding='same'),
                      nn.BatchNorm2d(128),
                      nn.LeakyReLU(), 
                      nn.MaxPool2d(2, stride=2),
                      nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding='same'),
                      nn.BatchNorm2d(256),
                      nn.LeakyReLU(), 
                      nn.MaxPool2d(2, stride=2),
                      Flatten(),   
                      nn.Linear(256, 64),
                      nn.Dropout(0.5),
                      nn.BatchNorm1d(64),
                      nn.ReLU(),
                      nn.Linear(64, 10),
                      nn.Softmax()
        )

In [None]:
optimizer = optim.Adam(model.parameters(), lr=lr)
train_model(model, optimizer, device, dtype, epochs=10)

In [None]:
check_accuracy(loader_val, model, device, dtype)

In [15]:
lr = 0.001


model2 = nn.Sequential(nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding='valid'), 
                      nn.BatchNorm2d(32),
                      nn.LeakyReLU(),
                      nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding='valid'), 
                      nn.BatchNorm2d(32),
                      nn.LeakyReLU(),
                      nn.MaxPool2d(2, stride=2),
                      nn.Conv2d(in_channels=32, out_channels=64, kernel_size=2, padding='valid'),
                      nn.BatchNorm2d(64),
                      nn.LeakyReLU(),
                      nn.Conv2d(in_channels=64, out_channels=64, kernel_size=2, padding='valid'),
                      nn.BatchNorm2d(64),
                      nn.LeakyReLU(),
                      nn.MaxPool2d(2, stride=2),
                      Flatten(),
                      nn.Linear(2304, 1024),
                      nn.Dropout(0.5),
                      nn.BatchNorm1d(1024),
                      nn.ReLU(),    
                      nn.Linear(1024, 10),
                      nn.Softmax()
         )

In [None]:
optimizer = optim.Adam(model2.parameters(), lr=lr)
train_model(model2, optimizer, device, dtype, epochs=10)

In [None]:
check_accuracy(loader_val, model2, device, dtype)

## Include Data Augmentation

In [20]:
lr=0.001

model_aug = nn.Sequential(nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding='same'), 
                      nn.BatchNorm2d(32),
                      nn.LeakyReLU(),
                      nn.MaxPool2d(2, stride=2),
                      nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding='same'),
                      nn.BatchNorm2d(64),
                      nn.LeakyReLU(), 
                      nn.MaxPool2d(2, stride=2),
                      nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding='same'),
                      nn.BatchNorm2d(64),
                      nn.LeakyReLU(), 
                      nn.MaxPool2d(2, stride=2),
                      nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding='same'),
                      nn.BatchNorm2d(128),
                      nn.LeakyReLU(), 
                      nn.MaxPool2d(2, stride=2),
                      nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding='same'),
                      nn.BatchNorm2d(256),
                      nn.LeakyReLU(), 
                      nn.MaxPool2d(2, stride=2),
                      Flatten(), 
                      nn.Linear(256, 64),
                      nn.Dropout(0.5),
                      nn.BatchNorm1d(64),
                      nn.ReLU(),
                      nn.Linear(64, 10),
                      nn.Softmax()
        )

In [None]:
optimizer = optim.Adam(model_aug.parameters(), lr=lr)
train_model_aug(model_aug, optimizer, device, dtype, epochs=10)

In [None]:
check_accuracy(loader_val, model_aug, device, dtype)

In [23]:
lr = 0.001


model2_aug = nn.Sequential(nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding='same'), 
                      nn.BatchNorm2d(32),
                      nn.LeakyReLU(),
                      nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding='same'), 
                      nn.BatchNorm2d(32),
                      nn.LeakyReLU(),
                      nn.MaxPool2d(2, stride=2),
                      nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding='same'),
                      nn.BatchNorm2d(64),
                      nn.LeakyReLU(),
                      nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding='same'),
                      nn.BatchNorm2d(64),
                      nn.LeakyReLU(),
                      nn.MaxPool2d(2, stride=2),
                      nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding='same'),
                      nn.BatchNorm2d(128),
                      nn.LeakyReLU(),
                      nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding='same'),
                      nn.BatchNorm2d(128),
                      nn.LeakyReLU(), 
                      nn.MaxPool2d(2, stride=2),
                      Flatten(),
                      nn.Linear(2048, 512),
                      nn.Dropout(0.5),
                      nn.BatchNorm1d(512),
                      nn.ReLU(),
                      nn.Linear(512, 10),
                      nn.Softmax()
         )

In [None]:
optimizer = optim.Adam(model2_aug.parameters(), lr=lr)
train_model_aug(model2_aug, optimizer, device, dtype, epochs=10)

In [None]:
check_accuracy(loader_val, model2_aug, device, dtype)

## Select Best Model and Finish Training

Based on the 10-epoch training results we pick the best model to further train

In [None]:
optimizer = optim.Adam(model2_aug.parameters(), lr=lr)
train_model_aug(model2_aug, optimizer, device, dtype, epochs=100) # Train for 100 epochs

In [None]:
check_accuracy(loader_val, model2_aug, device, dtype)

## Model Structure and Results

The models tested consisted of variations of two distinct architectures: the first one being *(conv-batchnorm-relu-pool)xN -> (dense-dropout-batchnorm-relu)xM -> softmax* and the second one being *((conv-batchnorm-relu)x2 -> pool)xN -> (dense-dropout-batchnorm-relu)xM -> softmax* 

LeakyReLU was used instead of regular ReLU as the models seemed to converge much faster this way and with a small increase in accuracy (about 1-2%). A big increase in training speed also came from the use of batchnorm between the convolutional and dense layers (2d for the convolutional ones and 1d for the dense).


### For the first architecture the models tested were the according:

#### 1) Initial Model
-32 filter conv layer (3x3 kernels with 'same' padding for standard dimensions) - batchnorm2d - leaky relu - MaxPool <br />
-64 filter conv layer (3x3 kernels with 'same' padding) - batchnorm2d - leaky relu - MaxPool(stride=2) <br />
-64 filter conv layer (3x3 kernels with 'same' padding) - batchnorm2d - leaky relu - MaxPool(stride=2) <br />
-128 filter conv layer (3x3 kernels with 'same' padding) - batchnorm2d - leaky relu - MaxPool(stride=2) <br />
-256 filter conv layer (3x3 kernels with 'same' padding) - batchnorm2d - leaky relu - MaxPool(stride=2) <br />
-dense layer with 128 neurons (from 256 inputs) - relu - dense with 64 neurons - relu - dense with 10 neurons - softmax

Validation Accuracy: 74%, Training Accuracy: 78.9% 

#### 2) Single Hidden Layer Dense Network
Similar architecture with reduced depth of the dense neural network to only 3 layers: <br />

-dense layer with 64 neurons (from 256 inputs) - relu - dense with 10 neurons - softmax

Validation Accuracy: 77%, Training Accuracy: 83.39%

We notice a faster convergence towards optimal weights as we can see an increase in both validation and training set accuracy after training for 10 epochs. If we increase the training time we expect similar results with the previous model but the computational cost is also decreased here so this architecture seems preferable (at least for 10 epochs). 

*Conclusion:* a single hidden layer dense network is capable of handling the features extracted through the convolutional layers in a more efficient way than a double hidden layer one.

#### 3) Multiple Hidden Layer Dense Network
This time we increase the features at the output of the convolutional layers by applying a single stride pooling layer at the last 2 levels and at the same time we use a multiple layer dense network to handle the output: <br />

-dense layer with 512 neurons (from 1024 inputs) - relu - dense with 256 neurons - relu - dense with 128 neurons - relu - 
dense with 64 neurons - relu - dense with 10 neurons - softmax

Validation Accuracy: 71%, Training Accuracy: 72.3% 

Again we see that a deeper dense network seems to be doing a much worse job when we restrict the training time, while even with an increased number of epochs it might still have issues with converging to a better minimum compared to the more shallow network

#### 4) and 5) AvgPool 

By using the first two models but substituting MaxPool for AvgPool we get a slightly worse performance by 2-3% in each case so using MaxPool with this specific dataset seems optimal

#### 6) Smaller kernels

Changing the kernels of the second model to 2x2 kernels (and adjusting the padding to 'valid' - practically no padding to fit the right dimensions)

Validation Accuracy: 68.5%, Training Accuracy: 72.8% which is worse than the initial 3x3 kernel model

#### 6) Larger kernels

Changing the kernels of the second model to 5x5 kernels (and this time keeping the padding to 'same' to fit the right dimensions)

Validation Accuracy: 73.8%, Training Accuracy: 77.04% which is better than the 2x2 kernel model but still not as good as the original

#### 7) Without Dropout and Batch Norm

By removing the Dropout and BatchNorm1d layers from the dense NNs we get the following:

Validation Accuracy: 70.3%, Training Accuracy: 79.37% so we see that without these two extra layers the network is prone to overfitting to the training set and increasing its training accuracy but at the same time doing worse on unknown data


### For the second architecture the models tested were the according:

#### 1) Initial Model


-32 filter conv layer (3x3 kernels with 'valid' padding) - batchnorm2d - leaky relu <br />
-32 filter conv layer (3x3 kernels with 'same' padding) - batchnorm2d - leaky relu <br />
-MaxPool(stride=2) <br />
-64 filter conv layer (3x3 kernels with 'valid' padding) - batchnorm2d - leaky relu <br />
-64 filter conv layer (3x3 kernels with 'same' padding) - batchnorm2d - leaky relu <br />
-MaxPool(stride=2) <br />
-128 filter conv layer (3x3 kernels with 'valid' padding) - batchnorm2d - leaky relu <br />
-256 filter conv layer (3x3 kernels with 'valid' padding) - batchnorm2d - leaky relu <br />
-MaxPool(stride=2) <br />
-dense layer with 64 neurons (from 256 inputs) - relu - dense with 10 neurons - softmax

Validation Accuracy: 71.2%, Training Accuracy: 73.11% 

#### 2)  and 3) Combination of different kernels

Similarly to the initial model but by swapping the kernels on the second convolutioal layer of each pair for 2x2 kernels we get the following results
 
Validation Accuracy: 68.4%, Training Accuracy: 69.83% we notice again that the 2x2 kernels don't seem to be working equally well with the 3x3 ones on this particular problem

By doing the same but this time substituting only the 3x3 kernels of first pair of convolutional layers for 5x5 kernels we get 

Validation Accuracy: 70.8%, Training Accuracy: 71.3% so similarly to what we noticed before the 5x5 kernels seem to be doing better than the 2x2 ones but still the results we get are optimal only in the case of 3x3 kernels

#### 4), 5) and 6) Removing a pair of layers

By removing the last pair of layers we can experiment with a simpler and thus much easier to train CNN: 

-32 filter conv layer (3x3 kernels with 'valid' padding) - batchnorm2d - leaky relu <br />
-32 filter conv layer (3x3 kernels with 'valid' padding) - batchnorm2d - leaky relu <br />
-MaxPool(stride=2) <br />
-64 filter conv layer (3x3 kernels with 'valid' padding) - batchnorm2d - leaky relu <br />
-64 filter conv layer (3x3 kernels with 'valid' padding) - batchnorm2d - leaky relu <br />
-MaxPool(stride=2) <br />
-dense layer with 512 neurons (from 1600 inputs) - relu - dense with 64 neurons - relu - dense with 10 neurons - softmax

Validation Accuracy: 74.6%, Training Accuracy: 77.43% Despite the fact that this CNN is not as deep as the initial one it seems to be doing almost equally well 

By changing the 3x3 kernels to 2x2 kernels we get the following

Validation Accuracy: 77.1%, Training Accuracy: 85.02% which is the best result we have got so far with the model however failing to generalize a much better training accuracy and overfitting to the data

Finally we combine 2x2 and 3x3 layers in the follwoing way

-32 filter conv layer (3x3 kernels with 'valid' padding) - batchnorm2d - leaky relu <br />
-32 filter conv layer (3x3 kernels with 'valid' padding) - batchnorm2d - leaky relu <br />
-MaxPool(stride=2) <br />
-64 filter conv layer (2x2 kernels with 'valid' padding) - batchnorm2d - leaky relu <br />
-64 filter conv layer (2x2 kernels with 'valid' padding) - batchnorm2d - leaky relu <br />
-MaxPool(stride=2) <br />
-dense layer with 512 neurons (from 2304 inputs) - relu - dense with 64 neurons - relu - dense with 10 neurons - softmax

Validation Accuracy: 78.2%, Training Accuracy: 85.57% Similarly to the previous one, we get even better results but the model again seems to be overfitting to the training data

### Data Augmentation

We atempt to enhance our dataset by adding specific transformations to several of the images such as horizontal flip (with a probability of 0.5) and random affine, a combination of rotation and shift, both horizontal and vertical. 

We experiment with two transformations: 

-Random Horizontal Shift (p=0.5) and Random Affine (degrees=15 translate=(0.15,0.15)) <br />
-Random Horizontal Shift (p=0.5) and Random Affine (degrees=30 translate=(0.2,0.2)) <br />

The models used are the ones that did the best during the previous parts. <br />
from the first architecture: Model number 2 as well as a similar version of this network with an extra convolutional layer added with 512 neurons before the dense layers, in order to see if a deeper model does better in the case of the augmented dataset. <br />
From the second architecture: Model number 6 was used as well as a version of this model with an added pair of convolutional layers with 128 neurons each before the dense layers to compare again the more shallow model to one more capable of adapting to the dataset. <br />

#### The results for the first augmentation after 10 epochs were the following:<br />

##### Architecture 1: 

Best model: validation accuracy 76.5% and training accuracy* 82.5% <br />
Deeper model: validation accuracy 75.4% and training accuracy 80.32% <br />

##### Architecture 2: 

Best model: validation accuracy 75.5% and training accuracy* 82.3% <br />
Deeper model: validation accuracy 82.2% and training accuracy 86.85% <br />

*accuracy on the initial training set (not on the augmented one)

#### The results for the second augmentation after 10 epochs were the following:<br />

##### Architecture 1: 

Best model: validation accuracy 76.5% and training accuracy* 81.82% <br />
Deeper model: validation accuracy 73.7% and training accuracy 79.72% <br />

##### Architecture 2: 

Best model: validation accuracy 75.3% and training accuracy* 81.15% <br />
Deeper model: validation accuracy 80.8% and training accuracy 85.75% <br />

*accuracy on the initial training set (not on the augmented one) <br />

For the first architecture we notice the initial model doing much better than the one with the extra layer while, for the second one, the addition of an extra pair of convolutional layers seems to be substantially improving the performance when training with the augmented dataset and the extra transformations seem to be affecting negatively the more shallow model's 
performance on the validation set. <br />

After further training however (up to 100 epochs), the deeper models start having a better performance on the validation set (> 85%) while being able to almost memorize fully the training set (training accuracy > 98%). Still due to serious overfitting after that many epochs, there need to be more adjustments to hyperparameters and/or the architecture itself for the model to be able to generalize better the results.

##### Model Choice: 
Only the best models for each architecture were left: Models 2 and 6 (model, model2) for architecture 1 and 2 respectively without the use of data augmentation as well as the same model for architecture 1 (model_aug) and the model with extra layers (model2_aug) for architecture 2 when using data augmentation.