# Project 1: Classification, weight sharing, auxiliary losses 


The objective of this project is to test different architectures to compare two digits visible in a
two-channel image. It aims at showing in particular the impact of weight sharing, and of the use of an
auxiliary loss to help the training of the main objective.
It should be implemented with PyTorch only code, in particular without using other external libraries
such as scikit-learn or numpy.

The goal of this project is to implement a deep network such that, given as input a series of 2 ×14×14
tensor, corresponding to pairs of 14 × 14 grayscale images, it predicts for each pair if the first digit is
lesser or equal to the second. The training and test set should be 1, 000 pairs each, and the size of the images allows to run experiments rapidly, even in the VM with a single core and no GPU.
You can generate the data sets to use with the function generate˙pair˙sets(N) defined in the file
dlc˙practical˙prologue.py. This function returns six tensors:

## Set-up: 

In [None]:
import torch
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
import argparse
import os
import urllib
import torch.nn as nn

import matplotlib.pyplot as plt
from dlc_practical_prologue import *
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
from six.moves import urllib
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)

In [None]:
if not os.path.exists('../data/'):
    os.makedirs('../data/')

In [None]:
# Run this once to download the MNIST data-set. 
# There is a problem with the server on which it's hosted so only way right now 
# to have it :( 
'''
!wget www.di.ens.fr/~lelarge/MNIST.tar.gz
!tar -zxvf MNIST.tar.gz
'''

## Data: 

In [None]:
train_input, train_target, train_classes, test_input, test_target, test_classes = generate_pair_sets(
    1000)

In [None]:
print(f'Training and test input size: {train_input.size(), test_input.size()}')
print(f'Training and test target size: {train_target.size(), test_target.size()}')
print(f'Training and test classes size: {train_classes.size(), test_classes.size()}')

Generate dataset needed for training. For this as we have a special data case we rewrite the `Dataset` class in order to use a `dataloader` later. Remember `target` is 1 if first number is smaller or equal than the second image.  

In [None]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, pairs, target, classes):
        'Initialization'
        # target = (0,1)
        self.target = target
        # image pairs (2,14,14)
        self.pairs = pairs
        # cipher classes (2 in [0,9])
        self.classes = classes

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.pairs)

    def __getitem__(self, index):
        'Generates one sample of data'
        # image pairs
        X = self.pairs[index]
        # target:
        y = self.target[index]
        # classes:
        Y = self.classes[index]
        return X, y, Y

Create datasets (training and validation):

In [None]:
training_set = Dataset(train_input, train_target, train_classes)
test_set = Dataset(test_input, test_target, test_classes)

Have a look:

In [None]:
fig, ax = plt.subplots(6, 2, figsize=(5, 18))
for j in range(6):
    im1 = training_set.__getitem__(j)[0][0, :, :]
    im2 = training_set.__getitem__(j)[0][1, :, :]
    target = training_set.__getitem__(j)[1]
    classes = training_set.__getitem__(j)[2]
    ax[j, 0].imshow(im1, cmap='gray')
    ax[j, 1].imshow(im2, cmap='gray')
    ax[j, 1].set_title(f'Cipher: {classes[1]}')
    ax[j, 0].set_title(f'Cipher: {classes[0]}, target: {target}')

## Models:

### Model architectures:

#### Baseline: 
- Loss: CE (cross entropy) 
- Optimizer: SGD optimizer
- Activation function: softmax

#### Experiment 1: 
Add one more hidden layer: Accuracy: 77.6%

#### Experiment 2:
Use Sigmoid instead ReLU: Accuracy: 52.6%, Avg loss: 0.010993 

#### Experiment 3:
Use Tanh instead ReLu: Accuracy: 71.9%, Avg loss: 0.009163

#### Experiment 4:
Add Batch Normalization with eps=1e-05, momentum=0.1, affine=True, track_running_stats=True: Accuracy: 75.8%, Avg loss: 0.007833

#### Ideas
- Dropout
- Batch normalization


In [None]:
# Basic model with two layers and a two digit output:
class Model_1(nn.Module):
    def __init__(self):
        super().__init__()
        input_size = 2 * 14 * 14
        hidden_sizes = [392, 392]
        # two digit output, probability of being 1 or 0:
        output_size = 2
        # flatten images to 1D input:
        self.flatten = nn.Flatten()
        # then two hidden layers:
        self.model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                                   nn.ReLU(),
                                   nn.BatchNorm1d(num_features=hidden_sizes[0]),
                                   nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                                   nn.ReLU(),
                                   nn.BatchNorm1d(num_features=hidden_sizes[1]),
                                   nn.Linear(hidden_sizes[1], output_size))
        # no need to add softmax at the end because already in CE loss.
    def forward(self, x):
        # flatten 2D->1D
        x = self.flatten(x)
        # predict probabilities:
        logits = self.model(x)
        return logits

### Train model:

##### Load data:

In [None]:
# Data loader for model, change num_workers when on GPU:
params = {'batch_size': 64, 'shuffle': True, 'num_workers': 0}
training_generator = torch.utils.data.DataLoader(training_set, **params)
test_generator = torch.utils.data.DataLoader(test_set, **params)

##### Call model: 

In [None]:
model = Model_1().to(device)
print(model)

##### Training:
Binary classification with two output units --> so `CrossEntropyLoss()` so need to use `torch.nn.CrossEntropyLoss` instead of `BCELoss` (BCE for 1 digit output). The `Softmax` activation is already included in this loss function. 

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    train_loss = 0
    for batch, (X, y, Y) in enumerate(dataloader):
        # Compute prediction and loss:
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation:
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        if batch % 10 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    # return average training loss:
    train_loss /= size
    return train_loss


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    test_loss, correct = 0, 0
    softmax = torch.nn.Softmax(dim=1)

    with torch.no_grad():
        for X, y, Y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()

            # Softmax to get probabilities:
            prob = softmax(pred)
            # calculate number of correct predictions:
            correct += (prob.argmax(1) == y).type(torch.float).sum().item()
    # return average test loss and accuracy:
    test_loss /= size
    correct /= size
    print(
        f"Validation Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n"
    )
    return 100 * correct, test_loss

Hyperparameters and optimizers:

In [None]:
learning_rate = 1e-3
batch_size = 64
epochs = 25

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [None]:
training_loss, test_loss = [], []
accuracy = []

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loss = train_loop(training_generator, model, loss_fn, optimizer)
    acc, t_loss = test_loop(test_generator, model, loss_fn)
    
    accuracy.append(acc)
    training_loss.append(train_loss)
    test_loss.append(t_loss)
print("Done!")


In [None]:
# save model:
torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, '../data/Lena/lena_ex_4.pth')

Plot losses and accuracy:

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(8, 4))
axs[0].plot(accuracy)
axs[0].plot(accuracy)
axs[0].set_xlabel('Num epochs')
axs[0].set_title('Accuracy')
axs[1].plot(test_loss, label=' test_loss')
axs[1].plot(training_loss, label='train_loss')
axs[1].set_xlabel('Num epochs')
axs[1].set_title('Loss')
plt.legend()

### Predictions on test set:

In [None]:
# Make a few predictions:
size = len(test_generator.dataset)
softmax = torch.nn.Softmax(dim=1)
fig, ax = plt.subplots(6, 2, figsize=(5, 18))

with torch.no_grad():
    for batch, (X, y, Y) in enumerate(test_generator):
        if batch == 0:
            pred = model(X)
            prob = softmax(pred)
            prediction = prob.argmax(1).type(torch.float)
            for j in range(6):
                im1 = X[j][0, :, :]
                im2 = X[j][1, :, :]
                target = y[j]
                classes = Y[j]
                pred = prediction[j]
                ax[j, 0].imshow(im1, cmap='gray')
                ax[j, 1].imshow(im2, cmap='gray')
                ax[j, 0].set_title(
                    f'Cipher: {classes[0]}, target: {target}, pred: {pred}')
                ax[j, 1].set_title(f'Cipher: {classes[1]}')