# Project 1: Classification, weight sharing, auxiliary losses 


The objective of this project is to test different architectures to compare two digits visible in a
two-channel image. It aims at showing in particular the impact of weight sharing, and of the use of an
auxiliary loss to help the training of the main objective.
It should be implemented with PyTorch only code, in particular without using other external libraries
such as scikit-learn or numpy.

The goal of this project is to implement a deep network such that, given as input a series of 2 ×14×14
tensor, corresponding to pairs of 14 × 14 grayscale images, it predicts for each pair if the first digit is
lesser or equal to the second. The training and test set should be 1, 000 pairs each, and the size of the images allows to run experiments rapidly, even in the VM with a single core and no GPU.
You can generate the data sets to use with the function generate˙pair˙sets(N) defined in the file
dlc˙practical˙prologue.py. This function returns six tensors:

## Set-up: 

In [1]:
import torch
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
import argparse
import os
import urllib
import torch.nn as nn
import torch.utils.model_zoo as model_zoo
from sklearn.metrics import roc_auc_score # roc auc metric
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from dlc_practical_prologue import *
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from torchvision.models import * 

In [3]:
from signSGD import signSGD
from Nadam import Nadam

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cpu')

In [None]:
# control the randomness
torch.manual_seed(0)

In [5]:
from six.moves import urllib
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)

In [6]:
if not os.path.exists('../data/'):
    os.makedirs('../data/')

In [7]:
# Run this once to download the MNIST data-set. 
# There is a problem with the server on which it's hosted so only way right now 
# to have it :( 

"""
!wget www.di.ens.fr/~lelarge/MNIST.tar.gz
!tar -zxvf MNIST.tar.gz
"""

'\n!wget www.di.ens.fr/~lelarge/MNIST.tar.gz\n!tar -zxvf MNIST.tar.gz\n'

## Data: 

In [None]:
train_input, train_target, train_classes, test_input, test_target, test_classes = generate_pair_sets(
    1000)

In [None]:
print(f'Training and test input size: {train_input.size(), test_input.size()}')
print(f'Training and test target size: {train_target.size(), test_target.size()}')
print(f'Training and test classes size: {train_classes.size(), test_classes.size()}')

Generate dataset needed for training. For this as we have a special data case we rewrite the `Dataset` class in order to use a `dataloader` later. Remember `target` is 1 if first number is smaller or equal than the second image.  

In [18]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, pairs, target, classes):
        'Initialization'
        # target = (0,1)
        self.target = target
        # image pairs (2,14,14)
        self.pairs = pairs
        # cipher classes (2 in [0,9])
        self.classes = classes

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.pairs)

    def __getitem__(self, index):
        'Generates one sample of data'
        # image pairs
        X = self.pairs[index]
        # target:
        y = self.target[index]
        # classes:
        Y = self.classes[index]
        return X, y, Y

Create datasets (training and validation):

In [None]:
training_set = Dataset(train_input, train_target, train_classes)
test_set = Dataset(test_input, test_target, test_classes)

Have a look:

In [None]:
fig, ax = plt.subplots(6, 2, figsize=(5, 18))
for j in range(6):
    im1 = training_set.__getitem__(j)[0][0, :, :]
    im2 = training_set.__getitem__(j)[0][1, :, :]
    target = training_set.__getitem__(j)[1]
    classes = training_set.__getitem__(j)[2]
    ax[j, 0].imshow(im1, cmap='gray')
    ax[j, 1].imshow(im2, cmap='gray')
    ax[j, 1].set_title(f'Cipher: {classes[1]}')
    ax[j, 0].set_title(f'Cipher: {classes[0]}, target: {target}')

## Models:

### Model architectures:

#### Baseline: 
- Loss: CE (cross entropy) 
- Optimizer: SGD optimizer
- Activation function: softmax

In [8]:
# Basic model with two layers and a two digit output:
class Model_1(nn.Module):
    def __init__(self):
        super().__init__()
        input_size = 2 * 14 * 14
        hidden_sizes = [392, 392]
        # two digit output, probability of being 1 or 0:
        output_size = 2
        # flatten images to 1D input:
        self.flatten = nn.Flatten()
        # then two hidden layers:
        self.model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                                   nn.ReLU(),
                                   nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                                   nn.ReLU(),
                                   nn.Linear(hidden_sizes[1], output_size))
        # no need to add softmax at the end because already in CE loss.
    def forward(self, x):
        # flatten 2D->1D
        x = self.flatten(x)
        # predict probabilities:
        logits = self.model(x)
        return logits

#### Siamese-RNN: 
- Two RNN - flatten outputs of both, concatenate and then linear model over it. 
- Loss: CE (cross entropy) 
- Optimizer: Adam optimizer
- Activation function: softmax

In [9]:
class ImageRNN(nn.Module):

    # declaraction of variables
    def __init__(self, batch_size, n_steps, n_inputs, n_neurons, n_outputs):
        super(ImageRNN, self).__init__()

        # we need an intermediate output, because we are using a siamese RNN network
        intermediate_output_size = 10

        # two digit output, probability of being 1 or 0:
        output_size = 2

        self.n_neurons = n_neurons
        self.batch_size = batch_size
        self.n_steps = n_steps
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        
        # flatten images to 1D input:
        self.flatten = nn.Flatten()
        
        self.basic_rnn = nn.RNN(self.n_inputs, self.n_neurons,  bias=False)

        self.FC = nn.Linear(self.n_neurons, intermediate_output_size, bias=False)
        self.fcout = nn.Linear(2*intermediate_output_size, output_size,  bias=False)
        
        
    # initialize hidden weights that have zero values
    def init_hidden(self, ):
        # (num_layers, batch_size, n_neurons)
        return (torch.zeros(1, self.batch_size, self.n_neurons))

    def forward(self, X):
        # transforms X to dimensions: n_steps X batch_size X n_inputs
        X1 = X[:, 0, :].view(-1, 14, 14).permute(1, 0, 2)
        X2 = X[:, 1, :].view(-1, 14, 14).permute(1, 0, 2)
        # X1.shape = torch.Size([14, 64, 14])
        
        # ------------------------------------------------------------------
        # Input image 1 (first image of the pair)
        # Predict the number on first image
        # ------------------------------------------------------------------
        self.batch_size = X1.size(1)
        self.hidden = self.init_hidden() 
        #self.hidden.shape = torch.Size([1, 40, 50])
        
        # lstm_out => n_steps, batch_size, n_neurons (hidden states for each time step)
        # self.hidden => 1, batch_size, n_neurons (final state from each lstm_out)
        lstm_out, self.hidden = self.basic_rnn(X1, self.hidden)

        # lstm_out.shape = torch.Size([14, 64, 50]) 
        # self.hidden.shape = torch.Size([1, 64, 50])
        
        # Size batch x 10 (cipher predictions on image)
        out1 = self.FC(self.hidden)
        # out1.shape = torch.Size([1, 64, 10])
        
        # ------------------------------------------------------------------
        # Input image 2 (second image of the pair)
        # Predict the number on second image
        # ------------------------------------------------------------------
        self.batch_size = X2.size(1)
        self.hidden = self.init_hidden()
        
        # lstm_out => n_steps, batch_size, n_neurons (hidden states for each time step)
        # self.hidden => 1, batch_size, n_neurons (final state from each lstm_out)
        lstm_out, self.hidden = self.basic_rnn(X2, self.hidden)
        
        # Size batch x 10 (cipher predictions on image)
        out2 = self.FC(self.hidden)
        
        # ------------------------------------------------------------------
        # 0-1 Prediction 
        # Predict if first image bigger than second
        # ------------------------------------------------------------------
        
        # concatenate into size (batch x 20)
        output = torch.cat((out1[0], out2[0]), 1)   
        # output.shape = torch.Size([64, 20])
        
        # flatten 2D->1D
        output_ = self.flatten(output)     
        # output_.shape = torch.Size([64, 20])        
        
        # predict probabilities:
        logits = self.fcout(output_)
        return out1.view(-1, self.n_outputs), out2.view(
            -1, self.n_outputs), logits 

#### Siamese RNN 2 (longer linear part): 
- Two RNN - flatten outputs of both, concatenate and then linear model over it
- Longer linear model here
- Loss: CE (cross entropy)
- Optimizer: Adam optimizer
- Activation function: softmax

In [10]:
class ImageRNN_2(nn.Module):

    # declaraction of variables
    def __init__(self, batch_size, n_steps, n_inputs, n_neurons, n_outputs):
        super(ImageRNN_2, self).__init__()

        # we need an intermediate output, because we are using a siamese RNN network
        intermediate_output_size = 10

        # two digit output, probability of being 1 or 0:
        fina_output_size = 2

        self.n_neurons = n_neurons
        self.batch_size = batch_size
        self.n_steps = n_steps
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        
        # flatten images to 1D input:
        self.flatten = nn.Flatten()
        
        self.basic_rnn = nn.RNN(self.n_inputs, self.n_neurons,  bias=False)

        self.FC = nn.Linear(self.n_neurons, intermediate_output_size, bias=False)
        self.fcout = nn.Linear(2*intermediate_output_size, fina_output_size,  bias=False)
        
        # Input last linear model:
        input_size = 2 * 10
        hidden_sizes = [150, 150]
        
        # then two hidden layers:
        self.final = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
                                   nn.ReLU(),
                                   nn.Linear(hidden_sizes[0], hidden_sizes[1]),
                                   nn.ReLU(),
                                   nn.Linear(hidden_sizes[1], fina_output_size))
        
        
    # initialize hidden weights that have zero values
    def init_hidden(self, ):
        # (num_layers, batch_size, n_neurons)
        return (torch.zeros(1, self.batch_size, self.n_neurons))

    def forward(self, X):
        # transforms X to dimensions: n_steps X batch_size X n_inputs
        X1 = X[:, 0, :].view(-1, 14, 14).permute(1, 0, 2)
        X2 = X[:, 1, :].view(-1, 14, 14).permute(1, 0, 2)
        # X1.shape = torch.Size([14, 64, 14])

        # ------------------------------------------------------------------
        # Input image 1 (first image of the pair)
        # Predict the number on first image
        # ------------------------------------------------------------------
        self.batch_size = X1.size(1)
        self.hidden = self.init_hidden()
        
        # lstm_out => n_steps, batch_size, n_neurons (hidden states for each time step)
        # self.hidden => 1, batch_size, n_neurons (final state from each lstm_out)
        lstm_out, self.hidden = self.basic_rnn(X1, self.hidden)
        # lstm_out.shape = torch.Size([14, 64, 50]) 
        # self.hidden.shape = torch.Size([1, 64, 50])

        # Size batch x 10 (cipher predictions on image)
        out1 = self.FC(self.hidden)
        # out1.shape = torch.Size([1, 64, 10])

        # ------------------------------------------------------------------
        # Input image 2 (second image of the pair)
        # Predict the number on second image
        # ------------------------------------------------------------------
        self.batch_size = X2.size(1)
        self.hidden = self.init_hidden()
        
        # lstm_out => n_steps, batch_size, n_neurons (hidden states for each time step)
        # self.hidden => 1, batch_size, n_neurons (final state from each lstm_out)
        lstm_out, self.hidden = self.basic_rnn(X2, self.hidden)
        
        # Size batch x 10 (cipher predictions on image)
        out2 = self.FC(self.hidden)
        
        # ------------------------------------------------------------------
        # 0-1 Prediction 
        # Predict if first image bigger than second
        # ------------------------------------------------------------------
        
        # concatenate into size (batch x 20)
        output = torch.cat((out1[0], out2[0]), 1)   
        # output.shape = torch.Size([64, 20])

        # flatten 2D->1D
        output_ = self.flatten(output)      
        
        # predict probabilities:
        #logits = self.fcout(output_)
        logits = self.final(output_)
        return out1.view(-1, self.n_outputs), out2.view(
            -1, self.n_outputs), logits 

#### Basic Siamese CNN:

Basic CNN with two layers and two linear layers. 
- Siamese CNN
- Auxiliary losses

In [11]:
import torch.nn.functional as F

class Conv_basic(nn.Module):
    # declaraction of variables
    def __init__(self):
        super(Conv_basic, self).__init__()

        # we need an intermediate output, because we are using a siamese RNN network
        intermediate_output_size = 10

        # two digit output, probability of being 1 or 0:
        fina_output_size = 2
        self.n_outputs = 10
        # flatten images to 1D input:
        self.flatten = nn.Flatten()

        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(40, 30)
        self.fc2 = nn.Linear(30, 10)
        
        self.fcout = nn.Linear(2 * intermediate_output_size,
                               fina_output_size,
                               bias=False)

    def forward(self, X):
        # transforms X to dimensions: n_steps X batch_size X n_inputs
        X1 = X[:, 0, :].view(-1, 1, 14, 14)
        X2 = X[:, 1, :].view(-1, 1, 14, 14)
        # X1.shape = torch.Size([64, 1, 14, 14])
        
        # ------------------------------------------------------------------
        # Input image 1 (first image of the pair)
        # Predict the number on first image
        # ------------------------------------------------------------------
        x = F.relu(F.max_pool2d(self.conv1(X1), 2))
        # x.shape = torch.Size([64, 10, 5, 5])
        x = F.relu(F.max_pool2d(x, 2))
        # x.shape = torch.Size([64, 10, 2, 2])
        x = x.view(-1, 40)
        # x.shape = torch.Size([64, 40])
        x = F.relu(self.fc1(x))
        # x.shape = torch.Size([64, 30])
        x = F.dropout(x, training=self.training)
        # x.shape = torch.Size([64, 30])
        
        # Size batch x 10 (cipher predictions on image)
        out1 = self.fc2(x)
        # out1.shape = torch.Size([64, 10])
        
        # ------------------------------------------------------------------
        # Input image 2 (second image of the pair)
        # Predict the number on second image
        # ------------------------------------------------------------------
        x = F.relu(F.max_pool2d(self.conv1(X2), 2))
        x = F.relu(F.max_pool2d(x, 2))
        x = x.view(-1, 40)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        # Size batch x 10 (cipher predictions on image)
        out2 = self.fc2(x)
        
        # ------------------------------------------------------------------
        # 0-1 Prediction
        # Predict if first image bigger than second
        # ------------------------------------------------------------------

        # concatenate into size (batch x 20)
        #X = self.flatten(X)
        output = torch.cat((out1, out2), 1)
        # flatten 2D->1D
        output_ = self.flatten(output)
        
        # predict probabilities:
        logits = self.fcout(output_)
        return out1.view(-1, self.n_outputs), out2.view(-1,
                                                        self.n_outputs), logits

#### Deep Siamese CNN (best model):

In [12]:
# Basic model with two layers and a two digit output:
class SiameseConvNet2(nn.Module):
    def __init__(self):
        super(SiameseConvNet2, self).__init__()
        self.length = 14
        self.input_size = 1 * 14 * 14
        # we need an intermediate output, because we are using a siamese network
        intermediate_output_size = 10
        self.n_outputs = 10
        # two digit output, probability of being 1 or 0:
        final_output_size = 2
        
        # flatten images to 1D input:
        self.flatten = nn.Flatten()
        # convolutional layers
        kernel_size = 3
        n_channel = 5
        self.conv_layer = nn.Sequential(nn.Conv2d(1, n_channel, kernel_size),
                                        nn.ReLU(),
                                        nn.Conv2d(n_channel, 2 * n_channel, kernel_size),
                                        nn.ReLU(),
                                        nn.Conv2d(2 * n_channel, 4 * n_channel, kernel_size),
                                        nn.BatchNorm2d(4 * n_channel),
                                        nn.ReLU(),
                                        nn.Dropout2d(p=0.3),
                                        nn.Conv2d(4 * n_channel, 8 * n_channel, kernel_size),
                                        nn.ReLU(),
                                        nn.Dropout2d(p=0.4),
                                        nn.Conv2d(8 * n_channel, 16 * n_channel, kernel_size),
                                        nn.ReLU(),
                                        nn.Dropout2d(p=0.5),
                                        nn.Conv2d(16 * n_channel, 32 * n_channel, kernel_size),
                                        nn.ReLU(),
                                        nn.Dropout2d(p=0.6)
                                        )
        def compute_conv2d_size(length):
            return  length - (kernel_size - 1) - 1 + 1
        
        length_out = self.length
        depth = 4
        
        for i in range(depth):
            length_out = compute_conv2d_size(length_out)
                
        # For 10 digits:
        concat_size = 640
        self.fcout1 = nn.Linear(concat_size, intermediate_output_size)
        
        # For 0-1 output: 
        self.fcout2 = nn.Linear(2 * intermediate_output_size,
                               final_output_size,
                               bias=False)

    def forward(self, x):
        x1 = x[:, 0].view(-1, 1, self.length, self.length)
        x2 = x[:, 1].view(-1, 1, self.length, self.length)
        # x1.shape = torch.Size([64, 1, 14, 14])
        
        x1 = self.conv_layer(x1)
        # x1.shape = torch.Size([64, 160, 2, 2])
        x1 = x1.view(-1, 640)
        # x1.shape = torch.Size([64, 640])
        
        # Size batch x 10 (cipher predictions on image)
        out1 = self.fcout1(x1)
        # out1.shape = torch.Size([64, 10])
        
        x2 = self.conv_layer(x2) 
        x2 = x2.view(-1, 640)
        # Size batch x 10 (cipher predictions on image)
        out2 = self.fcout1(x2)
        
        # ------------------------------------------------------------------
        # 0-1 Prediction
        # Predict if first image bigger than second
        # ------------------------------------------------------------------

        # concatenate into size (batch x 20)
        #X = self.flatten(X)
        output = torch.cat((out1, out2), 1)
        # flatten 2D->1D
        output_ = self.flatten(output)
        
        # predict probabilities:
        logits = self.fcout2(output)
        return out1.view(-1, self.n_outputs), out2.view(-1,
                                                        self.n_outputs), logits

### Training functions:
Binary classification with two output units --> so `CrossEntropyLoss()` so need to use `torch.nn.CrossEntropyLoss` instead of `BCELoss` (BCE for 1 digit output). The `Softmax` activation is already included in this loss function. 

In [13]:
from sklearn.metrics import roc_auc_score
softmax = torch.nn.Softmax(dim=1)

def train_loop(dataloader, model, loss_fn, optimizer, w, print_loss=True):
    size = len(dataloader.dataset)
    train_loss, correct, accuracy_numbers = 0, 0, 0
    tp, fp, fn, tn = 0, 0, 0, 0
    aucs = []
    for batch, (X, y, Y) in enumerate(dataloader):
        # Compute prediction and loss:
        pred1, pred2, logits = model(X)

        # Softmax to get probabilities:
        prob = softmax(logits)
        
        # calculate number of correct predictions:
        correct += (prob.argmax(1) == y).type(torch.float).sum().item()
        accuracy_numbers += (pred1.argmax(1) == Y[:,0]).type(torch.float).sum().item()
        accuracy_numbers += (pred2.argmax(1) == Y[:,1]).type(torch.float).sum().item()
        
        # roc-auc score: 
        aucs.append(roc_auc_score(y.detach().numpy(), prob.detach().numpy()[:, 1]))

        # [0-1] pred loss:
        loss = loss_fn(logits, y)

        # [0-9] pred loss for each pair:
        loss_aux_1 = loss_fn(pred1, Y[:, 0])
        loss_aux_2 = loss_fn(pred2, Y[:, 1])
        
        # true positives and other rates:
        tp += (prob.argmax(1) * y).type(torch.float).sum().item()
        tn += ((1 - prob.argmax(1)) * (1 - y)).type(
                torch.float).sum().item()
        fp += ((1 - y) * prob.argmax(1)).sum().type(
                torch.float).sum().item()
        fn += (y * (1 - prob.argmax(1))).sum().type(
                torch.float).sum().item()
        
        # Backpropagation:
        optimizer.zero_grad()
        loss = w[0]*loss + w[1] * loss_aux_1 + w[2] * loss_aux_2  # 0.4 is weight for auxillary classifier

        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        
    
        loss, current = loss.item(), batch * len(X)
        """
        if print_loss:
            if batch % 10 == 0:
                    print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")"""

    # return average training loss:
    train_loss /= size
    correct *= 100 / size
    accuracy_numbers *= 100 / (2 * size)
    auc = sum(aucs)/len(aucs)
    
    # F1 score:
    epsilon = 1e-7
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    f1 = 2 * 100 * (precision * recall) / (precision + recall + epsilon)
    
    return correct, f1, accuracy_numbers, train_loss, auc


def test_loop(dataloader, model, loss_fn, e, w, print_loss=True):
    size = len(dataloader.dataset)
    test_loss, correct = 0, 0
    tp, fp, fn, tn, accuracy_numbers = 0, 0, 0, 0, 0
    softmax = torch.nn.Softmax(dim=1)
    aucs = []
    with torch.no_grad():
        for X, y, Y in dataloader:
            pred1, pred2, logits = model(X)
            
            # [0-1] pred loss:
            loss = loss_fn(logits, y)

            # [0-9] pred loss for each pair:
            loss_aux_1 = loss_fn(pred1, Y[:, 0])
            loss_aux_2 = loss_fn(pred2, Y[:, 1])
            
            # sum with weights for total loss: 
            loss = w[0]*loss + w[1] * loss_aux_1 + w[2] * loss_aux_2 
            test_loss += loss.item()
            
            # Softmax to get probabilities:
            prob = softmax(logits)

            # calculate number of correct predictions:
            correct += (prob.argmax(1) == y).type(torch.float).sum().item()
            accuracy_numbers += (pred1.argmax(1) == Y[:,0]).type(torch.float).sum().item()
            accuracy_numbers += (pred2.argmax(1) == Y[:,1]).type(torch.float).sum().item()
            
            # roc-auc score:
            aucs.append(roc_auc_score(y.detach().numpy(), prob.detach().numpy()[:, 1]))
            
            # true positives and other rates:
            tp += (prob.argmax(1) * y).type(torch.float).sum().item()
            tn += ((1 - prob.argmax(1)) * (1 - y)).type(
                torch.float).sum().item()
            fp += ((1 - y) * prob.argmax(1)).sum().type(
                torch.float).sum().item()
            fn += (y * (1 - prob.argmax(1))).sum().type(
                torch.float).sum().item()

    # return average test loss and accuracy:
    test_loss /= size
    correct *= 100 / size
    accuracy_numbers *= 100 / (2 * size)
    auc = sum(aucs)/len(aucs)
    
    # F1 score:
    epsilon = 1e-7
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    f1 = 2 * 100 * (precision * recall) / (precision + recall + epsilon)

    if e % 10 == 0:
        if print_loss:
            print(
                f"Validation Error: \n Accuracy: {(correct):>0.1f}%, F1: {(f1):>0.1f}%, Accuracy ciphers: {(accuracy_numbers):>0.1f}%, Avg loss: {test_loss:>8f} \n "
            )
    return correct, f1, accuracy_numbers, test_loss, auc

In [24]:
def train_eval(model,
               optimizer,
               loss_fn,
               w,training_generator, test_generator,
               epochs=25,
               save=False,
               print_loss=False,
               print_epoch=False):


    train_perf, test_perf = [], []

    for t in range(epochs):
        if print_epoch:
            if t % 10 == 0:
                print(f"Epoch {t+1}\n-------------------------------")
        train_perf.append(
            train_loop(training_generator,
                       model,
                       loss_fn,
                       optimizer,
                       w,
                       print_loss=print_loss))
        test_perf.append(
            test_loop(test_generator,
                      model,
                      loss_fn,
                      e=t,
                      w=w,
                      print_loss=print_loss))
    #print("Done!")
    if save:
        torch.save(
            {
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_perf': train_perf,
                'test_perf': test_perf
            }, '../data/{}'.format(type(model).__name__))
        print("Saved!")

    return train_perf, test_perf

In [15]:
def plot_performance(train_perf, test_perf):
    def sub_plot(axs_id, train_data, test_data, train_label, test_label,
                 x_label, title):
        axs[axs_id].plot(train_data, label=train_label)
        axs[axs_id].plot(test_data, label=test_label)
        axs[axs_id].set_xlabel(x_label)
        axs[axs_id].set_title(title)
        axs[axs_id].legend()

    fig, axs = plt.subplots(1, 4, figsize=(15, 5))

    train_accs = list(zip(*train_perf))[0]
    test_accs = list(zip(*test_perf))[0]
    test_f1 = list(zip(*test_perf))[1]
    train_f1 = list(zip(*train_perf))[1]
    
    # Accuracy 0-1 and F1 score:
    sub_plot(0, train_accs, test_accs, 'train accuracy', 'test accuracy',
             'Num epochs', 'Accuracy')
    axs[0].plot(test_f1, label='test_f1')
    axs[0].plot(train_f1, label='train_f1')
    axs[0].legend()
    
    # Accuracy on ciphers:
    test_accs_ciphers = list(zip(*test_perf))[2]
    train_accs_ciphers = list(zip(*train_perf))[2]
    sub_plot(1, train_accs_ciphers,  test_accs_ciphers,
             'train accuracy','test accuracy', 'Num epochs', 'Accuracy ciphers')
    
    # Losses:
    train_losses = list(zip(*train_perf))[3]
    test_losses = list(zip(*test_perf))[3]
    sub_plot(2, train_losses, test_losses, 'train loss', 'test loss',
             'Num epochs', 'Loss')
    
    # Roc-auc curve:
    train_aucs = list(zip(*train_perf))[4]
    test_aucs = list(zip(*test_perf))[4]
    sub_plot(3, train_aucs, test_aucs, 'train auc', 'test auc', 'Num epochs', 'ROC AUC')

In [16]:
def plot_performance_runs(performance,
                          metrics=['Accuracy', 'F1','Cipher Accuracy','Loss', 'ROC_AUC'],
                          style=0):
    df = pd.DataFrame(columns=['Run', 'is_Test', 'epoch'] + metrics)
    for i in range(len(performance)):
        for j in range(2):
            df_temp = pd.DataFrame(
                performance[i][j],
                columns=['Accuracy', 'F1', 'Cipher Accuracy', 'Loss', 'ROC_AUC'
                         ]).reset_index().rename(columns={'index': 'epoch'})
            df_temp.insert(0, "is_Test", j)
            df_temp.insert(0, "Run", i)
            df = df.append(df_temp)
            df = df.reset_index().drop(columns=['index'])

    sns.set_theme()
    for metric in metrics:
        if style == 0:
            g = sns.lineplot(data=df, x='epoch', y=metric, hue='is_Test')
            g.legend(['Train', 'Test'])
        if style == 1:
            g = sns.lineplot(data=df,
                             x='epoch',
                             y=metric,
                             hue='is_Test',
                             style='Run')
            g.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.title("{} vs {}".format(metric, "epoch"))
        plt.show()

### Train model:

In [None]:
# ------------------------------------------------------------------
# Control the randomness
# ------------------------------------------------------------------
torch.manual_seed(0)

# ------------------------------------------------------------------
# Perform n_runs runs
# ------------------------------------------------------------------
perf = []
n_runs = 1

params_model = {
    'batch_size': 64,
    'n_steps': 64,
    'n_inputs': 14,
    'n_neurons': 50,
    'n_outputs': 10
}

params_t = {
    'epochs': 100,
    'learning_rate': 1e-3,
    'loss_fn': nn.CrossEntropyLoss(),
    # weights given to main and auxiliary losses
    'w': [1, 0.8, 0.8]
}

for i in range(n_runs):
    ### ------------------------------------------------------------------
    ### Initialize/reset the model and optimizer (i.e. reset the weights)
    ### ------------------------------------------------------------------
    print(f'Run [{i}/{n_runs}]')
    model = ImageRNN(**params_model).to(device)
    learning_rate = 1e-3
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=params_t['learning_rate'])

    ### ------------------------------------------------------------------
    ### Train and evaluate
    ### ------------------------------------------------------------------
    loss_fn = nn.CrossEntropyLoss()
    perf += [
        train_eval(model,
                   optimizer,
                   params_t['loss_fn'],
                   params_t['w'],
                   params_t['epochs'],
                   print_loss=False,
                   print_epoch=False)
    ]

# ------------------------------------------------------------------
# Plot performance
# ------------------------------------------------------------------
plot_performance_runs(perf, ['Loss', 'F1','Cipher Accuracy','Accuracy', 'ROC_AUC'])

# roc curves
"""
y_probas = nn.functional.softmax(model(test_input), dim=1).detach().numpy()
y_test = test_target
skplt.metrics.plot_roc(y_test, y_probas)
plt.show()
"""

#### Siamese-RNN:

In [None]:
# ------------------------------------------------------------------
# control the randomness
# ------------------------------------------------------------------

torch.manual_seed(0)

# ------------------------------------------------------------------
# initialize/reset the model and optimizer (i.e. reset the weights)
# ------------------------------------------------------------------
params_model = {
    'batch_size': 64,
    'n_steps': 64,
    'n_inputs': 14,
    'n_neurons': 50,
    'n_outputs': 10
}

model = ImageRNN(**params_model).to(device)

print("model: {}\nParameters:".format(type(model).__name__))
for parameter in model.parameters():
    print(parameter.shape)
print()

params_t = {
    'epochs': 100,
    'learning_rate': 1e-3,
    'loss_fn': nn.CrossEntropyLoss(),
    # weights given to main and auxiliary losses
    'w': [1, 0.8, 0.8]
}

optimizer = torch.optim.Adam(model.parameters(), lr=params_t['learning_rate'])

# ------------------------------------------------------------------
# train and evaluate
# ------------------------------------------------------------------
loss_fn = nn.CrossEntropyLoss()
train_perf, test_perf = train_eval(model,
                                   optimizer,
                                   params_t['loss_fn'], params_t['w'],
                                   params_t['epochs'],
                                   save=False,
                                   print_loss=True,
                                   print_epoch=True)

# ------------------------------------------------------------------
# plot performance
# ------------------------------------------------------------------
plot_performance(train_perf, test_perf)

#### Siamese-RNN 2:

In [None]:
# ------------------------------------------------------------------
# control the randomness
# ------------------------------------------------------------------

torch.manual_seed(0)

# ------------------------------------------------------------------
# initialize/reset the model and optimizer (i.e. reset the weights)
# ------------------------------------------------------------------
params_model = {
    'batch_size': 64,
    'n_steps': 64,
    'n_inputs': 14,
    'n_neurons': 50,
    'n_outputs': 10
}

model = ImageRNN_2(**params_model).to(device)

print("model: {}\nParameters:".format(type(model).__name__))
for parameter in model.parameters():
    print(parameter.shape)
print()

params_t = {
    'epochs': 100,
    'learning_rate': 1e-3,
    'loss_fn': nn.CrossEntropyLoss(),
    # weights given to main and auxiliary losses
    'w': [1, 0.8, 0.8]
}

optimizer = torch.optim.Adam(model.parameters(), lr=params_t['learning_rate'])

# ------------------------------------------------------------------
# train and evaluate
# ------------------------------------------------------------------
loss_fn = nn.CrossEntropyLoss()
train_perf, test_perf = train_eval(model,
                                   optimizer,
                                   params_t['loss_fn'], params_t['w'],
                                   params_t['epochs'],
                                   save=False,
                                   print_loss=True,
                                   print_epoch=True)

# ------------------------------------------------------------------
# plot performance
# ------------------------------------------------------------------
plot_performance(train_perf, test_perf)

In [49]:
import random

# randomize runs:
# ------------------------------------------------------------------
# initialize/reset the model and optimizer (i.e. reset the weights)
# ------------------------------------------------------------------
params_model = {
    'batch_size': 64,
    'n_steps': 64,
    'n_inputs': 14,
    'n_neurons': 50,
    'n_outputs': 10
}

model = ImageRNN_2(**params_model).to(device)

print("model: {}\nParameters:".format(type(model).__name__))
for parameter in model.parameters():
    print(parameter.shape)
print()

params_t = {
    'epochs': 50,
    'learning_rate': 1e-3,
    'loss_fn': nn.CrossEntropyLoss(),
    # weights given to main and auxiliary losses
    'w': [1, 0.8, 0.8]
}

optimizer = torch.optim.Adam(model.parameters(), lr=params_t['learning_rate'])
loss_fn = nn.CrossEntropyLoss()

N = 10
random_seeds = random.sample(range(1, 30), 10)
metrics = pd.DataFrame(
    columns=['Runs','Accuracy', 'F1', 'Accuracy_cipher', 'Loss', 'AUC'])
metrics['Runs'] = range(N)

for i in range(N):
    print('Run: {}/{}'.format(i + 1, N))
    train_input, train_target, train_classes, test_input, test_target, test_classes = generate_pair_sets(
        1000)
    training_set = Dataset(train_input, train_target, train_classes)
    test_set = Dataset(test_input, test_target, test_classes)
    
    # Data loader for model, change num_workers when on GPU:
    params_ = {'batch_size': 64, 'shuffle': True, 'num_workers': 0}
    training_generator = torch.utils.data.DataLoader(training_set, **params_)
    test_generator = torch.utils.data.DataLoader(test_set, **params_)
    
    train_perf, test_perf = train_eval(model,
                                       optimizer,
                                       params_t['loss_fn'],
                                       params_t['w'],
                                       training_generator,
                                       test_generator,
                                       params_t['epochs'],
                                       save=False,
                                       print_loss=False,
                                       print_epoch=False)
    print(test_perf[-1])
    correct, f1, accuracy_numbers, train_loss, auc = test_perf[-1]
    metrics.iloc[i,1]= correct
    metrics.iloc[i,2] = f1
    metrics.iloc[i,3] = accuracy_numbers
    metrics.iloc[i,4]= train_loss
    metrics.iloc[i,5] = auc
metrics

model: ImageRNN_2
Parameters:
torch.Size([50, 14])
torch.Size([50, 50])
torch.Size([10, 50])
torch.Size([2, 20])
torch.Size([150, 20])
torch.Size([150])
torch.Size([150, 150])
torch.Size([150])
torch.Size([2, 150])
torch.Size([2])

Run: 1/10
(68.9, 72.45349365903061, 53.85, 0.04982289886474609, 0.7727789072925929)
Run: 2/10
(69.9, 72.10379480251976, 52.7, 0.0527953028678894, 0.7852484855917605)
Run: 3/10
(69.10000000000001, 71.72918071426872, 53.35, 0.05151599407196045, 0.7695533773450342)
Run: 4/10
(72.2, 74.86437111667775, 59.35, 0.051769250392913815, 0.7742951849298675)
Run: 5/10
(69.10000000000001, 70.37391636717999, 59.550000000000004, 0.051922221899032596, 0.7602308530135335)
Run: 6/10
(73.10000000000001, 76.46543680918307, 59.900000000000006, 0.05153920006752014, 0.7943658762411963)
Run: 7/10
(74.0, 77.03179710916949, 59.85, 0.046590493202209475, 0.8062588537844837)
Run: 8/10
(73.60000000000001, 77.20206753091092, 60.75, 0.048461812734603885, 0.7972592769271908)
Run: 9/10
(70.9,

Unnamed: 0,Runs,Accuracy,F1,Accuracy_cipher,Loss,AUC
0,0,68.9,72.4535,53.85,0.0498229,0.772779
1,1,69.9,72.1038,52.7,0.0527953,0.785248
2,2,69.1,71.7292,53.35,0.051516,0.769553
3,3,72.2,74.8644,59.35,0.0517693,0.774295
4,4,69.1,70.3739,59.55,0.0519222,0.760231
5,5,73.1,76.4654,59.9,0.0515392,0.794366
6,6,74.0,77.0318,59.85,0.0465905,0.806259
7,7,73.6,77.2021,60.75,0.0484618,0.797259
8,8,70.9,74.4961,62.1,0.0493848,0.792621
9,9,73.4,75.5963,61.15,0.0481885,0.82366


In [50]:
metrics.mean(), metrics.std()

(Runs                4.500000
 Accuracy           71.420000
 F1                 74.231643
 Accuracy_cipher    58.255000
 Loss                0.050199
 AUC                 0.787627
 dtype: float64,
 Runs               3.027650
 Accuracy           2.067097
 F1                 2.422594
 Accuracy_cipher    3.525104
 Loss               0.002016
 AUC                0.019119
 dtype: float64)

#### Basic convnet: 

In [None]:
# ------------------------------------------------------------------
# control the randomness
# ------------------------------------------------------------------

torch.manual_seed(0)

# ------------------------------------------------------------------
# initialize/reset the model and optimizer (i.e. reset the weights)
# ------------------------------------------------------------------
params_model = {
    'batch_size': 64,
    'n_steps': 64,
    'n_inputs': 14,
    'n_neurons': 50,
    'n_outputs': 10
}

model = Conv_basic().to(device)

print("model: {}\nParameters:".format(type(model).__name__))
for parameter in model.parameters():
    print(parameter.shape)
print()

params_t = {
    'epochs': 100,
    'learning_rate': 1e-3,
    'loss_fn': nn.CrossEntropyLoss(),
    # weights given to main and auxiliary losses
    'w': [1, 0.8, 0.8]
}

optimizer = torch.optim.Adam(model.parameters(), lr=params_t['learning_rate'])

# ------------------------------------------------------------------
# train and evaluate
# ------------------------------------------------------------------
loss_fn = nn.CrossEntropyLoss()
train_perf, test_perf = train_eval(model,
                                   optimizer,
                                   params_t['loss_fn'],
                                   params_t['w'],
                                   params_t['epochs'],
                                   save=False,
                                   print_loss=True,
                                   print_epoch=True)

# ------------------------------------------------------------------
# plot performance
# ------------------------------------------------------------------
plot_performance(train_perf, test_perf)

In [44]:
import random

# randomize runs:
# ------------------------------------------------------------------
# initialize/reset the model and optimizer (i.e. reset the weights)
# ------------------------------------------------------------------
params_model = {
    'batch_size': 64,
    'n_steps': 64,
    'n_inputs': 14,
    'n_neurons': 50,
    'n_outputs': 10
}

model = Conv_basic().to(device)

print("model: {}\nParameters:".format(type(model).__name__))
for parameter in model.parameters():
    print(parameter.shape)
print()

params_t = {
    'epochs': 100,
    'learning_rate': 1e-3,
    'loss_fn': nn.CrossEntropyLoss(),
    # weights given to main and auxiliary losses
    'w': [1, 0.8, 0.8]
}

optimizer = torch.optim.Adam(model.parameters(), lr=params_t['learning_rate'])
loss_fn = nn.CrossEntropyLoss()

N = 10
random_seeds = random.sample(range(1, 30), 10)
metrics = pd.DataFrame(
    columns=['Runs','Accuracy', 'F1', 'Accuracy_cipher', 'Loss', 'AUC'])
metrics['Runs'] = range(N)

for i in range(N):
    print('Run: {}/{}'.format(i + 1, N))
    train_input, train_target, train_classes, test_input, test_target, test_classes = generate_pair_sets(
        1000)
    training_set = Dataset(train_input, train_target, train_classes)
    test_set = Dataset(test_input, test_target, test_classes)
    
    # Data loader for model, change num_workers when on GPU:
    params_ = {'batch_size': 64, 'shuffle': True, 'num_workers': 0}
    training_generator = torch.utils.data.DataLoader(training_set, **params_)
    test_generator = torch.utils.data.DataLoader(test_set, **params_)
    
    train_perf, test_perf = train_eval(model,
                                       optimizer,
                                       params_t['loss_fn'],
                                       params_t['w'],
                                       training_generator,
                                       test_generator,
                                       params_t['epochs'],
                                       save=False,
                                       print_loss=False,
                                       print_epoch=False)
    print(test_perf[-1])
    correct, f1, accuracy_numbers, train_loss, auc = test_perf[-1]
    metrics.iloc[i,1]= correct
    metrics.iloc[i,2] = f1
    metrics.iloc[i,3] = accuracy_numbers
    metrics.iloc[i,4]= train_loss
    metrics.iloc[i,5] = auc
metrics

model: Conv_basic
Parameters:
torch.Size([10, 1, 5, 5])
torch.Size([10])
torch.Size([20, 10, 5, 5])
torch.Size([20])
torch.Size([30, 40])
torch.Size([30])
torch.Size([10, 30])
torch.Size([10])
torch.Size([2, 20])

Run: 1/10
(73.9, 75.07162822403426, 68.2, 0.03203369951248169, 0.8339582987254909)
Run: 2/10
(78.60000000000001, 79.84933585169613, 72.60000000000001, 0.027249876260757446, 0.8759174062024448)
Run: 3/10
(80.10000000000001, 81.99094521288292, 78.65, 0.02354672122001648, 0.8949376791273713)
Run: 4/10
(80.9, 82.6835851940273, 78.95, 0.022619142532348632, 0.8967654293156648)
Run: 5/10
(82.60000000000001, 84.38060540190328, 82.2, 0.021699075341224672, 0.8997894884161121)
Run: 6/10
(82.4, 84.17265685551051, 82.2, 0.024346720218658448, 0.8953469068996494)
Run: 7/10
(82.7, 84.53976263946917, 81.0, 0.02281933259963989, 0.9087170995886285)
Run: 8/10
(84.30000000000001, 85.76608746044158, 81.9, 0.022233237862586974, 0.9165412220480633)
Run: 9/10
(81.80000000000001, 82.69961475729765, 81

Unnamed: 0,Runs,Accuracy,F1,Accuracy_cipher,Loss,AUC
0,0,73.9,75.0716,68.2,0.0320337,0.833958
1,1,78.6,79.8493,72.6,0.0272499,0.875917
2,2,80.1,81.9909,78.65,0.0235467,0.894938
3,3,80.9,82.6836,78.95,0.0226191,0.896765
4,4,82.6,84.3806,82.2,0.0216991,0.899789
5,5,82.4,84.1727,82.2,0.0243467,0.895347
6,6,82.7,84.5398,81.0,0.0228193,0.908717
7,7,84.3,85.7661,81.9,0.0222332,0.916541
8,8,81.8,82.6996,81.85,0.0232998,0.892814
9,9,83.0,84.489,82.55,0.0234803,0.90839


In [46]:
metrics.mean(), metrics.std()

(Runs                4.500000
 Accuracy           81.030000
 F1                 82.564327
 Accuracy_cipher    79.010000
 Loss                0.024333
 AUC                 0.892318
 dtype: float64,
 Runs               3.027650
 Accuracy           2.979952
 F1                 3.120127
 Accuracy_cipher    4.845834
 Loss               0.003105
 AUC                0.023287
 dtype: float64)

#### Deeper Convnet: 

In [None]:
# ------------------------------------------------------------------
# control the randomness
# ------------------------------------------------------------------

torch.manual_seed(0)

# ------------------------------------------------------------------
# initialize/reset the model and optimizer (i.e. reset the weights)
# ------------------------------------------------------------------
params_model = {
    'batch_size': 64,
    'n_steps': 64,
    'n_inputs': 14,
    'n_neurons': 50,
    'n_outputs': 10
}

model = SiameseConvNet2().to(device)

print("model: {}\nParameters:".format(type(model).__name__))
for parameter in model.parameters():
    print(parameter.shape)
print()

params_t = {
    'epochs': 100,
    'learning_rate': 1e-3,
    'loss_fn': nn.CrossEntropyLoss(),
    # weights given to main and auxiliary losses
    'w': [1, 0.8, 0.8]
}

optimizer = torch.optim.Adam(model.parameters(), lr=params_t['learning_rate'])

# ------------------------------------------------------------------
# train and evaluate
# ------------------------------------------------------------------
loss_fn = nn.CrossEntropyLoss()
train_perf, test_perf = train_eval(model,
                                   optimizer,
                                   params_t['loss_fn'],
                                   params_t['w'],
                                   params_t['epochs'],
                                   save=False,
                                   print_loss=True,
                                   print_epoch=True)

# ------------------------------------------------------------------
# plot performance
# ------------------------------------------------------------------
plot_performance(train_perf, test_perf)

### Predictions on test set:

In [None]:
# Make a few predictions:
params_ = {'batch_size': 64, 'shuffle': True, 'num_workers': 0}
test_generator = torch.utils.data.DataLoader(test_set, **params_)
size = len(test_generator.dataset)
softmax = torch.nn.Softmax(dim=1)
fig, ax = plt.subplots(6, 2, figsize=(10, 18))

with torch.no_grad():
    for batch, (X, y, Y) in enumerate(test_generator):
        if batch == 0:
            pred1, pred2, pred = model(X)
            prob = softmax(pred)
            prediction = prob.argmax(1).type(torch.float)
            pred_cipher_1 = pred1.argmax(1).type(torch.float)
            pred_cipher_2 = pred2.argmax(1).type(torch.float)
            for j in range(6):
                im1 = X[j][0, :, :]
                im2 = X[j][1, :, :]
                target = y[j]
                classes = Y[j]
                pred = prediction[j]
                pred1 = int(pred_cipher_1[j])
                pred2 = int(pred_cipher_2[j])
                ax[j, 0].imshow(im1, cmap='gray')
                ax[j, 1].imshow(im2, cmap='gray')
                ax[j, 0].set_title(
                    f'Cipher (true/pred): {classes[0]}/{pred1}, target: {target}, pred: {pred}')
                ax[j, 1].set_title(f'Cipher (true/pred): {classes[1]}/{pred2}')