#NEURAL NETWORKS AND DEEP LEARNING

## Homework 1 - Supervised Deep Learning

### Classification task

Puppin Michele - 1227474

In [None]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import files

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F

# Set the seed
np.random.seed(25)
torch.manual_seed(25)

In [None]:
# Set device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Training device: {device}")

## Dataset

In [None]:
train_dataset = torchvision.datasets.MNIST('classifier_data', train=True, download=True)
test_dataset  = torchvision.datasets.MNIST('classifier_data', train=False, download=True)

## Early stopping

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

## Network definition

In [None]:
class Net(nn.Module):

    def __init__(self, DropProb = 0):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1,  out_channels=16, kernel_size=3, stride=1, padding=2)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=2)
        self.pool = nn.MaxPool2d(2, 2)
        self.drop = nn.Dropout(DropProb)
        self.fc1 = nn.Linear(32*8*8, 128)
        self.fc2 = nn.Linear(128, 10)
        self.act = nn.ReLU()
        print('Network initialized')

    def forward(self, x, return_layer = 0):
        x = self.act(self.conv1(x))
        if return_layer == 1:
          return x
        x = self.pool(x)
        x = self.act(self.conv2(x))
        if return_layer == 2:
          return x
        x = self.pool(x)
        x = torch.flatten(x, 1) 
        x = self.drop(x)
        x = self.act(self.fc1(x))
        x = self.fc2(x)
        return x
    
    def train_nn(self, train_loader, optimizer, loss_func, device):
        train_loss= []
        self.train()
        for sample_batched in train_loader:
            x_batch = sample_batched[0].to(device)
            label_batch = sample_batched[1].to(device)
            out = self.forward(x_batch)
            loss = loss_func(out, label_batch)
            self.zero_grad()
            loss.backward()
            optimizer.step()
            loss_batch = loss.detach().cpu().numpy()
            train_loss.append(loss_batch)
        return train_loss
    
    def validation_nn(self, val_loader, loss_func, device):
        val_loss = []
        self.eval() 
        with torch.no_grad():
            for sample_batched in val_loader:
                x_batch = sample_batched[0].to(device)
                label_batch = sample_batched[1].to(device)
                out = self.forward(x_batch)
                loss = loss_func(out, label_batch)
                loss_batch = loss.detach().cpu().numpy()
                val_loss.append(loss_batch)
        return val_loss
    
    def fit(self, train_loader, val_loader, optimizer, loss_func, epochs, device):
        train_loss_log = []
        val_loss_log = []

        early_stopping = EarlyStopping(patience = 25, verbose = False)

        for epoch in range(epochs):
            print(epoch)
            # Training
            train_loss = self.train_nn(train_loader, optimizer, loss_func, device)
            train_loss_log.append(np.mean(train_loss))
            # Validation
            val_loss = self.validation_nn(val_loader, loss_func, device)
            val_loss_log.append(np.mean(val_loss))

            early_stopping(np.mean(val_loss), self)
            if early_stopping.early_stop:
                print("Early stopping")
                break
            
        return train_loss_log, val_loss_log

    def predict(self, input_loader, loss_func, device):
        inputs = []
        outputs = []
        labels = []
        self.eval()
        with torch.no_grad(): 
            for sample_batched in input_loader:
                x_batch = sample_batched[0].to(device)
                label = sample_batched[1].to(device) 
                out = self.forward(x_batch)
                inputs.append(x_batch)
                outputs.append(out)
                labels.append(label) 
        inputs = torch.cat(inputs)
        outputs = torch.cat(outputs)
        labels = torch.cat(labels)
        test_loss = loss_func(outputs, labels) 
        return inputs, outputs, labels, test_loss

    def save(self, path):
        torch.save(self.state_dict(), path)
        
    def load(self, path):
        self.load_state_dict( torch.load(path) )
        
    def restart(self):
        self.__init__()

## Random Grid Search

In [None]:
def RandomGridSearch(config, train_load, val_load, rep, device):

    par_log = []
    train_loss_log = []
    val_loss_log = []

    for i in range(rep):
        print(i)
        
        sample_params = {}

        for k in config.keys():
            sample_params[k] = np.random.choice(config[k])

        par_log.append(sample_params)

        DropProb = sample_params['Dropout']

        model = Net(DropProb).to(device)

        loss_func = nn.CrossEntropyLoss()
        epochs = sample_params['Epochs']

        if sample_params['Optimizer']=='Adam':
                opt = optim.Adam(model.parameters(), lr = sample_params['LearningRate'], weight_decay = sample_params['Regularization'])
        if sample_params['Optimizer']=='SGD':
                opt = optim.SGD( model.parameters(), lr = sample_params['LearningRate'], weight_decay = sample_params['Regularization'], momentum=0.9)

        # Training & validation
        train_loss, val_loss = model.fit(train_load, val_load, opt, loss_func, epochs, device)
            
        # Storing train/loss validation
        train_loss_log.append( train_loss )
        val_loss_log.append( val_loss )

    return par_log, train_loss_log, val_loss_log 

## Training and Testing the network

### Model selection

In [None]:
dict_params = {
            'LearningRate'    : [0.01, 0.001, 0.0001],
            'Regularization'  : [1e-4, 1e-5, 1e-6],
            'Dropout'         : [0, 0.1, 0.15],
            'Epochs'          : [20, 40, 60],
            'Optimizer'       : ['SGD', 'Adam']
         }

In [None]:
# Add random transformation and normalization
add_noise  = torchvision.transforms.GaussianBlur(kernel_size=3)
rotation   = torchvision.transforms.RandomRotation(30)
distortion = torchvision.transforms.RandomPerspective(0.5)

random_transform = transforms.RandomChoice([add_noise, rotation, distortion])
composed_transform = transforms.Compose( [ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ] )

In [None]:
train_dataset = torchvision.datasets.MNIST('classifier_data', 
                                           transform=composed_transform, 
                                           train=True, 
                                           download=True)

In [None]:
train_set, val_set = torch.utils.data.random_split(train_dataset, [50000, 10000])
train_load = DataLoader(train_set, batch_size=64, shuffle=True)
val_load = DataLoader(val_set, batch_size=64, shuffle=False)

In [None]:
params_list, train_loss_list, val_loss_list = RandomGridSearch(dict_params, train_load, val_load, 20, device)

In [None]:
# Select best parameters
best_params = params_list[np.argmin([v[-1] for v in val_loss_list])]
best_params

### Train with best parameters

In [None]:
best_params = {
            'LearningRate'    : 0.0001,
            'Regularization'  : 1e-4,
            'Dropout'         : 0.15,
            'Epochs'          : 60,
            'Optimizer'       : 'Adam'
         }

In [None]:
train_load = DataLoader(train_dataset, batch_size=20, shuffle=True, num_workers=0 )
val_load = train_load

DropProb = best_params['Dropout']

model = Net(DropProb).to(device)

loss_func = nn.CrossEntropyLoss()
epochs = best_params['Epochs']

if best_params['Optimizer']=='Adam':
        opt = optim.Adam(model.parameters(), lr = best_params['LearningRate'], weight_decay = best_params['Regularization'])
if best_params['Optimizer']=='SGD':
        opt = optim.SGD( model.parameters(), lr = best_params['LearningRate'], weight_decay = best_params['Regularization'], momentum=0.9)

# Training & validation
train_loss, val_loss = model.fit(train_load, val_load, opt, loss_func, epochs, device)

In [None]:
# Plot Training and Validation loss
plt.plot(train_loss, label='Training')
plt.plot(val_loss, label='Validation')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.savefig('TrainValLoss_Class.pdf', bbox_inches='tight')
files.download('TrainValLoss_Class.pdf')
plt.show()

In [None]:
# Save trained model
model.save('net_class_parameters.torch')
torch.save(opt.state_dict(), 'optimizer_class_state.torch')

### Test the trained model

In [None]:
# Load test set
test_dataset = torchvision.datasets.MNIST('classifier_data', 
                                          transform=transforms.Compose([
                                                transforms.ToTensor(),
                                                transforms.Normalize((0.1307,), (0.3081,))]),
                                          train=False, 
                                          download=True)

test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
# Run predictions
loss_func = nn.CrossEntropyLoss()

model.load('net_class_parameters.torch')

inputs, outputs, labels, test_loss = model.predict(test_loader, loss_func, device)

In [None]:
# Evaluate test accuracy
outputs = outputs.detach().cpu().numpy()
labels = labels.detach().cpu().numpy()

predicted_labels = [outputs[i].argmax() for i in range(len(outputs))]
diffs = np.array([predicted_labels[i]-labels[i] for i in range(len(outputs))])
wrong = np.count_nonzero(diffs) 
test_accuracy = 1 - wrong/len(outputs)
print("Test accuracy: ", test_accuracy)

### Confusion matrix for the test set

In [None]:
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues,
						  save_path='models/'):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    #else:
        #print('Confusion matrix, without normalization')

    plt.figure(figsize=(15, 15))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, fontsize=15)
    plt.yticks(tick_marks, classes, fontsize=15)

    fmt = '.3f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), size=11,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=30)
    plt.xlabel('Predicted label', fontsize=30)
    plt.savefig(save_path+"_picConfMatrix.png", dpi=400)
    plt.tight_layout()

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(labels, predicted_labels)
categories=[0,1,2,3,4,5,6,7,8,9]
plot_confusion_matrix(cm,categories, normalize=False,save_path='./confusion.pdf')

## Weights Histogram

In [None]:
# First convolutional layer
c1_w = model.conv1.weight.data.cpu().numpy() 
c1_b = model.conv1.bias.data.cpu().numpy() 

# Second convolutional layer
c2_w = model.conv2.weight.data.cpu().numpy()
c2_b = model.conv2.bias.data.cpu().numpy() 

# First hidden layer
h1_w = model.fc1.weight.data.cpu().numpy() 
h1_b = model.fc1.bias.data.cpu().numpy() 

# Second hidden layer
h2_w = model.fc2.weight.data.cpu().numpy()
h2_b = model.fc2.bias.data.cpu().numpy() 

# Weights histogram
fig, axs = plt.subplots(4, 1, figsize=(12,8))
axs[0].hist(c1_w.flatten(), 50)
axs[0].set_title('First convolutional layer weights')
axs[1].hist(c2_w.flatten(), 50)
axs[1].set_title('Second convolutional layer weights')
axs[2].hist(h1_w.flatten(), 50)
axs[2].set_title('First fully connected layer weights')
axs[3].hist(h2_w.flatten(), 50)
axs[3].set_title('Second fully connected layer weights')
[ax.grid() for ax in axs]
plt.tight_layout()
plt.savefig('Weights_Class.pdf', bbox_inches='tight')
files.download('Weights_Class.pdf')
plt.show()

## Activation Profiles

In [None]:
def get_activation(layer, input, output):
    global activation
    activation = torch.relu(output) 

### Register hook  
hook_handle = model.fc2.register_forward_hook(get_activation)

### Analyze activations
model = model.to(device)
model.eval()
tloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
with torch.no_grad():
    for sample_batched in tloader:
        x1 = sample_batched[0].to(device)
        label1 = sample_batched[1].to(device)
        break
    y1 = model(x1)
    z1 = activation
    for sample_batched in tloader:
        x2 = sample_batched[0].to(device)
        label2 = sample_batched[1].to(device)
        break
    y2 = model(x2)
    z2 = activation
    for sample_batched in tloader:
        x3 = sample_batched[0].to(device)
        label3 = sample_batched[1].to(device)
        break
    y3 = model(x3)
    z3 = activation

### Remove hook
hook_handle.remove()

### Plot activations
fig, axs = plt.subplots(3, 1, figsize=(12,8))
axs[0].stem(z1.cpu().numpy()[0], use_line_collection=True)
axs[0].set_title('Label = %d' % label1.cpu().numpy())
axs[1].stem(z2.cpu().numpy()[0], use_line_collection=True)
axs[1].set_title('Label = %d' % label2.cpu().numpy())
axs[2].stem(z3.cpu().numpy()[0], use_line_collection=True)
axs[2].set_title('Label = %d' % label3.cpu().numpy())
plt.tight_layout()
plt.savefig('Activations_Class.pdf', bbox_inches='tight')
files.download('Activations_Class.pdf')

plt.show()

## Receptive fields

In [None]:
example = DataLoader(test_dataset, batch_size=1, shuffle=True) 

In [None]:
with torch.no_grad(): 
    for sample_batched in (example):
        x_batch = sample_batched[0].to(device)
        first_conv = model.forward(x_batch, 1)
        first_conv = first_conv.cpu().numpy()
        break

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(15,5))
axs = axs.flatten()
for i in range(5):
    axs[i].imshow(c1_w[i, 0, :, :], cmap='Greys')
    axs[i].set_yticks([])
    axs[i].set_xticks([])
for i in range(5):
    axs[i+5].imshow(first_conv[0, i, :, :], cmap='Greys')
    axs[i+5].set_yticks([])
    axs[i+5].set_xticks([])

plt.savefig('Filters1_Class.pdf', bbox_inches='tight')
files.download('Filters1_Class.pdf')
plt.plot()

In [None]:
 with torch.no_grad(): 
    for sample_batched in (example):
        x_batch = sample_batched[0].to(device)
        second_conv = model.forward(x_batch, 2)
        second_conv = second_conv.cpu().numpy()
        break

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(15,5))
axs = axs.flatten()
for i in range(5):
    axs[i].imshow(c2_w[i, 0, :, :], cmap='Greys')
    axs[i].set_yticks([])
    axs[i].set_xticks([])
for i in range(5):
    axs[i+5].imshow(second_conv[0, i, :, :], cmap='Greys')
    axs[i+5].set_yticks([])
    axs[i+5].set_xticks([])

plt.savefig('Filters2_Class.pdf', bbox_inches='tight')
files.download('Filters2_Class.pdf')
plt.plot()