# Deep Learning Applications: Laboratory #1

In this first laboratory we worked with some elements of Deep Models. 

## Exercise 1: 
The scope of this exercise was to study a Multilayer Perceptron and the possibilities of it's performance on a relatively easy dataset MNIST.
A Multilayer Perceptron (MLP) is a type of feedforward neural network consisting of multiple layers of neurons. It is composed of:
* Input layer which receives input features.
* Hidden layers - 1+ layers with neurons that apply transformations using weights, biases, and activation functions.
* Output layer which produces final predictions for implemented tasks.

## Exercise 1.1: A baseline MLP for MNIST Classification

In this exercise, we implemented a MLP to classify handwritten digits from the MNIST dataset. The steps for this exercise:

1. Data Preparation - we loaded the MNIST dataset, which consists of grayscale images of handwritten digits (0–9) and split the dataset into training, validation, and test sets.  

2. MLP Model Implementation - we defined a MLP.  

3. Training the Model - we trained the model for multiple epochs using the training set, optimizing using Adam.  

4. Model Evaluation - after each epoch, we would evaluate the model on the validation set and compute validation accuracy.

5. Plot Performance Curves

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import wandb
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report
from typing import List, Tuple
from torchvision import transforms
from torchvision.datasets import MNIST, CIFAR10
from torch.utils.data import Subset, DataLoader
from functools import reduce

In [None]:
def load_mnist_data(val_size: int = 5000):
    # loading the MNIST dataset and splitting it
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])
    
    ds_train = MNIST(root='./data', train=True, download=True, transform=transform)
    ds_test = MNIST(root='./data', train=False, download=True, transform=transform)
    
    I = np.random.permutation(len(ds_train))
    ds_val = Subset(ds_train, I[:val_size])
    ds_train = Subset(ds_train, I[val_size:])
    
    return ds_train, ds_val, ds_test

In [None]:
# this function is to train a model for a single epoch 
def train_epoch(model, dl, opt, epoch , device = 'cpu'):
    model.train()
    losses = []

    for xs, ys in tqdm(dl, desc=f'Training epoch {epoch}', leave=True):
        xs, ys = xs.to(device), ys.to(device)
        opt.zero_grad()
        logits = model(xs)
        loss = F.cross_entropy(logits, ys)
        loss.backward()
        opt.step()
        losses.append(loss.item())

        # logging loss with wandb
        wandb.log({"Train_loss": loss.item(), "epoch": epoch})

    return np.mean(losses)

# this function is to evaluate the model
def evaluate_model(model, dl, epoch, is_test = False,  device = 'cpu', use_wandb = False):
    model.eval()
    predictions, gts = [], []
    
    with torch.no_grad():
        for xs, ys in tqdm(dl, desc='Evaluating', leave=False):
            xs, ys = xs.to(device), ys.to(device)
            logits = model(xs)
            preds = torch.argmax(logits, dim=1)
            
            gts.append(ys.cpu())
            predictions.append(preds.cpu())
    
    gts = torch.cat(gts).numpy()
    predictions = torch.cat(predictions).numpy()
    
    acc = accuracy_score(gts, predictions)
    report = classification_report(gts, predictions, zero_division=0, digits=3)
    
    # logging accuracy with wandb
    wandb.log({"Validation_accuracy": acc})
    
    return acc, report


# this function is to plot the loss curve and validation accuracy
def plot_validation_curves(losses_and_accs):
    losses, accs = zip(*losses_and_accs)
    
    plt.figure(figsize=(16, 8))
    
    plt.subplot(1, 2, 1)
    plt.plot(losses, marker='o', label='Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Average Training Loss per Epoch')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(accs, marker='o', label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title(f'Best Accuracy = {np.max(accs):.4f} @ epoch {np.argmax(accs) + 1}')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

### A basic MLP

The MLP class constructs a feedforward neural network with one input layer, some hidden layers with ReLU activation, and an output layer.

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, hidden_layers, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(hidden_layers)])
        self.head = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.flatten(1)
        x = F.relu(self.fc1(x))
        for layer in self.hidden_layers:
            x = F.relu(layer(x))
        x = self.head(x)
        return x

def train_model(model, dl_train, dl_val, epochs, opt, device):
    losses_and_accs = []
    for epoch in range(epochs):
        loss = train_epoch(model, dl_train, opt, epoch, device)
        val_acc, _ = evaluate_model(model, dl_val, device)
        losses_and_accs.append((loss, val_acc))
    
    plot_validation_curves(losses_and_accs)
    return losses_and_accs

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# hyperparameters
epochs = 15
lr = 0.0001
batch_size = 128
input_size = 28 * 28
width = 16
depth = 2

wandb.init(project="basic-training", config={
    "epochs": epochs,
    "learning_rate": lr,
    "batch_size": batch_size,
    "width": width,
    "depth": depth
})

ds_train, ds_val, ds_test = load_mnist_data()

# creating dataloaders
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True, num_workers=4)
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=batch_size, num_workers=4)
dl_test = torch.utils.data.DataLoader(ds_test, batch_size=batch_size, shuffle=True, num_workers=4)

# instantiating model and optimizer
model_mlp = MLP(input_size, width, depth, 10).to(device)
optimizer = torch.optim.Adam(model_mlp.parameters(), lr=lr)

losses_and_accs = train_model(model_mlp, dl_train, dl_val, epochs, optimizer, device)
test_acc, test_report = evaluate_model(model_mlp, dl_test, device)
print(f"Test Accuracy: {test_acc}\nTest Report:\n{test_report}")

wandb.finish()

We trained this MLP for 15 epochs, to achieve training loss	of 0.25 and validation accuracy	of 0.92. The plots produced during training can be seen here.
![Alt Text](plots/llm_mnist.png)

We can say that we trained a MLP classifier that achieves reasonable accuracy on MNIST, it is possible to make greater improvement if training for more epochs, but this results demonstrate that on a simple dataset such as MNIST MLP's performance is more than enough.

## Exercise 1.2: CNN with and without Residual connections

In this exercise, we extended the previous process from a simple MLP to Convolutional Neural Networks but on a more difficult dataset such as CIFAR-10. The goal is to analyze how deeper CNN architectures affect performance and to explore the benefits of residual connections.

* defined a CNN with multiple convolutional layers and ConvBlocks that apply ReLU activation and pooling operations to extract features from images.
* trained the model for multiple epochs using the Adam optimizer.
* monitored training loss and validation accuracy over time.

We also wanted to compare models with different numbers of ConvBlocks to observe how increasing depth affects performance when using or not Residual Connections. This to show that deeper networks without residual connections do not always perform better due to vanishing gradients or overfitting.

In [None]:
def load_cifar10_data(val_size: int = 5000):
    # loading the CIFAR-10 dataset and splitting it
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
    ])
    
    ds_train = CIFAR10(root='./data', train=True, download=True, transform=transform)
    ds_test = CIFAR10(root='./data', train=False, download=True, transform=transform)
    
    I = np.random.permutation(len(ds_train))
    ds_val = Subset(ds_train, I[:val_size])
    ds_train = Subset(ds_train, I[val_size:])
    
    return ds_train, ds_val, ds_test

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, num=1, channels=8, size=3):
        super().__init__()
        self.layers = nn.ModuleList([
            nn.Conv2d(channels, channels, kernel_size=size, padding=(size-1)//2) 
            for _ in range(num)
        ])
        
    def forward(self, x):
        return reduce(lambda f, g: lambda x: g(F.relu(f(x))), self.layers, lambda x: x)(x)

class CNN(nn.Module):
    def __init__(self, num=2, channels=8, size=3):
        super().__init__()
        self.conv1 = nn.Conv2d(3, channels, kernel_size=size, padding=(size-1)//2)
        self.cblock1 = ConvBlock(num=num, channels=channels, size=size)
        self.cblock2 = ConvBlock(num=num, channels=channels, size=2)
        self.cblock3 = ConvBlock(num=num, channels=channels, size=2)
        
        self.adaptive_pool = nn.AdaptiveAvgPool2d(1)
        self.output = nn.Linear(channels, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.cblock1(x)
        x = F.max_pool2d(x, 3, 2)
        x = self.cblock2(x)
        x = F.max_pool2d(x, 3, 2)
        x = self.cblock3(x)
        x = self.adaptive_pool(x)
        x = x.flatten(1)
        x = self.output(x)
        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# hyperparameters
epochs = 10
lr = 0.001
batch_size = 128
num_blocks = 3  # ConvBlocks in the CNN
channels = 16
size = 3

ds_train, ds_val, ds_test = load_cifar10_data(val_size=5000)

dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True, num_workers=4)
dl_val = DataLoader(ds_val, batch_size=batch_size, num_workers=4)
dl_test = DataLoader(ds_test, batch_size=batch_size, shuffle=False, num_workers=4)

model = CNN(num=num_blocks, channels=channels, size=size).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

wandb.init(project="cnn_cifar10", config={
    "epochs": epochs,
    "learning_rate": lr,
    "batch_size": batch_size,
    "num_blocks": num_blocks,
    "channels": channels,
    "kernel_size": size
})

train_model(model, dl_train, dl_val, epochs, optimizer, device)

test_acc, test_report = evaluate_model(model, dl_test, device)
print(f'Final test accuracy: {test_acc:.4f}')
print(f'Test classification report:\n{test_report}')

wandb.finish()

We trained this CNN without residual connections over 10 epochs. The model used 3 ConvBlocks. The results that can be seen in the image show that we arrived at a value of training loss equal to 1.31 and validation accuracy of 0.56.

![Alt Text](plots/cnn_cifar10.png)

So with a deeper and better model such as CNN on a more difficult dataset in 10 epochs we coudn't achieve the same results as in the exercise above.

Next, we decided to modify the architecture to include residual connections, which help deeper networks retain gradient flow and learn more effectively.

In [None]:
class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)

    def forward(self, x):
        identity = x
        x = F.relu(self.conv1(x))
        x = self.conv2(x)
        return F.relu(x + identity)

class ResNetCNN(nn.Module):
    def __init__(self, depth=2):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
        self.residual_blocks = nn.ModuleList([
            ResidualBlock(64) for _ in range(depth)
        ])
        self.output = nn.Linear(64, 10)
        self.adaptive_pool = nn.AdaptiveAvgPool2d(1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        for block in self.residual_blocks:
            x = block(x)
        x = F.max_pool2d(x, 2)
        x = self.adaptive_pool(x)
        x = x.flatten(1)
        return self.output(x)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

epochs = 10
lr = 0.001
batch_size = 128
num_blocks = 3  # ResidualBlocks in the ResNetCNN
channels = 16
size = 4

ds_train, ds_val, ds_test = load_cifar10_data(val_size=5000)

dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True, num_workers=4)
dl_val = DataLoader(ds_val, batch_size=batch_size, num_workers=4)
dl_test = DataLoader(ds_test, batch_size=batch_size, shuffle=False, num_workers=4)

model = ResNetCNN(depth=num_blocks).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

wandb.init(project="resnet_cifar10", config={
    "epochs": epochs,
    "learning_rate": lr,
    "batch_size": batch_size,
    "num_blocks": num_blocks,
    "channels": 64,
    "kernel_size": size
})

train_model(model, dl_train, dl_val, epochs, optimizer, device)

test_acc, test_report = evaluate_model(model, dl_test, device)
print(f'Final test accuracy: {test_acc:.4f}')
print(f'Test classification report:\n{test_report}')

wandb.finish()

Here we trained for 10 epochs, the model had 3 ResidualBlocks meaning it was deeper. After training with these parameters we have training loss of 0.75 and validation accuracy of 0.70.

![Alt Text](plots/cnnresnet_cifar10.png)

We can see that the results are considerably better, but it comes with a much higer computational cost.
Nonetheless we have evidence that introducing architectural improvements such as residual connections better the performance of a model. 

## Exercise 2: The effectiveness of Residual Connections
Now we will use our two models (with and without residual connections) to try to understand why the residual versions of the networks learn more effectively.
We will do this using gradient values.

In [None]:
def analyze_gradient_flow(model, dataloader, device):
    model.train()
    gradient_norms = []

    for batch_idx, (data, target) in enumerate(dataloader):
        data, target = data.to(device), target.to(device)
        
        model.zero_grad()
        
        output = model(data)
        loss = F.cross_entropy(output, target)        
        loss.backward()
        
        layer_norms = []
        for name, param in model.named_parameters():
            if param.grad is not None:
                layer_norms.append(param.grad.norm().item())
        gradient_norms.append(layer_norms)
        
        if batch_idx >= 10:
            break
    
    return np.array(gradient_norms)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_blocks = 3
channels = 16
kernel_size = 3
batch_size = 128

plain_model = CNN(num=num_blocks, channels=channels, size=size).to(device)
resnet_model = ResNetCNN(depth=num_blocks).to(device)

ds_train, ds_val, ds_test = load_cifar10_data(val_size=5000)
dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True, num_workers=4)

plain_grads = analyze_gradient_flow(plain_model, dl_train, device)
resnet_grads = analyze_gradient_flow(resnet_model, dl_train, device)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.title("CNN Gradient Norms")
plt.boxplot(plain_grads)

plt.subplot(1, 2, 2)
plt.title("ResNet CNN Gradient Norms")
plt.boxplot(resnet_grads)

plt.tight_layout()
plt.show()

print("\nGradient Statistics Comparison:")
print("-" * 40)
print(f"CNN. Mean: {plain_grads.mean():.6f}, deviation: {plain_grads.std():.6f}")
print(f"ResNet CNN. Mean: {resnet_grads.mean():.6f}, deviation: {resnet_grads.std():.6f}")
print("-" * 40)

We achived plots such as in the image. And the values to compare between are:
CNN. Mean: 0.011, deviation: 0.022
ResNet CNN. Mean: 0.115, deviation: 0.095

![Alt Text](plots/gradcomp.png)

As we can see the mean gradient is significantly higher in the ResNet model. and the gradient variance is also higher, meaning gradients are more diverse and spread out.

Residual connections help mitigate vanishing gradients, which is a major issue in deep networks. The plain CNN has a very low deviation, meaning the gradients are almost uniform and may fail to capture complex patterns.