In [172]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import random_split
from datetime import datetime
import numpy as np

# Getting the same results with train and train_manual_update
- Write torch.manual_seed(42) at the beginning of your notebook.
- Write torch.set_default_dtype(torch.double) at the beginning of your notebook to alleviate precision errors

In [173]:
torch.manual_seed(42)
torch.set_default_dtype(torch.double)

# Tasks
Load, analyse and preprocess the CIFAR-10 dataset. Split it into 3
datasets: training, validation and test. Take a subset of these datasets
by keeping only 2 labels: cat and car

In [174]:
# Select the available device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def load_cifar(train_val_split=0.9, data_path='../data/', preprocessor=None):
    """
    Loads the CIFAR-10 dataset, splits it into train/validation/test subsets, and
    reduces it to only two classes (cat and car). Returns three lists corresponding
    to the training, validation, and test datasets.

    Args:
        train_val_split (float): The fraction of the dataset to use for training (the remainder is for validation).
        data_path (str): The directory path where the CIFAR-10 dataset should be downloaded/stored.
        preprocessor (torchvision.transforms.Compose): Transformations to apply to images upon loading.

    Returns:
        tuple: (train_data, val_data, test_data), each a list of (image, label) pairs,
               where label is 0 for 'cat' and 1 for 'car'.
    """
    
    # If no preprocessor is specified, define a default set of transforms
    if preprocessor is None:
        preprocessor = transforms.Compose([
            transforms.Resize((16, 16)),  # Resize images to 16x16
            transforms.ToTensor(),
            transforms.Normalize(
                mean=(0.4915, 0.4823, 0.4468),
                std=(0.2470, 0.2435, 0.2616)
            )
        ])
    
    # Load the full CIFAR-10 dataset for training+validation
    data_train_val = datasets.CIFAR10(
        data_path, train=True, download=True, transform=preprocessor
    )
    # Load the full CIFAR-10 dataset for testing
    data_test = datasets.CIFAR10(
        data_path, train=False, download=True, transform=preprocessor
    )

    # Calculate the split sizes for training and validation
    n_train = int(len(data_train_val) * train_val_split)
    n_val = len(data_train_val) - n_train
    
    # Split the dataset into train and validation, using a fixed random seed for reproducibility
    data_train, data_val = random_split(
        data_train_val, [n_train, n_val],
        generator=torch.Generator().manual_seed(42)
    )
    
    # only keep images labeled as "car" (label=1) or "cat" (label=3) in the original CIFAR-10
    # Map them to labels: car -> 0, cat -> 1
    label_map = {1: 0, 3: 1}
    
    # Create filtered training subset with only cars and cats
    cifar_cars_cats_train = [
        (img, label_map[label])
        for (img, label) in data_train
        if label in [1, 3]
    ]
    # Similarly for validation
    cifar_cars_cats_val = [
        (img, label_map[label])
        for (img, label) in data_val
        if label in [1, 3]
    ]
    # And for testing
    cifar_cars_cats_test = [
        (img, label_map[label])
        for (img, label) in data_test
        if label in [1, 3]
    ]
    
    # Print dataset sizes for sanity check
    print("Training set size:", len(cifar_cars_cats_train))
    print("Validation set size:", len(cifar_cars_cats_val))
    print("Test set size:", len(cifar_cars_cats_test))

    return cifar_cars_cats_train, cifar_cars_cats_val, cifar_cars_cats_test

# Load the filtered CIFAR-10 datasets
data_train, data_val, data_test = load_cifar()

def compute_accuracy(model, loader):
    """
    Computes the classification accuracy of a given PyTorch model on a specified dataloader.

    Args:
        model (nn.Module): The neural network model to evaluate.
        loader (DataLoader): A PyTorch DataLoader providing (images, labels) batches.

    Returns:
        float: The computed accuracy (0.0 to 1.0).
    """
    model.eval()  # Switch model to evaluation mode
    correct = 0
    total = 0

    # don't need gradients when simply running inference
    with torch.no_grad():
        for imgs, labels in loader:
            # Move data and labels to the same device as the model
            imgs = imgs.to(device=device, dtype=torch.double)
            labels = labels.to(device=device)
            
            # Forward pass: get the predictions
            outputs = model(imgs)
            # Convert raw logit outputs to predicted class indices
            _, predicted = torch.max(outputs, dim=1)
            
            total += labels.size(0)
            correct += int((predicted == labels).sum())

    accuracy = correct / total
    print('Accuracy: {:.2f}'.format(accuracy))
    
    return accuracy


Files already downloaded and verified
Files already downloaded and verified
Training set size: 9017
Validation set size: 983
Test set size: 2000


Write a MyMLP class that implements a MLP in PyTorch (so only fully
connected layers) such that:
    
    - The input dimension is 768(= 16 ∗ 16 ∗ 3) and the output dimension is 2 (for the 2 classes).
    - The hidden layers have respectively 128 and 32 hidden units.
    - All activation functions are ReLU. The last layer has no activation function since the cross-entropy loss already includes a softmax activation
function.

In [175]:
class MyNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(16*16*3, 128)
        self.fc2 = nn.Linear(128, 32)
        self.fc3 = nn.Linear(32, 2)
    
    def forward(self, x):
        out = torch.flatten(x, start_dim=1)
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))    #no softmax, cross-entropy handles that
        
        out = self.fc3(out)
        
        return out     
            

Write a train(n_epochs, optimizer, model, loss_fn, train_loader) function that trains model for n_epochs epochs given an optimizer optimizer, a loss function loss_fn and a dataloader train_loader.

In [176]:
def train(n_epochs, optimizer, model, loss_fn, train_loader):
    
    n_batch = len(train_loader)
    losses_train = []
    model.train()

    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        
        for imgs, labels in train_loader:
            imgs = imgs.to(device=device, dtype=torch.double)
            labels = labels.to(device=device)

            optimizer.zero_grad()
            outputs = model(imgs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            
            # Accumulate the loss from this batch
            loss_train += loss.item()

        # After the entire epoch, average out
        loss_train /= n_batch
        losses_train.append(loss_train)

        if epoch == 1 or epoch % 5 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.5f}'.format(
                datetime.now().time(), epoch, loss_train))

    return losses_train

Write a similar function train manual_update that has no optimizer parameter, but a learning rate lr parameter instead and that manually updates each trainable parameter of model using equation (2). Do not forget to zero out all gradients after each iteration. 

Train 2 instances of MyMLP, one using train and the other using train_manual_update (use the same parameter values for both models). Compare their respective training losses. To get exactly the same results with both functions, see section 3.3

In [177]:
def train_manual_update(n_epochs, model, loss_fn, train_loader, 
                        lr=1e-2, momentum_coeff=0., weight_decay=0.):
    model.train()
    n_batch = len(train_loader)
    losses_train = []
    
    dict_mom = {}  # for momentum terms

    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        
        for imgs, labels in train_loader:
            imgs = imgs.to(device=device, dtype=torch.double)
            labels = labels.to(device=device)

            # Forward
            outputs = model(imgs)
            loss = loss_fn(outputs, labels)

            # Backward
            loss.backward()

            # Manual updates
            with torch.no_grad():
                for name, p in model.named_parameters():
                    if p.grad is not None:
                        grad = p.grad

                        # L2 regularization
                        if weight_decay:
                            grad = grad + weight_decay * p.data

                        # Momentum
                        if momentum_coeff:
                            if name in dict_mom:
                                grad = grad + momentum_coeff * dict_mom[name]
                            dict_mom[name] = grad

                        # Update the parameter
                        p.data = p.data - lr * grad

                        # Reset the gradient
                        p.grad.zero_()

            # Accumulate this batch’s loss
            loss_train += loss.item()

        # Average over batches
        loss_train /= n_batch
        losses_train.append(loss_train)

        if epoch == 1 or epoch % 5 == 0:
            print('{}  |  Epoch {}  |  Training loss {:.5f}'.format(
                datetime.now().time(), epoch, loss_train))

    return losses_train

In [178]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Training on device {device}.")

# Define training hyperparameters
n_epochs = 30          # number of epochs
batch_size = 256       # batch size for training
seed = 265             # random seed for reproducibility

# Create DataLoaders for training and validation (no shuffling to keep data order deterministic)
train_loader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=False)
val_loader = torch.utils.data.DataLoader(data_val, batch_size=batch_size, shuffle=False)

# Define the loss function (cross-entropy for classification)
loss_fn = nn.CrossEntropyLoss()

# Lists of possible hyperparameter values for learning rate, momentum, and weight decay
list_lr = [0.01]*6
list_momentum = [0, 0, 0.9, 0.9, 0.9, 0.8]
list_decay = [0, 0.01, 0, 0.01, 0.001, 0.01]

# Construct a list of parameter dictionaries for each run
params = [{
        "lr": list_lr[i],
        "mom": list_momentum[i],
        "decay": list_decay[i],
    } for i in range(len(list_lr))
]

print("\n   Global parameters:")
print("batch_size = ", batch_size)
print("n_epoch = ", n_epochs)
print("loss_fn = ", nn.CrossEntropyLoss())
print("seed = ", seed)

# These lists will store accuracies and model references so that we can pick the best model later
accuracies = []
models = []

# Iterate over all parameter configurations
for i in range(len(list_lr)):
    
    print("\n =========================================== ")
    print("   Current parameters: ")
    # Print current hyperparameters in a readable format
    print("".join(['%s = %s\n' % (key, value) for (key, value) in params[i].items()]))
    
    print(" Using Pytorch's SGD ")
    
    # Set a fixed random seed to ensure reproducible initial weights
    torch.manual_seed(seed)
    
    # Initialize the model and move it to the chosen device
    model = MyNet()
    model.to(device=device)

    # Create a built-in PyTorch SGD optimizer with the current hyperparameters
    optimizer = optim.SGD(
        model.parameters(), 
        lr=params[i]["lr"], 
        momentum=params[i]["mom"], 
        weight_decay=params[i]["decay"]
    )

    # Train the model using the built-in optimizer
    loss_train = train(
        n_epochs=n_epochs,
        optimizer=optimizer,
        model=model,
        loss_fn=loss_fn,
        train_loader=train_loader,
    )
    
    # Compute and print accuracies for the built-in-optimizer-trained model
    print("\n Accuracies ")
    print("Training")
    acc = compute_accuracy(model, train_loader)
    print("Validation")
    acc_val = compute_accuracy(model, val_loader)

    # Now train a new model using manual updates (same hyperparameters)
    print("\n Using manual update")
    torch.manual_seed(seed)  # Ensure same initial weights
    model_manual = MyNet()
    model_manual.to(device=device)

    loss_train_manual = train_manual_update(
        n_epochs=n_epochs,
        model=model_manual,
        loss_fn=loss_fn,
        train_loader=train_loader,
        lr=params[i]["lr"],
        momentum_coeff=params[i]["mom"], 
        weight_decay=params[i]["decay"],
    )

    # Compute and print accuracies for the model trained with manual updates
    print("\n --- Accuracies --- ")
    print("Training", end="")
    acc_manual = compute_accuracy(model_manual, train_loader)
    print("Validation", end="")
    acc_manual_val = compute_accuracy(model_manual, val_loader)
    
    # model selection:
    # 1) Store the validation accuracy from the built-in SGD model
    accuracies.append(acc)
    models.append(model)

    # 2) Store the validation accuracy from the manually updated model
    accuracies.append(acc_manual)
    models.append(model_manual)


Training on device cpu.

   Global parameters:
batch_size =  256
n_epoch =  30
loss_fn =  CrossEntropyLoss()
seed =  265

   Current parameters: 
lr = 0.01
mom = 0
decay = 0

 Using Pytorch's SGD 
00:32:20.534162  |  Epoch 1  |  Training loss 0.67882
00:32:20.651903  |  Epoch 5  |  Training loss 0.56008
00:32:20.805914  |  Epoch 10  |  Training loss 0.43988
00:32:20.942234  |  Epoch 15  |  Training loss 0.37431
00:32:21.088623  |  Epoch 20  |  Training loss 0.32573
00:32:21.233918  |  Epoch 25  |  Training loss 0.29081
00:32:21.383780  |  Epoch 30  |  Training loss 0.26554

 Accuracies 
Training
Accuracy: 0.90
Validation
Accuracy: 0.87

 Using manual update
00:32:21.427262  |  Epoch 1  |  Training loss 0.67882
00:32:21.536881  |  Epoch 5  |  Training loss 0.56008
00:32:21.673672  |  Epoch 10  |  Training loss 0.43988
00:32:21.810336  |  Epoch 15  |  Training loss 0.37431
00:32:21.955657  |  Epoch 20  |  Training loss 0.32573
00:32:22.090796  |  Epoch 25  |  Training loss 0.29081
00:32:

In [179]:
i_best_model = np.argmax(accuracies)
best_model = models[i_best_model]

params_best_model = params[i_best_model]

print(
    "\nThe best model was trained with",
    "".join(['\n    %s = %s' % (key, value) for (key, value) in params[i_best_model].items()]))

print("Training accuracy of the best model: ")
compute_accuracy(best_model, train_loader)
print("Validation accuracy of the best model: ")
compute_accuracy(best_model, val_loader)


The best model was trained with 
    lr = 0.01
    mom = 0
    decay = 0
Training accuracy of the best model: 
Accuracy: 0.97
Validation accuracy of the best model: 
Accuracy: 0.91


0.9094608341810784

In [180]:
test_loader = torch.utils.data.DataLoader(data_test, batch_size = batch_size, shuffle = False)

print('Test Accuracy score of the best model')
compute_accuracy(best_model, test_loader)

Test Accuracy score of the best model
Accuracy: 0.91


0.913