[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/upm-classes/image-understanding-2022-2023/blob/main/practice3/practice3.ipynb)

Fill with your data:

- Full name 1: 
- Full name 2: 

# Image classification

In this practice, you will learn how to build an image classifier using PyTorch in Google Colab. You will use the CIFAR-10 dataset, which consists of 60,000 32x32 color images in 10 classes. The goal is to train a neural network to correctly classify the images into their respective classes.

### Step 1: Set up Google Colab

Make sure to select "GPU" as the runtime type. For this you need to click on the menu Runtime/Change Runtime Type and select GPU.

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

assert device.type == 'cuda', 'GPU is not selected.'

In [None]:
def set_seed(seed):
    # Function to ensure the reproducibility of the results
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

### Step 2: Load the CIFAR-10 dataset

Next, you will load the CIFAR-10 dataset using PyTorch's built-in datasets module. You will also split the dataset into training and validation sets.

In [None]:
set_seed(0)
batch_size = 10

# Define the transforms to be applied to the data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load the CIFAR-10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)

# Split the training set into training and validation sets
trainset, valset = torch.utils.data.random_split(trainset, [40000, 10000])


# Initialize the data loader for the training set
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)
validloader = torch.utils.data.DataLoader(valset, batch_size=batch_size,
                                          shuffle=False, num_workers=2)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, 
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 
           'horse', 'ship', 'truck')

In [None]:
# functions to show an image
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


# get some random training images
dataiter = iter(trainloader)
images, labels = next(dataiter)

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join(f'{classes[labels[j]]:5s}' for j in range(batch_size)))

### Step 3: Define the neural network

Next, you will define the neural network using PyTorch's nn module. You can choose any architecture you like, but for simplicity, we will use a basic convolutional neural network (CNN).

In [None]:
set_seed(0)

# Define a convolution neural network
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=3, out_channels=12, kernel_size=5, 
                               stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(12)
        self.conv2 = nn.Conv2d(in_channels=12, out_channels=12, kernel_size=5, 
                               stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(12)
        self.pool = nn.MaxPool2d(2,2)
        self.conv4 = nn.Conv2d(in_channels=12, out_channels=24, kernel_size=5, 
                               stride=1, padding=1)
        self.bn4 = nn.BatchNorm2d(24)
        self.conv5 = nn.Conv2d(in_channels=24, out_channels=24, kernel_size=5, 
                               stride=1, padding=1)
        self.bn5 = nn.BatchNorm2d(24)
        self.fc1 = nn.Linear(24*10*10, 10)

    def forward(self, input):
        output = F.relu(self.bn1(self.conv1(input)))      
        output = F.relu(self.bn2(self.conv2(output)))     
        output = self.pool(output)                        
        output = F.relu(self.bn4(self.conv4(output)))     
        output = F.relu(self.bn5(self.conv5(output)))     
        output = output.view(-1, 24*10*10)
        output = self.fc1(output)
        return output

# Instantiate a neural network model 
model = Network()

### Step 4: Train the neural network

Now you will train the neural network using stochastic gradient descent (SGD) with a learning rate of 0.001 and a batch size of 32.

In [None]:
set_seed(0)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
num_epochs = 10

train_loss = []
valid_loss = []

# Train the network
for epoch in range(num_epochs):
    running_loss = 0.0
    pbar = tqdm(enumerate(trainloader, 0), total=len(trainloader))
    for i, data in pbar:
        inputs, labels = data

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        pbar.set_description(
                f'Train Loss: {loss.item():.6f},  epoch: {epoch}/{num_epochs}')
    train_loss.append(running_loss / len(trainloader))

    model.eval()
    pbar = tqdm(enumerate(validloader, 0), total=len(validloader))
    running_val_loss = 0.0
    with torch.no_grad():
        for i, data in pbar:
            images, labels = data
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item()
            pbar.set_description(
                f'Validation Loss: {loss.item():.6f},  epoch: {epoch}/{num_epochs}')
        valid_loss.append(running_val_loss / len(validloader))

In [None]:
#### Saving pretrained model
torch.save(model.state_dict(), 'pretrained_model_1.ckpt')

In [None]:
plt.figure(figsize=(8,4))
plt.plot(range(1, num_epochs + 1), train_loss, label='Train Loss')
plt.plot(range(1, num_epochs + 1), valid_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

### Step 5: Evaluate the neural network

Finally, you will evaluate the performance of the neural network on the test set.

In [None]:
### Loading the pretrained model if exists
if os.path.exists('pretrained_model1_.ckpt'):
    model.load_state_dict(torch.load('pretrained_model_1.ckpt'))

In [None]:
# Evaluate the network on the validation set
model.eval()

y_true = []
y_pred = []
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        y_true.extend(labels.tolist())
        y_pred.extend(predicted.tolist())

In [None]:
confusion_mat = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mat, 
                              display_labels=classes)
disp.plot()
plt.show()

In [None]:
def accuracy_np(predictions, targets):
    """
    Calculates the accuracy of predicted labels given the actual labels, using NumPy.

    Args:
      predictions: an array of predicted labels (shape: N)
      targets: an array of actual labels (shape: N)

    Returns:
      accuracy: the accuracy of the predictions (scalar)
    """
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
assert accuracy_np([1, 1, 1, 1], [1, 1, 1, 1]) == 1.0
assert accuracy_np([1, 1, 1, 0], [1, 1, 1, 1]) == 0.75

In [None]:
def recall_np(predictions, targets, class_id):
    """
    Calculates the recall of a given class given the predicted labels and actual labels, using NumPy.

    Args:
      predictions: an array of predicted labels (shape: N)
      targets: an array of actual labels (shape: N)
      class_id: the ID of the class to calculate recall for

    Returns:
      recall: the recall of the specified class (scalar), 
      if the divisor is 0, the result should be 0.
    """
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
assert recall_np([1, 2, 1, 1], [1, 1, 2, 1], 0) == 0
assert recall_np([1, 1, 1, 1], [1, 1, 2, 1], 1) == 1.0

In [None]:
def precision_np(predictions, targets, class_id):
    """
    Calculates the precision of a given class given the predicted labels and actual labels, using NumPy.

    Args:
      predictions: an array of predicted labels (shape: N)
      targets: an array of actual labels (shape: N)
      class_id: the ID of the class to calculate precision for

    Returns:
      precision: the precision of the specified class (scalar),
      if the divisor is 0, the result should be 0.
    """
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
assert precision_np([1, 2, 1, 1], [1, 1, 2, 1], 0) == 0
assert precision_np([1, 1, 1, 1], [1, 1, 2, 1], 1) == 0.75

In [None]:
print(f'Accuracy_ {accuracy_np(y_pred, y_true)}')

In [None]:
print('Recall per class:')
for i in range(len(classes)):
    print(f'{classes[i]}: {recall_np(y_pred, y_true, i)}')

In [None]:
print('Precision per class:')
for i in range(len(classes)):
    print(f'{classes[i]}: {precision_np(y_pred, y_true, i)}')

# Image classification with Data Augmentation

### Step 6: Define the transformations
We can easily add more images dding some transformations to the images. In particular, we added:
- RandomHorizontalFlip: randomly flips the image thorugh the horizontal axis
- ColorJitter: randomly adjusts brightness, contrast, saturation, and hue of the image.
- GaussianBlur: applies Gaussian blur with a given kernel size and standard deviation.

In [None]:
set_seed(0)
batch_size = 10

# Define the transforms to be applied to the data
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(), 
    transforms.ColorJitter(brightness=0.1, contrast=0.1, 
                           saturation=0.1, hue=0.1),
    transforms.GaussianBlur(kernel_size=3, 
                            sigma=(0.1, 2.0)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load the CIFAR-10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)

# Split the training set into training and validation sets
trainset, valset = torch.utils.data.random_split(trainset, [40000, 10000])


# Initialize the data loader for the training set
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)
validloader = torch.utils.data.DataLoader(valset, batch_size=batch_size,
                                          shuffle=False, num_workers=2)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, 
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 
           'horse', 'ship', 'truck')

### Step 7: Instantiate an new model

In [None]:
set_seed(0)
# Instantiate a neural network model 
model = Network()

### Step 8: Train the model

In [None]:
set_seed(0)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
num_epochs = 10

train_loss = []
valid_loss = []

# Train the network
for epoch in range(num_epochs):
    running_loss = 0.0
    pbar = tqdm(enumerate(trainloader, 0), total=len(trainloader))
    for i, data in pbar:
        inputs, labels = data

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        pbar.set_description(
            f'Train Loss: {loss.item():.6f},  epoch: {epoch}/{num_epochs}')
    train_loss.append(running_loss / len(trainloader))

    model.eval()
    pbar = tqdm(enumerate(validloader, 0), total=len(validloader))
    running_val_loss = 0.0
    with torch.no_grad():
        for i, data in pbar:
            images, labels = data
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item()
            pbar.set_description(
                f'Validation Loss: {loss.item():.6f},  epoch: {epoch}/{num_epochs}')
        valid_loss.append(running_val_loss / len(validloader))

In [None]:
#### Saving pretrained model
torch.save(model.state_dict(), 'pretrained_model_2.ckpt')

In [None]:
plt.figure(figsize=(8,4))
plt.plot(range(1, num_epochs + 1), train_loss, label='Train Loss')
plt.plot(range(1, num_epochs + 1), valid_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

### Step 9: Evaluate the model

In [None]:
### Loading the pretrained model if exists
if os.path.exists('pretrained_model_2.ckpt'):
    model.load_state_dict(torch.load('pretrained_model_2.ckpt'))

In [None]:
# Evaluate the network on the validation set
model.eval()

y_true = []
y_pred = []
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        y_true.extend(labels.tolist())
        y_pred.extend(predicted.tolist())

In [None]:
confusion_mat = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mat, 
                              display_labels=classes)
disp.plot()
plt.show()

In [None]:
print(f'Accuracy_ {accuracy_np(y_pred, y_true)}')

In [None]:
print('Recall per class:')
for i in range(len(classes)):
    print(f'{classes[i]}: {recall_np(y_pred, y_true, i)}')

In [None]:
print('Precision per class:')
for i in range(len(classes)):
    print(f'{classes[i]}: {precision_np(y_pred, y_true, i)}')

## Answer the following questions

1. Analyze the plots (loss curves) in both models. What do they indicate? Discuss it in detail. (2 point)

YOUR ANSWER HERE

2. Analyze the confussion matrix in both models. What images are more problematic for the models? What images are easier ? Discuss it in detail. (2 points)

YOUR ANSWER HERE

3. Analize the metrics (accuracy, precision, and recall) of both models. Discuss the results in detail. (2 points)

YOUR ANSWER HERE

4. How can you improve the results of the models? (1 point)

YOUR ANSWER HERE