In [None]:
import os

local_assets_b = False

if local_assets_b:
  assets_dir = "/content/assets/P3/"

  if not os.path.isdir(assets_dir):
    assert os.path.isfile("assets.zip")
    os.system("unzip assets.zip")
else:
  from google.colab import drive
  drive.mount('/content/drive')
  assets_dir = '/content/drive/MyDrive/InterviewKickstart/assets/P2/'

# Transfer Learning Introduction

Transfer learning is a technique in deep learning where pre-trained models trained on large-scale datasets are leveraged to solve new tasks with limited labeled data. It involves taking a pre-trained model, which has learned rich and generalized features from a source task, and fine-tuning it on a target task.

### What is VGG?
VGG (Visual Geometry Group) is a popular deep convolutional neural network (CNN) architecture developed by the Visual Geometry Group at the University of Oxford. VGGNet is known for its simplicity and effectiveness in image classification tasks. It consists of multiple convolutional layers followed by fully connected layers. The most common variant, VGG-16, has 16 layers, including 13 convolutional layers and 3 fully connected layers. VGGNet has achieved impressive results on various image classification benchmarks, including the ImageNet challenge.

### What is CIFAR-10?
CIFAR-10 is a widely used benchmark dataset in computer vision and machine learning. It consists of 60,000 small-sized color images (32x32 pixels) belonging to 10 different classes, with 6,000 images per class. The dataset is split into a training set of 50,000 images and a test set of 10,000 images. The classes in CIFAR-10 include common objects like airplanes, automobiles, birds, cats, deer, dogs, frogs, horses, ships, and trucks. CIFAR-10 serves as a good dataset for evaluating and benchmarking image classification models.

When it comes to transfer learning, VGG is often used as a backbone model. Its pre-trained weights, which have been learned on the large-scale ImageNet dataset, capture generic features like edges, textures, and shapes that are beneficial for various visual recognition tasks. By leveraging the pre-trained VGG model, we can fine-tune it on the CIFAR-10 dataset to perform image classification. The lower-level layers of VGG capture low-level features, such as edges and corners, while the higher-level layers learn more complex features. This enables VGG to extract meaningful representations from images and generalize well to new tasks with limited labeled data.

By fine-tuning VGG on the CIFAR-10 dataset, we can take advantage of the pre-trained weights and learn task-specific features for image classification. This approach is effective when the target task has a similar domain or visual characteristics as the source task on which VGG was pre-trained. Transfer learning with VGG can help achieve better performance on CIFAR-10 by leveraging the knowledge learned from ImageNet, even with a smaller dataset.

## Visualizing different layers of VGG

In the VGG network, as the input image progresses through the convolutional layers, it undergoes a series of transformations and feature extractions. One fascinating aspect of VGG is the ability to visualize the learned features at different convolutional layers, which provides insights into the network's inner workings.

By examining the feature maps of various convolutional layers, we can observe how the network progressively captures different levels of abstraction. In the early layers, such as the first few convolutional layers, the network tends to learn simple and low-level features like edges, corners, and textures. These features are more local and specific to certain image regions.

As we move deeper into the network, the feature maps become more complex and abstract. Higher-level layers capture more global and semantic information, focusing on object parts, shapes, and textures. These learned features are more robust and capable of representing more complex visual patterns.

Visualizing the intermediate layers of VGG helps us understand how the network gradually builds a hierarchy of features, with each layer refining and enhancing the representations learned in the previous layers. It also highlights the network's ability to transform raw pixel values into rich, hierarchical feature representations that enable accurate image classification and object detection.

By analyzing the visualizations of different conv layers in VGG, we gain valuable insights into the network's feature learning process, offering a glimpse into how deep convolutional neural networks extract and encode meaningful visual information from images. This understanding not only aids in interpreting the network's decisions but also provides a foundation for developing improved architectures and advancing the field of computer vision.






### Importing the necessary modules

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import matplotlib.pyplot as plt
from PIL import Image

### Setting up the Transformations for Images

In [None]:
transform = transforms.Compose(transforms=[
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

### Load the Model

In [None]:
# Load the pre-trained VGG model
model = models.vgg16(pretrained=True)
print(model)

### Extracting the Convolution Layers from VGG

In [None]:
# we will save the conv layer weights in this list
model_weights =[]
#we will save the 49 conv layers in this list
conv_layers = []
# get all the model children as list
model_children = list(model.children())
#counter to keep count of the conv layers
counter = 0
#append all the conv layers and their respective wights to the list
for i in range(len(model_children)):
    print("Block: ", i, " : ",type(model_children[i]))
    if isinstance(model_children[i], torch.nn.modules.container.Sequential):
        for layer_num, child in enumerate(model_children[i].children()):
            print("Layer: ", layer_num, " : ",type(child))
            if isinstance(child, nn.Conv2d):
                counter+=1
                model_weights.append(child.weight)
                conv_layers.append(child)
print(f"Total convolution layers found: {counter}")

### Load the image

In [None]:
image = Image.open(fp = assets_dir + 'dog_image.jpeg')
plt.imshow(image)
image = transform(image)
# Reshape the image to match the input size of VGG
print(f"Image shape before: {image.shape}")
image = image.unsqueeze(0)
print(f"Image shape after: {image.shape}")

### Run the image on the Extracted Convolution Layers and Visualise them

In [None]:
# Set the model to evaluation mode
model.eval()

results = [conv_layers[0](image)]
for i in range(1, len(conv_layers)):
    results.append(conv_layers[i](results[-1]))
outputs = results

for num_layer in range(len(outputs)):
    plt.figure(figsize=(50, 10))
    layer_viz = outputs[num_layer][0, :, :, :]
    layer_viz = layer_viz.data
    print("Layer ",num_layer+1)
    for i, filter in enumerate(layer_viz):
        if i == 16:
            break
        plt.subplot(2, 8, i + 1)
        plt.imshow(filter, cmap='gray')
        plt.axis("off")
    plt.show()
    plt.close()

## Transfer Learning vs from Scratch

Below we will develop 2 models to show the benefits of using Transfer Learning. Transfer Learning will help us save time (which is very valuable) and cost (computation required is less, equally valuable).

### Using VGG for Transfer Learning

In [None]:
# define the imports
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
from torchvision.models import vgg16

The below code defines a transformation pipeline for the CIFAR-10 dataset. It resizes the images to a size of 224x224 pixels, converts them to tensors, and then applies normalization with mean (0.5, 0.5, 0.5) and standard deviation (0.5, 0.5, 0.5). These transformations are commonly used to preprocess the CIFAR-10 images before feeding them into the VGG model for training or inference.

### Setting up the Transformations for Images

In [None]:
# Define the transformations
transform = transforms.Compose(transforms=[
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

### Load the Dataset

In [None]:
import os

cifar_root_dir = './.data/cifar-10'
os.makedirs(cifar_root_dir, exist_ok=True)

# Load the CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root=cifar_root_dir, train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR10(root=cifar_root_dir, train=False, download=True, transform=transform)

### Datasets and DataLoaders


In PyTorch, datasets and dataloaders are essential components for handling and processing data during the training and testing phases of machine learning models.

Datasets represent collections of data samples with their corresponding labels. They provide an interface to access individual samples, allowing users to retrieve and preprocess data for training or evaluation. PyTorch provides built-in datasets like MNIST, CIFAR-10, and ImageNet, but custom datasets can also be created to work with specific data.

Dataloaders, on the other hand, are utilities that enable efficient data loading and batching. They take a dataset as input and allow users to define batch sizes, shuffle the data, and apply transformations to the samples. Dataloaders are especially useful when dealing with large datasets, as they enable the model to process data in small batches, reducing memory requirements and speeding up training. They are key components in PyTorch that facilitate data handling and preparation for machine learning tasks.

Since the CIFAR-10 dataset is huge, we will be filtering it to contain only 2 classes, cats and dogs.

In [None]:
# Define the specific classes you want to train on
class_names = ['dog', 'cat']
# These are the ids of the dog and cat class in the CIFAR-10 dataset
class_ids = [3, 5]
# We are mapping the class labels to to 0 and 1 as the output layer will have these indices and it makes it easier to evaluate the predictions
label_map = {3:0, 5:1}

# Filter the dataset to include only the selected classes
train_indices = [idx for idx, label in enumerate(train_dataset.targets) if label in class_ids]
test_indices = [idx for idx, label in enumerate(test_dataset.targets) if label in class_ids]


train_dataset = torch.utils.data.Subset(dataset=train_dataset, indices=train_indices)
test_dataset = torch.utils.data.Subset(dataset=test_dataset, indices=test_indices)

### Defining the DataLoaders

In [None]:
# Define the data loaders
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

Since the last layer of VGG is 1000 (as it was trained for ImageNet which contains 1000 classes) we are removing that and connecting the second last layer to the number of classes we currently have i.e. 2.

In [None]:
# Load the pre-trained VGG-16 model
vgg = vgg16(pretrained=True)
# Modify the last layer of VGG by changing it to 2 classes instead of 1000 as trained for ImageNet
vgg.classifier[6] = nn.Linear(in_features=4096, out_features=len(class_names))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg.to(device)

### Loss Function and Optimizer

In [None]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=vgg.parameters(), lr=0.001, momentum=0.9)

### Training the Model

Note: The below code takes around 10 mins to run, so you can load the model in case you would like to play around without waiting.

In [None]:
# Training loop
num_epochs = 4
for epoch in range(num_epochs):
    running_loss = 0.0

    for images, labels in train_loader:
        # Convert class labels to numeric values i.e. dog(3) to 0 and cat(5) to 1 so that they are mapped to the output layer
        transformed_tensor = list(map(torch.tensor, [label_map[val.item()] for val in labels]))
        labels = torch.tensor(transformed_tensor)
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad() # Clear gradients from previous iteration
        outputs = vgg(images) # Forward pass to get model predictions
        loss = criterion(outputs, labels) # Calculate the loss between predictions and actual labels
        loss.backward() # Backpropagation to compute gradients
        optimizer.step() # Update model parameters using the computed gradients
        running_loss += loss.item() # Accumulate training loss

    # Calculate accuracy on the test set

    # Set the VGG model to evaluation mode (no gradients)
    vgg.eval()
    # Initialize variables to keep track of correct and total predictions
    correct = 0
    total = 0

    with torch.no_grad():
        # Iterate through the test_loader, which contains test images and their corresponding labels
        for images, labels in test_loader:
            # Convert class labels to numeric values i.e. dog(3) to 0 and cat(5) to 1 so that they are mapped to the output layer
            transformed_tensor = list(map(torch.tensor, [label_map[val.item()] for val in labels]))
            labels = torch.tensor(transformed_tensor)
            images, labels = images.to(device), labels.to(device)

            outputs = vgg(images)
            # Get the predicted class with the highest probability for each image
            _, predicted = torch.max(outputs.data, 1)
            # Update the total count of test samples
            total += labels.size(0)
            # Count the number of correct predictions by comparing predicted labels with true labels
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {running_loss:.4f} - Test Accuracy: {accuracy:.2f}%")

### Save your Model

In [None]:
import os

cifar_modelstate_root_dir = './.model-state/cifar-10'
os.makedirs(cifar_modelstate_root_dir, exist_ok=True)

# Uncomment the below code in case you want to save your model
"""
# Save the model
state_dict = vgg.state_dict()
torch.save(state_dict, cifar_modelstate_root_dir+"/vgg_model_state_dict.pt")
"""

### Load your already saved model

In [None]:
from torchvision.models import vgg16
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim

# Load the model in case training it takes a lot of time
file_path = cifar_modelstate_root_dir+"/vgg_model_state_dict.pt"

loaded_vgg_model = vgg16(pretrained=False)
loaded_vgg_model.classifier[6] = nn.Linear(in_features=4096, out_features=2)
# Load the saved state dictionary
saved_state_dict = torch.load(file_path)
# Load the state dictionary into the model
loaded_vgg_model.load_state_dict(saved_state_dict)
# Set the model to evaluation mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_vgg_model = loaded_vgg_model.to(device)
loaded_vgg_model.eval()

### Evaluating the model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
def evaluate(model, data_loader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for images, labels in data_loader:
            # Convert class labels to numeric values i.e. dog(3) to 0 and cat(5) to 1 so that they are mapped to the output layer
            transformed_tensor = list(map(torch.tensor, [label_map[val.item()] for val in labels]))
            labels = torch.tensor(transformed_tensor)
            images = images.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            # Get the predicted class with the highest probability for each image
            _, predicted_labels = torch.max(outputs, 1)

            # Append the predictions
            predictions.extend(predicted_labels.cpu().numpy())
            # Append the true labels
            true_labels.extend(labels.cpu().numpy())

    # Use the predictions and true labels to evaluate the model on the following
    accuracy = accuracy_score(y_true=true_labels, y_pred=predictions)
    precision = precision_score(y_true=true_labels, y_pred=predictions, average='weighted')
    recall = recall_score(y_true=true_labels, y_pred=predictions)
    f1 = f1_score(y_true=true_labels, y_pred=predictions)
    cf_matrix = confusion_matrix(y_true=true_labels, y_pred=predictions)

    return accuracy, precision, recall, f1, cf_matrix

In [None]:
# Note: Uncomment the below line in case you are using the trained model or else use the one below it with the loaded model
# test_accuracy, test_precision, test_recall, test_f1, cf_matrix = evaluate(vgg, test_loader)
test_accuracy, test_precision, test_recall, test_f1, cf_matrix = evaluate(loaded_vgg_model, test_loader)

# Print final test set performance
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test Confusion Matrix:")
print(cf_matrix)

### Developing a model from Scratch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### CNN Model Architecture

The below CNN class is an architecture we are building from scratch in order to compare which one performs better in the task of Image Classification.

In [None]:
# Define model architecture
class CNN(nn.Module):
    """
    Convolutional Neural Network (CNN) for image classification.

    This CNN consists of two convolutional layers followed by max-pooling layers,
    and two fully connected layers. The input images are expected to have three channels (RGB).

    Args:
        num_classes (int): The number of classes in the classification task.

    Attributes:
        conv1 (nn.Conv2d): The first convolutional layer with 16 output channels and a kernel size of 3x3.
        relu1 (nn.ReLU): The ReLU activation function applied after the first convolutional layer.
        pool1 (nn.MaxPool2d): The max-pooling layer with a kernel size of 2x2 after the first convolutional layer.
        conv2 (nn.Conv2d): The second convolutional layer with 32 output channels and a kernel size of 3x3.
        relu2 (nn.ReLU): The ReLU activation function applied after the second convolutional layer.
        pool2 (nn.MaxPool2d): The max-pooling layer with a kernel size of 2x2 after the second convolutional layer.
        fc1 (nn.Linear): The first fully connected layer with 64 units.
        relu3 (nn.ReLU): The ReLU activation function applied after the first fully connected layer.
        fc2 (nn.Linear): The second fully connected layer with `num_classes` units for classification.

    Methods:
        forward(x): Performs a forward pass through the network given an input tensor x.
                    Returns the output tensor after passing through the fully connected layers.
    """

    def __init__(self, num_classes):
        super(CNN, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(in_features=32 * 8 * 8, out_features=64)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(in_features=64, out_features=num_classes)

    def forward(self, x):
        x = self.pool1(self.relu1(self.conv1(x)))
        x = self.pool2(self.relu2(self.conv2(x)))
        x = x.view(-1, 32 * 8 * 8)
        x = self.relu3(self.fc1(x))
        x = self.fc2(x)
        return x

### Training Parameters

In [None]:
# Define training parameters
batch_size = 32
learning_rate = 0.001

### Transformations

In [None]:
# Define the transformations
transform = transforms.Compose(transforms=[
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

### Loading the dataset

In [None]:
# Load the CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

### Dataset and Dataloader

In [None]:
# Define the specific classes you want to train on
class_names = ['dog', 'cat']
# These are the ids of the dog and cat class in the CIFAR-10 dataset
class_ids = [3, 5]
# We are mapping the class labels to to 0 and 1 as the output layer will have these indices and it makes it easier to evaluate the predictions
label_map = {3:0, 5:1}

# Filter the dataset to include only the selected classes
train_indices = [idx for idx, label in enumerate(train_dataset.targets) if label in class_ids]
test_indices = [idx for idx, label in enumerate(test_dataset.targets) if label in class_ids]


train_dataset = torch.utils.data.Subset(dataset=train_dataset, indices=train_indices)
test_dataset = torch.utils.data.Subset(dataset=test_dataset, indices=test_indices)

In [None]:
# Define the data loaders
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

### Loss Criterion and Optimizer

In [None]:
# Create the model
model = CNN(num_classes=2).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=model.parameters(), lr=0.001, momentum=0.9)

### Training the model and Evaluation

In [None]:
# Training loop
num_epochs = 4
for epoch in range(num_epochs):
    model.train() # Set the model to training mode so that it can learn from the training data
    train_loss = 0.0
    for images, labels in train_loader:
        # Convert class labels to numeric values i.e. dog(3) to 0 and cat(5) to 1 so that they are mapped to the output layer
        transformed_tensor = list(map(torch.tensor, [label_map[val.item()] for val in labels]))
        labels = torch.tensor(transformed_tensor)
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad() # Clear gradients from previous iteration
        outputs = model(images) # Forward pass to get model predictions
        loss = criterion(outputs, labels) # Calculate the loss between predictions and actual labels
        loss.backward() # Backpropagation to compute gradients
        optimizer.step() # Update model parameters using the computed gradients
        train_loss += loss.item() * images.size(0) # Accumulate training loss
    train_loss /= len(train_loader.dataset)

    # Evaluation on the test set
    model.eval()
    test_loss = 0.0
    true_labels = []
    pred_labels = []
    with torch.no_grad():
        for images, labels in test_loader:
            # Convert class labels to numeric values i.e. dog(3) to 0 and cat(5) to 1 so that they are mapped to the output layer
            transformed_tensor = list(map(torch.tensor, [label_map[val.item()] for val in labels]))
            labels = torch.tensor(transformed_tensor)
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            # Get the predicted class with the highest probability for each image
            _, predicted = torch.max(outputs, 1)
            # Append the true labels and the predicted labels to their respective lists
            true_labels.extend(labels.cpu().numpy())
            pred_labels.extend(predicted.cpu().numpy())

    test_loss /= len(test_loader.dataset)

    # Calculate evaluation metrics based on the true labels and the predicted labels
    accuracy = accuracy_score(y_true=true_labels, y_pred=pred_labels)
    precision = precision_score(y_true=true_labels, y_pred=pred_labels, average='weighted')
    recall = recall_score(y_true=true_labels, y_pred=pred_labels, average='weighted')
    f1 = f1_score(y_true=true_labels, y_pred=pred_labels, average='weighted')

    # Print the evaluation metrics
    print(f"Epoch {epoch + 1}/{num_epochs} - "
          f"Train Loss: {train_loss:.4f} - "
          f"Test Loss: {test_loss:.4f} - "
          f"Accuracy: {accuracy:.4f} - "
          f"Precision: {precision:.4f} - "
          f"Recall: {recall:.4f} - "
          f"F1 Score: {f1:.4f}")