### Question 1
Download the Fashion_MNIST dataset. You can find it on the official Fashion-MNIST website or by using PyTorch's torchvision.datasets module. Split the dataset into training, validation and testing sets. A common split is 80% of the data to train, 10% to validate, and 10% to test scenarios, but you can adjust this as needed. Normalize the images. This involves scaling the pixel values to a range between 0 and 1.

In [1]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

# Define transformations for the dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Normalize to [-1, 1] range
])

# Download and load the FashionMNIST dataset
dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)

# Define the size of each split
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Create data loaders for each split
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Print sizes to verify
print(f"Training set size: {train_size}")
print(f"Validation set size: {val_size}")
print(f"Test set size: {test_size}")


Training set size: 48000
Validation set size: 6000
Test set size: 6000


### Question 2
Implement a MLP for classification. 

#### a) Flatten the images into a single dimensional vector before feeding it to the model. 
#### b) Write a pre-processing module for all the images.  
#### c) Write the Forward pass from scratch. 

In [2]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(28*28, 256)  # First fully connected layer
        self.fc2 = nn.Linear(256, 128)    # Second fully connected layer
        self.fc3 = nn.Linear(128, 10)     # Output layer

    def forward(self, x):
        # a. Flatten the images
        x = x.view(-1, 28*28)
        
        # First layer with ReLU activation
        x = torch.relu(self.fc1(x))
        
        # Second layer with ReLU activation
        x = torch.relu(self.fc2(x))
        
        # Output layer
        x = self.fc3(x)
        
        return x

model = MLP()


#### d)Write the Backward pass from scratch. 

In [3]:
def backward_pass(model, loss, learning_rate):
    # Zero the gradients
    for param in model.parameters():
        if param.grad is not None:
            param.grad.data.zero_()
    
    # Compute gradients
    loss.backward()
    
    # Update parameters
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad


#### e) Write the module for cross entropy loss 

def cross_entropy_loss(output, target):
    # Apply softmax to output
    softmax_output = torch.softmax(output, dim=1)
    
    # Calculate cross-entropy loss
    loss = -torch.sum(target * torch.log(softmax_output))
    return loss


#### f) Experiment with different hyperparameters like number of layers, dropout, objective function, etc. and settle with a combination which performs the best for the given problem

In [5]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size=28*28, hidden_sizes=[256, 128], output_size=10, dropout_prob=0.0):
        super(MLP, self).__init__()
        layers = []
        current_size = input_size
        
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(current_size, hidden_size))
            layers.append(nn.ReLU())
            if dropout_prob > 0:
                layers.append(nn.Dropout(dropout_prob))
            current_size = hidden_size
        
        layers.append(nn.Linear(current_size, output_size))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the images
        return self.network(x)


In [6]:
import torch.optim as optim

def train_and_evaluate(model, train_loader, val_loader, num_epochs, learning_rate, criterion):
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        model.train()
        for images, labels in train_loader:
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        val_loss /= len(val_loader)
        val_accuracy = 100 * correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')
    
    return val_loss, val_accuracy


In [7]:
# Hyperparameter configurations
hidden_layer_configs = [
    [256],
    [256, 128],
    [512, 256, 128]
]
dropout_probs = [0.0, 0.2, 0.5]
learning_rates = [0.01, 0.001]
num_epochs = 10

best_val_loss = float('inf')
best_model = None
best_config = {}

# Experiment with different hyperparameters
for hidden_layers in hidden_layer_configs:
    for dropout_prob in dropout_probs:
        for lr in learning_rates:
            print(f'Experimenting with hidden_layers={hidden_layers}, dropout_prob={dropout_prob}, learning_rate={lr}')
            model = MLP(input_size=28*28, hidden_sizes=hidden_layers, output_size=10, dropout_prob=dropout_prob)
            criterion = nn.CrossEntropyLoss()
            val_loss, val_accuracy = train_and_evaluate(model, train_loader, val_loader, num_epochs, lr, criterion)
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = model
                best_config = {
                    'hidden_layers': hidden_layers,
                    'dropout_prob': dropout_prob,
                    'learning_rate': lr,
                    'val_loss': val_loss,
                    'val_accuracy': val_accuracy
                }

print(f'Best Configuration: {best_config}')

Experimenting with hidden_layers=[256], dropout_prob=0.0, learning_rate=0.01
Epoch [1/10], Validation Loss: 0.6025, Validation Accuracy: 79.12%
Epoch [2/10], Validation Loss: 0.5331, Validation Accuracy: 81.55%
Epoch [3/10], Validation Loss: 0.4955, Validation Accuracy: 82.63%
Epoch [4/10], Validation Loss: 0.4742, Validation Accuracy: 83.23%
Epoch [5/10], Validation Loss: 0.4633, Validation Accuracy: 83.70%
Epoch [6/10], Validation Loss: 0.4479, Validation Accuracy: 84.27%
Epoch [7/10], Validation Loss: 0.4516, Validation Accuracy: 83.95%
Epoch [8/10], Validation Loss: 0.4299, Validation Accuracy: 84.67%
Epoch [9/10], Validation Loss: 0.4281, Validation Accuracy: 84.77%
Epoch [10/10], Validation Loss: 0.4190, Validation Accuracy: 85.17%
Experimenting with hidden_layers=[256], dropout_prob=0.0, learning_rate=0.001
Epoch [1/10], Validation Loss: 1.3085, Validation Accuracy: 68.35%
Epoch [2/10], Validation Loss: 0.9563, Validation Accuracy: 73.18%
Epoch [3/10], Validation Loss: 0.8165, V

Epoch [1/10], Validation Loss: 2.0404, Validation Accuracy: 45.55%
Epoch [2/10], Validation Loss: 1.6474, Validation Accuracy: 51.13%
Epoch [3/10], Validation Loss: 1.3258, Validation Accuracy: 60.97%
Epoch [4/10], Validation Loss: 1.1297, Validation Accuracy: 65.88%
Epoch [5/10], Validation Loss: 1.0068, Validation Accuracy: 68.30%
Epoch [6/10], Validation Loss: 0.9226, Validation Accuracy: 70.92%
Epoch [7/10], Validation Loss: 0.8600, Validation Accuracy: 72.52%
Epoch [8/10], Validation Loss: 0.8136, Validation Accuracy: 72.55%
Epoch [9/10], Validation Loss: 0.7768, Validation Accuracy: 73.32%
Epoch [10/10], Validation Loss: 0.7490, Validation Accuracy: 73.88%
Experimenting with hidden_layers=[512, 256, 128], dropout_prob=0.0, learning_rate=0.01
Epoch [1/10], Validation Loss: 0.7257, Validation Accuracy: 73.42%
Epoch [2/10], Validation Loss: 0.5885, Validation Accuracy: 78.27%
Epoch [3/10], Validation Loss: 0.5285, Validation Accuracy: 80.92%
Epoch [4/10], Validation Loss: 0.4915, Va

## Question 3
Implement a CNN backbone model using pytorch.
#### a) Build a small CNN model consisting of 5 convolution layers. Each convolution layer would be followed by a ReLU activation and a max pooling layer

In [8]:
import torch
import torch.nn as nn

class CNN_Model(nn.Module):
    def __init__(self, num_kernels=[16, 32, 64, 128, 256], kernel_size=3):
        super(CNN_Model, self).__init__()

        self.layer1_conv = nn.Conv2d(1, num_kernels[0], kernel_size, padding=1)
        self.layer1_activ = nn.ReLU()
        self.layer1_maxpool = nn.MaxPool2d(2, stride=2)

        self.layer2_conv = nn.Conv2d(num_kernels[0], num_kernels[1], kernel_size, padding=1)
        self.layer2_activ = nn.ReLU()
        self.layer2_maxpool = nn.MaxPool2d(2, stride=2)

        self.layer3_conv = nn.Conv2d(num_kernels[1], num_kernels[2], kernel_size, padding=1)
        self.layer3_activ = nn.ReLU()
        self.layer3_maxpool = nn.MaxPool2d(2, stride=2)

        self.layer4_conv = nn.Conv2d(num_kernels[2], num_kernels[3], kernel_size, padding=1)
        self.layer4_activ = nn.ReLU()
        self.layer4_maxpool = nn.MaxPool2d(2, stride=2)

        self.layer5_conv = nn.Conv2d(num_kernels[3], num_kernels[4], kernel_size, padding=1)
        self.layer5_activ = nn.ReLU()
        self.layer5_maxpool = nn.MaxPool2d(2, stride=2)

        self.fc1 = nn.Linear(num_kernels[4] * 1 * 1, 256)
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x):
        x = self.layer1_conv(x)
        x = self.layer1_activ(x)
        x = self.layer1_maxpool(x)

        x = self.layer2_conv(x)
        x = self.layer2_activ(x)
        x = self.layer2_maxpool(x)

        x = self.layer3_conv(x)
        x = self.layer3_activ(x)
        x = self.layer3_maxpool(x)

        x = self.layer4_conv(x)
        x = self.layer4_activ(x)
        x = self.layer4_maxpool(x)

        x = self.layer5_conv(x)
        x = self.layer5_activ(x)
        x = self.layer5_maxpool(x)

        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.fc2(x)

        return x

# Example instantiation
model = CNN_Model()


In [9]:
model

CNN_Model(
  (layer1_conv): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (layer1_activ): ReLU()
  (layer1_maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (layer2_conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (layer2_activ): ReLU()
  (layer2_maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (layer3_conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (layer3_activ): ReLU()
  (layer3_maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (layer4_conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (layer4_activ): ReLU()
  (layer4_maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (layer5_conv): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (layer5_activ): ReLU()
  (layer5_maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, d

#### b) Experiment with different kernel size, number of kernel each layer (keep number of filter same in each layer, double it in each layer etc) and settle with a combination which performs the best for the given problem

In [10]:
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

# Define transformations for the dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Normalize to [-1, 1] range
])

# Download and load the FashionMNIST dataset
dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)

# Define the size of each split
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Create data loaders for each split
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Training and evaluation function
def train_and_evaluate(model, train_loader, val_loader, num_epochs, learning_rate, criterion):
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        model.train()
        for images, labels in train_loader:
            outputs = model(images)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        val_loss /= len(val_loader)
        val_accuracy = 100 * correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')
    
    return val_loss, val_accuracy

# Hyperparameter configurations
kernel_sizes = [2, 1]
num_kernels_configs = [
    [16, 32, 64, 128, 256],
    [32, 64, 128, 256, 256]
]
learning_rates = [0.01, 0.001]
num_epochs = 10

best_val_loss = float('inf')
best_model = None
best_config = {}

# Experiment with different hyperparameters
for kernel_size in kernel_sizes:
    for num_kernels in num_kernels_configs:
        for lr in learning_rates:
            print(f'Experimenting with kernel_size={kernel_size}, num_kernels={num_kernels}, learning_rate={lr}')
            model = CNN_Model(num_kernels=num_kernels, kernel_size=kernel_size)
            criterion = nn.CrossEntropyLoss()
            val_loss, val_accuracy = train_and_evaluate(model, train_loader, val_loader, num_epochs, lr, criterion)
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = model
                best_config = {
                    'kernel_size': kernel_size,
                    'num_kernels': num_kernels,
                    'learning_rate': lr,
                    'val_loss': val_loss,
                    'val_accuracy': val_accuracy
                }

print(f'Best Configuration: {best_config}')


Experimenting with kernel_size=2, num_kernels=[16, 32, 64, 128, 256], learning_rate=0.01
Epoch [1/10], Validation Loss: 2.3011, Validation Accuracy: 16.60%
Epoch [2/10], Validation Loss: 2.2978, Validation Accuracy: 9.58%
Epoch [3/10], Validation Loss: 2.2809, Validation Accuracy: 34.00%
Epoch [4/10], Validation Loss: 1.2359, Validation Accuracy: 52.72%
Epoch [5/10], Validation Loss: 0.8204, Validation Accuracy: 67.92%
Epoch [6/10], Validation Loss: 0.7345, Validation Accuracy: 68.52%
Epoch [7/10], Validation Loss: 0.6791, Validation Accuracy: 72.62%
Epoch [8/10], Validation Loss: 0.5836, Validation Accuracy: 78.30%
Epoch [9/10], Validation Loss: 0.5306, Validation Accuracy: 80.08%
Epoch [10/10], Validation Loss: 0.5302, Validation Accuracy: 79.35%
Experimenting with kernel_size=2, num_kernels=[16, 32, 64, 128, 256], learning_rate=0.001
Epoch [1/10], Validation Loss: 2.3034, Validation Accuracy: 9.72%
Epoch [2/10], Validation Loss: 2.3026, Validation Accuracy: 9.72%
Epoch [3/10], Valid

RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x1024 and 256x256)

#### c) Try different weight initialization methods (random, Xavier, He) 

In [None]:
def initialize_weights(model, method='xavier'):
    for m in model.modules():
        if isinstance(m, nn.Conv2d):
            if method == 'xavier':
                nn.init.xavier_uniform_(m.weight)
            elif method == 'he':
                nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
            elif method == 'random':
                nn.init.uniform_(m.weight)
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)

# Example usage
model = CNN_Model(num_kernels=best_config['num_kernels'], kernel_size=best_config['kernel_size'])
initialize_weights(model, method='xavier')
initialize_weights(model, method='he')
initialize_weights(model, method='random')

#### d) After extracting feature from CNN model use MLP for classification

In [None]:
# Initialize the models
cnn_model = CNN_Model(num_kernels=best_config['num_kernels'], kernel_size=best_config['kernel_size'])
mlp_model = MLP()

# Initialize weights if needed
# initialize_weights(cnn_model, method='xavier')
# initialize_weights(mlp_model, method='xavier')

# Criterion
criterion = nn.CrossEntropyLoss()

# Train and evaluate
val_loss, val_accuracy = train_and_evaluate(cnn_model, mlp_model, train_loader, val_loader, num_epochs, learning_rate, criterion)
print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')