#### Question 1 

<p>Download the Fashion_MNIST dataset. You can find it on the official Fashion-MNIST website or by using PyTorch's torchvision.datasets module. Split the dataset into training, validation and testing sets. A common split is 80% of the data to train, 10% to validate, and 10% to test scenarios, but you can adjust this as needed. Normalize the images. This involves scaling the pixel values to a range between 0 and 1.</p>

---

#### 1. Importing Libraries

In [44]:
import pandas as pd
import numpy as np
import os

import torch
import torchvision
from torch.utils.data import DataLoader, random_split
import torchvision.transforms as transforms

#### 2. Reading Dataset

In [28]:
# Creating train & test df
train = pd.read_csv('./Dataset/fashion-mnist_train.csv')

test = pd.read_csv('./Dataset/fashion-mnist_test.csv')

In [30]:
df = pd.concat([train, test], ignore_index=True)

In [31]:
df.shape

(70000, 785)

In [32]:
df_cols = df.columns

In [33]:
df.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Normalize data
df_labels = df['label']

df_img = df.iloc[:, 1:]
df_img.values

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 1, 3, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [35]:
df_images_flat = df_img.values.reshape(df_img.shape[0], -1)

In [36]:
len(df_images_flat[1])

784

In [37]:
# Normalize data
df_images_flat = df_images_flat.astype('float32') / 255.0

In [39]:
len(df_images_flat)

70000

In [40]:
normalized_df = pd.DataFrame.from_records(df_images_flat, columns = df_cols[1:])

normalized_df['labels'] = df_labels

In [41]:
normalized_df.head()

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784,labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019608,0.0,0.0,...,0.0,0.0,0.117647,0.168627,0.0,0.0,0.0,0.0,0.0,6
3,0.0,0.0,0.0,0.003922,0.007843,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.003922,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


In [45]:
# Spliting the dataset
train_size = int(0.8 * len(normalized_df))
val_size = int(0.1 * len(normalized_df))
test_size = len(normalized_df) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(normalized_df,
                                                        [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

#### Question 2
Implement a MLP for classification.

In [None]:
# MLP Implementation
class MLP:
    def __init__(self, input_size, hidden_size, num_classes):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        
        # Initialize weights and biases
        self.W1 = torch.randn(input_size, hidden_size) * 0.01
        self.b1 = torch.zeros(hidden_size)
        self.W2 = torch.randn(hidden_size, num_classes) * 0.01
        self.b2 = torch.zeros(num_classes)

    # Forward Pass (Part 2c)
    def forward(self, x):
        x = x.view(-1, self.input_size)  # Flatten the image
        
        # First layer: input to hidden
        self.z1 = x @ self.W1 + self.b1
        self.a1 = self.relu(self.z1)  # Apply ReLU activation
        
        # Second layer: hidden to output
        self.z2 = self.a1 @ self.W2 + self.b2
        
        return self.z2

    def relu(self, z):
        return torch.max(torch.zeros_like(z), z)
    
    def relu_derivative(self, z):
        return (z > 0).float()
    
    # Backward Pass (Part 2d)
    def backward(self, x, y, outputs, learning_rate):
        m = y.size(0)
        
        dL_dz2 = outputs - y
        
        dL_dW2 = (self.a1.t() @ dL_dz2) / m
        dL_db2 = dL_dz2.mean(dim=0)
        
        dL_da1 = dL_dz2 @ self.W2.t()
        
        dL_dz1 = dL_da1 * self.relu_derivative(self.z1)
        
        dL_dW1 = (x.view(-1, self.input_size).t() @ dL_dz1) / m
        dL_db1 = dL_dz1.mean(dim=0)
        
        self.W1 -= learning_rate * dL_dW1
        self.b1 -= learning_rate * dL_db1
        self.W2 -= learning_rate * dL_dW2
        self.b2 -= learning_rate * dL_db2

# Cross Entropy Loss (Part 2e)
def cross_entropy_loss(outputs, labels):
    labels_one_hot = torch.zeros(labels.size(0), num_classes)
    labels_one_hot[range(labels.size(0)), labels] = 1
    loss = -torch.mean(torch.sum(labels_one_hot * torch.log_softmax(outputs, dim=1), dim=1))
    return loss

input_size = 28 * 28
hidden_size = 128
num_classes = 10
learning_rate = 0.01
num_epochs = 20

model = MLP(input_size, hidden_size, num_classes)

for epoch in range(num_epochs):
    for images, labels in train_loader:
        outputs = model.forward(images)
        
        loss = cross_entropy_loss(outputs, labels)

        # Backward pass
        model.backward(images, labels, outputs, learning_rate)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Validation and Testing
def evaluate(loader):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in loader:
            outputs = model.forward(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        accuracy = 100 * correct / total
    return accuracy

val_accuracy = evaluate(val_loader)
test_accuracy = evaluate(test_loader)

print(f'Validation Accuracy: {val_accuracy:.2f}%')
print(f'Test Accuracy: {test_accuracy:.2f}%')

#### Question 3
Implement a CNN backbone model using pytorch.

In [None]:
class CNN_Model(nn.Module):

    #define init function
    def __init__(self):
        super(CNN_Model, self).__init__()

        #input size. 28 channel - 1,
        #32 - output channel from this layer,
        #square kerner of size 3.
        #stride defaults to 1 and padding to 0.
        self.layer1_conv = nn.Conv2d(1, 16, 2, padding=1)

        self.layer1_activ = nn.ReLU()
        self.layer1_maxpool = nn.MaxPool2d(2, stride=1) #2x2 kernel

        #layer 2, input size
        self.layer2_conv = nn.Conv2d(16, 32, 2, padding=1)
        self.layer2_activ = nn.ReLU()
        self.layer2_maxpool = nn.MaxPool2d(2, stride=1)
        #layer 3
        self.layer3_conv = nn.Conv2d(32, 64, 2, padding=1)
        self.layer3_activ = nn.ReLU()
        self.layer3_maxpool = nn.MaxPool2d(2, stride=2) #14x14x64
        #layer 4
        self.layer4_conv = nn.Conv2d(64, 128, 2, padding=1)
        self.layer4_activ = nn.ReLU()
        self.layer4_maxpool = nn.MaxPool2d(2, stride=2) #7x7x128
        #layer 5
        self.layer5_conv = nn.Conv2d(128, 128, 2, padding=1)
        self.layer5_activ = nn.ReLU()
        self.layer5_maxpool = nn.MaxPool2d(2, stride=2) #4x4x256
        #Dense layer, input size=256*

        self.dense1 = nn.Linear(2048, 20)
        #output layer
        self.out = nn.Linear(20, 10)

    def forward(self, x):

        #pass data x through 1st layer
        x = self.layer1_conv(x)
        #print(f"shape of data after layer1_conv: {x.shape}")
        x = self.layer1_activ(x)
        x = self.layer1_maxpool(x)
        #print(f"shape of data after layer1_maxpool: {x.shape}")

        #pass data through 2nd layer
        x = self.layer2_conv(x)
        #print(f"shape of data after layer2_conv: {x.shape}")
        x = self.layer2_activ(x)
        x = self.layer2_maxpool(x)
        #print(f"shape of data after layer2_maxpool: {x.shape}")

        #pass data through 3rd layer
        x = self.layer3_conv(x)
        #print(f"shape of data after layer3_conv: {x.shape}")
        x = self.layer3_activ(x)
        x = self.layer3_maxpool(x)
        #print(f"shape of data after layer3_maxpool: {x.shape}")

        #pass data through 4th layer
        x = self.layer4_conv(x)
        #print(f"shape of data after layer4_conv: {x.shape}")
        x = self.layer4_activ(x)
        x = self.layer4_maxpool(x)
        #print(f"shape of data after layer4_maxpool: {x.shape}")

        #pass data through 5th layer
        x = self.layer5_conv(x)
        #print(f"shape of data after layer5_conv: {x.shape}")
        x = self.layer5_activ(x)
        x = self.layer5_maxpool(x)
        #print(f"shape of data after layer5_maxpool: {x.shape}")
        #print(len(x[1]))
        #print(len(x[1][1]))
        #print(len(x[1][1][1]))

        #flatten
        x = x.view(-1, 128*4*4)

        #x = self.flatten = torch.flatten(x)
        #This method will return flattened data that will be passed to Dense layer from Q2
        #following 2 lines will be commented after testing.
        #print(f"shape of data after flatten: {x.shape}")
        x = self.dense1(x)
        x = self.out(x)
        return x

b. Experiment with different kernel size, number of kernel each layer

In [None]:
#Train the model
def cnn_model_train(train_dataloader, cnn_model, loss_func, optimizer):
    train_data_size = len(train_dataloader.dataset)
    #set the model to training mode
    cnn_model.train()

    for batch, (x_train, y_train) in enumerate(train_dataloader):
      batch = batch+1
      y_predict = cnn_model(x_train)
      loss = loss_func(y_predict, y_train)

      #backpropagate the prediction loss
      loss.backward()
      #adjust the parameters
      optimizer.step()
      #to reset the gradients of model parameters. Gradients by default add up;
      #to prevent double-counting, we explicitly zero them at each iteration.
      optimizer.zero_grad()

      #printout training metrics after batch of 100
      if batch % 100 ==0:
        loss = loss.item()
        print(f"Train loss: {loss}")

In [None]:
#Test the model
def cnn_model_test(test_dataloader, cnn_model, loss_func):
    #set the model to evaluation (important for BN and Dropout layers)
    cnn_model.eval()
    num_batches = len(test_dataloader)

    #initialize
    test_loss, correct = 0, 0
    #ensure that no grad are computed during test mode
    with torch.no_grad():
        for batch, (x_valid, y_valid) in enumerate(test_dataloader):
            batch = batch+1
            predict = cnn_model(x_valid)
            test_loss = loss_func(predict, y_valid)
            if batch % 100 ==0:
                test_loss = test_loss/num_batches
                print(f"Test loss: {test_loss}")

In [None]:
cnn_model = CNN_Model()
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn_model.parameters(), lr=0.001)
train_dataloader, test_dataloader = FMNIST_DataLoader()

c. Try different weight initialization methods (random, Xavier, He)

In [None]:
# for block is for unit testing only NOT BE EXECUTED IN ACTUAL RUN
for i, (x,y) in enumerate(train_dataloader):
    print(i)
    #print(x[0])
    #print(y[0])
    break

In [None]:
epochs = 2
for i in range(epochs):
    print(f"epoch {i+1}")
    cnn_model_train(train_dataloader, cnn_model, loss_func, optimizer)
    cnn_model_test(test_dataloader, cnn_model, loss_func)