In [1]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torchvision import datasets
import torchvision.transforms as transforms
from torchvision.models import resnet34
from torch.utils.data import DataLoader

from sklearn.metrics import confusion_matrix, f1_score
import math
from tqdm import tqdm

In [2]:
# convert data to a normalized torch.FloatTensor
transform = transforms.Compose([    transforms.ToTensor(), transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010])
])


In [3]:
# loading the train data
train_data = datasets.CIFAR10('data', train=True,
                              download=True, transform=transform)
train_dataloader = DataLoader(train_data, batch_size=100,shuffle=True)

#loading the test data
test_data = datasets.CIFAR10('data', train=False,
                             download=True, transform=transform)
test_dataloader = DataLoader(test_data, batch_size=100, shuffle=True)
# You should define x_train and y_train

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data\cifar-10-python.tar.gz


100%|███████████████████████████████████████████████████████████████| 170498071/170498071 [00:45<00:00, 3757048.37it/s]


Extracting data\cifar-10-python.tar.gz to data
Files already downloaded and verified


### Dense (fully connected) layer

In [4]:
class Dense:
    def __init__(self,n_inputs,n_neurons):
        # He Weight Initialization
        self.weights = np.random.randn(n_inputs, n_neurons) * np.sqrt(2 / n_inputs)
        self.biases = np.zeros((1, n_neurons))
    
    def forward(self,inputs):
        self.inputs = inputs
        return np.dot(inputs, self.weights) + self.biases

    def backward(self,output_error):
        # calculating errors
        self.inputs_error = np.dot(output_error, self.weights.T)
        self.weights_grad = np.dot(self.inputs.T, output_error)
        self.biases_grad = output_error
        return self.inputs_error

In [5]:

w = np.random.randn(10, 10) * np.sqrt(2 / 10)
b = np.zeros((1, 10))
print(w.shape)
print(b.shape)
print(b)

(10, 10)
(1, 10)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


### Activation Layers


In [6]:
class ReLU:
    def forward(self,inputs):
        self.inputs = inputs
        return np.maximum(0,self.inputs)

    def backward(self, output_error):
        return output_error > 0
        

In [7]:
class Sigmoid:
    def forward(self,inputs):
        self.outputs = 1 / (1 + np.exp(-inputs))
        return self.outputs

    def backward(self,output_error):
        self.outputs_grad = output_error * (1-self.outputs) * self.outputs 
        return self.outputs_grad

In [8]:
class Softmax:
    def forward(self, inputs):
        exp_inputs = np.exp(inputs)
        self.output = exp_inputs / np.sum(exp_inputs, axis=1, keepdims=True)
        return self.output
    
    def backward(self, output_error):
        # For a softmax output y and target t, the derivative dy/dx_i can be computed as:
        # dy/dx_i = y_i * (1 - y_i) if i=j, and -y_i * y_j if i!=j
        # where j is the index of the correct class
        # More details: https://deepnotes.io/softmax-crossentropy#derivative-of-softmax
        
        # Here, output_error is the derivative of the loss with respect to the output of the softmax layer.
        # We need to compute the derivative of the loss with respect to the input to the softmax layer.
        # Using the chain rule, this can be written as:
        # dL/dx_i = dL/dy_j * dy_j/dx_i, where j is the index of the correct class
        # dy_j/dx_i can be computed using the above formula
        
        self.input_error = output_error * self.output - output_error * np.sum(self.output * self.output, axis=1, keepdims=True)
        return self.input_error


### Loss function

In [9]:
class Categorical_Cross_Entropy_loss:
    def forward(self,softmax_output,class_label):
        self.softmax_output = softmax_output
        self.class_label = class_label
        self.batch_size = softmax_output.shape[0]
        self.loss = -np.sum(np.log(softmax_output[np.arange(self.batch_size), class_label])) / self.batch_size
        return self.loss
        
    def backward(self,softmax_output,class_label):
        # The derivative of the CCE loss with respect to the softmax output y_i can be computed as:
        # dL/dy_i = y_i - t_i, where t_i is 1 if i is the index of the correct class, and 0 otherwise
        # More details: https://deepnotes.io/softmax-crossentropy
        
        # Here, softmax_output is the output of the softmax layer, and class_label is the index of the correct class for each input
        # We need to compute the derivative of the loss with respect to the input to the softmax layer.
        # Using the chain rule, this can be written as:
        # dL/dx_i = dL/dy_i * dy_i/dx_i, where dy_i/dx_i can be computed using the softmax backward function.
        
        output_error = softmax_output.copy()
        output_error[np.arange(self.batch_size), class_label] -= 1
        output_error /= self.batch_size
        
        return output_error

### Optimizer

In [10]:
class CosineScheduler:
    def __init__(self, max_update, base_lr=0.01, final_lr=0,
               warmup_steps=0, warmup_begin_lr=0):
        self.base_lr_orig = base_lr
        self.max_update = max_update
        self.final_lr = final_lr
        self.warmup_steps = warmup_steps
        self.warmup_begin_lr = warmup_begin_lr
        self.max_steps = self.max_update - self.warmup_steps

    def get_warmup_lr(self, epoch):
        increase = (self.base_lr_orig - self.warmup_begin_lr) \
                       * float(epoch) / float(self.warmup_steps)
        return self.warmup_begin_lr + increase

    def __call__(self, epoch):
        if epoch < self.warmup_steps:
            return self.get_warmup_lr(epoch)
        if epoch <= self.max_update:
            self.base_lr = self.final_lr + (
                self.base_lr_orig - self.final_lr) * (1 + math.cos(
                math.pi * (epoch - self.warmup_steps) / self.max_steps)) / 2
        return self.base_lr

In [11]:
class FactorScheduler:
    def __init__(self, factor=1, stop_factor_lr=1e-7, base_lr=0.001):
        self.factor = factor
        self.stop_factor_lr = stop_factor_lr
        self.base_lr = base_lr

    def __call__(self, num_update):
        self.base_lr = max(self.stop_factor_lr, self.base_lr * self.factor)
        return self.base_lr

In [12]:
class SGD:
    def __init__(self, learning_rate=0.001):
        self.scheduler = FactorScheduler(factor=0.9, stop_factor_lr=1e-2, base_lr=2.0)
    def __call__(self, layer, num_epoch):
        # Update layer parameters based on gradient descent rule
        layer.weights = layer.weights - self.scheduler(num_epoch) * layer.weights_grad
        layer.biases  =  layer.biases - self.scheduler(num_epoch) * layer.biases_grad

### Architecture

In [13]:
feature_extractor = resnet34(pretrained=True)
num_features = feature_extractor.fc.in_features

for param in feature_extractor.parameters():
    param.requires_grad = False

feature_extractor.fc = nn.Identity() 



In [14]:
#model
Layer1 = Dense(num_features,20)
Act1 = ReLU()
Layer2 = Dense(20,10)
Act2 = Softmax()
Loss = Categorical_Cross_Entropy_loss()
Optimizer = SGD(learning_rate=0.01)

### Train

In [None]:


epochs = 20

for epoch in range(epochs):
    epoch_loss = 0
    epoch_accuracy = 0
    i = 0 
    for x_train, y_train in tqdm(train_dataloader, desc=f"Epoch {epoch+1}", colour="blue"):
        # Forward pass
        x = feature_extractor(x_train)
        x = Layer1.forward(x)
        x = Act1.forward(x)
        x = Layer2.forward(x)
        x = Act2.forward(x)
        loss = Loss.forward(x, y_train)

        # Report batch metrics
        y_predict = np.argmax(x, axis=1)
        accuracy = np.mean(y_train.numpy() == y_predict)
        epoch_loss += loss
        epoch_accuracy += accuracy

        # Backward pass
        x = Loss.backward(x, y_train)
        x = Act2.backward(x)
        x = Layer2.backward(x)
        x = Act1.backward(x)
        x = Layer1.backward(x)

        # Update parameters
        Optimizer(Layer1, epoch)
        Optimizer(Layer2, epoch)

        i += 1

    # Report epoch metrics
    epoch_loss /= len(train_dataloader)
    epoch_accuracy /= len(train_dataloader)
    print(f'Epoch: {epoch+1}')
    print(f'Loss: {epoch_loss:.5f}')
    print(f'Accuracy: {epoch_accuracy:.5f}')
    print('--------------------------')


Epoch 1: 100%|[34m███████████████████████████████████████████████████████████████████████[0m| 500/500 [01:57<00:00,  4.26it/s][0m


Epoch: 1
Loss: 2.30608
Accuracy: 0.10076
--------------------------


Epoch 2: 100%|[34m███████████████████████████████████████████████████████████████████████[0m| 500/500 [01:52<00:00,  4.45it/s][0m


Epoch: 2
Loss: 2.30259
Accuracy: 0.09920
--------------------------


Epoch 3: 100%|[34m███████████████████████████████████████████████████████████████████████[0m| 500/500 [01:51<00:00,  4.49it/s][0m


Epoch: 3
Loss: 2.30258
Accuracy: 0.10116
--------------------------


Epoch 4: 100%|[34m███████████████████████████████████████████████████████████████████████[0m| 500/500 [01:48<00:00,  4.60it/s][0m


Epoch: 4
Loss: 2.30260
Accuracy: 0.09976
--------------------------


Epoch 5: 100%|[34m███████████████████████████████████████████████████████████████████████[0m| 500/500 [01:48<00:00,  4.61it/s][0m


Epoch: 5
Loss: 2.30258
Accuracy: 0.09952
--------------------------


Epoch 6: 100%|[34m███████████████████████████████████████████████████████████████████████[0m| 500/500 [01:50<00:00,  4.54it/s][0m


Epoch: 6
Loss: 2.30258
Accuracy: 0.10054
--------------------------


Epoch 7: 100%|[34m███████████████████████████████████████████████████████████████████████[0m| 500/500 [01:47<00:00,  4.64it/s][0m


Epoch: 7
Loss: 2.30259
Accuracy: 0.10040
--------------------------


Epoch 8: 100%|[34m███████████████████████████████████████████████████████████████████████[0m| 500/500 [01:50<00:00,  4.54it/s][0m


Epoch: 8
Loss: 2.30258
Accuracy: 0.10140
--------------------------


Epoch 9: 100%|[34m███████████████████████████████████████████████████████████████████████[0m| 500/500 [01:44<00:00,  4.77it/s][0m


Epoch: 9
Loss: 2.30260
Accuracy: 0.10050
--------------------------


Epoch 10: 100%|[34m██████████████████████████████████████████████████████████████████████[0m| 500/500 [01:44<00:00,  4.79it/s][0m


Epoch: 10
Loss: 2.30259
Accuracy: 0.09926
--------------------------


Epoch 11: 100%|[34m██████████████████████████████████████████████████████████████████████[0m| 500/500 [01:43<00:00,  4.84it/s][0m


Epoch: 11
Loss: 2.30258
Accuracy: 0.10102
--------------------------


Epoch 12: 100%|[34m██████████████████████████████████████████████████████████████████████[0m| 500/500 [01:37<00:00,  5.11it/s][0m


Epoch: 12
Loss: 2.30259
Accuracy: 0.09936
--------------------------


Epoch 13: 100%|[34m██████████████████████████████████████████████████████████████████████[0m| 500/500 [01:31<00:00,  5.45it/s][0m


Epoch: 13
Loss: 2.30258
Accuracy: 0.09970
--------------------------


Epoch 14: 100%|[34m██████████████████████████████████████████████████████████████████████[0m| 500/500 [01:30<00:00,  5.52it/s][0m


Epoch: 14
Loss: 2.30258
Accuracy: 0.10138
--------------------------


Epoch 15: 100%|[34m██████████████████████████████████████████████████████████████████████[0m| 500/500 [01:34<00:00,  5.31it/s][0m


Epoch: 15
Loss: 2.30258
Accuracy: 0.09972
--------------------------


Epoch 16: 100%|[34m██████████████████████████████████████████████████████████████████████[0m| 500/500 [01:42<00:00,  4.86it/s][0m


Epoch: 16
Loss: 2.30258
Accuracy: 0.10074
--------------------------


Epoch 17:  41%|[34m████████████████████████████▋                                         [0m| 205/500 [00:46<01:04,  4.56it/s][0m

### Evaluation

In [None]:
#Confusion Matrix for the training set
cm_train = confusion_matrix(y_train, y_predict)
plt.subplots(figsize=(10, 6))
sb.heatmap(cm_train, annot = True, fmt = 'g')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix for the training set")
plt.show()

#Confusion Matrix for the test set
# // To Do