In [128]:
import torch
import torchvision
from torchvision.datasets import MNIST
import matplotlib.pyplot as plt
import torchvision.transforms.v2 as v2
import torch.nn as nn
import torch.nn.functional as F

### Get the data for the MNIST dataset
* `10000 samples for train set`
* `5000 samples for test set`

Both the sets will be balanced.

In [6]:
transform =v2.Compose([
    v2.ToImage,
    v2.ToDtype(torch.float,scale=True)
])

train = MNIST('./mnist_dataset',train=True,download=False,transform=transform)
test = MNIST('./mnist_dataset',train=False,download=False,transform = transform)



In [147]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [91]:
class_names = train.classes
class_names
classes= torch.tensor([0,1,2,3,4,5,6,7,8,9])
X_train,y_train = train.data,train.targets


In [97]:
def sample_k_per_class(X, y, classes, k):
    x_samples = []
    y_samples = []

    for cls in classes:
        indices = (y == cls).nonzero(as_tuple=True)[0]
        
        if len(indices) < k:
            raise ValueError(f"Not enough samples in class {cls} to sample {k} items.")

        chosen = indices[torch.randperm(len(indices))[:k]]

        x_samples.append(X[chosen])
        y_samples.append(y[chosen])

    X_out = torch.cat(x_samples, dim=0)
    y_out = torch.cat(y_samples, dim=0)

    perm = torch.randperm(len(y_out))
    return X_out[perm], y_out[perm]
def count_trainable_layers(model):
    return sum(1 for m in model.modules() if any(p.requires_grad for p in m.parameters()))
X_train,y_train = sample_k_per_class(X_train,y_train,classes,1000)

In [79]:
X_test,y_test = test.data,test.targets
X_test,y_test = sample_k_per_class(X_test,y_test,classes,100)

In [133]:
X_train = X_train.type(torch.float)
X_test = X_test.type(torch.float)

In [148]:
X_train,X_test = X_train.to(device),X_test.to(device)
y_train,y_test = y_train.to(device), y_test.to(device)

In [195]:
class MyConvNetwork(nn.Module):
    def __init__(self,config,width):
        super().__init__()
        self.layers  = nn.ModuleList()
        for in_ch,out_ch in config:
            self.layers.append(
                nn.Conv2d(in_ch,out_ch,kernel_size=3,padding='same')
            )
        self.final_width = width - len(config)
        last_out_channels = config[-1][1]
        self.maxpool = nn.MaxPool2d(2,1)
        self.l1 = nn.Linear(last_out_channels*self.final_width*self.final_width,10) 
    def forward(self,x):
        for layer in self.layers:
            x = layer(x)
            x= F.relu(x)
            x = self.maxpool(x)
        
        x = torch.flatten(x,1)
        
        x = self.l1(x)

        return x


In [None]:
# model(torch.unsqueeze(X_train[0],dim=0))

tensor([[-0.3954, -0.5699,  0.2147, -0.2486,  0.2880, -0.0752, -0.4586, -0.3145,
         -0.3568,  0.2415]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [197]:
# model = MyConvNetwork([[1,3],[3,8],[8,16],[16,24],[24,32],[32,24],[24,16],[16,8],[8,3],[3,3]],width=28).to(device)
# opt = torch.optim.Adam(model.parameters(),lr=0.01)
# sch = torch.optim.lr_scheduler.StepLR(opt, step_size=10, gamma=0.1)
# loss_fn = torch.nn.CrossEntropyLoss()

In [196]:
# for epoch in range(50):
    
#     model.train()
#     y_pred_train = model(X_train)
#     y_pred_labels = y_pred_train.argmax(dim=1)
#     correct = (y_pred_labels == y_train).sum().item()
#     trainacc = 100 * correct / len(y_train)

#     loss = loss_fn(y_pred_train,y_train)
#     trainloss = loss.item()
#     opt.zero_grad()
#     loss.backward()

#     for name, param in model.named_parameters():
#         if param.grad is not None:
#             grad_norm = param.grad.data.norm(2).item()
#             if grad_norm < 1e-8 or grad_norm > 1e2:
#                 print("Vanishing or explosion of gradient encountered")
#     opt.step()
#     sch.step()



#     model.eval()
#     with torch.inference_mode():
#         y_pred_test = model(X_test)
#         loss = loss_fn(y_pred_test,y_test)
#         y_pred_labels = y_pred_test.argmax(dim=1)
#         correct = (y_pred_labels == y_test).sum().item()
#         testacc = 100 * correct / len(y_test)

#         testloss = loss.item()
    
#     if epoch%5==0:
#         print(f"{epoch} :: Train loss : {trainloss} Train Acc : {trainacc} -------------------------- Test loss : {testloss}  Test Acc : {testacc}")


In [198]:
config_ls = []
num_models = 3
layers = 5
for i in range(num_models):
    hold = [[1,3]]
    for _ in range(layers-1):
        hold.append([3,3])
    
    config_ls.append(hold)
    layers += 5

models = []
optimizers = []
schedulers = []
loss_fn = nn.CrossEntropyLoss()
for config in config_ls:
    model_hold = MyConvNetwork(config,width=28).to(device)
    models.append(model_hold)
    optimizers.append(torch.optim.Adam(params = model_hold.parameters(),lr=0.01))
    schedulers.append(torch.optim.lr_scheduler.StepLR(optimizers[-1], step_size=10, gamma=0.1))

model_losses_train = []
model_accuracy_train = []
model_losses_test = []
model_accuracy_test = []



tensor([[ 0.0204,  0.0118,  0.0175,  0.0179, -0.0210, -0.0068,  0.0113, -0.0140,
         -0.0102,  0.0129]], device='cuda:0', grad_fn=<AddmmBackward0>)

## I will try 2 approaches
1) Train all the models for the same epochs and compare the performance vs time
2) Train all the models till their performance flatlines and compare their performances. 

In [199]:
# models,optimizers, schedulers
current_model = 0
epochs = 100
train_loss_ls = []
train_acc_ls = []
test_loss_ls = []
test_acc_ls = []

for epoch in range(epochs):
    
    models[current_model].train()
    y_pred_train = models[current_model](X_train)
    y_pred_labels = y_pred_train.argmax(dim=1)
    correct = (y_pred_labels == y_train).sum().item()
    acc = 100 * correct / len(y_train)

    loss = loss_fn(y_pred_train,y_train)

    train_loss_ls.append(loss.item())
    train_acc_ls.append(acc)

    optimizers[current_model].zero_grad()
    loss.backward()
    optimizers[current_model].step()
    schedulers[current_model].step()

    models[current_model].eval()
    with torch.inference_mode():
        y_pred_test = models[current_model](X_test)
        loss = loss_fn(y_pred_test,y_test)
        y_pred_labels = y_pred_test.argmax(dim=1)
        correct = (y_pred_labels == y_test).sum().item()
        acc = 100 * correct / len(y_test)

        test_loss_ls.append(loss.item())
        test_acc_ls.append(acc)
    if epoch%5==0:
        print(f"{epoch} :: Train loss : {train_loss_ls[-1]} Train Acc : {train_acc_ls[-1]} -------------------------- Test loss : {test_loss_ls[-1]}  Test Acc : {test_acc_ls[-1]}")


    

OutOfMemoryError: CUDA out of memory. Tried to allocate 72.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.47 GiB is allocated by PyTorch, and 279.67 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [200]:
torch.cuda.memory_allocated()

11236888064

In [157]:
model_losses_train.append(train_loss_ls)
model_losses_test.append(test_loss_ls)
model_accuracy_train.append(train_acc_ls)
model_accuracy_test.append(test_acc_ls)

In [None]:
ls = model_losses_train
colors = ['r','g','b','yellow','orange']
y_label = 'Loss'
x_label = 'epochs'

plt.figure(figsize=(10,7))
plt.plot()