### **VGG16 Training on CIFAR-100 (SGD Optimizer)**

In [7]:
# Imports libraries
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
import torch.nn.functional as F

# Device Configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
batch_size = 64


### **Dataset Preparation**
Load CIFAR-100 and apply standard preprocessing


In [9]:
# Convert input images to tensors and normalize values
transform = transforms.Compose([transforms.Resize((224,224)),transforms.ToTensor(),transforms.transforms.Normalize(
    mean=(0.5071, 0.4867, 0.4408),
    std=(0.2675, 0.2565, 0.2761)
)
])

train_dataset = datasets.CIFAR100(root="./data", train=True,download=True, transform=transform, )
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = datasets.CIFAR100(root="./data", train=False,download=True, transform=transform, )
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### **Model Architecture**
VGG16 architecture with Batch Normalization to accelerate convergence

In [10]:
class VGG16(nn.Module):
    def __init__(self, num_classes=100):
        super().__init__()
        # Block 1
        self.C1  = nn.Conv2d(3,   64, 3, 1, 1);  self.B1  = nn.BatchNorm2d(64)
        self.C2  = nn.Conv2d(64,  64, 3, 1, 1);  self.B2  = nn.BatchNorm2d(64)
        self.S3  = nn.MaxPool2d(2,2)

        # Block 2
        self.C4  = nn.Conv2d(64,  128, 3, 1, 1); self.B4  = nn.BatchNorm2d(128)
        self.C5  = nn.Conv2d(128, 128, 3, 1, 1); self.B5  = nn.BatchNorm2d(128)
        self.S6  = nn.MaxPool2d(2,2)

        # Block 3
        self.C7  = nn.Conv2d(128, 256, 3, 1, 1); self.B7  = nn.BatchNorm2d(256)
        self.C8  = nn.Conv2d(256, 256, 3, 1, 1); self.B8  = nn.BatchNorm2d(256)
        self.C9  = nn.Conv2d(256, 256, 3, 1, 1); self.B9  = nn.BatchNorm2d(256)
        self.S10 = nn.MaxPool2d(2,2)

        # Block 4
        self.C11 = nn.Conv2d(256, 512, 3, 1, 1); self.B11 = nn.BatchNorm2d(512)
        self.C12 = nn.Conv2d(512, 512, 3, 1, 1); self.B12 = nn.BatchNorm2d(512)
        self.C13 = nn.Conv2d(512, 512, 3, 1, 1); self.B13 = nn.BatchNorm2d(512)
        self.S14 = nn.MaxPool2d(2,2)

        # Block 5
        self.C15 = nn.Conv2d(512, 512, 3, 1, 1); self.B15 = nn.BatchNorm2d(512)
        self.C16 = nn.Conv2d(512, 512, 3, 1, 1); self.B16 = nn.BatchNorm2d(512)
        self.C17 = nn.Conv2d(512, 512, 3, 1, 1); self.B17 = nn.BatchNorm2d(512)
        self.S18 = nn.MaxPool2d(2,2)

        # Head
        self.pool7 = nn.AdaptiveAvgPool2d((7,7))
        self.drop  = nn.Dropout(0.5)
        self.F1    = nn.Linear(512*7*7, 4096)
        self.F2    = nn.Linear(4096, 4096)
        self.F3    = nn.Linear(4096, num_classes)

    def forward(self, x):
        # Block 1
        x = F.relu(self.B1(self.C1(x)))
        x = F.relu(self.B2(self.C2(x)))
        x = self.S3(x)

        # Block 2
        x = F.relu(self.B4(self.C4(x)))
        x = F.relu(self.B5(self.C5(x)))
        x = self.S6(x)

        # Block 3
        x = F.relu(self.B7(self.C7(x)))
        x = F.relu(self.B8(self.C8(x)))
        x = F.relu(self.B9(self.C9(x)))
        x = self.S10(x)

        # Block 4
        x = F.relu(self.B11(self.C11(x)))
        x = F.relu(self.B12(self.C12(x)))
        x = F.relu(self.B13(self.C13(x)))
        x = self.S14(x)

        # Block 5
        x = F.relu(self.B15(self.C15(x)))
        x = F.relu(self.B16(self.C16(x)))
        x = F.relu(self.B17(self.C17(x)))
        x = self.S18(x)

        # Head
        x = self.pool7(x)
        x = x.view(x.size(0), -1)
        x = self.drop(F.relu(self.F1(x)))
        x = self.drop(F.relu(self.F2(x)))
        x = self.F3(x)
        return x


### **Training Setup**

The model is trained on the CIFAR-100 dataset using a VGG16 architecture with 100 output classes.  
**Stochastic Gradient Descent (SGD)** with momentum is used as the optimizer to provide stable updates and improved convergence behavior.

A higher learning rate is selected compared to adaptive optimizers, while momentum helps smooth parameter updates and accelerate training.


In [11]:
num_classes = 100
num_epochs = 5
learning_rate = 0.005
model = VGG16(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=learning_rate,
    momentum=0.9
)
total_step = len(train_loader)

### **Training Loop**

The model is trained epoch by epoch and evaluated on the test set after each epoch.


In [12]:
for epoch in range(num_epochs):
    # Train
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        correct_train += (predicted == labels).sum().item()
        total_train += labels.size(0)

    train_acc = 100.0 * correct_train / total_train
    avg_train_loss = running_loss / len(train_loader)

    # Test
    model.eval()
    test_loss = 0.0
    correct_test = 0
    total_test = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct_test += (predicted == labels).sum().item()
            total_test += labels.size(0)

    avg_test_loss = test_loss / len(test_loader)
    test_acc = 100.0 * correct_test / total_test

    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Test  Loss: {avg_test_loss:.4f} | Test  Acc: {test_acc:.2f}%")

Epoch [1/5]
Train Loss: 4.0017 | Train Acc: 7.87%
Test  Loss: 3.5219 | Test  Acc: 16.68%
Epoch [2/5]
Train Loss: 3.3512 | Train Acc: 18.46%
Test  Loss: 2.9820 | Test  Acc: 26.11%
Epoch [3/5]
Train Loss: 2.9127 | Train Acc: 26.77%
Test  Loss: 2.6463 | Test  Acc: 32.38%
Epoch [4/5]
Train Loss: 2.5359 | Train Acc: 34.21%
Test  Loss: 2.4335 | Test  Acc: 37.32%
Epoch [5/5]
Train Loss: 2.2326 | Train Acc: 40.28%
Test  Loss: 2.1116 | Test  Acc: 43.60%
