## VGG-11 Initial Training with CIFAR-100

In [24]:
import torch
from torch.utils.data import DataLoader, random_split
import torchvision
import torchvision.transforms as transforms
from torchvision import models
from torch.optim.lr_scheduler import CosineAnnealingLR
import random
import numpy as np
import os

In [25]:
def fix_random_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Fixed random seed: {seed}")

fix_random_seed(42)

# For deterministic DataLoader behavior
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

Fixed random seed: 42


In [26]:
vgg11 = models.vgg11(weights=models.VGG11_Weights.IMAGENET1K_V1)
vgg11.classifier[6] = torch.nn.Linear(4096, 100)
vgg11.classifier[5] = torch.nn.Dropout(p=0.5) # Dropout
device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
vgg11 = vgg11.to(device)

base_lr = 0.04
weight_decay = 5e-4
num_epochs = 5
optim = torch.optim.SGD(
    vgg11.parameters(),
    lr=base_lr,
    momentum=0.9,
    weight_decay=weight_decay
)

scheduler = CosineAnnealingLR(optim, T_max=num_epochs, eta_min=1e-5)

criterion = torch.nn.CrossEntropyLoss()

In [27]:
imagenet_mean = [0.485, 0.456, 0.406]
imagenet_std = [0.229, 0.224, 0.225]
BATCH_SIZE = 128
NUM_WORKERS = 4

g = torch.Generator()
g.manual_seed(42)

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.9, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(imagenet_mean, imagenet_std),
])

test_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(imagenet_mean, imagenet_std),
])

full_train = torchvision.datasets.CIFAR100(root='./data', train=True, download=True)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True)
train_subset, val_subset = random_split(full_train, [45000, 5000], generator=g)

train_subset.dataset.transform = train_transform
val_subset.dataset.transform = test_transform
test_dataset.transform = test_transform

print("Training set size:", len(train_subset))
print("Validation set size:", len(val_subset))
print("Test set size:", len(test_dataset))

def get_loader(dataset, shuffle):
    return DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=shuffle,
        num_workers=NUM_WORKERS,
        pin_memory=True,
        worker_init_fn=seed_worker,
        generator=g
    )

train_loader = get_loader(train_subset, shuffle=True)
val_loader = get_loader(val_subset, shuffle=False)
test_loader = get_loader(test_dataset, shuffle=False)

Training set size: 45000
Validation set size: 5000
Test set size: 10000


In [28]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.to(device)
    model.train()
    running_loss, correct, total = 0.0, 0, 0

    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    avg_loss = running_loss / total
    accuracy = 100. * correct / total
    return avg_loss, accuracy

In [29]:
def evaluate(model, test_loader, device):
    model.to(device)
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            preds = model(images)
            preds = torch.argmax(preds, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total * 100
    return accuracy

Results from 6epoch run. Our baseline model is 5epoch run. Can be reproduced by simply running all the cells. 

In [30]:
best_val = 0
for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(vgg11, train_loader, criterion, optim, device)
    scheduler.step()
    val_acc = evaluate(vgg11, val_loader, device)
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | Val Acc: {val_acc:.2f}%")
    if val_acc > best_val:
      best_val = val_acc
      torch.save(vgg11.state_dict(), "vgg11_cifar100_baseline_6e.pt")

Epoch 1/6 | Loss: 2.7691 | Train Acc: 30.47% | Val Acc: 47.24%
Epoch 2/6 | Loss: 1.7645 | Train Acc: 51.67% | Val Acc: 56.52%
Epoch 3/6 | Loss: 1.1977 | Train Acc: 65.60% | Val Acc: 61.12%
Epoch 4/6 | Loss: 0.6544 | Train Acc: 79.94% | Val Acc: 65.66%
Epoch 5/6 | Loss: 0.2376 | Train Acc: 92.42% | Val Acc: 69.26%
Epoch 6/6 | Loss: 0.0832 | Train Acc: 97.36% | Val Acc: 70.92%


In [31]:
acc = evaluate(vgg11, test_loader, device)
print(f"Accuracy: {acc:.2f}%")

Accuracy: 70.85%
