# ResNet10 with CIFAR10 dataset - Study Optimization



In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

try:
    from torchsummary import summary
except ModuleNotFoundError:
    !pip install torchsummary
    from torchsummary import summary

from torchvision import datasets, transforms
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import torchvision

import time

In [2]:
train_transforms = transforms.Compose([
                                       transforms.RandomAffine(degrees=10, shear = 10),
                                       transforms.ToTensor(),
                                       transforms.Normalize((0.1307,), (0.3081,))
                                       ])

# Test Phase transformations
test_transforms = transforms.Compose([
                                      #  transforms.Resize((28, 28)),
                                      #  transforms.ColorJitter(brightness=0.10, contrast=0.1, saturation=0.10, hue=0.1),
                                       transforms.ToTensor(),
                                       transforms.Normalize((0.1307,), (0.3081,))
                                       ])

train = datasets.CIFAR10(root = './data', train=True, download=True, transform=train_transforms)
test = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transforms)

# Do we have CUDA drivers for us?
cuda = torch.cuda.is_available()
print ("Cuda Available?", cuda)

dataloader_args = dict(shuffle=True, batch_size=2048, num_workers=2, pin_memory=True) if cuda else dict(shuffle=True, batch_size=64)

# Dataloaders
train_loader = torch.utils.data.DataLoader(dataset=train, **dataloader_args)
test_loader = torch.utils.data.DataLoader(dataset=test, **dataloader_args)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:05<00:00, 29.8MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Cuda Available? True


In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv01 = nn.Conv2d(3, 16, 3, bias=False, padding=1)
        self.batch01 = nn.BatchNorm2d(num_features=16)

        # ---- Lets take a skip connection
        self.skip_conv1 = nn.Conv2d(16, 16, 3, padding=0, dilation=2)

        self.conv02 = nn.Conv2d(16, 16, 3, bias=False,padding=1)
        self.batch02 = nn.BatchNorm2d(num_features=16)
        self.conv03 = nn.Conv2d(16, 16, 3, bias=False,padding=1)
        self.batch03 = nn.BatchNorm2d(num_features=16)
        self.conv04 = nn.Conv2d(16, 16, 3, bias=False,padding=1)
        self.batch04 = nn.BatchNorm2d(num_features=16)
        self.pool01 = nn.MaxPool2d(2, 2)                                #O=16
        self.conv05 = nn.Conv2d(16, 16, 1, bias=False)

        self.conv11 = nn.Conv2d(16, 32, 3, bias=False, padding=1)
        self.batch11 = nn.BatchNorm2d(num_features=32)
        self.conv12 = nn.Conv2d(32, 32, 3, bias=False, padding=1)
        self.batch12 = nn.BatchNorm2d(num_features=32)
        self.conv13 = nn.Conv2d(32, 32, 3, bias=False, padding=1)
        self.batch13 = nn.BatchNorm2d(num_features=32)
        self.conv14 = nn.Conv2d(32, 32, 3, bias=False, padding=1)
        self.batch14 = nn.BatchNorm2d(num_features=32)
        self.pool11 = nn.MaxPool2d(2, 2)                                #O=8
        self.conv15 = nn.Conv2d(32, 32, 1, bias=False)

        self.conv21 = nn.Conv2d(32, 64, 3, bias=False, padding=1)
        self.batch21 = nn.BatchNorm2d(num_features=64)
        self.conv22 = nn.Conv2d(64, 64, 3, bias=False, padding=1)
        self.batch22 = nn.BatchNorm2d(num_features=64)
        self.conv23 = nn.Conv2d(64, 64, 3, bias=False, padding=1)
        self.batch23 = nn.BatchNorm2d(num_features=64)
        self.conv24 = nn.Conv2d(64, 64, 3, bias=False, padding=1)
        self.batch24 = nn.BatchNorm2d(num_features=64)
        self.pool21 = nn.MaxPool2d(2, 2)                                #O=4
        self.conv25 = nn.Conv2d(64, 64, 1, bias=False)

        self.conv31 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, groups=64, bias = False, padding = 1)
        self.convPV1= nn.Conv2d(in_channels=64, out_channels=128, kernel_size=1, bias = False, padding = 0)
        self.batch31 = nn.BatchNorm2d(num_features=128)
        self.conv32 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, groups=128, bias = False, padding = 1)
        self.convPV2= nn.Conv2d(in_channels=128, out_channels=256, kernel_size=1, bias = False, padding = 0)
        self.batch32 = nn.BatchNorm2d(num_features=256)


        self.avg_pool = nn.AvgPool2d(kernel_size=4)
        self.convx3 = nn.Conv2d(256, 10, 1, bias=False, padding=0)

    def forward(self, x):
        x = self.batch01(F.relu(self.conv01(x)))

        # ---- Lets take a skip connection
        skip_channels = self.skip_conv1(self.skip_conv1(self.skip_conv1(self.skip_conv1(x))))

        x = self.batch02(F.relu(self.conv02(x)))
        x = self.batch03(F.relu(self.conv03(x)))
        x = self.batch04(F.relu(self.conv04(x)))
        x = self.pool01(x)
        x = self.conv05(x)
        # ----------------------------------------------------------

        # ---- Lets add the skip connection here
        x = skip_channels + x

        x = self.batch11(F.relu(self.conv11(x)))
        x = self.batch12(F.relu(self.conv12(x)))
        x = self.batch13(F.relu(self.conv13(x)))
        x = self.batch14(F.relu(self.conv14(x)))
        x = self.pool11(x)
        x = self.conv15(x)
        # ----------------------------------------------------------

        x = self.batch21(F.relu(self.conv21(x)))
        x = self.batch22(F.relu(self.conv22(x)))
        x = self.batch23(F.relu(self.conv23(x)))
        x = self.batch24(F.relu(self.conv24(x)))
        x = self.pool21(x)
        x = self.conv25(x)
        # ----------------------------------------------------------

        x = self.batch31(F.relu(self.convPV1(F.relu(self.conv31(x)))))
        x = self.batch32(F.relu(self.convPV2(F.relu(self.conv32(x)))))


        x = self.avg_pool(x)
        x = self.convx3(x)
        x = x.view(-1, 10)                           # Don't want 10x1x1..
        return F.log_softmax(x, dim=1)  # Added dim=1 parameter)

In [4]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model = Net().to(device)
summary(model, input_size=(3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 32, 32]             432
       BatchNorm2d-2           [-1, 16, 32, 32]              32
            Conv2d-3           [-1, 16, 28, 28]           2,320
            Conv2d-4           [-1, 16, 24, 24]           2,320
            Conv2d-5           [-1, 16, 20, 20]           2,320
            Conv2d-6           [-1, 16, 16, 16]           2,320
            Conv2d-7           [-1, 16, 32, 32]           2,304
       BatchNorm2d-8           [-1, 16, 32, 32]              32
            Conv2d-9           [-1, 16, 32, 32]           2,304
      BatchNorm2d-10           [-1, 16, 32, 32]              32
           Conv2d-11           [-1, 16, 32, 32]           2,304
      BatchNorm2d-12           [-1, 16, 32, 32]              32
        MaxPool2d-13           [-1, 16, 16, 16]               0
           Conv2d-14           [-1, 16,

In [5]:
from tqdm import tqdm

train_losses = []
test_losses = []
train_acc = []
test_acc = []
time_taken = []

def train(model, device, train_loader, optimizer, epoch):
    model.train()
    pbar = tqdm(train_loader)

    correct = 0
    processed = 0
    time_taken.clear()

    for batch_idx, (data, target) in enumerate (pbar):
        t0 = time.time()

        data, target = data.to(device), target.to(device)

        #Don't want history of gradients
        optimizer.zero_grad()

        y_predict = model(data)

        # Caluclate loss
        loss = F.nll_loss(y_predict, target)
        train_losses.append(loss)

        # Back propogate error
        loss.backward()

        # Take a optimzer step
        optimizer.step()

        torch.cuda.synchronize()
        t1 = time.time()

        time_taken.append((t1-t0))

        pred = y_predict.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()
        processed += len(data)

        # print(f'Loss={loss.item()} Batch_id={batch_idx} Accuracy={100*correct/processed:0.2f} Time taken per iter = {dt :.2f}ms')
        pbar.set_description(desc= f'Loss={loss.item()} Batch_id={batch_idx} Accuracy={100*correct/processed:0.2f}')
        train_acc.append(100*correct/processed)


def test (model, device, test_loader):
    model.eval()

    test_loss = 0
    correct = 0

    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)

            output = model(data)

            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_losses.append(test_loss)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

    test_acc.append(100. * correct / len(test_loader.dataset))


model =  Net().to(device)
criteria = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)

EPOCHS = 5
for epoch in range(EPOCHS):
    train(model, device, train_loader, optimizer, epoch)
    print(f" --> EPOCH: {epoch}, Avg Time Taken = {(sum(time_taken)/len(time_taken))*1000:.2f}ms")
    # scheduler.step()
    test(model, device, test_loader)

Loss=1.806193470954895 Batch_id=24 Accuracy=21.56: 100%|██████████| 25/25 [00:25<00:00,  1.02s/it]

 --> EPOCH: 0, Avg Time Taken = 337.22ms






Test set: Average loss: 2.3172, Accuracy: 1002/10000 (10.02%)



Loss=1.5318331718444824 Batch_id=24 Accuracy=37.21: 100%|██████████| 25/25 [00:17<00:00,  1.43it/s]

 --> EPOCH: 1, Avg Time Taken = 310.68ms






Test set: Average loss: 1.6515, Accuracy: 3741/10000 (37.41%)



Loss=1.4526479244232178 Batch_id=24 Accuracy=45.36: 100%|██████████| 25/25 [00:17<00:00,  1.41it/s]

 --> EPOCH: 2, Avg Time Taken = 310.49ms






Test set: Average loss: 1.4293, Accuracy: 4796/10000 (47.96%)



Loss=1.2338448762893677 Batch_id=24 Accuracy=51.04: 100%|██████████| 25/25 [00:17<00:00,  1.42it/s]

 --> EPOCH: 3, Avg Time Taken = 311.87ms






Test set: Average loss: 1.2852, Accuracy: 5269/10000 (52.69%)



Loss=1.181266188621521 Batch_id=24 Accuracy=55.43: 100%|██████████| 25/25 [00:17<00:00,  1.40it/s]

 --> EPOCH: 4, Avg Time Taken = 313.12ms






Test set: Average loss: 1.2248, Accuracy: 5603/10000 (56.03%)



### Check if BF16 is available or not. If not available, progress with FP16

In [6]:
import torch

def check_bf16_support():
    if torch.cuda.is_available():
        compute_capability = torch.cuda.get_device_capability()
        # Ampere (8.x) and newer GPUs support BF16
        return compute_capability[0] >= 8
    return False

check_bf16_support()

False

In [7]:
from torch.amp import autocast, GradScaler  # Updated import
torch.backends.cudnn.benchmark = True
n_epochs = 5

# 1. Setup model and optimizer with FP16 support
def setup_fp16_model():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = torch.compile(Net().to(device))
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.1, steps_per_epoch=len(train_loader), epochs=n_epochs)
    # Updated GradScaler initialization
    scaler = GradScaler('cuda')

    return model, optimizer, scaler, device

# 2. Modified training loop
def train(model, device, train_loader, optimizer, scaler, epoch):
    model.train()
    pbar = tqdm(train_loader)

    correct = 0
    processed = 0
    time_taken.clear()

    for batch_idx, (data, target) in enumerate(pbar):
        t0 = time.time()

        # Convert data to FP16 before moving to GPU
        data = data.half()  # Convert to FP16

        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()

        # Updated autocast
        with autocast('cuda', dtype=torch.float16):
            y_predict = model(data)
            loss = F.nll_loss(y_predict, target)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        # scheduler.step()

        torch.cuda.synchronize()
        t1 = time.time()

        time_taken.append((t1-t0))

        pred = y_predict.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        processed += len(data)

        pbar.set_description(desc=f'Loss={loss.item()} Batch_idx={batch_idx} Accuracy={100*correct/processed:0.2f}')
        train_acc.append(100*correct/processed)

# 3. Modified test loop
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for data, target in test_loader:

            # Convert data to FP16 before moving to GPU
            data = data.half()  # Convert to FP16

            data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True)

            # Updated autocast
            with autocast('cuda'):
                output = model(data)
                test_loss += F.nll_loss(output, target, reduction='sum').item()

            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_losses.append(test_loss)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

    test_acc.append(100. * correct / len(test_loader.dataset))

# 4. Training setup and execution
model, optimizer, scaler, device = setup_fp16_model()

# Your training loop
for epoch in range(1, n_epochs + 1):
    # print("EPOCH:", epoch)
    train(model, device, train_loader, optimizer, scaler, epoch)
    print(f" --> EPOCH: {epoch}, Avg Time Taken = {(sum(time_taken)/len(time_taken))*1000:.2f}ms")
    test(model, device, test_loader)

Loss=1.9276012182235718 Batch_idx=24 Accuracy=18.97: 100%|██████████| 25/25 [01:54<00:00,  4.59s/it]

 --> EPOCH: 1, Avg Time Taken = 4122.79ms






Test set: Average loss: 2.3071, Accuracy: 1000/10000 (10.00%)



Loss=1.5672558546066284 Batch_idx=24 Accuracy=35.26: 100%|██████████| 25/25 [00:22<00:00,  1.13it/s]

 --> EPOCH: 2, Avg Time Taken = 191.65ms






Test set: Average loss: 1.6947, Accuracy: 3696/10000 (36.96%)



Loss=1.38279390335083 Batch_idx=24 Accuracy=44.84: 100%|██████████| 25/25 [00:17<00:00,  1.44it/s]

 --> EPOCH: 3, Avg Time Taken = 124.75ms






Test set: Average loss: 1.4096, Accuracy: 4870/10000 (48.70%)



Loss=1.3067739009857178 Batch_idx=24 Accuracy=50.84: 100%|██████████| 25/25 [00:16<00:00,  1.52it/s]

 --> EPOCH: 4, Avg Time Taken = 124.67ms






Test set: Average loss: 1.3127, Accuracy: 5226/10000 (52.26%)



Loss=1.1495048999786377 Batch_idx=24 Accuracy=54.93: 100%|██████████| 25/25 [00:16<00:00,  1.51it/s]

 --> EPOCH: 5, Avg Time Taken = 123.98ms






Test set: Average loss: 1.2624, Accuracy: 5433/10000 (54.33%)



#### Code for BFP16 support

In [None]:
# 1. Setup model and optimizer with BF16 support
# def setup_bf16_model(model):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#     # Enable BF16 autocast
#     torch.set_float32_matmul_precision('medium')

#     # Convert model to BF16
#     model = Net().to(device).to(torch.bfloat16)
#     model = torch.compile(model)

#     optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

#     return model, optimizer, device

# # 2. Modified training loop for BF16
# def train(model, device, train_loader, optimizer, epoch):
#     model.train()
#     pbar = tqdm(train_loader)

#     correct = 0
#     processed = 0
#     time_taken.clear()

#     for batch_idx, (data, target) in enumerate(pbar):
#         t0 = time.time()

#         # Convert input to BF16
#         data = data.to(torch.bfloat16)
#         data, target = data.to(device), target.to(device)

#         optimizer.zero_grad(set_to_none=True)

#         # Use automatic mixed precision with BF16
#         with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
#             output = model(data)
#             loss = F.nll_loss(output, target)

#         loss.backward()
#         optimizer.step()

#         torch.cuda.synchronize()
#         t1 = time.time()

#         time_taken.append((t1-t0))

#         pred = output.argmax(dim=1, keepdim=True)
#         correct += pred.eq(target.view_as(pred)).sum().item()
#         processed += len(data)

#         pbar.set_description(
#             desc=f'Loss={loss.item():.4f} Batch_id={batch_idx} Accuracy={100*correct/processed:0.2f}'
#         )
#         train_acc.append(100*correct/processed)

# # 3. Modified test loop for BF16
# def test(model, device, test_loader):
#     model.eval()
#     test_loss = 0
#     correct = 0

#     with torch.no_grad():
#         for data, target in test_loader:
#             # Convert input to BF16
#             data = data.to(torch.bfloat16)
#             data, target = data.to(device), target.to(device)

#             with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
#                 output = model(data)
#                 test_loss += F.nll_loss(output, target, reduction='sum').item()

#             pred = output.argmax(dim=1, keepdim=True)
#             correct += pred.eq(target.view_as(pred)).sum().item()

#     test_loss /= len(test_loader.dataset)
#     test_losses.append(test_loss)

#     print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
#         test_loss, correct, len(test_loader.dataset),
#         100. * correct / len(test_loader.dataset)))

#     test_acc.append(100. * correct / len(test_loader.dataset))

# # 4. Training setup and execution
# model, optimizer, device = setup_bf16_model(model)

# # Training loop
# for epoch in range(1, n_epochs + 1):
#     train(model, device, train_loader, optimizer, epoch)
#     print(f" --> EPOCH: {epoch}, Avg Time Taken = {(sum(time_taken)/len(time_taken))*1000:.2f}ms")
#     test(model, device, test_loader)