<a href="https://colab.research.google.com/github/mayanjabbaale/Model-Optimization-Using-Quantization/blob/main/Model_Quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import time
import warnings
import numpy as np
from packaging import version
import torch
import torchvision
import torch.nn as nn
from torchvision import datasets, transforms, models
from torch.quantization import quantize_dynamic
from torch.ao.quantization import get_default_qconfig, QConfigMapping
from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx
from torch.utils.data import DataLoader, Subset

warnings.filterwarnings("ignore", message=".*TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support.*")
warnings.filterwarnings("ignore", message=".*erase_node(.*) on an already erased node.*")

print(f"Using PyTorch version --> {torch.__version__}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Working on --> {device}')

skip_cpu = False
print(f'Skip CPU evaluations --> {skip_cpu}')

Using PyTorch version --> 2.9.0+cu126
Working on --> cuda
Skip CPU evaluations --> False


In [2]:
transform = transforms.Compose([
    transforms.Resize((32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), ((0.5,)))
])

train_data = datasets.CIFAR10("./train", train=True, transform=transform, download=True)
test_data = datasets.CIFAR10("./eval", train=False, download=True, transform=transform)

train_dataloader = DataLoader(train_data,
                              batch_size=128,
                              shuffle=True)

test_dataloader = DataLoader(test_data,
                              batch_size=128,
                              shuffle=True,
                              num_workers=2,
                              drop_last=True)

calibration_dataset = Subset(train_data, range(256))
calibration_loader = DataLoader(calibration_dataset, batch_size=128, shuffle=False)

100%|██████████| 170M/170M [00:05<00:00, 28.6MB/s]
100%|██████████| 170M/170M [00:08<00:00, 20.5MB/s]


In [3]:
def resnet18_for_CIFAR10():
  model = models.resnet18(weights=None, num_classes=10)
  model.conv1 = nn.Conv2d(3, 64, 3, 1, 1, bias=False)
  model.maxpool = nn.Identity()

  return model.to(device)

model_to_quantize = resnet18_for_CIFAR10()

In [11]:
def train(model, loader, epochs, lr=0.01, save_path="model.pth", silent=False):
  try:
    model.train()
  except NotImplementedError:
    torch.ao.quantization.move_exported_model_to_train(model)

  if os.path.exists(save_path):
    if not silent:
      print(f'Model already trained. Loading from --> {save_path}')
    model.load_state_dict(torch.load(save_path))
    return

  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(params=model.parameters(), lr=lr, momentum=0.9)

  for epoch in range(epochs):
    for X, y in loader:
      X, y = X.to(device), y.to(device)

      logits = model(X)
      loss = criterion(logits, y)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    if not silent:
      print(f'Epoch {epoch+1} Loss={loss:.4f}')
      evaluate(model, f'Epoch {epoch+1}')

      try:
        model.train()
      except NotImplementedError:
        torch.ao.quantization.move_exported_model_to_train(model)

  if save_path:
    torch.save(model.state_dict(), save_path)
    if not silent:
      print(f'Training complete, model saved to --> {save_path}')

def evaluate(model, tag):
  try:
    model.eval()
  except NotImplementedError:
    model = torch.ao.quantization.move_exported_model_to_eval(model)

  model.to(device)
  correct = total = 0

  with torch.no_grad():
    for x, y in test_dataloader:
      x, y = x.to(device), y.to(device)
      preds = model(x).argmax(1)
      correct += (preds == y).sum().item()
      total += y.size(0)
    accuracy = correct / total
    print(f'Accuracy ({tag}): {(accuracy*100):.2f}%')

In [5]:
class Timer:
    """
    A simple timer utility for measuring elapsed time in milliseconds.

    Supports both GPU and CPU timing:
    - If CUDA is available, uses torch.cuda.Event for accurate GPU timing.
    - Otherwise, falls back to wall-clock CPU timing via time.time().

    Methods:
        start(): Start the timer.
        stop(): Stop the timer and return the elapsed time in milliseconds.
    """

    def __init__(self):
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.starter = torch.cuda.Event(enable_timing=True)
            self.ender = torch.cuda.Event(enable_timing=True)

    def start(self):
        if self.use_cuda:
            self.starter.record()
        else:
            self.start_time = time.time()

    def stop(self):
        if self.use_cuda:
            self.ender.record()
            torch.cuda.synchronize()
            return self.starter.elapsed_time(self.ender)  # ms
        else:
            return (time.time() - self.start_time) * 1000  # ms

def estimate_latency(model, example_inputs, repetitions=50):
    """
    Returns avg and std inference latency (ms) over given runs.
    """

    timer = Timer()
    timings = np.zeros((repetitions, 1))

    # warm-up
    for _ in range(5):
        _ = model(example_inputs)

    with torch.no_grad():
        for rep in range(repetitions):
            timer.start()
            _ = model(example_inputs)
            elapsed = timer.stop()
            timings[rep] = elapsed

    return np.mean(timings), np.std(timings)

def estimate_latency_full(model, tag, skip_cpu):
    """
    Prints model latency on GPU and (optionally) CPU.
    """

    # estimate latency on CPU
    if not skip_cpu:
        example_input = torch.rand(128, 3, 32, 32).cpu()
        model.cpu()
        latency_mu, latency_std = estimate_latency(model, example_input)
        print(f"Latency ({tag}, on CPU): {latency_mu:.2f} ± {latency_std:.2f} ms")

    # estimate latency on GPU
    example_input = torch.rand(128, 3, 32, 32).cuda()
    model.cuda()
    latency_mu, latency_std = estimate_latency(model, example_input)
    print(f"Latency ({tag}, on GPU): {latency_mu:.2f} ± {latency_std:.2f} ms")

def print_size_of_model(model, tag=""):
    """
    Prints model size (MB).
    """

    torch.save(model.state_dict(), "temp.p")
    size_mb_full = os.path.getsize("temp.p") / 1e6
    print(f"Size ({tag}): {size_mb_full:.2f} MB")
    os.remove("temp.p")

In [12]:
train(model_to_quantize, train_dataloader, epochs=15, save_path="full_model.pth")


Epoch 1 Loss=0.0003
Accuracy (Epoch 1): 80.13%
Epoch 2 Loss=0.0001
Accuracy (Epoch 2): 80.17%
Epoch 3 Loss=0.0003
Accuracy (Epoch 3): 80.18%
Epoch 4 Loss=0.0001
Accuracy (Epoch 4): 80.22%
Epoch 5 Loss=0.0001
Accuracy (Epoch 5): 80.36%
Epoch 6 Loss=0.0003
Accuracy (Epoch 6): 80.37%
Epoch 7 Loss=0.0001
Accuracy (Epoch 7): 80.34%
Epoch 8 Loss=0.0022
Accuracy (Epoch 8): 80.15%
Epoch 9 Loss=0.0002
Accuracy (Epoch 9): 80.21%
Epoch 10 Loss=0.0004
Accuracy (Epoch 10): 80.37%
Epoch 11 Loss=0.0001
Accuracy (Epoch 11): 80.25%
Epoch 12 Loss=0.0001
Accuracy (Epoch 12): 80.33%
Epoch 13 Loss=0.0001
Accuracy (Epoch 13): 80.24%
Epoch 14 Loss=0.0001
Accuracy (Epoch 14): 80.34%
Epoch 15 Loss=0.0001
Accuracy (Epoch 15): 80.32%
Training complete, model saved to --> full_model.pth


In [13]:
# get full model size
print_size_of_model(model_to_quantize, "full")

# evaluate full accuracy
accuracy_full = evaluate(model_to_quantize, 'full')

# estimate full model latency
estimate_latency_full(model_to_quantize, 'full', skip_cpu)


Size (full): 44.77 MB
Accuracy (full): 80.31%
Latency (full, on CPU): 1513.80 ± 193.57 ms
Latency (full, on GPU): 27.41 ± 0.42 ms
