# **CNN Comparison of Different Quantization Techniques**

In this workshop we will walk through how to implement various quantization modes to a CNN model.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.quantization as quant
from torch.utils.data import DataLoader
import time

torch.manual_seed(24)  # For reproducibility

<torch._C.Generator at 0x7c791c272ff0>

## **Measuring Inference Time**

Function that tests testing time for each quantization method

In [None]:
def measure_inference_time(model, dataloader, device='cpu'):
  model.to(device)
  model.eval()
  start_time=time.time()
  with torch.no_grad():
    for inputs, _ in dataloader:
      inputs = inputs.to(device)
      outputs = model(inputs)
    return time.time() - start_time

## **Measuring Test Accuracy**

Defining a function to test accuracy of each altered model

In [None]:
# test accuracy before and after quantization

def test_model(model):
  model.eval()
  correct = 0
  total = 0

  with torch.no_grad():
    for inputs, labels in testloader:
      inputs, labels = inputs.to(device), labels.to(device)
      outputs = model(inputs)
      _, predicted = torch.max(outputs, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()
  return correct / total

## **Original CNN Class**

In [None]:
# define CNN model
class CustomCNN(nn.Module):
  def __init__(self):
    super(CustomCNN, self).__init__()
    self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
    self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
    self.relu = nn.ReLU()
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    self.fc1 = nn.Linear(32 * 7 * 7, 10)

  def forward(self, x):
    x = self.pool(self.relu(self.conv1(x)))
    x = self.pool(self.relu(self.conv2(x)))
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    return x

# **Loading MNIST Dataset**

In [None]:
# load MNIST dataset/dataloader
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,),(0.3081))])

trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=32, shuffle=False)

testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
testloader = DataLoader(testset, batch_size=32, shuffle=False)


100%|██████████| 9.91M/9.91M [00:00<00:00, 54.9MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 1.76MB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 14.6MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 5.83MB/s]


# **Preparing Original Model**

In [None]:
# initialize model, loss function, optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CustomCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# train
epochs = 10

print("Training model...")
for epoch in range(epochs):
  running_loss = 0.0
  for inputs, labels in trainloader:
    inputs, labels = inputs.to(device), labels.to(device)

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    running_loss += loss.item()

  print(f"Epoch {epoch+1}, Loss: {running_loss/len(trainloader):.4f}")

print("Training completed")
torch.save(model.state_dict(), "mnist_cnn_workshop.pth")

Training model...
Epoch 1, Loss: 0.1448
Epoch 2, Loss: 0.0534
Epoch 3, Loss: 0.0385
Epoch 4, Loss: 0.0291
Epoch 5, Loss: 0.0229
Epoch 6, Loss: 0.0180
Epoch 7, Loss: 0.0149
Epoch 8, Loss: 0.0118
Epoch 9, Loss: 0.0105
Epoch 10, Loss: 0.0090
Training completed


# **CNN with Post Training Quantization (PTQ)**
Post Training Quantization (PTQ) involves calibrating the model by passing batches of sample data through it to obtain activation distributions. These distributions are used to determine scaling factors for weights and activations.

In [None]:
# load original model weights

model = CustomCNN()
model.load_state_dict(torch.load("mnist_cnn_workshop.pth", map_location=torch.device("cpu")))
model.eval()

# apply dynamic PTQ
quantized_model_PTDQ = quant.quantize_dynamic(
    model, {nn.Linear}, dtype=torch.qint8
)
#time_PTDQ = measure_inference_time(quantized_model_PTDQ, testloader,device='cpu')

torch.save(quantized_model_PTDQ.state_dict(), "mnist_cnn_quantizedPTDQ_workshop.pth")
print("Dynamic Quantization completed")

Dynamic Quantization completed


In [None]:
# load original model weights
model = CustomCNN()
model.load_state_dict(torch.load("mnist_cnn_workshop.pth", map_location=torch.device("cpu")))
model.eval()

# prepare the model for static PTQ
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(model, inplace=True)

# calibrate the model with a few batches
num_calibration_batches = 10  # adjust as needed
for inputs, _ in iter(testloader): # use a subset of the test data for calibration
    if num_calibration_batches == 0:
      break
    model(inputs)
    num_calibration_batches -= 1

# convert the model to quantized form
quantized_model_PTSQ = torch.quantization.convert(model, inplace=True)

# Measure inference time of the statically quantized model
#time_PTSQ = measure_inference_time(quantized_model_PTSQ, testloader, device='cuda') # BACKEND SUPPORT MISSING

# Save the statically quantized model
torch.save(quantized_model_PTSQ.state_dict(), "mnist_cnn_quantizedPTSQ_workshop.pth")

print("Static Quantization completed")



Static Quantization completed


# **QAT**

In [None]:
class QAT_CNN(nn.Module):
  def __init__(self):
    super(QAT_CNN, self).__init__()
    self.quant = quant.QuantStub() # convert tensors from floating point to quantized
    self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
    self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
    self.relu = nn.ReLU()
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    self.fc1 = nn.Linear(32 * 7 * 7, 10)
    self.dequant = quant.DeQuantStub() # convert tensors from quantized to floating point

  def forward(self, x):
    x = self.quant(x) # convert input to INT8
    x = self.pool(self.relu(self.conv1(x)))
    x = self.pool(self.relu(self.conv2(x)))
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    x = self.dequant(x) # convert back to FP32
    return x

In [None]:
# load original model weights
model = QAT_CNN()
model.qconfig = quant.get_default_qat_qconfig("fbgemm")
quant.prepare_qat(model, inplace=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

epochs = 10

print("Starting QAT Training...")

# train
for epoch in range(epochs):
  for inputs, labels in trainloader:
    inputs, labels = inputs.to(device), labels.to(device)
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
  print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")
print("QAT Training completed")

# save
model.eval()
quantized_model_QAT = quant.convert(model) # convert to INT8
torch.save(quantized_model_QAT.state_dict(), "mnist_cnn_quantizedQAT_workshop.pth")
print("QAT model saved")

Epoch 1: Loss = 0.0302
Epoch 2: Loss = 0.0039
Epoch 3: Loss = 0.0014
Epoch 4: Loss = 0.0006
Epoch 5: Loss = 0.0011
Epoch 6: Loss = 0.0000
Epoch 7: Loss = 0.0000
Epoch 8: Loss = 0.0001
Epoch 9: Loss = 0.0001
Epoch 10: Loss = 0.0001
QAT Training completed
QAT model saved


In [None]:
import os

# check model file sizes
original_size = os.path.getsize("mnist_cnn_workshop.pth") / 1024 # KB
quantized_size_PTSQ = os.path.getsize("mnist_cnn_quantizedPTSQ_workshop.pth") / 1024 # KB
quantized_size_PTDQ = os.path.getsize("mnist_cnn_quantizedPTDQ_workshop.pth") / 1024 # KB
quantized_size_QAT = os.path.getsize("mnist_cnn_quantizedQAT_workshop.pth") / 1024 # KB

print(f"Original model size: {original_size:.2f} KB")
print(f"Quantized model size (PTSQ): {quantized_size_PTSQ:.2f} KB")
print(f"Compression Ratio (PTSQ): {original_size / quantized_size_PTSQ:.2f}x")
print(f"Quantized model size (PTDQ): {quantized_size_PTDQ:.2f} KB")
print(f"Compression Ratio (PTDQ): {original_size / quantized_size_PTDQ:.2f}x")
print(f"Quantized model size (QAT): {quantized_size_QAT:.2f} KB")
print(f"Compression Ratio (QAT): {original_size / quantized_size_QAT:.2f}x")


Original model size: 82.74 KB
Quantized model size (PTSQ): 27.44 KB
Compression Ratio (PTSQ): 3.01x
Quantized model size (PTDQ): 37.86 KB
Compression Ratio (PTDQ): 2.19x
Quantized model size (QAT): 27.96 KB
Compression Ratio (QAT): 2.96x


In [None]:
# test accuracies before and after quantization
original_model = CustomCNN()
original_model.load_state_dict(torch.load("mnist_cnn_workshop.pth", map_location=torch.device("cpu")))
original_accuracy = test_model(model)

quantized_model_PTSQ = CustomCNN()
quantized_model_PTSQ.load_state_dict(torch.load("mnist_cnn_quantizedPTSQ_workshop.pth", map_location=torch.device("cpu")), strict=False)
quantized_model_PTSQ.eval()
quantized_model_PTSQ.to('cpu')

quantized_model_PTDQ = CustomCNN()
quantized_model_PTDQ.load_state_dict(torch.load("mnist_cnn_quantizedPTDQ_workshop.pth", map_location=torch.device("cpu")), strict=False)
quantized_model_PTDQ.eval()
quantized_model_PTDQ.to('cpu')

quantized_model_QAT = QAT_CNN()
quantized_model_QAT.load_state_dict(torch.load("mnist_cnn_quantizedQAT_workshop.pth", map_location=torch.device("cpu")))
quantized_model_QAT.eval()
quantized_model_QAT.to('cpu')


quantized_accuracy_PTSQ = test_model(quantized_model_PTSQ)

quantized_accuracy_PTDQ = test_model(quantized_model_PTDQ)

quantized_accuracy_QAT = test_model(quantized_model_QAT)

print(f"Original Model Accuracy: {original_accuracy * 100:.2f}%")
print(f"Quantized Model Accuracy (PTSQ): {quantized_accuracy_PTSQ * 100:.2f}%")
print(f"Quantized Model Accuracy (PTDQ): {quantized_accuracy_PTDQ * 100:.2f}%")
print(f"Quantized Model Accuracy (QAT): {quantized_accuracy_QAT * 100:.2f}%")

RuntimeError: Error(s) in loading state_dict for CustomCNN:
	While copying the parameter named "conv1.weight", whose dimensions in the model are torch.Size([16, 1, 3, 3]) and whose dimensions in the checkpoint are torch.Size([16, 1, 3, 3]), an exception occurred : ('Copying from quantized Tensor to non-quantized Tensor is not allowed, please use dequantize to get a float Tensor from a quantized Tensor',).
	While copying the parameter named "conv2.weight", whose dimensions in the model are torch.Size([32, 16, 3, 3]) and whose dimensions in the checkpoint are torch.Size([32, 16, 3, 3]), an exception occurred : ('Copying from quantized Tensor to non-quantized Tensor is not allowed, please use dequantize to get a float Tensor from a quantized Tensor',).

In [None]:
print(f"Dynamic Quantization Inference Time: {time_PTDQ:.4f} seconds")
print(f"Static Quantization Inference Time: {time_PTSQ:.4f} seconds")
print(f"Quantization Aware Training Inference Time: {time_QAT:.4f} seconds")