In [None]:
!git clone https://github.com/nxquang-al/pytorch-cifar100.git

Cloning into 'pytorch-cifar100'...
remote: Enumerating objects: 1037, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 1037 (delta 0), reused 1 (delta 0), pack-reused 1033[K
Receiving objects: 100% (1037/1037), 498.95 KiB | 5.20 MiB/s, done.
Resolving deltas: 100% (649/649), done.


In [None]:
%cd pytorch-cifar100

/content/pytorch-cifar100


In [None]:
!mkdir checkpoints

In [None]:
# # Train the baseline model
# !python train.py -net resnet34 -gpu

# Load model

In [None]:
!pip install gdown

In [None]:
import gdown

id = "1b3Xls00FQXokbYnhKEFm7A2QWFg04_Q1"
output = "./checkpoints/resnet34-baseline.pth"
gdown.download(id=id, output=output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1b3Xls00FQXokbYnhKEFm7A2QWFg04_Q1
To: /content/pytorch-cifar100/checkpoints/resnet34-baseline.pth
100%|██████████| 85.5M/85.5M [00:05<00:00, 16.8MB/s]


'./checkpoints/resnet34-baseline.pth'

In [None]:
# # Download torchscript static-quantized int8 model

# static_jit_id = "1UGl3aJRzqMaN-IcUpCoOtpKHSV7HE-KN"
# output = "./checkpoints/resnet34-static-int8-jit.pt"
# gdown.download(id=static_jit_id, output=output, quiet=False)

In [None]:
import sys
sys.path.append('/content/pytorch-cifar100/')

In [None]:
import torch
import copy
import os
import copy
import time
from models.resnet import resnet34

In [None]:
print(torch.cuda.is_available())

cuda_device = torch.device("cuda:0")
cpu_device = torch.device("cpu:0")

True


In [None]:
baseline = resnet34()
baseline.load_state_dict(torch.load("./checkpoints/resnet34-baseline.pth"))
baseline.to(cuda_device)

ResNet(
  (conv1): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (conv2_x): Sequential(
    (0): BasicBlock(
      (residual_function): Sequential(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (shortcut): Sequential()
      (add_relu): FloatFunctional(
        (activation_post_process): Identity()
      )
    )
    (1): BasicBlock(
      (residual_function): Sequential(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias

In [None]:
static_int8_jit = torch.jit.load(
    "./checkpoints/resnet34-static-int8-jit.pt",
    map_location=cpu_device
    )

In [None]:
assert next(baseline.parameters()).device == cuda_device

# CIFAR-100 Dataloader Preparation

In [None]:
from utils import get_test_dataloader
from conf import settings

test_loader = get_test_dataloader(
        settings.CIFAR100_TRAIN_MEAN,
        settings.CIFAR100_TRAIN_STD,
        num_workers=4,
        batch_size=1,
        shuffle=True
    )

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169001437/169001437 [00:05<00:00, 29095386.85it/s]


Extracting ./data/cifar-100-python.tar.gz to ./data




# Metrics Evaluation

In [None]:
@torch.no_grad()
def evaluate(model, test_loader, device=cuda_device):
    """Evaluate model accuracy, top-1 error rate, and top-5 error rate"""
    model.eval()

    correct = 0.0
    correct_1 = 0.0
    correct_5 = 0.0

    for (images, labels) in test_loader:

        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)

        top5_preds = torch.topk(outputs,5,1,largest=True,sorted=True)[1]
        labs = labels.view(labels.size(0),-1).expand_as(top5_preds)
        corrects = torch.eq(top5_preds, labs).float()
        correct_5 += torch.sum(corrects[:,:5])
        correct_1 += torch.sum(corrects[:,:1])

    acc = correct_1.float() / len(test_loader.dataset)
    top_1_error = 1 - correct_1.float() / len(test_loader.dataset)
    top_5_error = 1 - correct_5.float() / len(test_loader.dataset)
    print('Evaluating Network.....')
    print('Test set: Accuracy: {:.4f}, Top-1 Error: {:.4f}, Top-5 Error: {:.4f}'.format(
        acc, top_1_error, top_5_error
    ))

    return acc, top_1_error, top_5_error

# UTILS: Memory and Inference Time Measurements

## 1. Memory Usage

In [None]:
from torch.profiler import profile, record_function, ProfilerActivity

def cpu_memory_measure(model):
  model = copy.deepcopy(model).to(cpu_device)
  inputs = torch.randn(1, 3, 32, 32, dtype=torch.float).to(cpu_device)

  with profile(activities=[ProfilerActivity.CPU],
          profile_memory=True, record_shapes=True) as prof:
      model(inputs)

  print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))

def cuda_memory_measure(model):
  model = copy.deepcopy(model).to(cuda_device)
  inputs = torch.randn(1, 3, 224, 224, dtype=torch.float).to(cuda_device)

  with profile(activities=[ProfilerActivity.CUDA],
          profile_memory=True, record_shapes=True) as prof:
      model(inputs)

  print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

## 2. Inference Time

In [None]:
# GPU Inference Time (parallel)
import numpy as np

def gpu_time_measure(model, warmup=10):
  model = copy.deepcopy(model).to(cuda_device)
  dummy_input = torch.randn(1, 3, 32, 32, dtype=torch.float).to(cuda_device)

  # INIT LOGGERS
  starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
      enable_timing=True
  )
  repetitions = 300
  timings = np.zeros((repetitions, 1))
  # GPU-WARM-UP
  for _ in range(warmup):
      _ = model(dummy_input)
  torch.cuda.synchronize()

  # MEASURE PERFORMANCE
  with torch.no_grad():
      for rep in range(repetitions):
          starter.record()
          _ = model(dummy_input)
          ender.record()
          # WAIT FOR GPU SYNC
          torch.cuda.synchronize()
          curr_time = starter.elapsed_time(ender)
          timings[rep] = curr_time

  mean_syn = np.sum(timings) / repetitions
  std_syn = np.std(timings)
  return mean_syn

In [None]:
def cpu_time_measure_cifar(model, test_loader, device=cpu_device, warmup=10):
    model = model.to(cpu_device)
    model.eval()

    # warmup for sure
    dummy_input = torch.randn(1, 3, 32, 32, dtype=torch.float).to(cpu_device)
    for _ in range(warmup):
      _ = model(dummy_input)
    torch.cuda.synchronize()
    elapsed_time = 0

    for batch_index, (images, labels) in enumerate(test_loader):
        if batch_index >= 100:
            break

        images = images.to(device)
        print(images[0].shape)
        labels = labels.to(device)

        start = time.time()
        outputs = model(images)
        stop = time.time()
        elapsed_time += (stop - start)


    mean_syn = elapsed_time / batch_index

    return mean_syn


# UTILS: Save model

In [None]:
def save_model(model, save_dir, filename):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    save_path = os.path.join(save_dir, filename)
    torch.save(model.state_dict(), save_path)

In [None]:
# TorchScript Serialization

def save_jit_model(model, save_dir, filename):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    save_path = os.path.join(save_dir, filename)
    torch.jit.save(torch.jit.script(model), save_path)

def load_jit_model(model_path, device):
    return torch.jit.load(model_path, map_location=device)

In [None]:
def get_model_size(model):
    model_dir = os.path.join("/tmp", "temp")
    torch.save(model.state_dict(), model_dir)
    # model.save_pretrained(model_dir)
    size = os.path.getsize(model_dir)
    os.remove(model_dir)

    return size

# QUANTIZATION

## 1. Dynamic Quantization

In [None]:
baseline_cpu = copy.deepcopy(baseline).to(cpu_device)

In [None]:
# Quantize to float16
dynamic_fp16_model = torch.quantization.quantize_dynamic(
    baseline_cpu,
    {torch.nn.Linear, torch.nn.Conv2d},
    dtype=torch.float16
)

# evaluate float16 model
evaluate(dynamic_fp16_model, test_loader, cpu_device)

In [None]:
# Quantize the model to int8
dynamic_int8_model = torch.quantization.quantize_dynamic(
    baseline_cpu,
     {torch.nn.Linear, torch.nn.Conv2d},
    dtype=torch.qint8
)

# evaluate int8 model
evaluate(dynamic_int8_model, test_loader, cpu_device)

## 2. Post-training Static Quantization

In [None]:
class QuantizedResNet34(nn.Module):
    def __init__(self, model_fp32):
        super(QuantizedResNet34, self).__init__()
        self.quant = QuantStub()
        self.dequant = DeQuantStub()
        self.model = model_fp32

    def forward(self, x):
        x = self.quant(x)
        x = self.model(x)
        x = self.dequant(x)
        return x

In [None]:
from torch.ao.quantization import QuantStub, DeQuantStub
import torch.nn as nn

def calibrate_model(model, dataloader, device="cpu:0"):
    model.to(device)
    model.eval()

    for images, labels in dataloader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)

def quantize_static(model):
    device = torch.device("cpu:0")
    model = model.to(device)
    fused_model = copy.deepcopy(model)
    model.eval()
    fused_model.eval()

    for module_name, module in fused_model.named_children():
        if "conv1" in module_name:
            torch.ao.quantization.fuse_modules(module,[["0", "1", "2"]], inplace=True)
        elif "conv" in module_name:
            for basic_block_name, basic_block in module.named_children():
                for sub_block_name, sub_block in basic_block.named_children():
                    if sub_block_name == "residual_function":
                        torch.ao.quantization.fuse_modules(sub_block, [["0", "1", "2"], ["3", "4"]], inplace=True)
                    elif sub_block_name == "shortcut" and len(sub_block):
                        torch.ao.quantization.fuse_modules(sub_block, [["0", "1"]], inplace=True)
    print("After fusion: ", fused_model)

    quantized_model = QuantizedResNet34(fused_model)
    quantized_model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
    print(quantized_model.qconfig)

    torch.ao.quantization.prepare(quantized_model, inplace=True)
    calibrate_model(
        model=quantized_model, dataloader=train_loader, device=torch.device("cpu:0")
    )
    quantized_model = torch.ao.quantization.convert(quantized_model, inplace=True)

    return quantized_model

In [None]:
static_int8_model = quantize_static(baseline)

In [None]:
static_int8_model.eval()
evaluate(static_int8_model, test_loader, "cpu:0")

## 3. Quantize-aware Training

In [None]:
def train_model(model, train_loader, test_loader, device, learning_rate=0.1, num_epochs=100):
    """A simple trainer"""
    model.to(device)

    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=settings.MILESTONES, gamma=0.2)
    loss_function = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0
        running_corrects = 0

        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()

            # statistics
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        train_loss = running_loss / len(train_loader.dataset)
        train_acc = running_corrects / len(train_loader.dataset)

        # Evaluation
        model.eval()
        val_acc, top1_error, top5_error = evaluate(model=model, test_loader=test_loader, device=device)

        # Set learning rate scheduler
        scheduler.step()

        print("Epoch: {:03d} Train Loss: {:.3f} Train Acc: {:.3f} Val Acc: {:.3f} Top-5 Error: {:.3f}".format(
            epoch,
            train_loss,
            train_acc,
            val_acc,
            top5_error)
        )

    return model

In [None]:
def quantized_aware_training(model, train_loader, test_loader):
    device = torch.device("cuda:0")
    model = model.to(device)
    fused_model = copy.deepcopy(model)
    model.eval()
    fused_model.eval()

    # fuse the model inplace
    for module_name, module in fused_model.named_children():
        if "conv1" in module_name:
            torch.ao.quantization.fuse_modules(module,[["0", "1", "2"]], inplace=True)
        elif "conv" in module_name:
            for basic_block_name, basic_block in module.named_children():
                for sub_block_name, sub_block in basic_block.named_children():
                    if sub_block_name == "residual_function":
                        torch.ao.quantization.fuse_modules(sub_block, [["0", "1", "2"], ["3", "4"]], inplace=True)
                    elif sub_block_name == "shortcut" and len(sub_block):
                        torch.ao.quantization.fuse_modules(sub_block, [["0", "1"]], inplace=True)

    quantized_model = QuantizedResNet34(fused_model)
    quantized_model.qconfig = torch.quantization.get_default_qconfig("fbgemm")

    # prepare for quatized aware-training
    torch.quantization.prepare_qat(quantized_model, inplace=True)

    quantized_model.train()
    train_model(quantized_model, train_loader, test_loader, device=torch.device("cuda:0"), learning_rate=0.1, num_epochs=10)

    quantized_model.to(torch.device("cpu:0"))
    quantized_model = torch.quantization.convert(quantized_model, inplace=True)

    return quantized_model


In [None]:
qat_model = quantized_aware_training(baseline, train_loader, test_loader)

In [None]:
qat_model.eval()
evaluate(qat_model, test_loader, "cpu:0")

# MEASUREMENTS

## 1. Baseline

In [None]:
# Accuracy, Top-1 error rate, Top-5 error rate
evaluate(baseline, test_loader, device=cuda_device)

In [None]:
cpu_memory_measure(baseline)

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::empty         1.18%     613.000us         1.18%     613.000us       2.128us      14.19 Mb      14.19 Mb           288  
                    aten::conv2d         0.62%     321.000us        85.80%      44.407ms       1.234ms       3.91 Mb           0 b            36  
               aten::convolution         1.01%     525.000us        85.18%      44.086ms       1.225ms       3.91 Mb           0 b            36  
              aten::_convolution         2.09%       1.080ms        84.16%      43.561ms       1.210ms       3.91 Mb  

In [None]:
print("Baseline inference time using CPU: {:.4f} ms/sample".format(cpu_time_measure_cifar(baseline, test_loader) * 1000))



Baseline inference time using GPU: 53.5507 ms/sample


## 2. Dynamic Quantized Float16

In [None]:
evaluate(dynamic_fp16_model, test_loader, device=cpu_device)

In [None]:
cpu_memory_measure(dynamic_fp16_model)

----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       aten::empty         1.12%     488.000us         1.12%     488.000us       1.689us      14.18 Mb      14.18 Mb           289  
                      aten::conv2d         0.43%     186.000us        90.32%      39.288ms       1.091ms       3.91 Mb           0 b            36  
                 aten::convolution         1.26%     546.000us        89.90%      39.102ms       1.086ms       3.91 Mb           0 b            36  
                aten::_convolution         0.97%     423.000us        88.64%      38.556ms       1.071ms  

In [None]:
print("Dynamic Float16 inference time using GPU: {:.4f} ms/sample".format(cpu_time_measure_cifar(dynamic_fp16_model, test_loader) * 1000))

Dynamic Float16 inference time using GPU: 47.1648 ms/sample


## 3. Dynamic Quantized Int8

In [None]:
evaluate(dynamic_int8_model, test_loader, device=cpu_device)

In [None]:
cpu_memory_measure(dynamic_int8_model)

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::empty         1.37%     681.000us         1.37%     681.000us       2.348us      14.07 Mb      14.07 Mb           290  
                aten::empty_like         0.26%     129.000us         0.47%     233.000us       6.297us       3.91 Mb     512.00 Kb            37  
                    aten::conv2d         2.43%       1.209ms        86.07%      42.804ms       1.189ms       3.91 Mb     128.00 Kb            36  
               aten::convolution         1.15%     573.000us        85.60%      42.569ms       1.182ms       3.91 Mb  

In [None]:
print("Dynamic Int8 inference time using GPU: {:.4f} ms/sample".format(cpu_time_measure_cifar(dynamic_int8_model, test_loader) * 1000))

Dynamic Int8 inference time using GPU: 49.1005 ms/sample


## 4. Static Quantized Int8

In [None]:
evaluate(static_int8_jit, test_loader, cpu_device)

In [None]:
cpu_memory_measure(static_int8_jit)

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                      aten::empty         0.34%      67.000us         0.34%      67.000us       1.763us       3.84 Mb       3.84 Mb            38  
    aten::_empty_affine_quantized         0.72%     144.000us         0.72%     144.000us       2.618us       1.41 Mb       1.41 Mb            55  
           quantized::conv2d_relu        45.32%       9.006ms        45.97%       9.137ms     537.471us     501.00 Kb      -1.97 Mb            17  
                quantized::conv2d        47.74%       9.488ms        48.16%       9.571ms     503.737us      24.

In [None]:
print("Static Int8 JIT inference time using GPU: {:.4f} ms/sample".format(cpu_time_measure_cifar(static_int8_jit, test_loader) * 1000))

Static Int8 JIT inference time using GPU: 20.5449 ms/sample


# SAVE MODELS

In [None]:
# Save as a PyTorch model
save_model(model=baseline, save_dir="./checkpoints", filename="resnet34-baseline.pth")
# Save as TorchScript model
save_jit_model(model=baseline, save_dir="./checkpoints", filename="resnet34-baseline-jit.pth")

In [None]:
# Save dynamic quantize float16 model
save_model(model=dynamic_fp16_model, save_dir="./checkpoints", filename="resnet34-dynamic-float16.pth")

In [None]:
# Save dynamic quantize int8 model
save_model(model=dynamic_int8_model, save_dir="./checkpoints", filename="resnet34-dynamic-int8.pth")