In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from speech_command_dataset import SpeechCommandDataset
import numpy as np
import matplotlib.pyplot as plt
from model import M5
import time

In [15]:
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True 
torch.backends.cudnn.benchmark = False

device = torch.device("cpu")
print(device)

cpu


In [16]:
# declare dataloader
calib_params = {"batch_size": 512,
                "shuffle": True,
                "drop_last": True,
                "num_workers": 1}

testing_params = {"batch_size": 512,
                       "shuffle": False,
                       "drop_last": True,
                       "num_workers": 1}

calib_set = SpeechCommandDataset()
calib_loader = DataLoader(calib_set, **calib_params)

test_set = SpeechCommandDataset(is_training=False)
test_loader = DataLoader(test_set, **testing_params)

In [17]:
def test(model, epoch):
    model.eval()
    correct = 0
    for data, target in test_loader:

        data = data.to(device)
        target = target.to(device)

        #forward
        output = model(data)

        pred = output.argmax(dim=-1)
        correct += pred.squeeze().eq(target).sum().item()
        
    # print testing stats
    test_acc = 100.0 * float(correct) / len(test_set)
    print('Epoch: %3d' % epoch, '|test accuracy: %.2f' % test_acc)
    return test_acc

In [18]:
# load model
model_path = './log/best_model_clean.pth.tar'

print("=> loading checkpoint '{}'".format(model_path))
checkpoint = torch.load(model_path, map_location = device)

model = M5(cfg = checkpoint['cfg']).to(device)
model.load_state_dict(checkpoint['state_dict'])

=> loading checkpoint './log/best_model_clean.pth.tar'


<All keys matched successfully>

In [19]:
print(model)

M5(
  (features): Sequential(
    (0): Conv1d(1, 128, kernel_size=(40,), stride=(2,), padding=(19,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
    (4): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
    (8): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
    (9): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
    (12): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (13): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (14): ReLU()
    (

In [20]:
print('\nbytes per element:', model.features[0].weight.element_size())



bytes per element: 4


### Static quantization of a model consists of the following steps:

1. Fuse modules
2. Insert Quant/DeQuant Stubs
3. Prepare the fused module (insert observers before and after layers)
4. Calibrate the prepared module (pass it representative data)
5. Convert the calibrated module (replace with quantized version)

### 1.Fuse modules

In [21]:
model.eval()

_ = torch.quantization.fuse_modules(model.features, ['0','1','2'], inplace=True)
_ = torch.quantization.fuse_modules(model.features, ['4','5','6'], inplace=True)
_ = torch.quantization.fuse_modules(model.features, ['8','9','10'], inplace=True)
_ = torch.quantization.fuse_modules(model.features, ['12','13','14'], inplace=True)

print(model)


M5(
  (features): Sequential(
    (0): ConvReLU1d(
      (0): Conv1d(1, 128, kernel_size=(40,), stride=(2,), padding=(19,))
      (1): ReLU()
    )
    (1): Identity()
    (2): Identity()
    (3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
    (4): ConvReLU1d(
      (0): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): ReLU()
    )
    (5): Identity()
    (6): Identity()
    (7): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
    (8): ConvReLU1d(
      (0): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): ReLU()
    )
    (9): Identity()
    (10): Identity()
    (11): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
    (12): ConvReLU1d(
      (0): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): ReLU()
    )
    (13): Identity()
    (14): Identity()
    (15): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=Fal

### 2. Insert Quant/DeQuant Stubs

In [22]:
"""Insert stubs"""
model = nn.Sequential(torch.quantization.QuantStub(), 
                  *model.features,
                   model.avgpool,
                   model.flatten,
                   model.fc,
                   torch.quantization.DeQuantStub())

print(model)

Sequential(
  (0): QuantStub()
  (1): ConvReLU1d(
    (0): Conv1d(1, 128, kernel_size=(40,), stride=(2,), padding=(19,))
    (1): ReLU()
  )
  (2): Identity()
  (3): Identity()
  (4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (5): ConvReLU1d(
    (0): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
  )
  (6): Identity()
  (7): Identity()
  (8): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (9): ConvReLU1d(
    (0): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
  )
  (10): Identity()
  (11): Identity()
  (12): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (13): ConvReLU1d(
    (0): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
  )
  (14): Identity()
  (15): Identity()
  (16): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (17): AdaptiveAvgPool1d(output_size=1)
  (18): F

### 3. Prepare the fused module (insert observers before and after layers)

In [23]:
backend = "fbgemm"  # running on a x86 CPU. Use "qnnpack" if running on ARM.

model.qconfig = torch.quantization.get_default_qconfig(backend)
torch.quantization.prepare(model, inplace=True)

print(model)


Sequential(
  (0): QuantStub(
    (activation_post_process): HistogramObserver()
  )
  (1): ConvReLU1d(
    (0): Conv1d(1, 128, kernel_size=(40,), stride=(2,), padding=(19,))
    (1): ReLU()
    (activation_post_process): HistogramObserver()
  )
  (2): Identity()
  (3): Identity()
  (4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (5): ConvReLU1d(
    (0): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (activation_post_process): HistogramObserver()
  )
  (6): Identity()
  (7): Identity()
  (8): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (9): ConvReLU1d(
    (0): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (activation_post_process): HistogramObserver()
  )
  (10): Identity()
  (11): Identity()
  (12): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (13): ConvReLU1d(
    (0): Conv1d(256, 512, kernel_size=(3,), stride=(1,)

### 4. Calibrate the prepared module (pass it representative data)

In [24]:
iterator = iter(calib_loader)

NUM_CALIB_BATCH = 10

with torch.inference_mode():
    for _ in range(NUM_CALIB_BATCH):
        inputs, labels = next(iterator)
        inputs = inputs.cpu()
        labels = labels.cpu()
        outputs = model(inputs)

### 5. Convert the calibrated module (replace with quantized version)

In [25]:
"""Convert"""
quantized_model = torch.quantization.convert(model, inplace=False)

print(quantized_model)

  src_bin_begin // dst_bin_width, 0, self.dst_nbins - 1
  src_bin_end // dst_bin_width, 0, self.dst_nbins - 1


Sequential(
  (0): Quantize(scale=tensor([0.0157]), zero_point=tensor([64]), dtype=torch.quint8)
  (1): QuantizedConvReLU1d(1, 128, kernel_size=(40,), stride=(2,), scale=0.07932328432798386, zero_point=0, padding=(19,))
  (2): Identity()
  (3): Identity()
  (4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (5): QuantizedConvReLU1d(128, 128, kernel_size=(3,), stride=(1,), scale=0.09427855908870697, zero_point=0, padding=(1,))
  (6): Identity()
  (7): Identity()
  (8): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (9): QuantizedConvReLU1d(128, 256, kernel_size=(3,), stride=(1,), scale=0.058342449367046356, zero_point=0, padding=(1,))
  (10): Identity()
  (11): Identity()
  (12): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (13): QuantizedConvReLU1d(256, 512, kernel_size=(3,), stride=(1,), scale=0.13897468149662018, zero_point=0, padding=(1,))
  (14): Identity()
  (15): Identity()
  (16): MaxPoo

In [26]:
# print(quantized_model[0].weight().element_size())
print('\nbytes per element:', quantized_model[1].weight().element_size())


bytes per element: 1


In [27]:
test_acc = test(quantized_model, 0)

Epoch:   0 |test accuracy: 85.96


## run benchmark

In [32]:
# load model
best_path = './log/best_model_clean.pth.tar'
fine_path = './log/fine_grained_2_63.5_5_08_07_batchsize_256.pth.tar'
coarse_path = './log/coarse_1_5_07_17_batchsize_512.pth.tar'

best_checkpoint = torch.load(best_path, map_location = device)
best_model = M5(cfg = best_checkpoint['cfg']).to(device)
best_model.load_state_dict(best_checkpoint['state_dict'])

fine_checkpoint = torch.load(fine_path, map_location = device)
fine_model = M5(cfg = fine_checkpoint['cfg']).to(device)
fine_model.load_state_dict(fine_checkpoint['state_dict'])

coarse_checkpoint = torch.load(coarse_path, map_location = device)
coarse_model = M5(cfg = coarse_checkpoint['cfg']).to(device)
coarse_model.load_state_dict(coarse_checkpoint['state_dict'])

<All keys matched successfully>

In [33]:
def run_benchmark(model, num_batch):
    model.eval()
    elapsed = 0
    
    for i, (data, target) in enumerate(test_loader):

        data = data.to(device)
        #forward
        start = time.perf_counter()
        output = model(data)
        end = time.perf_counter()
        elapsed = elapsed + (end-start)
        
        if i == num_batch-1:
            break
    print('inference time: %.3f s' % (elapsed))

In [34]:
NUM_BATCH = 100

In [35]:
run_benchmark(quantized_model, NUM_BATCH)

inference time: 8.134 s


In [36]:
run_benchmark(best_model, NUM_BATCH)

inference time: 150.174 s


In [37]:
run_benchmark(fine_model, NUM_BATCH)

inference time: 145.694 s


In [38]:
run_benchmark(coarse_model, NUM_BATCH)

inference time: 15.019 s
