# Introduction

This notebook evalutes different quantized methods on CNN mdoel to predict the 6 human activities walking, walking upstairs, walking downstairs, sitting, standing or laying from accelerometer and gyroscope readings.

In [1]:
!gdown --id 167Jwj7RQF9Ngf-FQOPDWWQ3XCbLdTdNZ
!unzip -q uci_har_dataset.zip

Downloading...
From: https://drive.google.com/uc?id=167Jwj7RQF9Ngf-FQOPDWWQ3XCbLdTdNZ
To: /content/uci_har_dataset.zip
61.0MB [00:01, 52.3MB/s]


In [2]:
import re
import os
import copy
import time
import random
import itertools
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

warnings.filterwarnings("ignore")

random_state = 42

activity = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING', 'LAYING']
activity_map = {
    1: 'WALKING', 
    2:'WALKING_UPSTAIRS',
    3:'WALKING_DOWNSTAIRS',
    4:'SITTING', 
    5:'STANDING',
    6:'LAYING'
}

In [3]:
# load a single file as a numpy array
def read_raw_data_feature(filepath):
    dataframe = pd.read_csv(filepath, header=None, delim_whitespace=True)
    return dataframe.values

# load a list of files and return as a 3d numpy array
def read_raw_data_feature_group(filenames, prefix=''):
    loaded = list()
    for name in filenames:
        data = read_raw_data_feature(prefix + name)
        loaded.append(data)
    # stack group so that features are the 3rd dimension
    loaded = np.dstack(loaded)
    return loaded

# load a dataset group, such as train or test
def read_raw_data(group, prefix=''):
    filepath = prefix + group + '/Inertial Signals/'
    # load all 9 files as a single array
    filenames = list()
    # total acceleration
    filenames += ['total_acc_x_'+group+'.txt', 'total_acc_y_'+group+'.txt', 'total_acc_z_'+group+'.txt']
    # body acceleration
    filenames += ['body_acc_x_'+group+'.txt', 'body_acc_y_'+group+'.txt', 'body_acc_z_'+group+'.txt']
    # body gyroscope
    filenames += ['body_gyro_x_'+group+'.txt', 'body_gyro_y_'+group+'.txt', 'body_gyro_z_'+group+'.txt']
    # load input data
    X = read_raw_data_feature_group(filenames, filepath)
    # load class output
    y = read_raw_data_feature(prefix + group + '/y_'+group+'.txt')
    return X, y

# load the dataset, returns train and test X and y elements
def prepare_raw_data(prefix='uci_har_dataset/'):
    # load all train
    X_train, y_train = read_raw_data('train', prefix)
    # load all test
    X_test, y_test = read_raw_data('test', prefix)
    # zero-offset class values
    y_train = y_train - 1
    y_test = y_test - 1

    X = np.concatenate([X_train, X_test])
    y = np.concatenate([y_train, y_test])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=random_state)
    X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=random_state)
    
    return X_train, X_valid, X_test, y_train, y_valid, y_test

X_train, X_valid, X_test, y_train, y_valid, y_test = prepare_raw_data(prefix="uci_har_dataset/")

In [4]:
scaler = preprocessing.StandardScaler()
# https://stackoverflow.com/questions/53870113/using-standardscaler-on-3d-data
def scale_data(data, is_train=False):
    num_instances, num_time_steps, num_features = data.shape
    data = np.reshape(data, newshape=(-1, num_features))
    if is_train:
        data = scaler.fit_transform(data)
    else:
        data = scaler.transform(data)
    data = np.reshape(data, newshape=(num_instances, num_time_steps, num_features))
    return data

X_train = scale_data(X_train, is_train=True)
X_valid = scale_data(X_valid)
X_test = scale_data(X_test)
X_train.shape

(6179, 128, 9)

In [5]:
class CNNDataset(object):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __getitem__(self, idx):
        data = self.X[idx]
        target = self.y[idx][0]
        return data, target

    def __len__(self):
        return len(self.X)

def prepare_cnn_dataloader(X_train, X_valid, X_test, y_train, y_valid, y_test):
    traindataset = CNNDataset(X_train, y_train)

    trainloader = torch.utils.data.DataLoader(
        traindataset, 
        batch_size=100, 
        shuffle=True, 
        num_workers=4,
    )

    validdataset = CNNDataset(X_valid, y_valid)

    validloader = torch.utils.data.DataLoader(
        validdataset, 
        batch_size=100, 
        shuffle=True, 
        num_workers=4,
    )

    testdataset = CNNDataset(X_test, y_test)

    testloader = torch.utils.data.DataLoader(
        testdataset, 
        batch_size=100, 
        shuffle=True, 
        num_workers=4,
    )

    return trainloader, validloader, testloader

In [6]:
class CNN(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(9, 32, 3)
        self.relu1 = nn.ReLU()
        self.dp1 = nn.Dropout(0.6)

        self.pool1 = nn.MaxPool1d(2)
        self.flat1 = nn.Flatten()
        self.dp2 = nn.Dropout(0.2)

        self.fc1 = nn.Linear(2016, 256)
        self.relu2 = nn.ReLU()
        self.dp3 = nn.Dropout(0.2)

        self.fc2 = nn.Linear(256, 128)
        self.dp4 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(128, 6)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.dp1(x)

        x = self.pool1(x)
        x = self.flat1(x)
        x = self.dp2(x)

        x = self.fc1(x)
        x = self.relu2(x)
        x = self.dp3(x)

        x = self.fc2(x)
        x = self.dp4(x)

        x = self.fc3(x)
        
        return x

trainloader, validloader, testloader = prepare_cnn_dataloader(X_train, X_valid, X_test, y_train, y_valid, y_test)
inputs, labels = next(iter(trainloader))
inputs = inputs.permute(0, 2, 1)
model = CNN()
outputs = model(inputs.float())
labels[:1], outputs[:1]

(tensor([0]), tensor([[ 0.1476, -0.1000,  0.0312, -0.0573,  0.0859,  0.0753]],
        grad_fn=<SliceBackward>))

In [7]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            上次验证集损失值改善后等待几个epoch
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            如果是True，为每个验证集损失值改善打印一条信息
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            监测数量的最小变化，以符合改进的要求
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''
        Saves model when validation loss decrease.
        验证损失减少时保存模型。
        '''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        self.val_loss_min = val_loss

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix'):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.figure(figsize=(15,8))
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cmap='Reds'
        print("Normalized Confusion Matrix")
    else:
        cmap='Greens'
        print('Confusion Matrix Without Normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

def plot_avg_loss_per_epoch(avg_train_losses, avg_valid_losses):
    # visualize the loss as the network trained
    fig = plt.figure(figsize=(10,8))
    plt.plot(range(1,len(avg_train_losses)+1), avg_train_losses, label='Training Loss')
    plt.plot(range(1,len(avg_valid_losses)+1), avg_valid_losses,label='Validation Loss')

    # find position of lowest validation loss
    minposs = avg_train_losses.index(min(avg_train_losses))+1 
    plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')

    plt.xlabel('epochs')
    plt.ylabel('loss')
    plt.ylim(0, 2) # consistent scale
    plt.xlim(0, len(avg_train_losses)+1) # consistent scale
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

def plot_accuracy_per_epoch(train_accuracies, valid_accuracies):
    # visualize the loss as the network trained
    fig = plt.figure(figsize=(10,8))
    plt.plot(range(1,len(train_accuracies)+1), train_accuracies, label='Train Accuracy')
    plt.plot(range(1,len(valid_accuracies)+1), valid_accuracies,label='Valid Accuracy')

    plt.xlabel('epochs')
    plt.ylabel('accuracy')
    plt.ylim(0, 1) # consistent scale
    plt.xlim(0, len(train_accuracies)+1) # consistent scale
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

In [8]:
class QCNN(nn.Module):
    def __init__(self, model_fp32):
        super(QCNN, self).__init__()
        self.quant = torch.quantization.QuantStub()
        self.dequant = torch.quantization.DeQuantStub()
        self.model_fp32 = model_fp32

    def forward(self, x):
        x = self.quant(x)
        x = self.model_fp32(x)
        x = self.dequant(x)
        return x

In [9]:
def set_random_seeds(random_seed=0):

    torch.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

def prepare_dataloader():

    trainloader, validloader, testloader = prepare_cnn_dataloader(X_train, X_valid, X_test, y_train, y_valid, y_test)

    return trainloader, validloader, testloader

def evaluate_model(model, test_loader, device, criterion=None):

    model.eval()
    model.to(device)

    running_loss = 0
    running_corrects = 0

    for inputs, labels in test_loader:

        inputs = inputs.to(device)
        labels = labels.to(device)
        inputs = inputs.permute(0, 2, 1)
        outputs = model(inputs.float())
        _, preds = torch.max(outputs, 1)

        if criterion is not None:
            loss = criterion(outputs, labels).item()
        else:
            loss = 0

        running_loss += loss * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)

    eval_loss = running_loss / len(test_loader.dataset)
    eval_accuracy = running_corrects / len(test_loader.dataset)

    return eval_loss, eval_accuracy

def train_model(model, train_loader, test_loader, device, learning_rate=1e-1, num_epochs=10, patience=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    early_stopping = EarlyStopping(patience=patience)
    model.to(device)

    # Evaluation
    model.eval()
    eval_loss, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=device, criterion=criterion)
    print("Epoch: {:02d} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(-1, eval_loss, eval_accuracy))

    for epoch in range(num_epochs):

        # Training
        model.train()

        running_loss = 0
        running_corrects = 0

        for inputs, labels in train_loader:

            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            inputs = inputs.permute(0, 2, 1)
            outputs = model(inputs.float())
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # statistics
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        train_loss = running_loss / len(train_loader.dataset)
        train_accuracy = running_corrects / len(train_loader.dataset)

        # Evaluation
        model.eval()
        eval_loss, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=device, criterion=criterion)

        print("Epoch: {:03d} Train Loss: {:.3f} Train Acc: {:.3f} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(epoch, train_loss, train_accuracy, eval_loss, eval_accuracy))
        
        early_stopping(eval_loss, model)
        
        if early_stopping.early_stop:
            break
    return model

def calibrate_model(model, loader, device=torch.device("cpu:0")):

    model.to(device)
    model.eval()

    for inputs, labels in loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        inputs = inputs.permute(0, 2, 1)
        _ = model(inputs.float())

def measure_inference_latency(model, device, input_size=(1,3,32,32), num_samples=100):

    model.to(device)
    model.eval()

    x = torch.rand(size=input_size).to(device)
    x = x.permute(0, 2, 1)

    start_time = time.time()
    for _ in range(num_samples):
        _ = model(x.float())
    end_time = time.time()
    elapsed_time = end_time - start_time
    elapsed_time_ave = elapsed_time / num_samples

    return elapsed_time_ave

def save_model(model, model_dir, model_filename):

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_filepath = os.path.join(model_dir, model_filename)
    torch.save(model.state_dict(), model_filepath)

def load_model(model, model_filepath, device):

    model.load_state_dict(torch.load(model_filepath, map_location=device))

    return model

def save_torchscript_model(model, model_dir, model_filename):

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_filepath = os.path.join(model_dir, model_filename)
    torch.jit.save(torch.jit.script(model), model_filepath)

def load_torchscript_model(model_filepath, device):

    model = torch.jit.load(model_filepath, map_location=device)

    return model

def create_model():

    return CNN()

def model_equivalence(model_1, model_2, device, rtol=1e-05, atol=1e-08, num_tests=100, input_size=(1,3,32,32)):

    model_1.to(device)
    model_2.to(device)

    for _ in range(num_tests):
        x = torch.rand(size=input_size).to(device)
        x = x.permute(0, 2, 1)
        _ = model(x.float())        
        y1 = model_1(x).detach().cpu().numpy()
        y2 = model_2(x).detach().cpu().numpy()
        if np.allclose(a=y1, b=y2, rtol=rtol, atol=atol, equal_nan=False) == False:
            print("Model equivalence test sample failed: ")
            print(y1)
            print(y2)
            return False

    return True


# Model Training

In [10]:
random_seed = 0
num_classes = 10
cuda_device = torch.device("cuda:0")
cpu_device = torch.device("cpu:0")

model_dir = "saved_models"
model_filename = "hapt_model.pt"
quantized_model_filename = "hapt_model_quantized.pt"
model_filepath = os.path.join(model_dir, model_filename)
quantized_model_filepath = os.path.join(model_dir, quantized_model_filename)

set_random_seeds(random_seed=random_seed)

# Create an untrained model.
model = create_model()

train_loader, valid_loader, test_loader = prepare_dataloader()

# Train model.
print("Training Model...")
model = train_model(model=model, train_loader=train_loader, test_loader=valid_loader, device=cuda_device, learning_rate=1e-1, num_epochs=1000, patience=10)

Training Model...
Epoch: -1 Eval Loss: 1.802 Eval Acc: 0.127
Epoch: 000 Train Loss: 1.024 Train Acc: 0.552 Eval Loss: 0.506 Eval Acc: 0.817
Epoch: 001 Train Loss: 0.421 Train Acc: 0.828 Eval Loss: 0.345 Eval Acc: 0.893
Epoch: 002 Train Loss: 0.304 Train Acc: 0.882 Eval Loss: 0.246 Eval Acc: 0.919
Epoch: 003 Train Loss: 0.226 Train Acc: 0.913 Eval Loss: 0.215 Eval Acc: 0.935
Epoch: 004 Train Loss: 0.209 Train Acc: 0.916 Eval Loss: 0.196 Eval Acc: 0.943
Epoch: 005 Train Loss: 0.180 Train Acc: 0.927 Eval Loss: 0.176 Eval Acc: 0.946
Epoch: 006 Train Loss: 0.158 Train Acc: 0.938 Eval Loss: 0.165 Eval Acc: 0.949
Epoch: 007 Train Loss: 0.148 Train Acc: 0.939 Eval Loss: 0.159 Eval Acc: 0.946
Epoch: 008 Train Loss: 0.146 Train Acc: 0.944 Eval Loss: 0.149 Eval Acc: 0.950
Epoch: 009 Train Loss: 0.143 Train Acc: 0.941 Eval Loss: 0.155 Eval Acc: 0.947
EarlyStopping counter: 1 out of 10
Epoch: 010 Train Loss: 0.128 Train Acc: 0.946 Eval Loss: 0.144 Eval Acc: 0.949
Epoch: 011 Train Loss: 0.129 Train 

# Quantized Aware Training

In [11]:
save_model(model=model, model_dir=model_dir, model_filename=model_filename)
model = load_model(model=model, model_filepath=model_filepath, device=cuda_device)
model.to(cpu_device)

# Make a copy of the model for layer fusion
fused_model = copy.deepcopy(model)

model.train()
fused_model.train()

# Fuse the model in place rather manually.
fused_model = torch.quantization.fuse_modules(fused_model,
    [["conv1", "relu1"], ["fc1", "relu2"]])

# Print FP32 model.
print(model)
# Print fused model.
print(fused_model)

# Model and fused model should be equivalent.
model.eval()
fused_model.eval()
inputs, labels = next(iter(valid_loader))
assert model_equivalence(model_1=model, model_2=fused_model, device=cpu_device, rtol=1e-03, atol=1e-06, num_tests=100, input_size=inputs.shape), "Fused model is not equivalent to the original model!"

# Prepare the model for quantization aware training. This inserts observers in
# the model that will observe activation tensors during calibration.
qatmodel = QCNN(model_fp32=fused_model)
quantization_config = torch.quantization.get_default_qconfig("fbgemm")
qatmodel.qconfig = quantization_config

# Print quantization configurations
print(qatmodel.qconfig)

# https://pytorch.org/docs/stable/_modules/torch/quantization/quantize.html#prepare_qat
torch.quantization.prepare_qat(qatmodel, inplace=True)

# # Use training data for calibration.
print("Training QAT Model...")
qatmodel.train()
train_model(model=qatmodel, train_loader=train_loader, test_loader=valid_loader, device=cuda_device, learning_rate=1e-3, num_epochs=1000, patience=10)
qatmodel.to(cpu_device)

qatmodel = torch.quantization.convert(qatmodel, inplace=True)

qatmodel.eval()

# Print quantized model.
print(qatmodel)

# Save quantized model.
save_torchscript_model(model=qatmodel, model_dir=model_dir, model_filename=quantized_model_filename)

# Load quantized model.
quantized_jit_model = load_torchscript_model(model_filepath=quantized_model_filepath, device=cpu_device)

_, fp32_eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=cpu_device, criterion=None)
_, int8_eval_accuracy = evaluate_model(model=quantized_jit_model, test_loader=test_loader, device=cpu_device, criterion=None)

# Skip this assertion since the values might deviate a lot.
# assert model_equivalence(model_1=model, model_2=quantized_jit_model, device=cpu_device, rtol=1e-01, atol=1e-02, num_tests=100, input_size=(1,3,32,32)), "Quantized model deviates from the original model too much!"

print("FP32 evaluation accuracy: {:.3f}".format(fp32_eval_accuracy))
print("INT8 evaluation accuracy: {:.3f}".format(int8_eval_accuracy))

inputs, labels = next(iter(train_loader))

fp32_cpu_inference_latency = measure_inference_latency(model=model, device=cpu_device, input_size=inputs.shape, num_samples=100)
int8_cpu_inference_latency = measure_inference_latency(model=qatmodel, device=cpu_device, input_size=inputs.shape, num_samples=100)
int8_jit_cpu_inference_latency = measure_inference_latency(model=quantized_jit_model, device=cpu_device, input_size=inputs.shape, num_samples=100)
fp32_gpu_inference_latency = measure_inference_latency(model=model, device=cuda_device, input_size=inputs.shape, num_samples=100)

print("FP32 CPU Inference Latency: {:.2f} ms / sample".format(fp32_cpu_inference_latency * 1000))
print("FP32 CUDA Inference Latency: {:.2f} ms / sample".format(fp32_gpu_inference_latency * 1000))
print("INT8 CPU Inference Latency: {:.2f} ms / sample".format(int8_cpu_inference_latency * 1000))
print("INT8 JIT CPU Inference Latency: {:.2f} ms / sample".format(int8_jit_cpu_inference_latency * 1000))



CNN(
  (conv1): Conv1d(9, 32, kernel_size=(3,), stride=(1,))
  (relu1): ReLU()
  (dp1): Dropout(p=0.6, inplace=False)
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flat1): Flatten(start_dim=1, end_dim=-1)
  (dp2): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=2016, out_features=256, bias=True)
  (relu2): ReLU()
  (dp3): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (dp4): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=128, out_features=6, bias=True)
)
CNN(
  (conv1): ConvReLU1d(
    (0): Conv1d(9, 32, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (relu1): Identity()
  (dp1): Dropout(p=0.6, inplace=False)
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flat1): Flatten(start_dim=1, end_dim=-1)
  (dp2): Dropout(p=0.2, inplace=False)
  (fc1): LinearReLU(
    (0): Linear(in_features=2016, out_features=256, bias=True)
    (1): ReLU(

# Static Quantization

In [12]:
fused_model = copy.deepcopy(model)

model.cpu().eval()
# The model has to be switched to evaluation mode before any layer fusion.
# Otherwise the quantization will not work correctly.
fused_model.eval()

# Fuse the model in place rather manually.
fused_model = torch.quantization.fuse_modules(fused_model, [["conv1", "relu1"], ["fc1", "relu2"]], inplace=True)
sqmodel = QCNN(model_fp32=fused_model)
quantization_config = torch.quantization.get_default_qconfig("fbgemm")
sqmodel.qconfig = quantization_config
torch.quantization.prepare(sqmodel, inplace=True)
calibrate_model(model=sqmodel, loader=trainloader)
sqmodel = torch.quantization.convert(sqmodel, inplace=True)

# Save quantized model.
save_torchscript_model(model=sqmodel, model_dir=model_dir, model_filename=quantized_model_filename)

# Load quantized model.
quantized_jit_model = load_torchscript_model(model_filepath=quantized_model_filepath, device=cpu_device)

_, fp32_eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=cpu_device, criterion=None)
_, int8_eval_accuracy = evaluate_model(model=quantized_jit_model, test_loader=test_loader, device=cpu_device, criterion=None)

# Skip this assertion since the values might deviate a lot.
# assert model_equivalence(model_1=model, model_2=quantized_jit_model, device=cpu_device, rtol=1e-01, atol=1e-02, num_tests=100, input_size=(1,3,32,32)), "Quantized model deviates from the original model too much!"

print("FP32 evaluation accuracy: {:.3f}".format(fp32_eval_accuracy))
print("INT8 evaluation accuracy: {:.3f}".format(int8_eval_accuracy))

inputs, labels = next(iter(train_loader))

fp32_cpu_inference_latency = measure_inference_latency(model=model, device=cpu_device, input_size=inputs.shape, num_samples=100)
int8_cpu_inference_latency = measure_inference_latency(model=sqmodel, device=cpu_device, input_size=inputs.shape, num_samples=100)
int8_jit_cpu_inference_latency = measure_inference_latency(model=quantized_jit_model, device=cpu_device, input_size=inputs.shape, num_samples=100)
fp32_gpu_inference_latency = measure_inference_latency(model=model, device=cuda_device, input_size=inputs.shape, num_samples=100)

print("FP32 CPU Inference Latency: {:.2f} ms / sample".format(fp32_cpu_inference_latency * 1000))
print("FP32 CUDA Inference Latency: {:.2f} ms / sample".format(fp32_gpu_inference_latency * 1000))
print("INT8 CPU Inference Latency: {:.2f} ms / sample".format(int8_cpu_inference_latency * 1000))
print("INT8 JIT CPU Inference Latency: {:.2f} ms / sample".format(int8_jit_cpu_inference_latency * 1000))

FP32 evaluation accuracy: 0.964
INT8 evaluation accuracy: 0.950
FP32 CPU Inference Latency: 5.83 ms / sample
FP32 CUDA Inference Latency: 0.57 ms / sample
INT8 CPU Inference Latency: 3.31 ms / sample
INT8 JIT CPU Inference Latency: 3.06 ms / sample


# Dynamic Quantization

In [13]:
model.cpu().eval()
dqmodel = torch.quantization.quantize_dynamic(
    model,  # the original model
    {nn.Conv1d, nn.ReLU, nn.Dropout, nn.MaxPool1d, nn.Flatten, nn.Linear, nn.Softmax},  # a set of layers to dynamically quantize
    dtype=torch.qint8)  # the target dtype for quantized weights


# Save quantized model.
save_torchscript_model(model=dqmodel, model_dir=model_dir, model_filename=quantized_model_filename)

# Load quantized model.
quantized_jit_model = load_torchscript_model(model_filepath=quantized_model_filepath, device=cpu_device)

_, fp32_eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=cpu_device, criterion=None)
_, int8_eval_accuracy = evaluate_model(model=quantized_jit_model, test_loader=test_loader, device=cpu_device, criterion=None)

# Skip this assertion since the values might deviate a lot.
# assert model_equivalence(model_1=model, model_2=quantized_jit_model, device=cpu_device, rtol=1e-01, atol=1e-02, num_tests=100, input_size=(1,3,32,32)), "Quantized model deviates from the original model too much!"

print("FP32 evaluation accuracy: {:.3f}".format(fp32_eval_accuracy))
print("INT8 evaluation accuracy: {:.3f}".format(int8_eval_accuracy))

inputs, labels = next(iter(train_loader))

fp32_cpu_inference_latency = measure_inference_latency(model=model, device=cpu_device, input_size=inputs.shape, num_samples=100)
int8_cpu_inference_latency = measure_inference_latency(model=dqmodel, device=cpu_device, input_size=inputs.shape, num_samples=100)
int8_jit_cpu_inference_latency = measure_inference_latency(model=quantized_jit_model, device=cpu_device, input_size=inputs.shape, num_samples=100)
fp32_gpu_inference_latency = measure_inference_latency(model=model, device=cuda_device, input_size=inputs.shape, num_samples=100)

print("FP32 CPU Inference Latency: {:.2f} ms / sample".format(fp32_cpu_inference_latency * 1000))
print("FP32 CUDA Inference Latency: {:.2f} ms / sample".format(fp32_gpu_inference_latency * 1000))
print("INT8 CPU Inference Latency: {:.2f} ms / sample".format(int8_cpu_inference_latency * 1000))
print("INT8 JIT CPU Inference Latency: {:.2f} ms / sample".format(int8_jit_cpu_inference_latency * 1000))

FP32 evaluation accuracy: 0.964
INT8 evaluation accuracy: 0.967
FP32 CPU Inference Latency: 6.05 ms / sample
FP32 CUDA Inference Latency: 0.45 ms / sample
INT8 CPU Inference Latency: 5.48 ms / sample
INT8 JIT CPU Inference Latency: 5.21 ms / sample


# Conclusion

All three quantizated model acheive comparatively high accuracies that is less by at most 1%. The quantized awared trained model achieve the shortest inference latency. The dynamic quantization method could only quantized Linear and LSTM layers only, and therefore, do not have much difference in latency and accuracy as compared to the original.

In [14]:
for name in qatmodel.state_dict():
    print(name)
    print(qatmodel.state_dict()[name])

quant.scale
tensor([0.1374])
quant.zero_point
tensor([64])
model_fp32.conv1.weight
tensor([[[-0.0966,  0.0332, -0.2084],
         [-0.1993, -0.1359, -0.0906],
         [ 0.1238,  0.2356,  0.0574],
         [ 0.0302, -0.0332,  0.0332],
         [-0.1480, -0.1027, -0.3111],
         [ 0.1993,  0.1208,  0.1420],
         [ 0.2054,  0.0362,  0.0362],
         [ 0.3836,  0.1269,  0.3806],
         [ 0.1027,  0.1933,  0.3081]],

        [[-0.2917, -0.1725, -0.0031],
         [-0.0909,  0.1850, -0.0878],
         [-0.0251, -0.0220, -0.0533],
         [-0.1380,  0.2760,  0.3983],
         [-0.0031,  0.0188, -0.0282],
         [-0.1725, -0.2289, -0.1443],
         [-0.1599,  0.0251, -0.0125],
         [ 0.1129,  0.0094, -0.0408],
         [ 0.2603,  0.0878,  0.0690]],

        [[ 0.0873, -0.2229, -0.1425],
         [-0.2620, -0.2367, -0.1930],
         [ 0.0069, -0.0046, -0.2068],
         [ 0.0850, -0.0138, -0.2941],
         [-0.0620, -0.0046,  0.1011],
         [ 0.0643, -0.2781, -0.2298],
 