In [11]:
import torch 
from ultralytics import YOLO
import os 

In [28]:
model = torch.load('/home/airi/yolo/ONNX-TensorRT-Pytorch-Tensorflow-Face-Detection-Models-Quantization/models/yolov8l.pt')

In [25]:
def print_model_size(mdl):
    torch.save(mdl['model'], "tmp.pt")
    print("%.2f MB" %(os.path.getsize("tmp.pt")/1e6))
    os.remove('tmp.pt')

In [26]:
print_model_size(model)

87.61 MB


In [27]:
Weights = []
for i in model['model'].parameters():
    Weights.append(i.data)
print(f"Model have {len(Weights)} layers with trainable parameters.\nExample first layer parameters:")
print(Weights[0][0][0])
print("Type:", Weights[0][0][0].dtype)

Model have 304 layers with trainable parameters.
Example first layer parameters:
tensor([[ 0.0300,  0.0829, -0.1095],
        [ 0.0609,  0.0059, -0.0840],
        [-0.0634, -0.1497,  0.2061]], dtype=torch.float16)
Type: torch.float16


### Post Training Dynamic Quantization

To apply Dynamic Quantization, which converts all the weights in a model from 32-bit floating numbers to 8-bit integers but doesn’t convert the activations to int8 till just before performing the computation on the activations, simply call torch.quantization.quantize_dynamic:

In [43]:
from torch.quantization import quantize_dynamic
import pickle

In [44]:
model = torch.load('/home/airi/yolo/ONNX-TensorRT-Pytorch-Tensorflow-Face-Detection-Models-Quantization/models/yolov8l.pt')
model['model'] = quantize_dynamic(model['model'].eval(), qconfig_spec={torch.nn.Linear}, dtype=torch.quint8, mapping=None, inplace=False)
torch.save(model['model'], '/home/airi/yolo/ONNX-TensorRT-Pytorch-Tensorflow-Face-Detection-Models-Quantization/quant_models/yolov8l_int8.pt', pickle_module=pickle)

In [45]:
model = torch.load('/home/airi/yolo/ONNX-TensorRT-Pytorch-Tensorflow-Face-Detection-Models-Quantization/quant_models/yolov8l_int8.pt')

In [46]:
Weights = []
for i in model.parameters():
    Weights.append(i.data)
print(f"Model have {len(Weights)} layers with trainable parameters.\nExample first layer parameters:")
print(Weights[0][0][0])
print("Type:", Weights[0][0][0].dtype)

Model have 304 layers with trainable parameters.
Example first layer parameters:
tensor([[ 0.0300,  0.0829, -0.1095],
        [ 0.0609,  0.0059, -0.0840],
        [-0.0634, -0.1497,  0.2061]], dtype=torch.float16)
Type: torch.float16


### Post Training Static Quantization

This method converts both the weights and the activations to 8-bit integers beforehand so there won’t be on-the-fly conversion on the activations during the inference, as the dynamic quantization does, hence improving the performance significantly.

In [47]:
model = torch.load('/home/airi/yolo/ONNX-TensorRT-Pytorch-Tensorflow-Face-Detection-Models-Quantization/models/yolov8l.pt')

In [48]:
backend = "qnnpack"
model['model'].qconfig = torch.quantization.get_default_qconfig(backend)
torch.backends.quantized.engine = 'fbgemm'
model_static_quantized = torch.quantization.prepare(model['model'], inplace=False)
model_static_quantized = torch.quantization.convert(model_static_quantized, inplace=False)
torch.save(model_static_quantized, '/home/airi/yolo/ONNX-TensorRT-Pytorch-Tensorflow-Face-Detection-Models-Quantization/quant_models/yolov8n_static_int8.pt', pickle_module=pickle)

In [57]:
Weights = []
for i in model_static_quantized.parameters():
    Weights.append(i.data)
print(f"Model have {len(Weights)} layers with trainable parameters.\nExample first layer parameters:")
print(Weights[0][0])
print("Type:", Weights[0][0].dtype)

Model have 194 layers with trainable parameters.
Example first layer parameters:
tensor(3.3926, dtype=torch.float16)
Type: torch.float16
