# YOLO-X Tiny Quant example

## Prepare the ENV & Args

In [1]:
import logging
import sys
logging.basicConfig(
    level=logging.INFO,
    format='[%(levelname)s] %(message)s',
    stream=sys.stdout
)

In [2]:
import torch
import argparse
import itertools
from trainer import Trainer
from yolo_x_tiny_exp import Exp


[32m
[QUARK-INFO]: C++ kernel compilation check start.[0m
[32m
[QUARK-INFO]: C++ kernel build directory /home/haoliang/.cache/torch_extensions/py39_cu124/kernel_ext[0m
[32m
[QUARK-INFO]: C++ kernel loading. First-time compilation may take a few minutes...[0m
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
[32m
[QUARK-INFO]: C++ kernel compilation is already complete. Ending the C++ kernel compilation check. Total time: 1.1662 seconds[0m
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
parser = argparse.ArgumentParser()
parser.add_argument("--ckpt", default="./yolox_tiny.pth", type=str, help="pre train checkpoint")
parser.add_argument("--batch-size", type=int, default=64, help="batch size")
parser.add_argument('--random_size_range', type=int, default=3, help='random_size')
parser.add_argument("--experiment_name", type=str, default="0", help="exp name")
parser.add_argument('--data_dir', default='./coco_data', help='Data set directory.')

parser.add_argument("--min_lr_ratio", type=float, default=0.01, help="batch size")
parser.add_argument("--ema_decay", type=float, default=0.9995, help="ema decay reate.")

parser.add_argument('--output_dir', default='./YOLOX_outputs', help='Experiments results save path.')
parser.add_argument('--workers', default=4, type=int, help='Number of data loading workers to be used.')
parser.add_argument('--multiscale_range', default=5, type=int, help='multiscale_range.')
parser.add_argument("--start_epoch", type=int, default=280, help="batch size")
args = parser.parse_args([])

### Init the experiments & trainer

In [4]:
exp = Exp(args)
trainer = Trainer(exp, args)

  self.scaler = torch.cuda.amp.GradScaler(enabled=False)


## Prepare FP32 model & test accuracy

In [5]:
logging.info("args: {}".format(trainer.args))
logging.info("exp value:\n{}".format(trainer.exp))

[INFO] args: Namespace(ckpt='./yolox_tiny.pth', batch_size=64, random_size_range=3, experiment_name='0', data_dir='./coco_data', min_lr_ratio=0.01, ema_decay=0.9995, output_dir='./YOLOX_outputs', workers=4, multiscale_range=5, start_epoch=280)
[INFO] exp value:
╒═══════════════════╤════════════════════════════╕
│ keys              │ values                     │
╞═══════════════════╪════════════════════════════╡
│ seed              │ None                       │
├───────────────────┼────────────────────────────┤
│ output_dir        │ './YOLOX_outputs'          │
├───────────────────┼────────────────────────────┤
│ print_interval    │ 10                         │
├───────────────────┼────────────────────────────┤
│ eval_interval     │ 1                          │
├───────────────────┼────────────────────────────┤
│ dataset           │ None                       │
├───────────────────┼────────────────────────────┤
│ num_classes       │ 80                         │
├───────────────────┼───

In [6]:
model = trainer.exp.get_model()
model.to(trainer.device)
model = trainer.load_pretrain_weight(model)
trainer.model = model

[INFO] loading pre-trained checkpoint


  ckpt = torch.load(ckpt_file, map_location=self.device)["model"]


In [7]:
trainer.evaluator = trainer.exp.get_evaluator(batch_size=int(trainer.args.batch_size / 2))


loading annotations into memory...
Done (t=0.72s)
creating index...
index created!


### Evaluate the FP32 model on the COCO val dataset 

In [8]:
*_, summary = trainer.evaluator.evaluate(trainer.model)

100%|██████████| 157/157 [00:17<00:00,  8.76it/s]

[INFO] Evaluate in main process...



  statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])


Loading and preparing results...
DONE (t=1.86s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=25.46s).
Accumulating evaluation results...
DONE (t=5.32s).


In [9]:
print(summary)

Average forward time: 1.05 ms, Average NMS time: 0.73 ms, Average inference time: 1.78 ms
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.326
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.500
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.346
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.135
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.358
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.499
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.281
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.437
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.463
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.207
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.522
 Average Recall     (AR) @[ Io

## Perform PTQ & evaluate the accuracy

### Prepare Quantization config & Quantizer


In [8]:
from quark.torch import ModelQuantizer, ModelExporter
from quark.torch.quantization.config.config import QuantizationSpec, QuantizationConfig, Config
from quark.torch.quantization.config.type import Dtype, QSchemeType, ScaleType, RoundType, QuantizationMode
from quark.torch.quantization.observer.observer import PerTensorPowOf2MinMSEObserver


In [9]:
INT8_PER_WEIGHT_TENSOR_SPEC = QuantizationSpec(dtype=Dtype.int8,
                                                       qscheme=QSchemeType.per_tensor,
                                                       observer_cls=PerTensorPowOf2MinMSEObserver,
                                                       symmetric=True,
                                                       scale_type=ScaleType.float,
                                                       round_method=RoundType.half_even,
                                                       is_dynamic=False)
quant_config = QuantizationConfig(weight=INT8_PER_WEIGHT_TENSOR_SPEC,
                                    input_tensors=INT8_PER_WEIGHT_TENSOR_SPEC,
                                    output_tensors=INT8_PER_WEIGHT_TENSOR_SPEC,
                                    bias=INT8_PER_WEIGHT_TENSOR_SPEC)
quant_config = Config(global_quant_config=quant_config, quant_mode=QuantizationMode.fx_graph_mode)
trainer.quantizer = ModelQuantizer(quant_config)

[32m
[QUARK-INFO]: Configuration checking start.[0m
[32m
[QUARK-INFO]: Configuration checking end. The configuration is effective. This is weight quantization and activation static quantization.[0m


### Prepare calibration Dataset & Fx graph model

In [10]:
calib_data = [x[0].to(trainer.device) for x in list(itertools.islice(trainer.evaluator.dataloader, 1))]
dummy_input = torch.randn(1, 3, *trainer.exp.input_size).to(trainer.device)
trainer.model = trainer.model.eval()

NOTE: Based on the original YOLO_X Tiny repo code, loss calculation and bounding-boxes decode code are integrated in YOLO_X Tiny `forward`, we modify the code and let the `trainer.model.base_model` only contain the backbone network. We only need to quantize this part of the model.

In [11]:
graph_model = torch.export.export_for_training(trainer.model.base_model, (dummy_input, )).module()
graph_model = torch.fx.GraphModule(graph_model, graph_model.graph)
trainer.model.base_model = graph_model

### Perform PTQ & evaluate the quantized model

In [12]:
quantized_model = trainer.quantizer.quantize_model(graph_model, calib_data)


[32m
[QUARK-INFO]: Quantizing with the quantization configuration:
Config(
    global_quant_config=QuantizationConfig(
        input_tensors=QuantizationSpec(
            dtype=Dtype.int8,
            observer_cls=<class 'quark.torch.quantization.observer.observer.PerTensorPowOf2MinMSEObserver'>,
            is_dynamic=False,
            qscheme=QSchemeType.per_tensor,
            ch_axis=None,
            group_size=None,
            is_mx_scale_constraint=None,
            symmetric=True,
            round_method=RoundType.half_even,
            scale_type=ScaleType.float,
            scale_format=None,
            scale_calculation_mode=None,
            qat_spec=None,
            mx_element_dtype=None,
            zero_point_type=ZeroPointType.int32,
            is_scale_quant=False,
        ),
        output_tensors=QuantizationSpec(
            dtype=Dtype.int8,
            observer_cls=<class 'quark.torch.quantization.observer.observer.PerTensorPowOf2MinMSEObserver'>,
         

In [13]:
trainer.model.base_model = quantized_model

In [14]:
*_, summary = trainer.evaluator.evaluate(trainer.model)


[32m
[QUARK-INFO]: Freeze bn_stats.[0m
[32m
[QUARK-INFO]: Disable observer.[0m
[32m
[QUARK-INFO]: Total find: 447 PerTensorPowOf2MinMSEObserver, clear 447 of thems observed tensors[0m
[32m
[QUARK-INFO]: Enable fake quant.[0m
[32m
[QUARK-INFO]: Whether find Droutout: False, change mode to: False[0m
100%|██████████| 157/157 [00:33<00:00,  4.66it/s]

[INFO] Evaluate in main process...



  statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])


Loading and preparing results...
DONE (t=1.55s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=21.15s).
Accumulating evaluation results...
DONE (t=4.34s).


In [15]:
print(summary)

Average forward time: 4.67 ms, Average NMS time: 0.67 ms, Average inference time: 5.35 ms
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.251
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.428
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.266
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.095
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.280
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.392
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.233
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.359
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.376
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.154
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.418
 Average Recall     (AR) @[ Io

## Perform QAT based on PTQ results

1. Based on the PTQ results, we perform the PTQ, through training, and adjust the weight/bias.
This can get higher results.
2. We adopt the training code from the original YOLO-X Tiny repo, and we train the model from 280 epoch. Based on the development time and our work focused mainly on the Quark Fx QAT tool, we only tried several parameters to perform training. Differently, we using one single GPU to perform training to largely reduce the training complexity. The user can try other hyperparameters to get higher results.

### Prepare the Dataloader & Optimizer etc.

In [18]:
from trainer import ModelEMA
from data import DataPrefetcher

In [19]:
trainer.no_aug = trainer.start_epoch >= trainer.max_epoch - trainer.exp.no_aug_epochs
trainer.train_loader = trainer.exp.get_data_loader(batch_size=trainer.args.batch_size,
                                                no_aug=trainer.no_aug,
                                                cache_img=None)
logging.info("init prefetcher, this might take one minute or less...")
trainer.prefetcher = DataPrefetcher(trainer.train_loader)

trainer.max_iter = len(trainer.train_loader)
trainer.lr_scheduler = trainer.exp.get_lr_scheduler(trainer.exp.basic_lr_per_img * trainer.args.batch_size, trainer.max_iter)
trainer.optimizer = trainer.exp.get_optimizer(trainer.args.batch_size)
#  ------ using ema for better coverage ---
if trainer.use_model_ema:
    trainer.ema_model = ModelEMA(trainer.model, trainer.args.ema_decay)  # 0.9995
    trainer.ema_model.updates = trainer.max_iter * trainer.start_epoch

loading annotations into memory...


Done (t=15.37s)
creating index...
index created!
[INFO] init prefetcher, this might take one minute or less...


### Perform training to further improve accuracy
NOTE: We only training one epoch for demonstration

In [20]:
logging.info("Training start...")
# logging.info("\n{}".format(trainer.model))
trainer.epoch = 280
logging.info("---> start train epoch{}".format(trainer.epoch + 1))


[INFO] Training start...
[INFO] ---> start train epoch281


**NOTE**: in function, `train_in_iter`, 
  1. We close the observer, meaning, during training the scale will not change;
  2. Based on experience, we found that during training, we close the `bn` update that can get higher results.

In [21]:
trainer.train_in_iter()

[32m
[QUARK-INFO]: Enable update bn_stats.[0m
[32m
[QUARK-INFO]: Enable observer.[0m
[32m
[QUARK-INFO]: Enable fake quant.[0m
[32m
[QUARK-INFO]: Whether find Droutout: False, change mode to: True[0m
[32m
[QUARK-INFO]: Disable observer.[0m
[32m
[QUARK-INFO]: Enable fake quant.[0m
[32m
[QUARK-INFO]: Freeze bn_stats.[0m
  with torch.cuda.amp.autocast(enabled=self.amp_training):
  with torch.cuda.amp.autocast(enabled=False):
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


[INFO] epoch: 281/300, iter: 10/1849, gpu mem: 8149Mb, mem: 106.9Gb, iter_time: 1.577s, data_time: 0.002s, total_loss: 6.3, iou_loss: 2.5, l1_loss: 0.0, conf_loss: 2.7, cls_loss: 1.1, lr: 1.078e-04, size: 416, ETA: 16:11:53
[INFO] epoch: 281/300, iter: 20/1849, gpu mem: 8149Mb, mem: 106.9Gb, iter_time: 1.400s, data_time: 0.005s, total_loss: 6.2, iou_loss: 2.6, l1_loss: 0.0, conf_loss: 2.5, cls_loss: 1.2, lr: 1.078e-04, size: 384, ETA: 15:16:56
[INFO] epoch: 281/300, iter: 30/1849, gpu mem: 8149Mb, mem: 107.1Gb, iter_time: 1.392s, data_time: 0.003s, total_loss: 6.0, iou_loss: 2.5, l1_loss: 0.0, conf_loss: 2.3, cls_loss: 1.2, lr: 1.077e-04, size: 352, ETA: 14:56:51
[INFO] epoch: 281/300, iter: 40/1849, gpu mem: 8151Mb, mem: 107.2Gb, iter_time: 1.514s, data_time: 0.003s, total_loss: 5.9, iou_loss: 2.5, l1_loss: 0.0, conf_loss: 2.4, cls_loss: 1.0, lr: 1.077e-04, size: 416, ETA: 15:05:29
[INFO] epoch: 281/300, iter: 50/1849, gpu mem: 12124Mb, mem: 107.2Gb, iter_time: 1.758s, data_time: 0.00

### Evaluate the model

To simplify, we directly load the fintuned weight to test accuracy

In [16]:
trainer.model.load_state_dict(torch.load("./YOLOX_outputs/yolo_x_tiny_exp_3031/best_ckpt.pth")['model'])

  trainer.model.load_state_dict(torch.load("./YOLOX_outputs/yolo_x_tiny_exp_3031/best_ckpt.pth")['model'])


<All keys matched successfully>

In [17]:
*_, summary = trainer.evaluator.evaluate(trainer.model)

[32m
[QUARK-INFO]: Freeze bn_stats.[0m
[32m
[QUARK-INFO]: Disable observer.[0m
[32m
[QUARK-INFO]: Enable fake quant.[0m
[32m
[QUARK-INFO]: Whether find Droutout: False, change mode to: False[0m
100%|██████████| 157/157 [00:31<00:00,  4.96it/s]

[INFO] Evaluate in main process...





Loading and preparing results...
DONE (t=1.37s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=20.63s).
Accumulating evaluation results...
DONE (t=4.00s).


In [18]:
print(summary)

Average forward time: 4.61 ms, Average NMS time: 0.55 ms, Average inference time: 5.16 ms
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.303
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.482
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.325
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.124
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.328
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.469
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.264
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.401
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.420
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.181
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.467
 Average Recall     (AR) @[ Io

### Freeze model & export to onnx


#### Freeze model
For better deployment in the AMD NPU device, we apply several hardware optimizations (e.g. adjust the scale, insert multiply nodes to perform adjustment for hardware)

In [19]:

freezeded_model = trainer.quantizer.freeze(trainer.model.base_model.eval())
trainer.model.base_model = freezeded_model

[32m
[QUARK-INFO]: Freeze bn_stats.[0m
[32m
[QUARK-INFO]: Disable observer.[0m
[32m
[QUARK-INFO]: Enable fake quant.[0m
[32m
[QUARK-INFO]: Whether find Droutout: False, change mode to: False[0m
[32m
[QUARK-INFO]: Freeze model start.[0m
[32m
[QUARK-INFO]: Freeze quantized torch.fx.GraphModule [0m
[33m
[32m
[QUARK-INFO]: Running 1_th pass ConvertClip2ReLUQOPass[0m
[32m
[QUARK-INFO]: Running 2_th pass ApplyConstrain2ConcatQOPass[0m
[32m
[QUARK-INFO]: Node: cat_8 input quantizer: fake_quantizer_66 scale change from 0.03125 to 0.0625[0m
[32m
[QUARK-INFO]: Node: cat_14 input quantizer: fake_quantizer_233 scale change from 0.03125 to 0.25[0m
[32m
[QUARK-INFO]: Node: cat_15 input quantizer: fake_quantizer_235 scale change from 0.0078125 to 0.03125[0m
[32m
[QUARK-INFO]: Node: cat_15 input quantizer: fake_quantizer_236 scale change from 0.0078125 to 0.03125[0m
[32m
[QUARK-INFO]: Node: cat_16 input quantizer: fake_quantizer_254 scale change from 0.03125 to 0.25[0m
[32m

#### Export to ONNX

In [20]:
from quark.torch.export.config.config import ExporterConfig, JsonExporterConfig


In [21]:
config = ExporterConfig(json_export_config=JsonExporterConfig())
exporter = ModelExporter(config=config, export_dir="./export_onnx/")
# NOTE for NPU compile, it is better using batch-size = 1 for better compliance
example_inputs = (torch.rand(1, 3, 416, 416).to(trainer.device), )
exporter.export_onnx_model(trainer.model, example_inputs[0])

[32m
[QUARK-INFO]: Start exporting quantized onnx model ...[0m


[32m
[QUARK-INFO]: Quantized onnx model exported to export_onnx/quark_model.onnx successfully.[0m


#### Simplity the Onnx model and visualize

In [32]:
from onnxsim import simplify
import onnx
quant_model = onnx.load("./export_onnx/quark_model.onnx")
model_simp, check = simplify(quant_model)
onnx.save_model(model_simp, "./export_onnx/sample_quark_model.onnx")

Using `netron` to visualize the model (Optional)
```shell
$netron  ./export_onnx/sample_quark_model.onnx
```