# Static Quantization example with `optimum` for `distilbert`

In [4]:
!pip install -r requirements.txt
!pip install protobuf==3.20.1



Collecting protobuf==3.20.1
  Downloading protobuf-3.20.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.21.1
    Uninstalling protobuf-4.21.1:
      Successfully uninstalled protobuf-4.21.1
Successfully installed protobuf-3.20.1


## Configurations

lets define our `model_id` and `dataset`, will be used to statically quantize the model

In [2]:
model_id="philschmid/DistilBERT-Banking77"
dataset_id="banking77"
task="sequence-classification"

In [3]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

# create ORTQuantizer and define quantization configuration
quantizer = ORTQuantizer.from_pretrained(model_id, feature=task)
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=True, per_channel=False)

Downloading:   0%|          | 0.00/318 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.83k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/251M [00:00<?, ?B/s]

In [6]:
from functools import partial


def preprocess_fn(ex, tokenizer):
    return tokenizer(ex["text"])

# Create the calibration dataset
calibration_dataset = quantizer.get_calibration_dataset(
    dataset_id,
    preprocess_function=partial(preprocess_fn, tokenizer=quantizer.tokenizer),
    num_samples=200,
    dataset_split="train",
)

Downloading builder script:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset banking77/default (download: 1.03 MiB, generated: 897.51 KiB, post-processed: Unknown size, total: 1.91 MiB) to /home/ubuntu/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b...


Downloading data:   0%|          | 0.00/158k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3080 [00:00<?, ? examples/s]

Dataset banking77 downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
from optimum.onnxruntime.configuration import AutoCalibrationConfig

# Create the calibration configuration containing the parameters related to calibration.
calibration_config = AutoCalibrationConfig.minmax(calibration_dataset)
# Perform the calibration step: computes the activations quantization ranges
ranges = quantizer.fit(
    dataset=calibration_dataset,
    calibration_config=calibration_config,
    onnx_model_path="model.onnx",
    operators_to_quantize=qconfig.operators_to_quantize,
)

2022-06-01 19:30:29.382279202 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_205_ReduceMax
2022-06-01 19:30:29.382325915 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_205_ReduceMin
2022-06-01 19:30:29.384088828 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output input.24_ReduceMax
2022-06-01 19:30:29.384116558 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output input.24_ReduceMin
2022-06-01 19:30:29.385124997 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_320_ReduceMax
2022-06-01 19:30:29.385150792 [W:on

2022-06-01 19:30:29.568954963 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_676_ReduceMax
2022-06-01 19:30:29.568981862 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_676_ReduceMin
2022-06-01 19:30:29.569002009 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_466_ReduceMax
2022-06-01 19:30:29.569010442 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_466_ReduceMin
2022-06-01 19:30:29.569019085 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1,1} for output onnx::MatMul_744_ReduceMax
2022-06-01 19:30:29.

2022-06-01 19:30:29.792332244 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Sqrt_696_ReduceMax
2022-06-01 19:30:29.792350081 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Sqrt_696_ReduceMin
2022-06-01 19:30:29.792359020 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_607_ReduceMax
2022-06-01 19:30:29.792373084 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_607_ReduceMin
2022-06-01 19:30:29.792384397 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_668_ReduceMax
2022-06-01 19:30:29.792

2022-06-01 19:30:29.967520758 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_662_ReduceMin
2022-06-01 19:30:29.967529391 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Reshape_620_ReduceMax
2022-06-01 19:30:29.967538366 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Reshape_620_ReduceMin
2022-06-01 19:30:29.967546872 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_573_ReduceMax
2022-06-01 19:30:29.967555668 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_573_ReduceMin
2022-06-01 19:30:

2022-06-01 19:30:30.158669030 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_694_ReduceMin
2022-06-01 19:30:30.158675983 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1,1} for output onnx::MatMul_288_ReduceMax
2022-06-01 19:30:30.158691823 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1,1} for output onnx::MatMul_288_ReduceMin
2022-06-01 19:30:30.158702946 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1,1} for output onnx::Shape_178_ReduceMax
2022-06-01 19:30:30.158713677 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1,1} for output onnx::Shape_178_ReduceMin
2022-06

2022-06-01 19:30:30.412575854 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Sqrt_696_ReduceMax
2022-06-01 19:30:30.412598610 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Sqrt_696_ReduceMin
2022-06-01 19:30:30.412607864 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_607_ReduceMax
2022-06-01 19:30:30.412621597 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_607_ReduceMin
2022-06-01 19:30:30.412633440 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_668_ReduceMax
2022-06-01 19:30:30.412

20IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

2022-06-01 19:30:32.969723625 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_320_ReduceMax
2022-06-01 19:30:32.969737859 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_320_ReduceMin
2022-06-01 19:30:32.971536720 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_445_ReduceMax
2022-06-01 19:30:32.971550950 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSiz

2022-06-01 19:30:33.201383075 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Sqrt_696_ReduceMax
2022-06-01 19:30:33.201410103 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Sqrt_696_ReduceMin
2022-06-01 19:30:33.201421480 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_607_ReduceMax
2022-06-01 19:30:33.201435048 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_607_ReduceMin
2022-06-01 19:30:33.201448321 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_668_ReduceMax
2022-06-01 19:30:33.201

2022-06-01 19:30:33.386310927 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Reshape_392_ReduceMin
2022-06-01 19:30:33.386320737 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_243_ReduceMax
2022-06-01 19:30:33.386329443 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_243_ReduceMin
2022-06-01 19:30:33.386338040 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Reshape_709_ReduceMax
2022-06-01 19:30:33.386347249 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Reshape_709_ReduceMin
2022-06-01 19

2022-06-01 19:30:33.614488339 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Reshape_506_ReduceMax
2022-06-01 19:30:33.614500404 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Reshape_506_ReduceMin
2022-06-01 19:30:33.614511288 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_566_ReduceMax
2022-06-01 19:30:33.614536729 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_566_ReduceMin
2022-06-01 19:30:33.614554899 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Reshape_380_ReduceMax
2022-06-01 19

2022-06-01 19:30:33.836159872 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Sqrt_696_ReduceMax
2022-06-01 19:30:33.836184464 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Sqrt_696_ReduceMin
2022-06-01 19:30:33.836195229 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_607_ReduceMax
2022-06-01 19:30:33.836207858 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_607_ReduceMin
2022-06-01 19:30:33.836216462 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_668_ReduceMax
2022-06-01 19:30:33.836

2022-06-01 19:30:34.026888214 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::MatMul_203_ReduceMax
2022-06-01 19:30:34.026898439 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::MatMul_203_ReduceMin
2022-06-01 19:30:34.026909383 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_694_ReduceMax
2022-06-01 19:30:34.026917294 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1} for output onnx::Add_694_ReduceMin
2022-06-01 19:30:34.026925905 [W:onnxruntime:, execution_frame.cc:806 VerifyOutputSizes] Expected shape from model of {} does not match actual shape of {1,1,1,1} for output onnx::MatMul_288_ReduceMax
2022-06-01 19:

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# Quantize the same way we did for dynamic quantization!
quantizer.export(
    onnx_model_path="model.onnx",
    onnx_quantized_model_output_path="model-quantized.onnx",
    calibration_tensors_range=ranges,
    quantization_config=qconfig,
)