# Static Quantization example with `optimum` for `distilbert`

In [None]:
!pip install -r requirements.txt
!pip install git+https://github.com/huggingface/optimum.git@main "onnx" "onnxruntime>=1.9.0" "datasets>=1.2.1" "protobuf==3.20.1"
!pip install evaluate sklearn

## Configurations

lets define our `model_id` and `dataset`, will be used to statically quantize the model

In [2]:
from pathlib import Path

model_id="mrm8488/distilroberta-finetuned-banking77"
dataset_id="banking77"
onnx_path = Path("onnx")
task="sequence-classification"

## Converting the model to onnx

In [4]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer

# load vanilla transformers and convert to onnx
model = ORTModelForSequenceClassification.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

Downloading: 100%|██████████| 6.59k/6.59k [00:00<00:00, 5.72MB/s]


('onnx/tokenizer_config.json',
 'onnx/special_tokens_map.json',
 'onnx/vocab.json',
 'onnx/merges.txt',
 'onnx/added_tokens.json',
 'onnx/tokenizer.json')

## Configuring static quantization

In [48]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from onnxruntime.quantization import QuantFormat, QuantizationMode, QuantType

# create ORTQuantizer and define quantization configuration
quantizer = ORTQuantizer.from_pretrained(model_id, feature=task)
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=True,format=QuantFormat.QOperator,mode=QuantizationMode.QLinearOps, per_channel=False)

## Running Calibration of quantization ranges

In [49]:
import os
from functools import partial
from optimum.onnxruntime.configuration import AutoCalibrationConfig

def preprocess_fn(ex, tokenizer):
    return tokenizer(ex["text"],padding="longest")

# Create the calibration dataset
calibration_samples=256
calibration_dataset = quantizer.get_calibration_dataset(
    dataset_id,
    preprocess_function=partial(preprocess_fn, tokenizer=quantizer.tokenizer),
    num_samples=calibration_samples,
    dataset_split="train",
)
# Create the calibration configuration containing the parameters related to calibration.
calibration_config = AutoCalibrationConfig.percentiles(calibration_dataset,percentile=99.994)
# Perform the calibration step: computes the activations quantization ranges
shards=16
for i in range(shards):
    shard = calibration_dataset.shard(shards, i)
    quantizer.partial_fit(
        dataset=shard,
        calibration_config=calibration_config,
        onnx_model_path=onnx_path / "model.onnx",
        operators_to_quantize=qconfig.operators_to_quantize,
        batch_size=calibration_samples//shards,
        use_external_data_format=False,
    )
ranges = quantizer.compute_ranges()

# remove temp augmented model again
os.remove("augmented_model.onnx")


Using custom data configuration default
Reusing dataset banking77 (/home/ubuntu/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b)
Loading cached shuffled indices for dataset at /home/ubuntu/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b/cache-879d5f95a9fbe7e7.arrow
100%|██████████| 1/1 [00:00<00:00, 47.33ba/s]


Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Collecting tensor data and making histogram ...
Finding optimal threshold for each tensor using percentile algorithm ...
Number of tensors : 233
Number of histogram bins : 2048
Percentile : (0.006000000000000227,99.994)


## Running quantization with calibrated ranges

In [50]:
from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
from optimum.onnxruntime.preprocessors.passes import (
    ExcludeGeLUNodes,
    ExcludeLayerNormNodes,
    ExcludeNodeAfter,
    ExcludeNodeFollowedBy,
)

# Create a quantization preprocessor to determine the nodes to exclude
quantization_preprocessor = QuantizationPreprocessor()

# Exclude the nodes constituting LayerNorm
quantization_preprocessor.register_pass(ExcludeLayerNormNodes())
# Exclude the nodes constituting GELU
quantization_preprocessor.register_pass(ExcludeGeLUNodes())
# Exclude the residual connection Add nodes
quantization_preprocessor.register_pass(ExcludeNodeAfter("Add", "Add"))
# Exclude the Add nodes following the Gather operator
quantization_preprocessor.register_pass(ExcludeNodeAfter("Gather", "Add"))
# Exclude the Add nodes followed by the Softmax operator
quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))

In [51]:
# Quantize the same way we did for dynamic quantization!
quantizer.export(
    onnx_model_path=onnx_path / "model.onnx",
    onnx_quantized_model_output_path=onnx_path / "model-quantized.onnx",
    calibration_tensors_range=ranges,
    quantization_config=qconfig,
    # preprocessor=quantization_preprocessor,
)

PosixPath('onnx/model-quantized.onnx')

In [39]:
import os

# get model file size
size = os.path.getsize(onnx_path / "model.onnx")/(1024*1024)
quantized_model = os.path.getsize(onnx_path / "model-quantized.onnx")/(1024*1024)

print(f"Model file size: {size:.2f} MB")
print(f"Quantized Model file size: {quantized_model:.2f} MB")

Model file size: 313.54 MB
Quantized Model file size: 192.03 MB


## Test quantized model

In [40]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer

model = ORTModelForSequenceClassification.from_pretrained(onnx_path,file_name="model-quantized.onnx")

clx = pipeline("text-classification",model=model, tokenizer=quantizer.tokenizer)

In [41]:
clx("Please transfer 500€")

[{'label': 'transfer_into_account', 'score': 0.12022580206394196}]

## Evaluate

In [52]:
from transformers import pipeline
import evaluate
from datasets import load_dataset


test_dataset= load_dataset(dataset_id,split="test")
metric = evaluate.load("accuracy")

def evaluate_model(pipe,dataset,metric,text_column="text",label_column="label"):
    def eval(sample):
        onnx = pipe(sample[text_column])
        return {"pred": pipe.model.config.label2id[onnx[0][label_column]] ,"ref": sample[label_column]}

    res = test_dataset.map(eval)
    return metric.compute(references=res["ref"], predictions=res["pred"])



original model

In [20]:
pipe = pipeline("text-classification",model=model_id)

# evaluate_model(pipe,test_dataset,metric)



100%|██████████| 3080/3080 [01:05<00:00, 47.28ex/s]


{'accuracy': 0.8961038961038961}

quantized model

In [53]:
evaluate_model(clx,test_dataset,metric)

100%|██████████| 3080/3080 [00:19<00:00, 161.24ex/s]


{'accuracy': 0.41883116883116883}

## Optimize the quantized model

In [54]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model_id, feature=task)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.export(
    onnx_model_path=onnx_path / "model-quantized.onnx",
    onnx_optimized_model_output_path=onnx_path / "model-optimized.onnx",
    optimization_config=optimization_config,
)

2022-06-02 15:37:08.900401438 [W:onnxruntime:, inference_session.cc:1546 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.


PosixPath('onnx/model-optimized.onnx')

In [14]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer

model = ORTModelForSequenceClassification.from_pretrained(onnx_path,file_name="model-optimized.onnx")

optimized_clx = pipeline("text-classification",model=model, tokenizer=quantizer.tokenizer)

But first, let’s extend our context and question to a more meaningful sequence length of 128.

In [15]:
payload="What do I need to do to get my new card which I have requested 2 weeks ago?"

127

In [12]:
from time import perf_counter
import numpy as np 

payload="Hello my name is Philipp. I am getting in touch with you because i didn't get a response from you. What do I need to do to get my new card which I have requested 2 weeks ago? Please help me and answer this email in the next 7 days. Best regards and have a nice weekend "*2
print(f'Payload sequence length: {len(tokenizer(payload)["input_ids"])}')

def measure_latency(pipe):
    latencies = []
    # warm up
    for _ in range(10):
        _ = pipe(payload)
    # Timed run
    for _ in range(300):
        start_time = perf_counter()
        _ =  pipe(payload)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies,95)
    return f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_avg_ms


Payload sequence length: 127


In [17]:


vanilla_model=measure_latency(pipe)
quantized_model=measure_latency(clx)
optimized_model=measure_latency(optimized_clx)

print(f"Vanilla model: {vanilla_model[0]}")
print(f"Quantized model: {quantized_model[0]}")
print(f"Optimized model: {optimized_model[0]}")
print(f"Improvement through quantization: {round(vanilla_model[1]/quantized_model[1],2)}x")
print(f"Improvement through optimization: {round(vanilla_model[1]/optimized_model[1],2)}x")

Vanilla model: P95 latency (ms) - 66.20852575069875; Average latency (ms) - 56.69 +\- 9.12;
Quantized model: P95 latency (ms) - 24.14308940024057; Average latency (ms) - 23.89 +\- 0.34;
Optimized model: P95 latency (ms) - 23.938274949250626; Average latency (ms) - 23.73 +\- 0.65;
Improvement through quantization: 2.37x
Improvement through optimization: 2.39x


## Dynamic counter part



In [7]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer



dynamic_onnx_path=Path("dynamic_onnx")

# load vanilla transformers and convert to onnx
model = ORTModelForSequenceClassification.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(dynamic_onnx_path)
tokenizer.save_pretrained(dynamic_onnx_path)

# create ORTOptimizer and define optimization configuration
dynamic_optimizer = ORTOptimizer.from_pretrained(model_id, feature=task)
dynamic_optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
dynamic_optimizer.export(
    onnx_model_path=onnx_path / "model.onnx",
    onnx_optimized_model_output_path=dynamic_onnx_path / "model-optimized.onnx",
    optimization_config=dynamic_optimization_config,
)

# create ORTQuantizer and define quantization configuration
dynamic_quantizer = ORTQuantizer.from_pretrained(model_id, feature=task)
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)

# apply the quantization configuration to the model
dynamic_quantizer.export(
    onnx_model_path=dynamic_onnx_path / "model-optimized.onnx",
    onnx_quantized_model_output_path=dynamic_onnx_path / "model-quantized.onnx",
    quantization_config=dqconfig,
)

import os
# get model file size
size = os.path.getsize(dynamic_onnx_path / "model.onnx")/(1024*1024)
print(f"Vanilla Onnx Model file size: {size:.2f} MB")
size = os.path.getsize(dynamic_onnx_path / "model-quantized.onnx")/(1024*1024)
print(f"Quantized Onnx Model file size: {size:.2f} MB")

from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer

model = ORTModelForSequenceClassification.from_pretrained(dynamic_onnx_path,file_name="model-quantized.onnx")

dynamic_clx = pipeline("text-classification",model=model, tokenizer=dynamic_quantizer.tokenizer)

evaluate_model(dynamic_clx,test_dataset,metric)


Downloading: 100%|██████████| 6.59k/6.59k [00:00<00:00, 5.14MB/s]
2022-06-02 13:46:50.475389962 [W:onnxruntime:, inference_session.cc:1546 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.


Vanilla Onnx Model file size: 313.54 MB
Quantized Onnx Model file size: 222.52 MB


100%|██████████| 3080/3080 [00:19<00:00, 159.13ex/s]


{'accuracy': 0.8951298701298701}

In [14]:
measure_latency(dynamic_clx)[0]


'P95 latency (ms) - 39.63735170000291; Average latency (ms) - 39.48 +\\- 0.45;'