# Static Quantization example with `optimum` for `distilbert`

In [None]:
!pip install -r requirements.txt
!pip install git+https://github.com/huggingface/optimum.git@main "onnx" "onnxruntime>=1.9.0" "datasets>=1.2.1" "protobuf==3.20.1"
!pip install evaluate sklearn

## Configurations

lets define our `model_id` and `dataset`, will be used to statically quantize the model

In [55]:
from pathlib import Path

model_id="mrm8488/distilroberta-finetuned-banking77"
dataset_id="banking77"
onnx_path = Path("onnx")
task="sequence-classification"

## Converting the model to onnx

In [43]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer

# load vanilla transformers and convert to onnx
model = ORTModelForSequenceClassification.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

Downloading: 100%|██████████| 5.83k/5.83k [00:00<00:00, 2.86MB/s]
Downloading: 100%|██████████| 319/319 [00:00<00:00, 312kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 878kB/s] 
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 1.18MB/s]
Downloading: 100%|██████████| 5.70k/5.70k [00:00<00:00, 5.98MB/s]
Downloading: 100%|██████████| 256M/256M [00:05<00:00, 47.7MB/s] 


('onnx/tokenizer_config.json',
 'onnx/special_tokens_map.json',
 'onnx/vocab.txt',
 'onnx/added_tokens.json',
 'onnx/tokenizer.json')

## Configuring static quantization

In [44]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

# create ORTQuantizer and define quantization configuration
quantizer = ORTQuantizer.from_pretrained(model_id, feature=task)
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=True, per_channel=False)

## Running Calibration of quantization ranges

In [45]:
import os
from functools import partial
from optimum.onnxruntime.configuration import AutoCalibrationConfig

def preprocess_fn(ex, tokenizer):
    return tokenizer(ex["text"])

# Create the calibration dataset
calibration_dataset = quantizer.get_calibration_dataset(
    dataset_id,
    preprocess_function=partial(preprocess_fn, tokenizer=quantizer.tokenizer),
    num_samples=200,
    dataset_split="train",
)


# Create the calibration configuration containing the parameters related to calibration.
calibration_config = AutoCalibrationConfig.minmax(calibration_dataset)
# Perform the calibration step: computes the activations quantization ranges
ranges = quantizer.fit(
    dataset=calibration_dataset,
    calibration_config=calibration_config,
    onnx_model_path=onnx_path / "model.onnx",
    operators_to_quantize=qconfig.operators_to_quantize,
)
# remove temp augmented model again
os.remove("augmented_model.onnx")


Using custom data configuration default
Reusing dataset banking77 (/home/ubuntu/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b)
Loading cached shuffled indices for dataset at /home/ubuntu/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b/cache-879d5f95a9fbe7e7.arrow
100%|██████████| 1/1 [00:00<00:00, 68.60ba/s]


## Running quantization with calibrated ranges

In [47]:
# Quantize the same way we did for dynamic quantization!
quantizer.export(
    onnx_model_path=onnx_path / "model.onnx",
    onnx_quantized_model_output_path=onnx_path / "model-quantized.onnx",
    calibration_tensors_range=ranges,
    quantization_config=qconfig,
)

PosixPath('onnx/model-quantized.onnx')

In [48]:
import os

# get model file size
size = os.path.getsize(onnx_path / "model.onnx")/(1024*1024)
quantized_model = os.path.getsize(onnx_path / "model-quantized.onnx")/(1024*1024)

print(f"Model file size: {size:.2f} MB")
print(f"Quantized Model file size: {quantized_model:.2f} MB")

Model file size: 255.68 MB
Quantized Model file size: 134.16 MB


## Test quantized model

In [49]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer

model = ORTModelForSequenceClassification.from_pretrained(onnx_path,file_name="model-quantized.onnx")

clx = pipeline("text-classification",model=model, tokenizer=quantizer.tokenizer)

In [50]:
clx("Please transfer 500€")

[{'label': 'transfer_fee_charged', 'score': 0.3978312015533447}]

## Evaluate

In [52]:
from transformers import pipeline
import evaluate
from datasets import load_dataset


test_dataset= load_dataset(dataset_id,split="test")
metric = evaluate.load("accuracy")

def evaluate_model(pipe,dataset,metric,text_column="text",label_column="label"):
    def eval(sample):
        onnx = pipe(sample[text_column])
        return {"pred": pipe.model.config.label2id[onnx[0][label_column]] ,"ref": sample[label_column]}

    res = test_dataset.map(eval)
    return metric.compute(references=res["ref"], predictions=res["pred"])

pipe = pipeline("text-classification",model=model_id)

evaluate_model(pipe,test_dataset,metric)

Using custom data configuration default
Reusing dataset banking77 (/home/ubuntu/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b)
Couldn't find a directory or a metric named 'accuracy' in this version. It was picked from the master branch on github instead.


original model

In [53]:
pipe = pipeline("text-classification",model=model_id)

evaluate_model(pipe,test_dataset,metric)



100%|██████████| 3080/3080 [00:59<00:00, 51.70ex/s]


{'accuracy': 0.3012987012987013}

quantized model

In [54]:
evaluate_model(clx,test_dataset,metric)

100%|██████████| 3080/3080 [00:17<00:00, 179.52ex/s]


{'accuracy': 0.2837662337662338}

## Optimize the quantized model

In [96]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model_id, feature=task)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.export(
    onnx_model_path=onnx_path / "model-quantized.onnx",
    onnx_optimized_model_output_path=onnx_path / "model-optimized.onnx",
    optimization_config=optimization_config,
)

2022-06-02 13:11:28.726616837 [W:onnxruntime:, inference_session.cc:1546 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.
Model producer not matched: Expect pytorch, Got onnx.quantize 0.1.0. Please specify correct --model_type parameter.
Attention not fused


PosixPath('onnx/model-optimized.onnx')

In [97]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer

model = ORTModelForSequenceClassification.from_pretrained(onnx_path,file_name="model-optimized.onnx")

optimized_clx = pipeline("text-classification",model=model, tokenizer=quantizer.tokenizer)

But first, let’s extend our context and question to a more meaningful sequence length of 128.

In [85]:
payload="What do I need to do to get my new card which I have requested 2 weeks ago?"

In [87]:
payload="Hello my name is Philipp. I am getting in touch with you because i didn't get a response from you. What do I need to do to get my new card which I have requested 2 weeks ago? Please help me and answer this email in the next 7 days. Best regards and have a nice weekend "*2
len(tokenizer(payload)["input_ids"])

128

In [98]:
from time import perf_counter
import numpy as np 

def measure_latency(pipe):
    latencies = []
    # warm up
    for _ in range(10):
        _ = pipe(payload)
    # Timed run
    for _ in range(1000):
        start_time = perf_counter()
        _ =  pipe(payload)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies,95)
    return f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_avg_ms


vanilla_model=measure_latency(pipe)
quantized_model=measure_latency(clx)
optimized_model=measure_latency(optimized_clx)

print(f"Vanilla model: {vanilla_model[0]}")
print(f"Quantized model: {quantized_model[0]}")
print(f"Optimized model: {optimized_model[0]}")
print(f"Improvement through quantization: {round(vanilla_model[1]/quantized_model[1],2)}x")
print(f"Improvement through optimization: {round(vanilla_model[1]/optimized_model[1],2)}x")

Vanilla model: P95 latency (ms) - 46.81796969998686; Average latency (ms) - 45.68 +\- 1.95;
Quantized model: P95 latency (ms) - 38.94509134897817; Average latency (ms) - 26.37 +\- 4.91;
Optimized model: P95 latency (ms) - 24.78116525007863; Average latency (ms) - 24.40 +\- 0.25;
Improvement through quantization: 1.73x
Improvement through optimization: 1.87x
