In [15]:
# BKM from https://www.philschmid.de/optimizing-vision-transformer

!pip install "optimum[onnxruntime]==1.5.0" evaluate[evaluator] sklearn mkl-include mkl --upgrade




In [16]:
from optimum.onnxruntime import ORTModelForImageClassification
from transformers import AutoFeatureExtractor
from pathlib import Path


model_id="nateraw/vit-base-beans"
onnx_path = Path("onnx")

# load vanilla transformers and convert to onnx
model = ORTModelForImageClassification.from_pretrained(model_id, from_transformers=True)
preprocessor = AutoFeatureExtractor.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
preprocessor.save_pretrained(onnx_path)


Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
  if num_channels != self.num_channels:
  if height != self.image_size[0] or width != self.image_size[1]:


['onnx/preprocessor_config.json']

In [17]:
from transformers import pipeline

vanilla_clf = pipeline("image-classification", model=model, feature_extractor=preprocessor)
print(vanilla_clf("https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg"))


[{'score': 0.9579877257347107, 'label': 'healthy'}, {'score': 0.021719953045248985, 'label': 'bean_rust'}, {'score': 0.020292332395911217, 'label': 'angular_leaf_spot'}]


In [18]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

# create ORTQuantizer and define quantization configuration
dynamic_quantizer = ORTQuantizer.from_pretrained(model)
dqconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=False)

# apply the quantization configuration to the model
model_quantized_path = dynamic_quantizer.quantize(
    save_dir=onnx_path,
    quantization_config=dqconfig,
)


In [19]:
import os

# get model file size
size = os.path.getsize(onnx_path / "model.onnx")/(1024*1024)
quantized_model = os.path.getsize(onnx_path / "model_quantized.onnx")/(1024*1024)

print(f"Model file size: {size:.2f} MB")
print(f"Quantized Model file size: {quantized_model:.2f} MB")
#   Model file size: 330.27 MB
#   Quantized Model file size: 84.50 MB


Model file size: 327.56 MB
Quantized Model file size: 84.76 MB


In [20]:
from optimum.onnxruntime import ORTModelForImageClassification
from transformers import pipeline, AutoFeatureExtractor

model = ORTModelForImageClassification.from_pretrained(onnx_path, file_name="model_quantized.onnx")
preprocessor = AutoFeatureExtractor.from_pretrained(onnx_path)

q8_clf = pipeline("image-classification", model=model, feature_extractor=preprocessor)
print(q8_clf("https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg"))


[{'score': 0.9565075039863586, 'label': 'healthy'}, {'score': 0.022508859634399414, 'label': 'bean_rust'}, {'score': 0.020983604714274406, 'label': 'angular_leaf_spot'}]


In [21]:
from evaluate import evaluator
from datasets import load_dataset

e = evaluator("image-classification")
eval_dataset = load_dataset("beans",split=["test"])[0]

results = e.compute(
    model_or_pipeline=q8_clf,
    data=eval_dataset,
    metric="accuracy",
    input_column="image",
    label_column="labels",
    label_mapping=model.config.label2id,
    strategy="simple",
)

print(f"Vanilla model: 96.88%")
print(f"Quantized model: {results['accuracy']*100:.2f}%")
print(f"The quantized model achieves {round(results['accuracy']/0.9688,4)*100:.2f}% accuracy of the fp32 model")

#    Vanilla model: 96.88%
#    Quantized model: 96.88%
#    The quantized model achieves 99.99% accuracy of the fp32 model




  0%|          | 0/1 [00:00<?, ?it/s]

Vanilla model: 96.88%
Quantized model: 96.88%
The quantized model achieves 99.99% accuracy of the fp32 model


In [22]:
from time import perf_counter
import numpy as np
from PIL import Image
import requests

payload="https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg"

def measure_latency(pipe):
    # prepare date
    image = Image.open(requests.get(payload, stream=True).raw)
    inputs = pipe.feature_extractor(images=image, return_tensors="pt")
    latencies = []
    # warm up
    for _ in range(10):
        _ = pipe.model(**inputs)
    # Timed run
    for _ in range(200):
        start_time = perf_counter()
        _ =  pipe.model(**inputs)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies,95)
    return f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_p95_ms


vanilla_model=measure_latency(vanilla_clf)
quantized_model=measure_latency(q8_clf)

print(f"Vanilla model: {vanilla_model[0]}")
print(f"Quantized model: {quantized_model[0]}")
print(f"Improvement through quantization: {round(vanilla_model[1]/quantized_model[1],2)}x")

#    Vanilla model: P95 latency (ms) - 165.06651640004284; Average latency (ms) - 149.00 +\- 11.22;
#    Quantized model: P95 latency (ms) - 63.56140074997256; Average latency (ms) - 62.81 +\- 2.18;
#    Improvement through quantization: 2.6x


Vanilla model: P95 latency (ms) - 43.17981459462317; Average latency (ms) - 39.07 +\- 1.96;
Quantized model: P95 latency (ms) - 23.400302302616183; Average latency (ms) - 23.07 +\- 0.26;
Improvement through quantization: 1.85x
