## OverView
In this guide, we show how to enable PyTorch model with OpenVINO, and how to optimize Vision Transformers models with quantize.

1. Setup Development Environment
2. Convert the PyTorch model to ONNX model
3. Apply Bf16 quantization using OpenVINO
3. Apply Int8 quantization using OpenVINO

## Setup Development Environment

In [1]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


## Convert the Pytorch model to ONNX model

In [2]:
from transformers import ViTImageProcessor, ViTForImageClassification
from PIL import Image
import requests
import torch
from pathlib import Path

url = 'https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg'
image = Image.open(requests.get(url, stream=True).raw)

model_id="nateraw/vit-base-beans"
model_name="vit-base-beans"
onnx_path = Path("onnx")

image_processor = ViTImageProcessor.from_pretrained(model_id)
model = ViTForImageClassification.from_pretrained(model_id)

inputs = image_processor(images=image, return_tensors="pt")

# print(model(**inputs))

torch.onnx.export(model, inputs["pixel_values"], model_name+'.onnx',
    input_names=["input"], output_names=["output"],
    dynamic_axes={'input': {0:'batch'}, 'output': {0:'batch'}})

print("Convert success!")

  from .autonotebook import tqdm as notebook_tqdm
  if num_channels != self.num_channels:
  if height != self.image_size[0] or width != self.image_size[1]:


Convert success!


## Apply Bf16 quantization using OpenVINO

### Test the performance (latency) of quantized model 

In [3]:
import numpy as np
import openvino.runtime as ov
from time import perf_counter
import numpy as np
from PIL import Image
import requests
from datasets import load_dataset

payload="https://datasets-server.huggingface.co/assets/beans/--/default/validation/30/image/image.jpg"
image = Image.open(requests.get(payload, stream=True).raw)

def measure_latency(model, inputs):
    # prepare date
    latencies = []
    # warm up
    for _ in range(10):
        _ = model(inputs)
    # Timed run
    for _ in range(1000):
        start_time = perf_counter()
        _ = model(inputs)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies,95)
    return f"P95 latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_p95_ms

core = ov.Core()
core.set_property("CPU", {"INFERENCE_PRECISION_HINT": "f32"})
fp32_model = core.compile_model(model_name+'.onnx', "AUTO")

core.set_property("CPU", {"INFERENCE_PRECISION_HINT": "bf16"})
bf16_model = core.compile_model(model_name+'.onnx', "AUTO")

inputs = image_processor(image, return_tensors="pt")
# Create tensor from external memory
ov_inputs=inputs["pixel_values"].numpy()
input_tensor = ov.Tensor(array=ov_inputs, shape=[1, 3, 224, 224])

print(f"benchmark with models:")
rtn_fp32_model = measure_latency(fp32_model, input_tensor)
rtn_bf16_model = measure_latency(bf16_model, input_tensor)

print(f"fp32_model: {rtn_fp32_model[0]}")
print(f"bf16_model: {rtn_bf16_model[0]}")
print(f"Improvement through quantization: {round(rtn_fp32_model[1]/rtn_bf16_model[1], 2)}x")


benchmark with models:
fp32_model: P95 latency (ms) - 16.660605580545962; Average latency (ms) - 15.12 +\- 1.59;
bf16_model: P95 latency (ms) - 8.549499016953632; Average latency (ms) - 8.00 +\- 1.64;
Improvement through quantization: 1.95x


### Test the accuraccy of quantized model 

In [4]:
from sklearn.metrics import accuracy_score
import numpy as np
import os
from datasets import load_dataset

eval_dataset = load_dataset("beans",split=["test"])[0]

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return dict(accuracy=accuracy_score(predictions, labels))

def predict(model, image):
    inputs = image_processor(image, return_tensors="pt")
    ov_inputs=inputs["pixel_values"].numpy()
    input_tensor = ov.Tensor(array=ov_inputs, shape=[1, 3, 224, 224])
    return model(input_tensor)

size = len(eval_dataset["image"])

fp32_eval_pred = ([predict(fp32_model, eval_dataset["image"][i])[fp32_model.output(0)] for i in range(size)], eval_dataset["labels"])
bf16_eval_pred = ([predict(bf16_model, eval_dataset["image"][i])[bf16_model.output(0)] for i in range(size)], eval_dataset["labels"])

fp32_accuracy = compute_metrics(fp32_eval_pred)
bf16_accuracy = compute_metrics(bf16_eval_pred)

print(f"fp32_accuracy: {fp32_accuracy['accuracy']*100:.2f}%")
print(f"bf16_accuracy: {bf16_accuracy['accuracy']*100:.2f}%")
print(f"The quantized model achieves {round(bf16_accuracy['accuracy']/fp32_accuracy['accuracy'],4)*100:.2f}% accuracy of the fp32 model")

Found cached dataset beans (/home/marvin/.cache/huggingface/datasets/beans/default/0.0.0/90c755fb6db1c0ccdad02e897a37969dbf070bed3755d4391e269ff70642d791)
100%|██████████| 1/1 [00:00<00:00, 725.41it/s]


fp32_accuracy: 96.88%
bf16_accuracy: 96.88%
The quantized model achieves 100.00% accuracy of the fp32 model


## Apply Int8 quantization using OpenVINO



### Prepare the calibration dataset

In [5]:

import os
from pathlib import Path
import warnings

import torch
from torchvision import transforms as T
from torchvision.datasets import CIFAR10

import matplotlib.pyplot as plt
import numpy as np

from openvino.runtime import Core, Tensor

warnings.filterwarnings("ignore")

# Set the data and model directories
MODEL_DIR = 'model'
CALIB_DIR = 'calib'
CIFAR_DIR = 'data/datasets/beans'
CALIB_SET_SIZE = 300
MODEL_NAME = 'vit-base-beans'

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(CALIB_DIR, exist_ok=True)
os.makedirs(CIFAR_DIR, exist_ok=True)

Downlaod the dataset

In [6]:
import cv2
from datasets import load_dataset

ds = load_dataset('beans')['train']
lbs = ds['labels']

_index = 0
_label_index = [100, 100, 100]

for idx, info in enumerate(ds):
    im = info["image"]
    label = info["labels"]
    if _label_index[label] > 0:
        im = im.resize((224, 224))
        im.save(Path(CALIB_DIR) / f'{label}_{_index}.jpg')
        _label_index[label] = _label_index[label] - 1
        _index = _index + 1

Found cached dataset beans (/home/marvin/.cache/huggingface/datasets/beans/default/0.0.0/90c755fb6db1c0ccdad02e897a37969dbf070bed3755d4391e269ff70642d791)
100%|██████████| 3/3 [00:00<00:00, 1218.92it/s]


In [7]:
onnx_model_path = Path(".") / 'vit-base-beans.onnx'
ir_model_xml = Path(MODEL_DIR) / onnx_model_path.with_suffix('.xml')
ir_model_bin = Path(MODEL_DIR) / onnx_model_path.with_suffix('.bin')

Now, convert this model into the OpenVINO IR using Model Optimizer:



In [8]:
!mo -m $onnx_model_path  --output_dir $MODEL_DIR


[ INFO ] The model was converted to IR v11, the latest model format that corresponds to the source DL framework input/output format. While IR v11 is backwards compatible with OpenVINO Inference Engine API v1.0, please use API v2.0 (as of 2022.1) to take advantage of the latest improvements in IR v11.
Find more information about API v2.0 and IR v11 at https://docs.openvino.ai/latest/openvino_2_0_transition_guide.html
[ SUCCESS ] Generated IR version 11 model.
[ SUCCESS ] XML file: /home/marvin/workspace/greennet/vit/model/vit-base-beans.xml
[ SUCCESS ] BIN file: /home/marvin/workspace/greennet/vit/model/vit-base-beans.bin


Compress the model with the following command:

`pot -q default -m <path_to_xml> -w <path_to_bin> --engine simplified --data-source <path_to_data>`

In [9]:
!pot -q default -m $ir_model_xml -w $ir_model_bin --engine simplified --data-source $CALIB_DIR --output-dir compressed --direct-dump --name $MODEL_NAME

INFO:openvino.tools.pot.app.run:Output log dir: compressed
INFO:openvino.tools.pot.app.run:Creating pipeline:
 Algorithm: DefaultQuantization
 Parameters:
	preset                     : performance
	stat_subset_size           : 300
	target_device              : ANY
	model_type                 : None
	dump_intermediate_model    : False
	inplace_statistics         : True
	exec_log_dir               : compressed
INFO:openvino.tools.pot.data_loaders.image_loader:Layout value is set [N,C,H,W]
INFO:openvino.tools.pot.pipeline.pipeline:Inference Engine version:                2022.3.0-9052-9752fafe8eb-releases/2022/3
INFO:openvino.tools.pot.pipeline.pipeline:Model Optimizer version:                 2022.3.0-9052-9752fafe8eb-releases/2022/3
INFO:openvino.tools.pot.pipeline.pipeline:Post-Training Optimization Tool version: 2022.3.0-9052-9752fafe8eb-releases/2022/3
INFO:openvino.tools.pot.statistics.collector:Start computing statistics for algorithms : DefaultQuantization
INFO:openvino.tools.pot.

### Test the performance (latency) of quantized model


In [10]:
optimized_model_path = Path('compressed/optimized')
optimized_model_xml = optimized_model_path / '{}.xml'.format(MODEL_NAME)
optimized_model_bin = optimized_model_path / '{}.bin'.format(MODEL_NAME)

In [15]:
int8_model = core.compile_model(str(optimized_model_xml))

print(f"benchmark with models:")
rtn_fp32_model = measure_latency(fp32_model, input_tensor)
rtn_bf16_model = measure_latency(bf16_model, input_tensor)
rtn_int8_model = measure_latency(int8_model, input_tensor)

print(f"fp32_model: {rtn_fp32_model[0]}")
print(f"bf16_model: {rtn_bf16_model[0]}")
print(f"int8_model: {rtn_int8_model[0]}")
print(f"Improvement through bf16 quantization: {round(rtn_fp32_model[1]/rtn_bf16_model[1], 2)}x")
print(f"Improvement through int8 quantization: {round(rtn_fp32_model[1]/rtn_int8_model[1], 2)}x")

benchmark with models:
fp32_model: P95 latency (ms) - 16.08027223846875; Average latency (ms) - 15.07 +\- 1.23;
bf16_model: P95 latency (ms) - 8.533450739923865; Average latency (ms) - 7.91 +\- 1.04;
int8_model: P95 latency (ms) - 9.595101361628622; Average latency (ms) - 8.52 +\- 0.91;
Improvement through bf16 quantization: 1.88x
Improvement through int8 quantization: 1.68x


In [12]:
# Inference FP32 model (OpenVINO IR)
!benchmark_app -m $ir_model_xml -d CPU -api async -b 1

[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 130.23 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     input (node: input) : f32 / [...] / [?,3,224,224]
[ INFO ] Model outputs:
[ INFO ]     output (node: output) : f32 / [...] / [?,3]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1
[ INFO ] Reshaping model: 'input': [1,3,224,224]
[ INFO ] Reshape model took 6.33 ms
[Step 6/11] Configuring input of the model
[ INFO ] Model inputs:
[ INFO ]     input (node

In [13]:
!benchmark_app -m $optimized_model_xml -d CPU -api async -b 1


[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2022.3.0-9052-9752fafe8eb-releases/2022/3
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 74.60 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     input (node: input) : f32 / [...] / [?,3,224,224]
[ INFO ] Model outputs:
[ INFO ]     output (node: output) : f32 / [...] / [?,3]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1
[ INFO ] Reshaping model: 'input': [1,3,224,224]
[ INFO ] Reshape model took 10.80 ms
[Step 6/11] Configuring input of the model
[ INFO ] Model inputs:
[ INFO ]     input (node

### Test the accuraccy of quantized model 

In [14]:
ie = Core()

int8_model = ie.compile_model(str(optimized_model_xml))

size = len(eval_dataset["image"])

fp32_eval_pred = ([predict(fp32_model, eval_dataset["image"][i])[fp32_model.output(0)] for i in range(size)], eval_dataset["labels"])
int8_eval_pred = ([predict(int8_model, eval_dataset["image"][i])[int8_model.output(0)] for i in range(size)], eval_dataset["labels"])

fp32_accuracy = compute_metrics(fp32_eval_pred)
int8_accuracy = compute_metrics(int8_eval_pred)

print(f"fp32_accuracy: {fp32_accuracy['accuracy']*100:.2f}%")
print(f"int8_accuracy: {int8_accuracy['accuracy']*100:.2f}%")
print(f"The quantized model achieves {round(int8_accuracy['accuracy']/fp32_accuracy['accuracy'],4)*100:.2f}% accuracy of the fp32 model")


fp32_accuracy: 96.88%
int8_accuracy: 42.19%
The quantized model achieves 43.55% accuracy of the fp32 model
