# Exporting & optimiazing 🤗 transformers model to ONNX

* [Example from transformers](https://github.com/huggingface/transformers/blob/master/notebooks/04-onnx-export.ipynb)


Under the hood the process is sensibly the following:

converting:
1. Allocate the model from transformers (PyTorch or TensorFlow)
2. Forward dummy inputs through the model this way ONNX can record the set of operations executed
3. Optionally define dynamic axes on input and output tensors
4. Save the graph along with the network parameters

In [1]:
model_id="bert-base-cased-finetuned-mrpc"
pipeline="sentiment-analysis"
save_path="onnx"
max_length=128
opset_version=11
export_model_path=f"{save_path}/{model_id}.onnx"

In [3]:
from os import environ
from psutil import cpu_count

# Constants from the performance optimization available in onnxruntime
# It needs to be done before importing onnxruntime
environ["OMP_NUM_THREADS"] = str(cpu_count(logical=True))
environ["OMP_WAIT_POLICY"] = 'ACTIVE'

from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers

class OnnxModel:
    def __init__(self, model_path: str, provider: str):
        assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}"
        self.options = self._set_options()
        self.model=InferenceSession(model_path, self.options, providers=[provider])
        # Load the model as a graph and prepare the CPU backend 
        self.model.disable_fallback()

    def __call__(self, input):
        return self.model.run(None, input)[0]
  
    def _set_options(self):
        # Few properties that might have an impact on performances (provided by MS)
        options = SessionOptions()
        options.intra_op_num_threads = 1
        options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
        return options

# Converting model to onnx

## automatic conversion, build into `transformers`

In [9]:
!rm -rf onnx/
from pathlib import Path
from transformers.convert_graph_to_onnx import convert

# Handles all the above steps for you
convert(model=model_id,
        output=Path(f"transformers/{export_model_path}"),
        pipeline_name=pipeline,
        opset=opset_version,
        framework='pt'
)

ONNX opset version set to: 11
Loading pipeline (model: bert-base-cased-finetuned-mrpc, tokenizer: bert-base-cased-finetuned-mrpc)
Creating folder onnx
Using framework PyTorch: 1.7.1+cpu
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']


## manual conversion with `torch.onnx.export`

1. create sample input
2. `forward()` to get outputs
3. create `input_names` and `output_names`
3. create `dynamic_axes` -> input/output tensors where the shape might change -> basically every input and output.
4. convert model with `torch.onnx.export``

In [34]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.file_utils import ModelOutput

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)
# Setup some example inputs
sequence_0 = "The company HuggingFace is based in New York City"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=max_length, padding='longest', truncation=True, return_tensors="pt")

outputs = model(**paraphrase) 

In [35]:
# generate input and output names/keys
input_names = list(paraphrase.keys())
output_names = [f"output_{i}" for i in range(len(outputs))]

# Generate dynamic axes, with batching -> inputs/outputs with potential dynamic shape
symbolic_names = {0: 'batch_size', 1: 'sequence'} #TODO: Exaplain

input_dynamic_axes = {input_key: symbolic_names for input_key in input_names}
output_dynamic_axes = {output_key: {0: 'batch_size'} for output_key in output_names}
dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes)

print(f"created input_names:")
print(input_names)
print(f"created output_names:")
print(output_names)
print(f"created dynamic_axes:")
print(dynamic_axes)

created input_names:
['input_ids', 'token_type_ids', 'attention_mask']
created output_names:
['output_0']
created dynamic_axes:
{'input_ids': {0: 'batch_size', 1: 'sequence'}, 'token_type_ids': {0: 'batch_size', 1: 'sequence'}, 'attention_mask': {0: 'batch_size', 1: 'sequence'}, 'output_0': {0: 'batch_size'}}


In [46]:
import os
import torch

os.makedirs(export_model_path.replace('.onnx',''),exist_ok=True)

device = torch.device("cpu")
model.eval()
model.to(device)

with torch.no_grad():
    torch.onnx.export(model,                               # model being run
                      args=tuple(paraphrase.values()),     # model input (or a tuple for multiple inputs)
                      f=export_model_path,                 # where to save the model (can be a file or file-like object)
                      opset_version=opset_version,         # the ONNX version to export the model to
                      do_constant_folding=True,            # whether to execute constant folding for optimization
                      enable_onnx_checker=True, 
                      use_external_data_format=False,
                      input_names=input_names,             # the model's input names  'input_ids', 'token_type_ids', 'attention_mask'
                      output_names=output_names,           # the model's output names 'output_0'
                      dynamic_axes=dynamic_axes)           # inputs/outputs with potential dynamic shape -> mostly all

print("Model exported at ", export_model_path)

Model exported at  onnx/bert-base-cased-finetuned-mrpc.onnx


# Optimize exported model

Optimizations are basically of three kinds:

1. Constant Folding: Convert static variables to constants in the graph
2. Deadcode Elimination: Remove nodes never accessed in the graph
3. Operator Fusing: Merge multiple instruction into one (Linear -> ReLU can be fused to be LinearReLU)


Optimizer script managed bei onnx: https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/optimizer.py

## Optimizing

the optimize method we have in `transformers` only rely on online optimizations when loading the ONNX graph. Any offline transformations, like `SkipLayerNorm`, `EmbedLayerNorm`, `Attention`, `FastGeLU` might not be applied from the online version

Offline also allows to convert fp32 models to AMP on GPU


### manual offline optimization with `optimizer.optimize_model` 

In [None]:
# all optimization layers
# optimization_options.enable_gelu = False
# optimization_options.enable_layer_norm = False
# optimization_options.enable_attention = False
# optimization_options.enable_skip_layer_norm = False
# optimization_options.enable_embed_layer_norm = False
# optimization_options.enable_bias_skip_layer_norm = False
# optimization_options.enable_bias_gelu = False
# optimization_options.enable_gelu_approximation = True
# optimization_options.use_raw_attention_mask(False)
# optimization_options.disable_attention_mask()

In [37]:
# optimize transformer-based models with onnxruntime-tools
from onnxruntime.transformers import optimizer
from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions

# disable embedding layer norm optimization for better model size reduction
optimized_options = BertOptimizationOptions('bert')
optimized_options.enable_embed_layer_norm = False

optimized_model = optimizer.optimize_model(
    export_model_path,
    'bert', 
    num_heads=12,
    hidden_size=768,
    opt_level=99,
    optimization_options=optimized_options)

optimized_model.save_model_to_file(f"{save_path}/optimized_{model_id}.onnx")

In [41]:
optimized_model = optimizer.optimize_model(export_model_path, opt_level=1, use_gpu=False, only_onnxruntime=True)


In [42]:
import os
print('Size (MB):', os.path.getsize(f"{save_path}/optimized_{model_id}.onnx")/1e6)

Size (MB): 216.647809


### automatic opmtimization, build into `transformers`

In [51]:
from pathlib import Path
from transformers.convert_graph_to_onnx import optimize

optimize(Path(f"transformers/{export_model_path}"))

import os
print('Size (MB):', os.path.getsize(f"transformers/{export_model_path}")/1e6)

Optimized model has been written at transformers/onnx/bert-base-cased-finetuned-mrpc-optimized.onnx: ✔
/!\ Optimized model contains hardware specific operators which might not be portable. /!\
Size (MB): 433.319523


# Quantize

### manual conversion with `quantize_helper.quantize_onnx_model` 

In [55]:
from onnxruntime.transformers.quantize_helper import QuantizeHelper

onnx_model_path = f"transformers/onnx/bert-base-cased-finetuned-mrpc-optimized.onnx"
quantized_model_path = f"transformers/onnx/bert-base-cased-finetuned-mrpc-optimized-quantized.onnx"

QuantizeHelper.quantize_onnx_model(onnx_model_path, quantized_model_path)

         Please use quantize_static for static quantization, quantize_dynamic for dynamic quantization.


### automatic opmtimization, build into `transformers`

In [None]:
from pathlib import Path
from transformers.convert_graph_to_onnx import quantize

quantize(Path(f"transformers/{export_model_path}"))

import os
print('Size (MB):', os.path.getsize(f"transformers/{export_model_path}")/1e6)

## Test inference with exported model

In [52]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

provider = "CPUExecutionProvider"
#onnx_model = OnnxModel(f"{save_path}/optimized_{model_id}.onnx", provider)
onnx_model = OnnxModel(f"transformers/onnx/bert-base-cased-finetuned-mrpc-optimized.onnx", provider)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Setup some example inputs
sequence_0 = "The company HuggingFace is based in New York City"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=max_length, padding='longest', truncation=True, return_tensors="pt")

#### onnx model

In [53]:
inputs_onnx = {k: v.cpu().detach().numpy() for k, v in paraphrase.items()}

outputs = onnx_model(inputs_onnx)
outputs[0].argmax().item()

1

#### pytorch model

In [45]:
model = AutoModelForSequenceClassification.from_pretrained(model_id)

outputs = model(**paraphrase) 
outputs[0][0].argmax().item()

1

#### model size comparison

In [21]:
import os
print('Size (MB):', os.path.getsize(export_model_path)/1e6)


Size (MB): 433.319523


### benchmarking

In [27]:
from transformers import AutoTokenizer
from datasets import load_dataset,load_metric
import time
import torch

task = "mrpc"
split="validation"
all_datasets = load_dataset("glue", task)
metric = load_metric("glue", task)
dataset= all_datasets[split]

tokenizer = AutoTokenizer.from_pretrained(model_id)
max_length=128
padding='longest'

def preprocess_function(examples):
    # Tokenize the texts
    texts = (examples['sentence1'], examples['sentence2'])
    result = tokenizer(*texts, padding=padding, max_length=max_length, truncation=True,return_tensors="pt")
    result["labels"] = examples["label"]
    return result


def do_test(name='',raw_dataset=None,model=None,model_type='',samples=None):
    processed_dataset = raw_dataset.map(preprocess_function)
    processed_dataset = processed_dataset.select(range(samples))
    model_start = time.perf_counter()
    if model_type == 'onnx':
        for step, batch in enumerate(processed_dataset):
            destructed_dict = {'input_ids': torch.tensor(batch['input_ids']),
                               'attention_mask': torch.tensor(batch['attention_mask']),
                               'token_type_ids': torch.tensor(batch['token_type_ids'])
                              }
            inputs_onnx = {k: v.cpu().detach().numpy() for k, v in destructed_dict.items()}
            outputs = model(inputs_onnx)
            predictions = outputs[0].argmax().item()
            metric.add_batch(predictions=[predictions],references=[batch["labels"]])
    else:
        with torch.no_grad():
            for step, batch in enumerate(processed_dataset):
                input_ids = torch.tensor(batch['input_ids'])
                attention_mask = torch.tensor(batch['attention_mask'])
                token_type_ids = torch.tensor(batch['token_type_ids'])
                outputs = model(*[input_ids,attention_mask,token_type_ids])
                predictions = outputs[0][0].argmax().item()
                metric.add_batch(predictions=[predictions],references=[batch["labels"]])
        
    eval_metric = metric.compute()
    model_stop = time.perf_counter()
    total_time = round(model_stop - model_start,4)*1000
    average_time =  round(total_time/len(processed_dataset),4)
    return {'name':name,
            'model_type':model_type,
            **eval_metric,
            'total_time':f"{total_time}ms",
            'average_time':f"{average_time}ms",
            'max_length':max_length,
            'samples': len(processed_dataset),
            'task': task
           }

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [28]:
samples=100
model_res=do_test(name='pytorch',
               raw_dataset=dataset,
               model=model,
               model_type='pytorch',
               samples=samples)
    
               
model_onnx_res=do_test(name='onnx',
               raw_dataset=dataset,
               model=onnx_model,
               model_type='onnx',
               samples=samples)

model_onnx_optimized = do_test(name='onnx_optimized',
                               raw_dataset=dataset,
                               model=optimized_model,
                               model_type='onnx',
                               samples=samples)

import pandas as pd
df = pd.DataFrame([model_res,model_onnx_res])
df.to_csv('first-test.csv')
df.head()



HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))




# Optimize exported model

Optimizations are basically of three kinds:

1. Constant Folding: Convert static variables to constants in the graph
2. Deadcode Elimination: Remove nodes never accessed in the graph
3. Operator Fusing: Merge multiple instruction into one (Linear -> ReLU can be fused to be LinearReLU)


Optimizer script managed bei onnx: https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/optimizer.py

# Scripting

In [2]:

!rm -rf models/ && mkdir models

!cd models && python -m transformers.convert_graph_to_onnx \
        --pipeline {pipeline} \
        --model {model_id} \
        --framework 'pt' \
        --opset 11 \
        --check-loading \
        --quantize \
        "cli"


ONNX opset version set to: 11
Loading pipeline (model: bert-base-cased-finetuned-mrpc, tokenizer: bert-base-cased-finetuned-mrpc)
Using framework PyTorch: 1.7.1+cpu
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']
  position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
  input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors

2021-05-09 19:45:30.883663800 [W:onnxruntime:, inference_session.cc:1256 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED. The generated model may contain hardware and execution provider

In [6]:
cli_export_model_path='models/cli-optimized-quantized'

In [7]:
import os
print('Size (MB):', os.path.getsize(f"{cli_export_model_path}")/1e6)

Size (MB): 110.503015


In [31]:
samples=1000
provider = "CPUExecutionProvider"
optimized_model = OnnxModel(cli_export_model_path, provider)

model_cli_onnx_optimized = do_test(name='onnx_optimized_cli_transformers',
                               raw_dataset=dataset,
                               model=optimized_model,
                               model_type='onnx',
                               samples=samples)

import pandas as pd
df = pd.DataFrame([model_cli_onnx_optimized])
df.head()

HBox(children=(FloatProgress(value=0.0, max=408.0), HTML(value='')))




Unnamed: 0,name,model_type,accuracy,f1,total_time,average_time,max_length,samples,task
0,onnx_optimized_cli_transformers,onnx,0.845,0.894198,95909.7ms,95.9097ms,128,1000,mrpc
