# Putting Transformers into Production with ONNX

In [1]:
!pip uninstall -y transformers
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install sentence-transformers termcolor IProgress

Found existing installation: transformers 4.4.2
Uninstalling transformers-4.4.2:
  Successfully uninstalled transformers-4.4.2
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-6wtmcy5l
  Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-6wtmcy5l
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Building wheels for collected packages: transformers
  Building wheel for transformers (PEP 517) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.5.0.dev0-py3-none-any.whl size=2059073 sha256=c36c407572a03ce3ad49a12ec81f563ed7b9b622d9cbb602c0aa8d0ceae33886
  Stored in directory: /tmp/pip-ephem-wheel-cache-6329cr5a/wheels/05/0a/97/64ae47c27ba95fae2cb5838e7b4b7247a34d4a8ba5f7092de2
Successfully built transformers
Installing co

In [2]:
!echo "ONNX Runtime Execution Providers: " && python -c "import onnxruntime as ort; print(ort.get_available_providers())"

ONNX Runtime Execution Providers: 
['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']


In [3]:
!export ORT_TENSORRT_MAX_BATCH_SIZE=10
!export ORT_TENSORRT_MAX_WORKSPACE_SIZE=4294967296
!export ORT_TENSORRT_MAX_PARTITION_ITERATIONS=20
!export ORT_TENSORRT_MIN_SUBGRAPH_SIZE=5

In [8]:
import torch
import json
import numpy as np
import onnx
import onnxruntime as rt
import multiprocessing
import transformers

from termcolor import colored
from transformers import convert_graph_to_onnx
from pathlib import Path
from sentence_transformers import SentenceTransformer
from onnxruntime_customops import get_library_path

print(onnx.__version__)
print(rt.__version__)
print(transformers.__version__)

span = "Hello my friends!"

1.8.1
1.7.1
4.5.0.dev0


In [9]:
ONNX_PROVIDERS = ["CUDAExecutionProvider", "CPUExecutionProvider",]

opt = rt.SessionOptions()
opt.register_custom_ops_library(get_library_path())
opt.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
opt.log_severity_level = 4
opt.intra_op_num_threads = multiprocessing.cpu_count()
opt.execution_mode = rt.ExecutionMode.ORT_SEQUENTIAL

In [10]:
print(colored(f"GPU available {torch.cuda.is_available()}", "green"))
print(colored(f"GPU Name: {torch.cuda.get_device_name(0)}", "green"))
print(colored(f"GPU Count: {torch.cuda.device_count()}", "green"))
print(colored(f"CORE Count: {multiprocessing.cpu_count()}", "green"))

[32mGPU available True[0m
[32mGPU Name: Tesla V100-SXM2-32GB[0m
[32mGPU Count: 1[0m
[32mCORE Count: 48[0m


## 1. Simple Export

In [11]:
model_name = "sentence-transformers/bert-base-nli-stsb-mean-tokens"
pipeline_name = "feature-extraction"
model_pth = Path(f"encoder/bert-base-nli-stsb-mean-tokens.onnx")

nlp = transformers.pipeline(pipeline_name, model=model_name, tokenizer=model_name, device=0)
tokenizer = nlp.tokenizer

if not model_pth.exists():
    convert_graph_to_onnx.convert(
        framework="pt",
        model=model_name,
        output=model_pth,
        opset=12,
        tokenizer=model_name,
        use_external_format= False,
        pipeline_name= pipeline_name,
    )

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

ONNX opset version set to: 12
Loading pipeline (model: sentence-transformers/bert-base-nli-stsb-mean-tokens, tokenizer: sentence-transformers/bert-base-nli-stsb-mean-tokens)
Creating folder encoder
Using framework PyTorch: 1.8.1+cu102
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch', 1: 'sequence'}
Found output output_1 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']


  assert all(


In [12]:
model_raw = SentenceTransformer("bert-base-nli-stsb-mean-tokens")

  0%|          | 0.00/405M [00:00<?, ?B/s]

In [13]:
%%timeit
nlp(span)[0]

9.86 ms ± 914 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
%%timeit
model_raw.encode(span)[0]

12.4 ms ± 623 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
sess = rt.InferenceSession(str(model_pth), opt, providers=ONNX_PROVIDERS)

In [16]:
%%timeit
model_input = tokenizer.encode_plus(span)
model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
onnx_result = sess.run(None, model_input)

2.28 ms ± 164 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
model_input = tokenizer.encode_plus(span)
model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
onnx_result = sess.run(None, model_input)

print(onnx_result[0].shape)
print(onnx_result[1].shape)

(1, 6, 768)
(1, 768)


## 2. Custom Export

In [18]:
def print_transformers_shape_inference(name_or_path: str):
    """Prints the transformers shape inference for onnx."""
    res = {}
    
    model_pipeline = transformers.FeatureExtractionPipeline(
        model=transformers.AutoModel.from_pretrained(name_or_path),
        tokenizer=transformers.AutoTokenizer.from_pretrained(
            name_or_path, use_fast=True
        ),
        framework="pt",
        device=-1,
    )

    with torch.no_grad():
        (
            input_names,
            output_names,
            dynamic_axes,
            tokens,
        ) = convert_graph_to_onnx.infer_shapes(model_pipeline, "pt")
        ordered_input_names, model_args = convert_graph_to_onnx.ensure_valid_input(
            model_pipeline.model, tokens, input_names
        )

    res["input_names"] = input_names
    res["output_names"] = output_names
    res["dynamic_axes"] = dynamic_axes
    res["tokens"] = tokens
    res["exemplary_input"] = model_args
    
    print()
    print(f"Inferred shapes for {name_or_path}")
    print(f"Input names: {input_names}")
    print(f"Output names: {output_names}")
    print(f"Dynamic Axes:\n{json.dumps(dynamic_axes,sort_keys=True, indent=4)}")
    print(f"Tokens:{tokens}")
    print(f"Ordered input names: {ordered_input_names}")
    print(f"Arguments: {model_args}")
    
    return res

model_args = print_transformers_shape_inference(model_name)

Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch', 1: 'sequence'}
Found output output_1 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']

Inferred shapes for sentence-transformers/bert-base-nli-stsb-mean-tokens
Input names: ['input_ids', 'token_type_ids', 'attention_mask']
Output names: ['output_0', 'output_1']
Dynamic Axes:
{
    "attention_mask": {
        "0": "batch",
        "1": "sequence"
    },
    "input_ids": {
        "0": "batch",
        "1": "sequence"
    },
    "output_0": {
        "0": "batch",
        "1": "sequence"
    },
    "output_1": {
        "0": "batch"
    },
    "token_type_ids": {
        "0": "batch",
        "1": "sequence"

In [19]:
model_args

{'input_names': ['input_ids', 'token_type_ids', 'attention_mask'],
 'output_names': ['output_0', 'output_1'],
 'dynamic_axes': {'input_ids': {0: 'batch', 1: 'sequence'},
  'token_type_ids': {0: 'batch', 1: 'sequence'},
  'attention_mask': {0: 'batch', 1: 'sequence'},
  'output_0': {0: 'batch', 1: 'sequence'},
  'output_1': {0: 'batch'}},
 'tokens': {'input_ids': tensor([[ 101, 2023, 2003, 1037, 7099, 6434,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])},
 'exemplary_input': (tensor([[ 101, 2023, 2003, 1037, 7099, 6434,  102]]),
  tensor([[1, 1, 1, 1, 1, 1, 1]]),
  tensor([[0, 0, 0, 0, 0, 0, 0]]))}

In [20]:
class SentenceTransformer(transformers.BertModel):
    def __init__(self, config):
        super().__init__(config)
        # Naming alias for ONNX output specification
        # Makes it easier to identify the layer
        self.sentence_embedding = torch.nn.Identity()

    def forward(self, input_ids, token_type_ids, attention_mask):
        # Get the token embeddings from the base model
        token_embeddings = super().forward(
            input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids
        )[0]
        # Stack the pooling layer on top of it
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return self.sentence_embedding(sum_embeddings / sum_mask)

# Create the new model based on the config of the original pipeline
model = SentenceTransformer(config=nlp.model.config).from_pretrained(model_name)

In [21]:
assert np.allclose(
    model_raw.encode(span),
    model(**tokenizer(span, return_tensors="pt")).squeeze().detach().numpy(),
    atol=1e-6,
)

In [22]:
del model_args["dynamic_axes"]["output_0"] # Delete unused output
del model_args["dynamic_axes"]["output_1"] # Delete unused output
model_args["dynamic_axes"]["sentence_embedding"] = {0: "batch"}

model_args["output_names"] = ["sentence_embedding"]

In [23]:
model_args

{'input_names': ['input_ids', 'token_type_ids', 'attention_mask'],
 'output_names': ['sentence_embedding'],
 'dynamic_axes': {'input_ids': {0: 'batch', 1: 'sequence'},
  'token_type_ids': {0: 'batch', 1: 'sequence'},
  'attention_mask': {0: 'batch', 1: 'sequence'},
  'sentence_embedding': {0: 'batch'}},
 'tokens': {'input_ids': tensor([[ 101, 2023, 2003, 1037, 7099, 6434,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])},
 'exemplary_input': (tensor([[ 101, 2023, 2003, 1037, 7099, 6434,  102]]),
  tensor([[1, 1, 1, 1, 1, 1, 1]]),
  tensor([[0, 0, 0, 0, 0, 0, 0]]))}

In [24]:
outdir = Path("encoder")
output = outdir / "bert-base-nli-stsb-mean-tokens-pooling.onnx"
outdir.mkdir(parents=True, exist_ok=True)

torch.onnx.export(
    model,
    model_args["exemplary_input"],
    f=output.as_posix(),
    input_names=model_args["input_names"],
    output_names=model_args["output_names"],
    dynamic_axes=model_args["dynamic_axes"],
    do_constant_folding=True,
    use_external_data_format=False,
    enable_onnx_checker=True,
    opset_version=12,
)

In [25]:
sess = rt.InferenceSession(str(output), opt, providers=ONNX_PROVIDERS)

In [26]:
model_input = tokenizer.encode_plus(span)
model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
onnx_result = sess.run(None, model_input)[0]

In [27]:
print(onnx_result.shape)

(1, 768)


In [28]:
%%timeit
model_input = tokenizer.encode_plus(span)
model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
onnx_result = sess.run(None, model_input)[0]

2.38 ms ± 86.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [29]:
assert np.allclose(
    model_raw.encode(span),
    onnx_result,
    atol=1e-6,
)

## 3. Export with ORT Custom-OPS

In [30]:
!bash use5.sh

--2021-04-05 19:07:36--  https://storage.googleapis.com/tfhub-modules/google/universal-sentence-encoder-large/5.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 2a00:1450:4001:80f::2010, 2a00:1450:4001:827::2010, 2a00:1450:4001:828::2010, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|2a00:1450:4001:80f::2010|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 548921135 (523M) [application/x-tar]
Saving to: ‘5.tar.gz’


2021-04-05 19:07:44 (62.8 MB/s) - ‘5.tar.gz’ saved [548921135/548921135]

./
./assets/
./variables/
./variables/variables.index
./variables/variables.data-00000-of-00001
./saved_model.pb
2021-04-05 19:07:51.421900: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-04-05 19:07:52.981167: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-04-05 19:07:52.982154: I tensorflow/stream

In [32]:
sess = rt.InferenceSession("encoder/universal-sentence-encoder-5.onnx", opt, providers=ONNX_PROVIDERS)

sess.run(
    output_names=["outputs"],
    input_feed={"inputs:0": [span]},
)[0]

array([[ 6.34950260e-03,  2.62679085e-02, -1.07614197e-01,
        -1.99584328e-02,  4.63480456e-03,  8.49447493e-03,
         2.51034517e-02, -2.90801805e-02,  2.21730396e-02,
        -1.00490581e-02,  9.49494448e-03, -3.26025523e-02,
        -1.32153835e-02, -1.85077582e-02, -6.01154454e-02,
         1.85450036e-02,  1.17757544e-02, -5.08113876e-02,
        -4.71790917e-02,  2.50959080e-02,  3.06968554e-03,
        -1.85236465e-02, -6.13590591e-02, -5.80388913e-03,
         7.58706704e-02,  2.40540244e-02, -1.60349458e-02,
        -7.69246221e-02,  2.79430393e-02,  2.23552212e-02,
         2.06381734e-02, -2.15326883e-02, -4.23278026e-02,
        -9.65398476e-02,  1.82510875e-02,  1.71697009e-02,
         1.02297226e-02, -7.17197824e-03,  5.71188442e-02,
        -6.49892986e-02,  5.56984451e-03, -2.10656691e-02,
        -1.31348064e-02,  4.22912091e-02,  5.61305657e-02,
        -9.58824530e-03, -4.63117845e-02, -6.88697724e-03,
         2.34971046e-02, -9.47435200e-02, -5.15654758e-0

## 4. Exporting and optimizing GPT-Neo

In [33]:
model_name = "EleutherAI/gpt-neo-1.3B"
pipeline_name = "text-generation"

model_pth = Path(f"gpt_neo/gpt_neo_13b.onnx")
model_pth.parent.mkdir(exist_ok=True, parents=True)

nlp = transformers.pipeline(pipeline_name, model=model_name, tokenizer=model_name, device=0)
tokenizer = nlp.tokenizer
model = nlp.model

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [34]:
%%timeit
inp = tokenizer(span, return_tensors="pt")
for key, value in inp.items():
    inp[key] = value.to("cuda")
with torch.no_grad():
    out = nlp.model.forward(**inp).logits

105 ms ± 9.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [35]:
with torch.no_grad():
    (
        input_names,
        output_names,
        dynamic_axes,
        tokens,
    ) = convert_graph_to_onnx.infer_shapes(nlp, "pt")
    ordered_input_names, model_args = convert_graph_to_onnx.ensure_valid_input(
        nlp.model, tokens, input_names
    )

RuntimeError: Input, output and indices must be on the current device

In [None]:
class GPTNeoSent(transformers.GPTNeoForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        self.sentence_embedding = torch.nn.Identity()

    def forward(self, input_ids, attention_mask):
        return self.sentence_embedding(
            super().forward(input_ids, attention_mask=attention_mask).logits
        )
# Create the new model based on the config of the original pipeline
model = GPTNeoSent(config=nlp.model.config).from_pretrained(model_name)

In [None]:
%%timeit
model.forward(**nlp.tokenizer([span], return_tensors="pt"))

# Warning! This step may take an hour or even more. Use ate own discretion!

In [36]:
# We are using a pre-exported model here.
!tar -xvzf gpt_neo.tar.gz

gpt_neo/
gpt_neo/transformer.h.4.attn.attention.bias
gpt_neo/154931
gpt_neo/150389
gpt_neo/transformer.h.1.attn.attention.out_proj.bias
gpt_neo/transformer.h.23.ln_1.bias
gpt_neo/154935
gpt_neo/161766
gpt_neo/148101
gpt_neo/145833
gpt_neo/transformer.h.15.ln_1.weight
gpt_neo/143554
gpt_neo/transformer.h.19.ln_1.bias
gpt_neo/159501
gpt_neo/136712
gpt_neo/transformer.wpe.weight
gpt_neo/152654
gpt_neo/152666
gpt_neo/transformer.h.23.ln_2.bias
gpt_neo/transformer.h.4.attn.attention.out_proj.bias
gpt_neo/transformer.h.17.attn.attention.out_proj.bias
gpt_neo/152670
gpt_neo/transformer.h.23.ln_2.weight
gpt_neo/transformer.h.9.mlp.c_proj.bias
gpt_neo/141267
gpt_neo/transformer.h.11.ln_2.weight
gpt_neo/150380
gpt_neo/transformer.h.20.mlp.c_proj.bias
gpt_neo/148110
gpt_neo/transformer.h.21.mlp.c_fc.bias
gpt_neo/157214
gpt_neo/transformer.h.22.attn.attention.out_proj.bias
gpt_neo/transformer.h.10.attn.attention.bias
gpt_neo/transformer.h.6.ln_1.weight
gpt_neo/transformer.h.13.ln_1.weight
gpt_neo/

In [37]:
%%time
sess = rt.InferenceSession(
    str(model_pth), 
    opt,
    providers=ONNX_PROVIDERS
)

CPU times: user 2min 56s, sys: 3.64 s, total: 3min
Wall time: 2min 55s


In [38]:
%%timeit
model_input = tokenizer.encode_plus(span)
model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
onnx_result = sess.run(None, model_input)

310 ms ± 8.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
del sess