# Putting Transformers into Production with ONNX

In [1]:
!pip uninstall -y transformers
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install sentence-transformers termcolor IProgress nltk

Found existing installation: transformers 4.5.0.dev0
Uninstalling transformers-4.5.0.dev0:
  Successfully uninstalled transformers-4.5.0.dev0
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-7w9lb23_
  Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-7w9lb23_
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Building wheels for collected packages: transformers
  Building wheel for transformers (PEP 517) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.5.0.dev0-py3-none-any.whl size=2059073 sha256=e1b66c84ddc3fe9a0848e3d02e3c8002660b6add8c175b686c6807c18df364fa
  Stored in directory: /tmp/pip-ephem-wheel-cache-b9uh74ey/wheels/05/0a/97/64ae47c27ba95fae2cb5838e7b4b7247a34d4a8ba5f7092de2
Successfully built transformer

In [2]:
!echo "ONNX Runtime Execution Providers: " && python -c "import onnxruntime as ort; print(ort.get_available_providers())"

ONNX Runtime Execution Providers: 
['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']


In [3]:
!export ORT_TENSORRT_MAX_BATCH_SIZE=10
!export ORT_TENSORRT_MAX_WORKSPACE_SIZE=4294967296
!export ORT_TENSORRT_MAX_PARTITION_ITERATIONS=20
!export ORT_TENSORRT_MIN_SUBGRAPH_SIZE=5

In [4]:
import torch
import json
import numpy as np
import onnx
import onnxruntime as rt
import multiprocessing
import transformers
import time
import nltk

from termcolor import colored
from transformers import convert_graph_to_onnx
from pathlib import Path
from onnxruntime_customops import get_library_path

nltk.download("brown")

print(onnx.__version__)
print(rt.__version__)
print(transformers.__version__)

1.8.1
1.7.1
4.5.0.dev0


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [5]:
opt = rt.SessionOptions()
opt.register_custom_ops_library(get_library_path())
opt.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
opt.log_severity_level = 4
opt.intra_op_num_threads = multiprocessing.cpu_count()
opt.execution_mode = rt.ExecutionMode.ORT_SEQUENTIAL

In [6]:
print(colored(f"GPU available {torch.cuda.is_available()}", "green"))
print(colored(f"GPU Name: {torch.cuda.get_device_name(0)}", "green"))
print(colored(f"GPU Count: {torch.cuda.device_count()}", "green"))
print(colored(f"CORE Count: {multiprocessing.cpu_count()}", "green"))

[32mGPU available True[0m
[32mGPU Name: Tesla V100-SXM2-32GB[0m
[32mGPU Count: 1[0m
[32mCORE Count: 48[0m


## Simple Export

In [7]:
model_name = "bert-base-uncased"
pipeline_name = "feature-extraction"
model_pth = Path(f"encoder/{model_name}.onnx")

nlp = transformers.pipeline(pipeline_name, model=model_name, tokenizer=model_name, device=0)
model = nlp.model
tokenizer = nlp.tokenizer

if not model_pth.exists():
    convert_graph_to_onnx.convert(
        framework="pt",
        model=model_name,
        output=model_pth,
        opset=12,
        tokenizer=model_name,
        use_external_format= False,
        pipeline_name=pipeline_name,
    )

In [8]:
MAX_SENTENCES = 10_000

sents = [" ".join(sent) for sent in nltk.corpus.brown.sents()][:MAX_SENTENCES]

## Baseline: Torch

In [9]:
start = time.time()
for sent in sents:
    _ = nlp(sent)

duration = int(time.time() - start)
speed = int(MAX_SENTENCES / duration)
print(f"encoding {MAX_SENTENCES} sentences took {duration}s at {speed} sentences/s.")

encoding 10000 sentences took 106s at 94 sentences/s.


In [10]:
del nlp

## CUDAExecutionProvider + CPUExecutionProvider

In [11]:
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
sess = rt.InferenceSession(str(model_pth), opt, providers=providers)

In [12]:
start = time.time()
for sent in sents:
    model_input = tokenizer.encode_plus(sent)
    model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
    _ = sess.run(None, model_input)

duration = int(time.time() - start)
speed = int(MAX_SENTENCES / duration)
print(f"encoding {MAX_SENTENCES} sentences took {duration}s at {speed} sentences/s.")

encoding 10000 sentences took 26s at 384 sentences/s.


## CUDAExecutionProvider

In [13]:
providers = ["CUDAExecutionProvider"]
sess = rt.InferenceSession(str(model_pth), opt, providers=providers)

In [14]:
start = time.time()
for sent in sents:
    model_input = tokenizer.encode_plus(sent)
    model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
    _ = sess.run(None, model_input)

duration = int(time.time() - start)
speed = int(MAX_SENTENCES / duration)
print(f"encoding {MAX_SENTENCES} sentences took {duration}s at {speed} sentences/s.")

encoding 10000 sentences took 27s at 370 sentences/s.


## TensorrtExecutionProvider + CUDAExecutionProvider + CPUExecutionProvider

In [15]:
providers = ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]
sess = rt.InferenceSession(str(model_pth), opt, providers=providers)

In [16]:
# Warmup
for sent in sents[:50]:
    model_input = tokenizer.encode_plus(sent)
    model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
    _ = sess.run(None, model_input)

In [17]:
start = time.time()
for sent in sents:
    model_input = tokenizer.encode_plus(sent)
    model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
    _ = sess.run(None, model_input)

duration = int(time.time() - start)
speed = int(MAX_SENTENCES / duration)
print(f"encoding {MAX_SENTENCES} sentences took {duration}s at {speed} sentences/s.")

encoding 10000 sentences took 109s at 91 sentences/s.


## TensorrtExecutionProvider

In [18]:
providers = ["TensorrtExecutionProvider"]
sess = rt.InferenceSession(str(model_pth), opt, providers=providers)

In [19]:
# Warmup
for sent in sents[:50]:
    model_input = tokenizer.encode_plus(sent)
    model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
    _ = sess.run(None, model_input)

In [20]:
start = time.time()
for sent in sents:
    model_input = tokenizer.encode_plus(sent)
    model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
    _ = sess.run(None, model_input)

duration = int(time.time() - start)
speed = int(MAX_SENTENCES / duration)
print(f"encoding {MAX_SENTENCES} sentences took {duration}s at {speed} sentences/s.")

encoding 10000 sentences took 109s at 91 sentences/s.
