# Putting Transformers into Production with ONNX

In [1]:
!pip uninstall -y transformers
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install sentence-transformers termcolor IProgress nltk
!pip install onnxruntime-gpu

Found existing installation: transformers 4.4.2
Uninstalling transformers-4.4.2:
  Successfully uninstalled transformers-4.4.2
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-vdtpwnfh
  Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-vdtpwnfh
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Building wheels for collected packages: transformers
  Building wheel for transformers (PEP 517) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.5.0.dev0-py3-none-any.whl size=2059073 sha256=76b22162e22013b9b66ec2e3c75e629b83bae4050e9859f4ad6930cc83964b79
  Stored in directory: /tmp/pip-ephem-wheel-cache-z87apk99/wheels/05/0a/97/64ae47c27ba95fae2cb5838e7b4b7247a34d4a8ba5f7092de2
Successfully built transformers
Installing co

In [2]:
!echo "ONNX Runtime Execution Providers: " && python -c "import onnxruntime as ort; print(ort.get_available_providers())"

ONNX Runtime Execution Providers: 
['CUDAExecutionProvider', 'CPUExecutionProvider']


In [3]:
import torch
import json
import numpy as np
import onnx
import onnxruntime as rt
import multiprocessing
import transformers
import time
import nltk

from termcolor import colored
from transformers import convert_graph_to_onnx
from pathlib import Path
from onnxruntime_customops import get_library_path

nltk.download("brown")

print(onnx.__version__)
print(rt.__version__)
print(transformers.__version__)

[nltk_data] Downloading package brown to /root/nltk_data...


1.8.1
1.7.0
4.5.0.dev0


[nltk_data]   Unzipping corpora/brown.zip.


In [4]:
opt = rt.SessionOptions()
opt.register_custom_ops_library(get_library_path())
opt.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
opt.log_severity_level = 4
opt.intra_op_num_threads = multiprocessing.cpu_count()
opt.execution_mode = rt.ExecutionMode.ORT_SEQUENTIAL

In [5]:
print(colored(f"GPU available {torch.cuda.is_available()}", "green"))
print(colored(f"GPU Name: {torch.cuda.get_device_name(0)}", "green"))
print(colored(f"GPU Count: {torch.cuda.device_count()}", "green"))
print(colored(f"CORE Count: {multiprocessing.cpu_count()}", "green"))

[32mGPU available True[0m
[32mGPU Name: Tesla V100-SXM2-32GB[0m
[32mGPU Count: 1[0m
[32mCORE Count: 48[0m


## Simple Export

In [9]:
model_name = "EleutherAI/gpt-neo-1.3B"
pipeline_name = "text-generation"

model_pth = Path(f"gpt_neo/gpt_neo_13b.onnx")
model_pth.parent.mkdir(exist_ok=True, parents=True)

In [10]:
nlp = transformers.pipeline(pipeline_name, model=model_name, tokenizer=model_name, device=-1)
with torch.no_grad():
    (
        input_names,
        output_names,
        dynamic_axes,
        tokens,
    ) = convert_graph_to_onnx.infer_shapes(nlp, "pt")
    ordered_input_names, model_args = convert_graph_to_onnx.ensure_valid_input(
        nlp.model, tokens, input_names
    )

Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch', 1: 'sequence'}
Found output output_1 with shape: {0: 'batch', 2: 'sequence'}
Found output output_1 with shape: {0: 'batch', 2: 'sequence'}
Found output output_2 with shape: {0: 'batch', 1: 'sequence'}
Found output output_3 with shape: {0: 'batch', 2: 'sequence'}
Found output output_3 with shape: {0: 'batch', 2: 'sequence'}
Found output output_4 with shape: {0: 'batch', 1: 'sequence'}
Found output output_5 with shape: {0: 'batch', 2: 'sequence'}
Found output output_5 with shape: {0: 'batch', 2: 'sequence'}
Found output output_6 with shape: {0: 'batch', 1: 'sequence'}
Found output output_7 with shape: {0: 'batch', 2: 'sequence'}
Found output output_7 with shape: {0: 'batch', 2: 'sequence'}
Found output output_8 with shape: {0: 'batch', 1: 'sequence'}
Found output output_9 with shape: {0: 'batch', 2: 'sequence'}
Fou

In [11]:
nlp = transformers.pipeline(pipeline_name, model=model_name, tokenizer=model_name, device=0)
tokenizer = nlp.tokenizer
model = nlp.model

In [14]:
MAX_SENTENCES = 1_000

sents = [" ".join(sent) for sent in nltk.corpus.brown.sents()][:MAX_SENTENCES]

## Baseline: Torch

In [15]:
# We just want the logits, not the full pipeline
start = time.time()
for sent in sents:
    inp = tokenizer(sent, return_tensors="pt")
    for key, value in inp.items():
        inp[key] = value.to("cuda")
    with torch.no_grad():
        _ = nlp.model.forward(**inp).logits

duration = int(time.time() - start)
speed = int(MAX_SENTENCES / duration)
print(f"predicting {MAX_SENTENCES} sentences took {duration}s at {speed} sentences/s.")

predicting 1000 sentences took 53s at 18 sentences/s.


## Build custom model to ease exporting

This step may take really long!! Beware that this may take over **1 hour**. I am using a pre-stored model I exported over night.

In [None]:
class GPTNeoSent(transformers.GPTNeoForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        self.sentence_embedding = torch.nn.Identity()

    def forward(self, input_ids, attention_mask):
        return self.sentence_embedding(
            super().forward(input_ids, attention_mask=attention_mask).logits
        )
# Create the new model based on the config of the original pipeline
model = GPTNeoSent(config=nlp.model.config).from_pretrained(model_name)

encoding = nlp.tokenizer(["hello my friends!"], return_tensors="pt")

if not model_pth.exists():
    torch.onnx.export(
        model,
        (encoding["input_ids"], encoding["attention_mask"]),
        f=model_pth.as_posix(),
        input_names=input_names,
        output_names=output_names,
        dynamic_axes=dynamic_axes,
        do_constant_folding=True,
        use_external_data_format=True, # Needed because of model size
        enable_onnx_checker=True,
        opset_version=12,
    )

In [16]:
!tar -xvzf gpt_neo.tar.gz

gpt_neo/
gpt_neo/transformer.h.4.attn.attention.bias
gpt_neo/154931
gpt_neo/150389
gpt_neo/transformer.h.1.attn.attention.out_proj.bias
gpt_neo/transformer.h.23.ln_1.bias
gpt_neo/154935
gpt_neo/161766
gpt_neo/148101
gpt_neo/145833
gpt_neo/transformer.h.15.ln_1.weight
gpt_neo/143554
gpt_neo/transformer.h.19.ln_1.bias
gpt_neo/159501
gpt_neo/136712
gpt_neo/transformer.wpe.weight
gpt_neo/152654
gpt_neo/152666
gpt_neo/transformer.h.23.ln_2.bias
gpt_neo/transformer.h.4.attn.attention.out_proj.bias
gpt_neo/transformer.h.17.attn.attention.out_proj.bias
gpt_neo/152670
gpt_neo/transformer.h.23.ln_2.weight
gpt_neo/transformer.h.9.mlp.c_proj.bias
gpt_neo/141267
gpt_neo/transformer.h.11.ln_2.weight
gpt_neo/150380
gpt_neo/transformer.h.20.mlp.c_proj.bias
gpt_neo/148110
gpt_neo/transformer.h.21.mlp.c_fc.bias
gpt_neo/157214
gpt_neo/transformer.h.22.attn.attention.out_proj.bias
gpt_neo/transformer.h.10.attn.attention.bias
gpt_neo/transformer.h.6.ln_1.weight
gpt_neo/transformer.h.13.ln_1.weight
gpt_neo/

## CUDAExecutionProvider + CPUExecutionProvider

In [19]:
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
sess = rt.InferenceSession(str(model_pth), opt, providers=providers)

In [20]:
start = time.time()
for _ in range(len(sents)):
    model_input = tokenizer.encode_plus("hello my friends!")
    model_input = {name : np.atleast_2d(value) for name, value in model_input.items()}
    _ = sess.run(None, model_input)

duration = int(time.time() - start)
speed = int(MAX_SENTENCES / duration)
print(f"encoding {MAX_SENTENCES} sentences took {duration}s at {speed} sentences/s.")

encoding 1000 sentences took 313s at 3 sentences/s.
