In [1]:
import os
import json
from typing import Dict
from pathlib import Path
import numpy as np

import torch
from torch import nn
from transformers import AutoConfig, AutoModel
from transformers import XLMRobertaTokenizerFast
import onnx
import tensorrt as trt

from tensorrt_inference.backend import (
    build_engine, save_engine, load_engine
)
from tensorrt_inference.benchmark import (
    generate_random_input_for_transformers, run_inference, compute_mean_discrepency
)
from tensorrt_inference.transformers import MinMaxCalibratorTransformers
from tensorrt_inference.onnx import convert_to_onnx

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
home_dir = str(Path.home())
home_dir

'/home/g.racic'

## Create model

In [3]:
config = {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": None,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": True,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,
  "use_cache": True,
  "vocab_size": 250002
}

In [4]:
with open("config.json", "w") as f:
    json.dump(config, f)

In [5]:
class Model(nn.Module):
    def __init__(self, n_categories):
        super().__init__()
        self.n_categories = n_categories
        config = AutoConfig.from_pretrained("config.json")
        self.base_model = AutoModel.from_config(config)
        self.category_embeddings = torch.nn.Embedding(
            num_embeddings=self.n_categories, embedding_dim=768
        )
        
    def forward(
        self,
        input_ids: torch.FloatTensor,
        attention_mask: torch.FloatTensor,
    ):
        text_embedding = self.base_model(
            input_ids=input_ids, attention_mask=attention_mask
        ).pooler_output
        return text_embedding

In [6]:
model = Model(233)
model.eval()

Model(
  (base_model): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

## Create data

In [7]:
seq_max_length = 96
batch_size = 32

In [8]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

In [9]:
data = \
"""
Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's
standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make
a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting,
remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing
Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions
of Lorem Ipsum. Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of
classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at
Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem
Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source.
Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and
Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the 
Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.
The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. 
Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum et Malorum" by Cicero are also reproduced in their 
exact original form, accompanied by English versions from the 1914 translation by H. Rackham.
"""

In [10]:
tokens = tokenizer([data], max_length=seq_max_length, padding="max_length", truncation=True, return_tensors="pt")

In [11]:
random_tensor_inputs, random_numpy_inputs = generate_random_input_for_transformers(
    n_inputs=10,
    batch_size=batch_size,
    seq_len=seq_max_length,
    include_token_ids=False,
)

In [12]:
random_tensor_inputs_cpu = [
    {k: v.to("cpu") for k, v in tensor_input.items()}
    for tensor_input in random_tensor_inputs
]

In [13]:
with torch.no_grad():
    pytorch_cpu_outputs, pytorch_cpu_time_buffer = run_inference(
        inference_fn=lambda x: model(**x), inputs=random_tensor_inputs_cpu, n_measures=50
    )

In [14]:
model = model.to("cuda")

In [15]:
with torch.no_grad():
    pytorch_gpu_outputs, pytorch_gpu_time_buffer = run_inference(
        inference_fn=lambda x: model(**x), inputs=random_tensor_inputs, n_measures=50
    )

## Convert model to ONNX

In [16]:
onnx_model_path = os.path.join(home_dir, "onnx-xlm-roberta-base")

In [17]:
convert_to_onnx(model.to("cpu"), onnx_model_path, tokens, False)

## FP-16 quantization with TensorRT

In [18]:
trt_model_path = os.path.join(home_dir, "trt-xlm-roberta-base")

In [19]:
trt_logger = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(trt_logger)

In [20]:
engine_fp16 = build_engine(
    runtime=runtime,
    onnx_file_path=onnx_model_path,
    logger=trt_logger,
    min_shape=(batch_size, seq_max_length),
    optimal_shape=(batch_size, seq_max_length),
    max_shape=(batch_size, seq_max_length),
    workspace_size=10000 * 1024 * 1024,
    fp16=True,
    int8=False
)

In [21]:
save_engine(engine=engine_fp16, engine_file_path=trt_model_path)

In [22]:
trt_fp16_model = load_engine(
    runtime=runtime, engine_file_path=trt_model_path
)

In [23]:
trt_fp16_outputs, trt_fp16_time_buffer = run_inference(
    inference_fn=trt_fp16_model, inputs=random_numpy_inputs, n_measures=50
)

## INT-8 quantization with TensorRT

In [24]:
trt_int8_model_path = os.path.join(home_dir, "trt-int8-xlm-roberta-base")

In [25]:
engine_int8 = build_engine(
    runtime=runtime,
    onnx_file_path=onnx_model_path,
    logger=trt_logger,
    min_shape=(batch_size, seq_max_length),
    optimal_shape=(batch_size, seq_max_length),
    max_shape=(batch_size, seq_max_length),
    workspace_size=10000 * 1024 * 1024,
    fp16=True,
    int8=True,
    calibrator=MinMaxCalibratorTransformers(random_numpy_inputs)
)

In [26]:
save_engine(engine=engine_int8, engine_file_path=trt_int8_model_path)

In [27]:
trt_int8_model = load_engine(
    runtime=runtime, engine_file_path=trt_int8_model_path
)

In [28]:
trt_int8_outputs, trt_int8_time_buffer = run_inference(
    inference_fn=trt_int8_model, inputs=random_numpy_inputs, n_measures=50
)

## Output comparison

In [29]:
trt_fp16_outputs = [output[0] for output in trt_fp16_outputs]
trt_int8_outputs = [output[0] for output in trt_int8_outputs]
pytorch_cpu_outputs_numpy = [tensor.detach().numpy() for tensor in pytorch_cpu_outputs]
pytorch_gpu_outputs_numpy = [tensor.cpu().detach().numpy() for tensor in pytorch_gpu_outputs]

In [30]:
compute_mean_discrepency(pytorch_cpu_outputs_numpy, pytorch_gpu_outputs_numpy, 1e-3)

3.7059348e-07

In [31]:
compute_mean_discrepency(trt_int8_outputs, trt_fp16_outputs, 1e-3)
# Small differencies between TRT fp16 model outputs and TRT int8 model outputs

0.0005077717

In [32]:
compute_mean_discrepency(pytorch_cpu_outputs_numpy, trt_fp16_outputs, 1e-3)

0.0006866513

## Latency comparison

In [33]:
mean_latency_pytorch_cpu_secs = np.mean(pytorch_cpu_time_buffer)
mean_latency_pytorch_cpu_secs

0.3879566197283566

In [34]:
mean_latency_pytorch_gpu_secs = np.mean(pytorch_gpu_time_buffer)
mean_latency_pytorch_gpu_secs

0.05832288501784205

In [35]:
mean_latency_trt_fp16_secs = np.mean(trt_fp16_time_buffer)
mean_latency_trt_fp16_secs

0.011639913546387106

In [36]:
mean_latency_trt_int8_secs = np.mean(trt_int8_time_buffer)
mean_latency_trt_int8_secs

0.01163815296953544