In [1]:
import onnx
from onnxruntime.transformers.fusion_options import FusionOptions
from onnxruntime.transformers import optimizer
from onnxruntime import InferenceSession
from onnx import shape_inference

In [2]:
import os
from typing import Dict
import json
import time
import itertools

from torchvision import transforms

from PIL import Image

import torch
import tensorrt as trt

import numpy as np

import transformers

from tensorrt_inference.backend import (
    build_engine, save_engine, load_engine
)

import torch_tensorrt
import pycuda.driver as cuda

In [3]:
model_path = "/home/g.racic/sync/removebg_u2netp_192_v1_standalone.pth"
u2netp = torch.jit.load(model_path) 
u2netp = u2netp.eval()
u2netp.to("cuda:0")

RecursiveScriptModule(
  original_name=U2NETP
  (stage1): RecursiveScriptModule(
    original_name=RSU7
    (rebnconvin): RecursiveScriptModule(
      original_name=REBNCONV
      (conv_s1): RecursiveScriptModule(original_name=Conv2d)
      (bn_s1): RecursiveScriptModule(original_name=BatchNorm2d)
      (relu_s1): RecursiveScriptModule(original_name=ReLU)
    )
    (rebnconv1): RecursiveScriptModule(
      original_name=REBNCONV
      (conv_s1): RecursiveScriptModule(original_name=Conv2d)
      (bn_s1): RecursiveScriptModule(original_name=BatchNorm2d)
      (relu_s1): RecursiveScriptModule(original_name=ReLU)
    )
    (pool1): RecursiveScriptModule(original_name=MaxPool2d)
    (rebnconv2): RecursiveScriptModule(
      original_name=REBNCONV
      (conv_s1): RecursiveScriptModule(original_name=Conv2d)
      (bn_s1): RecursiveScriptModule(original_name=BatchNorm2d)
      (relu_s1): RecursiveScriptModule(original_name=ReLU)
    )
    (pool2): RecursiveScriptModule(original_name=MaxPool2d

## Inputs

In [4]:
img_paths = [
    "/home/g.racic/sync/cailimage/pix_images/images/526_95bd2ba559f6b06a26d34e6aa0730a12.jpg",
    "/home/g.racic/sync/cailimage/pix_images/images/101_dd5783e2de4e0129850c5c347c00b6c4.jpg",
    "/home/g.racic/sync/cailimage/pix_images/images/102_5673a8240902ac026c2b1faf553b4999.jpg",
    "/home/g.racic/sync/cailimage/pix_images/images/103_85540ed2f1683e7a89cf5b729e7ae71b.jpg",
    "/home/g.racic/sync/cailimage/pix_images/images/105_289028da5128a3e7c6882f37590978de.jpg"
]

In [5]:
fts = transforms.Compose(
    [transforms.Resize((192, 192)),
     transforms.ToTensor(),
     transforms.Normalize((.485, .456, .406), (.229, .224, .225))]
)
fts

Compose(
    Resize(size=(192, 192), interpolation=bilinear, max_size=None, antialias=None)
    ToTensor()
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
)

In [6]:
def preprocess_img(img_path: str):
    image = Image.open(img_path)
    if image.mode != 'RGB':
        image = image.convert('RGB')
    return fts(image)

In [7]:
img_tensors = [preprocess_img(img_path) for img_path in img_paths]
img_tensors = torch.stack(img_tensors)
img_tensors = img_tensors.to("cuda")
img_tensors.shape

torch.Size([5, 3, 192, 192])

In [8]:
def benchmark(model_fn, input_data, batch_size, nwarmup=50, nruns=1000):
    _data = itertools.cycle(input_data)
    print("Warm up ...")
    with torch.no_grad():
        for n in range(nwarmup):
            model_fn(next(_data))
    torch.cuda.synchronize()
    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(1, nruns+1):
            start_time = time.time()
            model_fn(next(_data))
            torch.cuda.synchronize()
            end_time = time.time()
            timings.append(end_time - start_time)
            if i%100==0:
                print('Iteration %d/%d, avg batch time %.2f ms'%(i, nruns, np.mean(timings)*1000))
 
    print('Average throughput: %.2f example/second'%(batch_size/np.mean(timings)))

## ONNX export

In [9]:
def convert_to_onnx(
    model_pytorch, output_path: str, inputs_pytorch: Dict[str, torch.Tensor], quantization: bool
) -> None:
    if quantization:
        try:
            from pytorch_quantization.nn import TensorQuantizer
        except ImportError:
            raise ImportError(
                "It seems that pytorch-quantization is not yet installed. "
                "It is required when you enable the quantization flag and use CUDA device."
                "Please find installation instructions on "
                "https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization or use:\n"
                "pip3 install git+ssh://git@github.com/NVIDIA/TensorRT#egg=pytorch-quantization\\&"
                "subdirectory=tools/pytorch-quantization/"
            )

        TensorQuantizer.use_fb_fake_quant = True

    dynamic_axis = dict()
    for k in inputs_pytorch.keys():
        dynamic_axis[k] = {0: "batch_size"}
    dynamic_axis["output"] = {0: "batch_size"}
    with torch.no_grad():
        torch.onnx.export(
            model_pytorch,
            args=tuple(inputs_pytorch.values()),
            f=output_path,
            opset_version=12,
            do_constant_folding=True,
            input_names=list(inputs_pytorch.keys()),
            output_names=["output"],
            dynamic_axes=dynamic_axis,
            training=torch.onnx.TrainingMode.EVAL,
            verbose=False,
        )
    if quantization:
        TensorQuantizer.use_fb_fake_quant = False

In [10]:
u2netp_onnx_path = "/home/g.racic/u2netp_onnx.onnx"

In [11]:
sample_image = {"image": img_tensors}

In [12]:
convert_to_onnx(u2netp, u2netp_onnx_path, sample_image, False)



## ONNX inference

In [13]:
sess = InferenceSession(u2netp_onnx_path)
print("The model expects input shape: ", sess.get_inputs()[0].shape)

The model expects input shape:  ['batch_size', 3, 192, 192]


In [None]:
sess.run(None, {"image": img_tensors.cpu().numpy()})
# 150 ms ± 456 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)

In [15]:
onnx_model = onnx.load(u2netp_onnx_path)

## TRT Compilation

In [16]:
trt_logger = trt.Logger(trt.Logger.VERBOSE)
runtime = trt.Runtime(trt_logger)

In [29]:
# Just for debugging purpose
with trt.Builder(trt_logger) as builder, \
builder.create_network(
    flags=1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
) as network_definition, \
trt.OnnxParser(network_definition, trt_logger) as parser:
    with open(u2netp_onnx_path, "rb") as f:
        if not parser.parse(f.read()):
            print(parser.get_error(0))
        else:
            print(f"Num layers: {network_definition.num_layers}")
            print(f"Num inputs: {network_definition.num_inputs}")
            print(f"Num outputs: {network_definition.num_outputs}")
            trt_all_nodes = {network_definition.get_layer(n).name for n in range(network_definition.num_layers)}
            onnx_all_nodes = {n.name for n in onnx_model.graph.node}
            print("Nodes in ONNX graph which are not in the network definition: ")
            print(onnx_all_nodes - trt_all_nodes)

Num layers: 371
Num inputs: 1
Num outputs: 7
Nodes in ONNX graph which are not in the network definition: 
{'Constant_44', 'Constant_24', 'Constant_331', 'Constant_286', 'Constant_280', 'Constant_373', 'Constant_238', 'Constant_39', 'Constant_320', 'Constant_275', 'Constant_87', 'Constant_395', 'Constant_220', 'Constant_72', 'Constant_112', 'Constant_149', 'Constant_122', 'Constant_29', 'Constant_368', 'Constant_34', 'Constant_315', 'Constant_389', 'Constant_117', 'Constant_363', 'Constant_270', 'Constant_386', 'Constant_310', 'Constant_325', 'Constant_243', 'Constant_358', 'Constant_197', 'Constant_82', 'Constant_398', 'Constant_392', 'Constant_144', 'Constant_378', 'Constant_249', 'Constant_77'}


In [17]:
# Min, optim, max shapes used for TRT optimizer
image_tensor_shapes = [(1, 3, 192, 192), (5, 3, 192, 192), (10, 3, 192, 192)]

In [18]:
trt_vit_path = "/home/g.racic/trt_u2netp3"

In [19]:
engine = build_engine(
    runtime=runtime,
    onnx_file_path=u2netp_onnx_path,
    logger=trt_logger,
    min_shape=image_tensor_shapes[0],
    optimal_shape=image_tensor_shapes[1],
    max_shape=image_tensor_shapes[2],
    workspace_size=10000 * 1024 * 1024,
    fp16=True,
    int8=False
)

In [20]:
save_engine(engine=engine, engine_file_path=trt_vit_path)

## Throughput comparison

In [21]:
trt_model = load_engine(
    runtime=runtime, engine_file_path=trt_vit_path
)

In [22]:
img_inputs_numpy = [{"image": img_tensors.cpu().numpy()}]

In [23]:
benchmark(trt_model, [img_inputs_numpy[0]], 5)

Warm up ...
Start timing ...
Iteration 100/1000, avg batch time 7.36 ms
Iteration 200/1000, avg batch time 7.35 ms
Iteration 300/1000, avg batch time 7.35 ms
Iteration 400/1000, avg batch time 7.36 ms
Iteration 500/1000, avg batch time 7.37 ms
Iteration 600/1000, avg batch time 7.37 ms
Iteration 700/1000, avg batch time 7.37 ms
Iteration 800/1000, avg batch time 7.36 ms
Iteration 900/1000, avg batch time 7.36 ms
Iteration 1000/1000, avg batch time 7.36 ms
Average throughput: 679.41 example/second


In [24]:
benchmark(u2netp, [img_tensors], 5)

Warm up ...
Start timing ...
Iteration 100/1000, avg batch time 14.29 ms
Iteration 200/1000, avg batch time 14.31 ms
Iteration 300/1000, avg batch time 14.30 ms
Iteration 400/1000, avg batch time 14.29 ms
Iteration 500/1000, avg batch time 14.30 ms
Iteration 600/1000, avg batch time 14.29 ms
Iteration 700/1000, avg batch time 14.29 ms
Iteration 800/1000, avg batch time 14.29 ms
Iteration 900/1000, avg batch time 14.29 ms
Iteration 1000/1000, avg batch time 14.29 ms
Average throughput: 349.87 example/second


In [26]:
# Compilation with Torch-TensorRT
trt_u2netp = torch_tensorrt.compile(u2netp, 
    inputs= [img_tensors.half()],
    enabled_precisions= { torch.half }
)

In [27]:
benchmark(trt_u2netp, [img_tensors.half()], 5)

Warm up ...
Start timing ...
Iteration 100/1000, avg batch time 8.22 ms
Iteration 200/1000, avg batch time 8.22 ms
Iteration 300/1000, avg batch time 8.21 ms
Iteration 400/1000, avg batch time 8.22 ms
Iteration 500/1000, avg batch time 8.22 ms
Iteration 600/1000, avg batch time 8.22 ms
Iteration 700/1000, avg batch time 8.22 ms
Iteration 800/1000, avg batch time 8.22 ms
Iteration 900/1000, avg batch time 8.22 ms
Iteration 1000/1000, avg batch time 8.22 ms
Average throughput: 608.46 example/second
