In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
import os

In [2]:
!rm -rf "../triton_model_repository"

In [3]:
os.makedirs("../triton_model_repository/resnet18_torch/1")
os.makedirs("../triton_model_repository/resnet18_onnx/1")
os.makedirs("../triton_model_repository/resnet18_trt_fp32/1")
os.makedirs("../triton_model_repository/resnet18_trt_fp16/1")

In [4]:
#Parameters
BATCH_SIZE = 8
PY_MODEL_PATH = './saved/resnet18.pt'
JIT_MODEL_PATH = '../triton_model_repository/resnet18_torch/1/model.pt'
ONNX_MODEL_PATH = '../triton_model_repository/resnet18_onnx/1/model.onnx'
TRT_MODEL_PATH = '../triton_model_repository/resnet18_trt_fp32/1/model.plan'
TRT_MODEL_PATH_FP16 = '../triton_model_repository/resnet18_trt_fp16/1/model.plan'
MODEL_NAME = 'resnet18'
NUM_CLASSES = 4
INPUT_SHAPE = (3, 224, 224)
CHANNEL_LAST = False

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

## Load Pytorch Model

In [6]:
model = models.resnet18(pretrained=False)
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(nn.Linear(num_ftrs,512),
                        nn.ReLU(),
                        nn.Dropout(p=0.3),
                        nn.Linear(512,4))

if CHANNEL_LAST:
    model = model.to(device, memory_format=torch.channels_last)
else:
    model = model.to(device)
model.load_state_dict(torch.load(PY_MODEL_PATH))
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

# Export to TorchScript

In [7]:
if CHANNEL_LAST:
    example = torch.randn((BATCH_SIZE, *INPUT_SHAPE), dtype=torch.float32, device=device).to(memory_format=torch.channels_last)
else:
    example = torch.randn((BATCH_SIZE, *INPUT_SHAPE), dtype=torch.float32, device=device)

script = torch.jit.trace(model, example)
script.save(JIT_MODEL_PATH)

# Export to ONNX

In [8]:
if CHANNEL_LAST:
    x = torch.randn((1, *INPUT_SHAPE), dtype=torch.float32, device=device).to(memory_format=torch.channels_last)
else:
    x = torch.randn((1, *INPUT_SHAPE), dtype=torch.float32, device=device)

torch.onnx.export(model,                       # model being run
                  x,                           # model input (or a tuple for multiple inputs)
                  ONNX_MODEL_PATH,             # Path to saved onnx model
                  export_params=True,          # store the trained parameter weights inside the model file
                  opset_version=13,            # the ONNX version to export the model to
                  input_names = ['input'],     # the model's input names
                  output_names = ['output'],   # the model's output names
                  dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
                                'output' : {0 : 'batch_size'}})

# Export to TensorRT

TensorRT is a library that focuses specifically on running an already trained network quickly and efficiently on a GPU for high performance inference on NVIDIA GPUs. However, some pre-processing steps maybe required before converting the ONNX model to TensorRT inference engine.

## Using Polygraph

Polygraphy is a toolkit designed to assist in running and debugging deep learning models. It can run inference among different model formats, convert models to other formats, compare performance of models, all through the comamnd-line.

The `surgeon sanitize` subtool can be used to fold constants in graphs and remove unused nodes. In cases where shapes are statically known, it can also simplify subgraphs involving shape operations. A simple example is shown below:

Suppose you are computing, `output = input + ((a + b) + c)` where `a`, `b` and `c` are constants. By running the command given below, `polygraph` will collapse `a`, `b` and `c` into a single constant tensor, simplifying the equation to `output = input + d`.

Polygraphy's surgeon tool provides a constant folding function, which is an important step for newer models before converting the ONNX model into TensorRT.

In [9]:
## Use this for EfficientNetV2
#!polygraphy surgeon sanitize $ONNX_MODEL_PATH --fold-constant -o $ONNX_MODEL_PATH

You can use the `run` subtool to compare the ONNX model between TensorRT and ONNX Runtime.

In [10]:
!polygraphy run $ONNX_MODEL_PATH --trt --onnxrt

[W] 'colored' module is not installed, will not use colors when logging. To enable colors, please install the 'colored' module: python3 -m pip install colored
[I] trt-runner-N0-01/14/22-19:33:44     | Activating and starting inference
[01/14/2022-19:33:46] [TRT] [W] parsers/onnx/onnx2trt_utils.cpp:364: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[W]     Input tensor: input (dtype=DataType.FLOAT, shape=(-1, 3, 224, 224)) | No shapes provided; Will use shape: [1, 3, 224, 224] for min/opt/max in profile.
[W]     This will cause the tensor to have a static shape. If this is incorrect, please set the range of shapes for this input tensor.
[I]     Configuring with profiles: [Profile().add(input, min=[1, 3, 224, 224], opt=[1, 3, 224, 224], max=[1, 3, 224, 224])]
[I] Building engine with configuration:
    Workspace            | 16777216 bytes (16.00 MiB)
    Precision            | TF32: False, FP16: F

## Exporting to TensorRT inference engine

Finally, the model is converted to TensorRT inference engine using `trtexec`, a command-line tool for working with TensorRT. The various flags used here are explained below:

The `explicitBatch` flag signals to TensorRT that we will be using a fixed batch size at runtime. `minShapes` and `maxShapes`, like their name suggests, are the minimum and maximum shaped tensors that you want to pass for inferencing, while `optShapes` is the preferred shape

### FP32 Conversion

In [11]:
!trtexec \
  --onnx=$ONNX_MODEL_PATH \
  --explicitBatch \
  --workspace=16382 \
  --optShapes=input:8x3x224x224 \
  --maxShapes=input:128x3x224x224 \
  --minShapes=input:1x3x224x224 \
  --saveEngine=$TRT_MODEL_PATH

&&&& RUNNING TensorRT.trtexec [TensorRT v8201] # trtexec --onnx=../triton_model_repository/resnet50_onnx/1/model.onnx --explicitBatch --workspace=16382 --optShapes=input:8x3x224x224 --maxShapes=input:128x3x224x224 --minShapes=input:1x3x224x224 --saveEngine=../triton_model_repository/resnet50_trt_fp32/1/model.plan
[01/14/2022-19:34:05] [W] --explicitBatch flag has been deprecated and has no effect!
[01/14/2022-19:34:05] [W] Explicit batch dim is automatically enabled if input model is ONNX or if dynamic shapes are provided when the engine is built.
[01/14/2022-19:34:05] [I] === Model Options ===
[01/14/2022-19:34:05] [I] Format: ONNX
[01/14/2022-19:34:05] [I] Model: ../triton_model_repository/resnet50_onnx/1/model.onnx
[01/14/2022-19:34:05] [I] Output:
[01/14/2022-19:34:05] [I] === Build Options ===
[01/14/2022-19:34:05] [I] Max batch: explicit batch
[01/14/2022-19:34:05] [I] Workspace: 16382 MiB
[01/14/2022-19:34:05] [I] minTiming: 1
[01/14/2022-19:34:05] [I] avgTiming: 8
[01/14/2022-1

**if CHANNEL_LAST**


`!trtexec \
  --onnx=$ONNX_MODEL_PATH \
  --explicitBatch \
  --workspace=16382 \
  --optShapes=input:8x224x224x3 \
  --maxShapes=input:128x224x224x3 \
  --minShapes=input:1x224x224x3 \
  --saveEngine=$TRT_MODEL_PATH'`

### FP16 Conversion

As lower precision tends to run faster, we can convert the ONNX model to FP16 precision by simply passing the flag `--fp16`.

In [12]:
!trtexec \
  --onnx=$ONNX_MODEL_PATH \
  --explicitBatch \
  --workspace=16382 \
  --optShapes=input:8x3x224x224 \
  --maxShapes=input:128x3x224x224 \
  --minShapes=input:1x3x224x224 \
  --saveEngine=$TRT_MODEL_PATH_FP16 --fp16

&&&& RUNNING TensorRT.trtexec [TensorRT v8201] # trtexec --onnx=../triton_model_repository/resnet50_onnx/1/model.onnx --explicitBatch --workspace=16382 --optShapes=input:8x3x224x224 --maxShapes=input:128x3x224x224 --minShapes=input:1x3x224x224 --saveEngine=../triton_model_repository/resnet50_trt_fp16/1/model.plan --fp16
[01/14/2022-19:34:37] [W] --explicitBatch flag has been deprecated and has no effect!
[01/14/2022-19:34:37] [W] Explicit batch dim is automatically enabled if input model is ONNX or if dynamic shapes are provided when the engine is built.
[01/14/2022-19:34:37] [I] === Model Options ===
[01/14/2022-19:34:37] [I] Format: ONNX
[01/14/2022-19:34:37] [I] Model: ../triton_model_repository/resnet50_onnx/1/model.onnx
[01/14/2022-19:34:37] [I] Output:
[01/14/2022-19:34:37] [I] === Build Options ===
[01/14/2022-19:34:37] [I] Max batch: explicit batch
[01/14/2022-19:34:37] [I] Workspace: 16382 MiB
[01/14/2022-19:34:37] [I] minTiming: 1
[01/14/2022-19:34:37] [I] avgTiming: 8
[01/14

**if CHANNEL_LAST**


`!trtexec \
  --onnx=$ONNX_MODEL_PATH \
  --explicitBatch \
  --workspace=16382 \
  --optShapes=input:8x224x224x3 \
  --maxShapes=input:128x224x224x3 \
  --minShapes=input:1x224x224x3 \
  --saveEngine=$TRT_MODEL_PATH' --fp16`

Test the TensorRT model for dummy data

In [13]:
!trtexec --loadEngine=$TRT_MODEL_PATH --shapes=input:8x3x224x224

&&&& RUNNING TensorRT.trtexec [TensorRT v8201] # trtexec --loadEngine=../triton_model_repository/resnet50_trt_fp32/1/model.plan --shapes=input:8x3x224x224
[01/14/2022-19:35:47] [I] === Model Options ===
[01/14/2022-19:35:47] [I] Format: *
[01/14/2022-19:35:47] [I] Model: 
[01/14/2022-19:35:47] [I] Output:
[01/14/2022-19:35:47] [I] === Build Options ===
[01/14/2022-19:35:47] [I] Max batch: explicit batch
[01/14/2022-19:35:47] [I] Workspace: 16 MiB
[01/14/2022-19:35:47] [I] minTiming: 1
[01/14/2022-19:35:47] [I] avgTiming: 8
[01/14/2022-19:35:47] [I] Precision: FP32
[01/14/2022-19:35:47] [I] Calibration: 
[01/14/2022-19:35:47] [I] Refit: Disabled
[01/14/2022-19:35:47] [I] Sparsity: Disabled
[01/14/2022-19:35:47] [I] Safe mode: Disabled
[01/14/2022-19:35:47] [I] DirectIO mode: Disabled
[01/14/2022-19:35:47] [I] Restricted mode: Disabled
[01/14/2022-19:35:47] [I] Save engine: 
[01/14/2022-19:35:47] [I] Load engine: ../triton_model_repository/resnet50_trt_fp32/1/model.plan
[01/14/2022-19:35