In [5]:
# Assert that you are running on tensorrt container
! which trtexec
# If you want to check trtexec running options, run trtexec -h 
# ! trtexec -h

In [14]:
# prepare resnet-18 model
import timm
import torch
import inspect

model = timm.create_model("resnet18").cpu()

# check model forward's input parameter name
IN_SHAPE = (3, 224, 224)
OUT_SHAPE = (1000,)
signature = inspect.signature(model.forward)
print(signature)

# export model to onnx format
# For more details, please refer to https://pytorch.org/docs/stable/onnx.html
dummy_input = (torch.randn(*((1,)+IN_SHAPE)).cpu(),)
input_names = ["x"]
output_names = ["outputs"]

model = model.eval() # Need to set model to eval

torch.onnx.export(
    model,
    dummy_input,
    "resnet18.onnx",
    dynamic_axes={
        "x" : {0: "batch"}, # To support dynamic shape on target axis
    },
    verbose=True,
    input_names=input_names, # Need to be aligned with actual parameter name in forward function
    output_names=output_names # Required to match with number of actual output 
)

(x)
Exported graph: graph(%x : Float(*, 3, 224, 224, strides=[150528, 50176, 224, 1], requires_grad=0, device=cpu),
      %fc.weight : Float(1000, 512, strides=[512, 1], requires_grad=1, device=cpu),
      %fc.bias : Float(1000, strides=[1], requires_grad=1, device=cpu),
      %onnx::Conv_193 : Float(64, 3, 7, 7, strides=[147, 49, 7, 1], requires_grad=0, device=cpu),
      %onnx::Conv_194 : Float(64, strides=[1], requires_grad=0, device=cpu),
      %onnx::Conv_196 : Float(64, 64, 3, 3, strides=[576, 9, 3, 1], requires_grad=0, device=cpu),
      %onnx::Conv_199 : Float(64, 64, 3, 3, strides=[576, 9, 3, 1], requires_grad=0, device=cpu),
      %onnx::Conv_202 : Float(64, 64, 3, 3, strides=[576, 9, 3, 1], requires_grad=0, device=cpu),
      %onnx::Conv_208 : Float(128, 64, 3, 3, strides=[576, 9, 3, 1], requires_grad=0, device=cpu),
      %onnx::Conv_209 : Float(128, strides=[1], requires_grad=0, device=cpu),
      %onnx::Conv_211 : Float(128, 128, 3, 3, strides=[1152, 9, 3, 1], requires_gr

In [7]:
# visualize model graph with netron
# ! netron resnet18.onnx -b

In [8]:
# check onnx integrity

import onnx
import onnxruntime as ort
from tqdm import tqdm 

NUM_TEST = 10
B = 16

onnx_model = onnx.load("resnet18.onnx")
sess = ort.InferenceSession(
    onnx_model.SerializeToString(), 
    providers=["CPUExecutionProvider"]
)

mean_diff = 0
with torch.no_grad():
    for _ in tqdm(range(NUM_TEST)):
        input_dict = {"x" : torch.randn(*((B,) + SHAPE))}
        torch_output = model(**input_dict)
        onnx_output = sess.run(output_names, {k : v.numpy() for k, v in input_dict.items()})
        mean_diff += (torch_output - torch.from_numpy(onnx_output[0])).square().mean()


2023-07-26 05:36:56.142927328 [E:onnxruntime:Default, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 313729, index: 1, mask: {2, 26, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.
2023-07-26 05:36:56.142923523 [E:onnxruntime:Default, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 313728, index: 0, mask: {1, 25, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.
2023-07-26 05:36:56.145980903 [E:onnxruntime:Default, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 313731, index: 3, mask: {4, 28, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.
2023-07-26 05:36:56.163261299 [E:onnxruntime:Default, env.cc:251 ThreadMain] pthread_setaffinity_np failed for thread: 313732, index: 4, mask: {5, 29, }, error code: 22 error msg: Invalid argument. S

In [9]:
print(mean_diff / NUM_TEST)

tensor(5.3039e-16)


In [12]:
# change onnx model to tensorrt using trtexec

! /opt/tensorrt/bin/trtexec --onnx=resnet18.onnx --minShapes=x:1x3x224x224 --optShapes=x:16x3x224x224 --maxShapes=x:32x3x224x224 --useCudaGraph --saveEngine=resnet18.plan --verbose=true 


&&&& RUNNING TensorRT.trtexec [TensorRT v8601] # /opt/tensorrt/bin/trtexec --onnx=resnet18.onnx --minShapes=x:1x3x224x224 --optShapes=x:16x3x224x224 --maxShapes=x:32x3x224x224 --useCudaGraph --saveEngine=resnet18.plan --verbose=true
[07/26/2023-05:37:37] [I] === Model Options ===
[07/26/2023-05:37:37] [I] Format: ONNX
[07/26/2023-05:37:37] [I] Model: resnet18.onnx
[07/26/2023-05:37:37] [I] Output:
[07/26/2023-05:37:37] [I] === Build Options ===
[07/26/2023-05:37:37] [I] Max batch: explicit batch
[07/26/2023-05:37:37] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[07/26/2023-05:37:37] [I] minTiming: 1
[07/26/2023-05:37:37] [I] avgTiming: 8
[07/26/2023-05:37:37] [I] Precision: FP32
[07/26/2023-05:37:37] [I] LayerPrecisions: 
[07/26/2023-05:37:37] [I] Layer Device Types: 
[07/26/2023-05:37:37] [I] Calibration: 
[07/26/2023-05:37:37] [I] Refit: Disabled
[07/26/2023-05:37:37] [I] Version Compatible: Disabled
[07/26/2023-05:37:37] [I] T

In [17]:
# type: ignore
from tensorrt_handson_lab.tensorrt_utils import common
import numpy as np
from typing import Dict
import tensorrt as trt
from cuda import cudart
import time

def infer(input_bindings, output_bindings, context, batch: Dict[str, np.ndarray]):
    # Copy given input to device memory (GPU memory)
    
    st = time.time()

    allocations = []
    for k, bindings in input_bindings.items():
        allocations.append(bindings["allocation"])

    for k, bindings in output_bindings.items():
        allocations.append(bindings["allocation"])

    for k, val in batch.items():
        if input_bindings[k]["shape"][0] > val.shape[0]:
            padded = np.zeros(dtype=input_bindings[k]["dtype"], shape=input_bindings[k]["shape"])
            padded[: len(val)] = val
        common.memcpy_host_to_device(
            input_bindings[k]["allocation"],
            np.ascontiguousarray(val.astype(input_bindings[k]["dtype"])),
        )
    
    # execute model with tensorrt runtime
    context.execute_v2(allocations)

    # prepare host memory
    output_dict = {}
    for k, ob in output_bindings.items():
        host_output = np.zeros(dtype=ob["dtype"], shape=ob["shape"])
        common.memcpy_device_to_host(host_output, ob["allocation"])
        output_dict[k] = host_output
        

    cost = time.time() - st
    return output_dict, cost

torch_load_time = 0
trt_load_time = 0

with open("resnet18.plan", "rb")as f, trt.Runtime(trt.Logger(trt.Logger.INFO)) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())

context = engine.create_execution_context()

in_name = engine.get_tensor_name(0)
in_dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(in_name)))
in_shape = list((B,) + IN_SHAPE)
in_size = in_dtype.itemsize * np.prod(in_shape)
input_bindings = {
    in_name : {
        "index" : 0,
        "name" : in_name,
        "dtype" : in_dtype,
        "shape" : in_shape,
        "allocation" : common.cuda_call(cudart.cudaMalloc(in_size))
    }
}
context.set_binding_shape(0, in_shape) # Need to specify binding shape

out_name = engine.get_tensor_name(1)
out_dtype = np.dtype(trt.nptype(engine.get_tensor_dtype(out_name)))
out_shape = list((B,) + OUT_SHAPE)
out_size = out_dtype.itemsize * np.prod(out_shape)
output_bindings = {
    out_name : {
        "index" : 1,
        "name" : out_name,
        "dtype" : out_dtype,
        "shape" : out_shape,
        "allocation" : common.cuda_call(cudart.cudaMalloc(out_size))
    }
}

print(input_bindings)
print(output_bindings)

model.cpu()
st = time.time()
model = model.cuda().eval()
trt_load_time = time.time() - st

mean_diff = 0
torch_cost = 0
trt_cost = 0
with torch.no_grad():
    for _ in tqdm(range(NUM_TEST)):
        st = time.time()
        input_dict = {"x" : torch.randn(*((B,) + IN_SHAPE)).cuda()}
        torch_output = model(**input_dict)
        torch_cost += time.time() - st
        trt_output, trt_cost_batch = infer(
            input_bindings=input_bindings,
            output_bindings=output_bindings, 
            context=context,
            batch={k:v.cpu().numpy() for k,v in input_dict.items()})
        trt_cost += trt_cost_batch
        mean_diff += (torch_output.cpu() - torch.from_numpy(trt_output["outputs"])).square().mean()

  context.set_binding_shape(0, in_shape) # Need to specify binding shape


[07/26/2023-05:39:01] [TRT] [I] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.
[07/26/2023-05:39:01] [TRT] [I] Loaded engine size: 45 MiB
[07/26/2023-05:39:01] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +45, now: CPU 0, GPU 250 (MiB)
[07/26/2023-05:39:01] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +161, now: CPU 0, GPU 411 (MiB)
[07/26/2023-05:39:01] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading
{'x': {'index': 0, 'name': 'x', 'dtype': dtype('float32'), 'shape': [16, 3, 224, 224], 'allocation': 1399

100%|██████████| 10/10 [00:00<00:00, 21.90it/s]


In [18]:
print(mean_diff / NUM_TEST)

print(torch_cost / NUM_TEST, trt_cost /  NUM_TEST)

tensor(0.0190)
0.025162768363952637 0.011265873908996582
