Before running inference:
1. Run [3.2_Model_conversion_trt.ipynb](3.2_Model_conversion_trt.ipynb) to convert models .onnx files into .trt
2. Make sure TensorRT [binaries](https://developer.nvidia.com/tensorrt/download/10x) are present and added to [Path](https://medium.com/@guillermovc/how-to-install-tensorrt-in-windows-10-71a4033c4407).
3. Make sure pycuda and tensorrt is installed via pip.

In [1]:
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from PIL import Image
import torchvision.transforms as transforms
from IPython.display import display
import tensorrt as trt
import cv2

from helpers.processing import display_img_with_map

In [2]:
# https://stackoverflow.com/questions/59280745/inference-with-tensorrt-engine-file-on-python
class TensorRTInference:
    def __init__(self, engine_path):
        # initialize
        self.logger = trt.Logger(trt.Logger.ERROR)
        self.runtime = trt.Runtime(self.logger)

        # setup
        self.engine = self.load_engine(engine_path)
        self.context = self.engine.create_execution_context()

        # allocate buffers
        self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers(
            self.engine
        )

    def load_engine(self, engine_path):
        # loads the model from given filepath
        with open(engine_path, "rb") as f:
            engine = self.runtime.deserialize_cuda_engine(f.read())
        return engine

    class HostDeviceMem:
        def __init__(self, host_mem, device_mem, shape):
            # keeping track of addresses
            self.host = host_mem
            self.device = device_mem
            # keeping track of shape to un-flatten it later
            self.shape = shape

    def allocate_buffers(self, engine):
        inputs, outputs, bindings = [], [], []
        stream = cuda.Stream()

        for i in range(engine.num_io_tensors):
            tensor_name = engine.get_tensor_name(i)
            shape = engine.get_tensor_shape(tensor_name)
            size = trt.volume(shape)
            dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))

            # allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            # append the device buffer address to device bindings
            bindings.append(int(device_mem))

            # append to the appropiate input/output list
            if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
                inputs.append(self.HostDeviceMem(host_mem, device_mem, shape))
            else:
                outputs.append(self.HostDeviceMem(host_mem, device_mem, shape))

        return inputs, outputs, bindings, stream

    def infer(self, input_data):
        # transfer input data to device
        np.copyto(self.inputs[0].host, input_data.ravel())
        cuda.memcpy_htod_async(self.inputs[0].device, self.inputs[0].host, self.stream)

        # set tensor address
        for i in range(self.engine.num_io_tensors):
            self.context.set_tensor_address(
                self.engine.get_tensor_name(i), self.bindings[i]
            )

        # run inference
        self.context.execute_async_v3(stream_handle=self.stream.handle)

        # transfer predictions back
        for i in range(len(self.outputs)):
            cuda.memcpy_dtoh_async(
                self.outputs[i].host, self.outputs[i].device, self.stream
            )

        # synchronize the stream
        self.stream.synchronize()

        # un-flatten the outputs
        outputs = []
        for i in range(len(self.outputs)):
            output = self.outputs[i].host
            output = output.reshape(self.outputs[i].shape)
            outputs.append(output)

        return outputs

In [3]:
engine_path = 'efficientnet_v2_s_downscaled_pcb_fp32.trt'
trt_inference = TensorRTInference(engine_path)

In [4]:
image = cv2.imread('../docs/sample_data/01_short_04_1926_1070.png', cv2.IMREAD_COLOR)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
transform = transforms.Compose([transforms.ToTensor()])
tensor = transform(image).unsqueeze(0)

In [None]:
%%timeit
trt_inference.infer(tensor.numpy())

In [None]:
outputs = trt_inference.infer(tensor.numpy())
display_img_with_map(outputs[0], outputs[1], image)

In [9]:
engine_path = 'efficientnet_v2_s_downscaled_pcb_fp16.trt'
trt_inference = TensorRTInference(engine_path)

In [None]:
%%timeit
trt_inference.infer(tensor.numpy())

In [None]:
outputs = trt_inference.infer(tensor.numpy())
display_img_with_map(outputs[0], outputs[1], image)

In [20]:
engine_path = 'vit_tiny_patch16_224.augreg_in21k_ft_in1k_pcb_fp32.trt'
trt_inference = TensorRTInference(engine_path)

In [None]:
%%timeit
trt_inference.infer(tensor.numpy())

In [None]:
outputs = trt_inference.infer(tensor.numpy())
display_img_with_map(outputs[0], outputs[1], image)

In [23]:
engine_path = 'vit_tiny_patch16_224.augreg_in21k_ft_in1k_pcb_fp16.trt'
trt_inference = TensorRTInference(engine_path)

In [None]:
%%timeit
trt_inference.infer(tensor.numpy())

In [None]:
outputs = trt_inference.infer(tensor.numpy())
display_img_with_map(outputs[0], outputs[1], image)