# Resnet50 on webcam - tensorrt

## Import libraries

In [7]:
import torch
import torchvision
import cv2
import numpy as np
import time
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import matplotlib.pyplot as plt
import os
import skimage.transform as skt
from torch.onnx import OperatorExportTypes

## Download model

In [2]:
model = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT)

## Labels

In [3]:
# dict with ImageNet labels
with open('imagenet_labels.txt') as f:
    labels = eval(f.read())

## Export model to ONNX

In [4]:
BATCH_SIZE=1
# BATCH_SIZE=32

In [8]:
onnx_file = f"resnet50_pytorch_BS{BATCH_SIZE}.onnx"
if not os.path.exists(onnx_file):
    dummy_input=torch.randn(BATCH_SIZE, 3, 224, 224)
    torch.onnx.export(model, dummy_input, onnx_file, verbose=False, input_names=["input"], output_names=["output"], operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK)
else:
    print(f"{onnx_file} file already exists")

## Export model to TRT

In [6]:
USE_FP16 = True
target_dtype = np.float16 if USE_FP16 else np.float32

In [7]:
tensorrt_file = f"resnet50_engine_pytorch_BS{BATCH_SIZE}.trt"
if not os.path.exists(tensorrt_file):
    if USE_FP16:
        !/usr/src/tensorrt/bin/trtexec --onnx=resnet50_pytorch_BS{BATCH_SIZE}.onnx --saveEngine=resnet50_engine_pytorch_BS{BATCH_SIZE}.trt  --explicitBatch --inputIOFormats=fp16:chw --outputIOFormats=fp16:chw --fp16
    else:
        !/usr/src/tensorrt/bin/trtexec --onnx=resnet50_pytorch_BS{BATCH_SIZE}.onnx --saveEngine=resnet50_engine_pytorch_BS{BATCH_SIZE}.trt  --explicitBatch
else:
    print(f"{tensorrt_file} engine already exists")

resnet50_engine_pytorch_BS1.trt engine already exists


## Export model to ENGINE

In [8]:
USE_FP16 = True
target_dtype = np.float16 if USE_FP16 else np.float32

In [9]:
tensorrt_file = f"resnet50_engine_pytorch_BS{BATCH_SIZE}.engine"
if not os.path.exists(tensorrt_file):
    if USE_FP16:
        # !/usr/src/tensorrt/bin/trtexec --onnx=resnet50_pytorch_BS{BATCH_SIZE}.onnx --saveEngine={tensorrt_file}  --explicitBatch --inputIOFormats=fp16:chw --outputIOFormats=fp16:chw --fp16
        !/usr/src/tensorrt/bin/trtexec --onnx=../../tensorrt/resnet50_pytorch_BS1.onnx --saveEngine=resnet50_engine_pytorch_BS1.engine --explicitBatch --inputIOFormats=fp16:chw --outputIOFormats=fp16:chw --fp16
    else:
        !/usr/src/tensorrt/bin/trtexec --onnx=resnet50_pytorch_BS{BATCH_SIZE}.onnx --saveEngine={tensorrt_file}  --explicitBatch
else:
    print(f"{tensorrt_file} engine already exists")

&&&& RUNNING TensorRT.trtexec [TensorRT v8403] # /usr/src/tensorrt/bin/trtexec --onnx=resnet50_pytorch_BS1.onnx --saveEngine=resnet50_engine_pytorch_BS1.engine --explicitBatch --inputIOFormats=fp16:chw --outputIOFormats=fp16:chw --fp16
[11/30/2022-09:30:40] [W] --explicitBatch flag has been deprecated and has no effect!
[11/30/2022-09:30:40] [W] Explicit batch dim is automatically enabled if input model is ONNX or if dynamic shapes are provided when the engine is built.
[11/30/2022-09:30:40] [I] === Model Options ===
[11/30/2022-09:30:40] [I] Format: ONNX
[11/30/2022-09:30:40] [I] Model: resnet50_pytorch_BS1.onnx
[11/30/2022-09:30:40] [I] Output:
[11/30/2022-09:30:40] [I] === Build Options ===
[11/30/2022-09:30:40] [I] Max batch: explicit batch
[11/30/2022-09:30:40] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[11/30/2022-09:30:40] [I] minTiming: 1
[11/30/2022-09:30:40] [I] avgTiming: 8
[11/30/2022-09:30:40] [I] Precision: FP32+F

## Config cuda an TRT model

In [55]:
f = open(f"resnet50_engine_pytorch_BS{BATCH_SIZE}.trt", "rb")
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) 

engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()

In [81]:
# Full HD resolution
CAPTURE_WIDTH = 1920
CAPTURE_HEIGHT = 1080
resize = True
if resize:
    # 360p resolution
    RESIZE_WIDTH = 360
    RESIZE_HEIGHT = 360

In [82]:
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, CAPTURE_WIDTH)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, CAPTURE_HEIGHT)

True

In [83]:
ret, frame = cap.read()
img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
print(f"Original image shape: {img.shape}, dtype: {img.dtype}")
# resize = False
if resize:
    img = skt.resize(img, (RESIZE_WIDTH, RESIZE_HEIGHT))
    print(f"Resized image shape: {img.shape}, dtype: {img.dtype}")
else:
    img = skt.resize(img, (CAPTURE_WIDTH, CAPTURE_HEIGHT))
    print(f"Resized image shape: {img.shape}, dtype: {img.dtype}")
img = np.expand_dims(np.array(img, dtype=np.float32), axis=0) # Expand image to have a batch dimension
print(f"Expanded image shape: {img.shape}, dtype: {img.dtype}")
input_batch = np.array(np.repeat(img, BATCH_SIZE, axis=0), dtype=np.float32) # Repeat across the batch dimension
print(f"Batched image shape: {input_batch.shape}")
# plt.imshow(input_batch[0].astype(np.float32))

Original image shape: (720, 1280, 3), dtype: uint8
Resized image shape: (360, 360, 3), dtype: float64
Expanded image shape: (1, 360, 360, 3), dtype: float32
Batched image shape: (1, 360, 360, 3)


In [84]:
cap.release()

In [85]:
def preprocess_image(img):
    norm = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    result = norm(torch.from_numpy(img).transpose(0,2).transpose(1,2))
    result = np.array(result, dtype=np.float16)
    return result

In [86]:
# need to set input and output precisions to FP16 to fully enable it
output = np.empty([BATCH_SIZE, 1000], dtype = target_dtype) 

# allocate device memory
d_input = cuda.mem_alloc(1 * np.array(input_batch, dtype=np.float16).nbytes)
d_output = cuda.mem_alloc(1 * output.nbytes)

bindings = [int(d_input), int(d_output)]

stream = cuda.Stream()

In [87]:
def predict(batch): # result gets copied into output
    # transfer input data to device
    cuda.memcpy_htod_async(d_input, batch, stream)
    # execute model
    context.execute_async_v2(bindings, stream.handle, None)
    # transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)
    # syncronize threads
    stream.synchronize()
    
    return output

In [89]:
# Open webcam and start inference
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, CAPTURE_WIDTH)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, CAPTURE_HEIGHT)
font = cv2.FONT_HERSHEY_SIMPLEX
fontScale = 0.5
fontColor = (10,10,10)
lineThickness= 1
lineType = cv2.LINE_AA
pos = 30
do_preprocess = True

while True:
    if 5 >= BATCH_SIZE:
        ret, frame = cap.read()
        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        if resize:
            img = skt.resize(img, (RESIZE_WIDTH, RESIZE_HEIGHT))
        else:
            img = skt.resize(img, (CAPTURE_WIDTH, CAPTURE_HEIGHT))
        img = np.expand_dims(np.array(img, dtype=np.float32), axis=0)
    t0 = time.time()
    ret, frame = cap.read()
    t_frame = time.time()
    pos = 30
    cv2.putText(frame, f"Image resolution: {frame.shape}", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
    pos += 20
    cv2.putText(frame, f"Open frame time: {((t_frame - t0)*1000):.2f} ms", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
    pos += 20
    if not ret:
        continue

    # Preprocess image
    t1 = time.time()
    img = frame.copy()
    t_copy = time.time()
    cv2.putText(frame, f"Copy frame time: {((t_copy - t1)*1000):.2f} ms", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
    pos += 20
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    t_rgb = time.time()
    cv2.putText(frame, f"RGB conversion time: {((t_rgb - t_copy)*1000):.2f} ms", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
    pos += 20
    if resize:
        img = skt.resize(img, (RESIZE_HEIGHT, RESIZE_HEIGHT))
    else:
        img = skt.resize(img, (CAPTURE_WIDTH, CAPTURE_HEIGHT))
    t_resize = time.time()
    cv2.putText(frame, f"Resize time: {((t_resize - t_rgb)*1000):.2f} ms", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
    pos += 20
    img = np.expand_dims(np.array(img, dtype=np.float32), axis=0) # Expand image to have a batch dimension
    t_expand = time.time()
    cv2.putText(frame, f"Expand time: {((t_expand - t_resize)*1000):.2f} ms", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
    pos += 20
    if BATCH_SIZE > 1:
        input_batch = np.array(np.repeat(img, BATCH_SIZE, axis=0), dtype=np.float32) # Repeat across the batch dimension
        t_repeat = time.time()
        cv2.putText(frame, f"Repeat time: {((t_repeat - t_expand)*1000):.2f} ms", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
        pos += 20
    else:
        input_batch = img.copy()
    if do_preprocess:
        t_input = time.time()
        preprocessed_image = np.array([preprocess_image(image) for image in input_batch])
        t_preprocess = time.time()
        cv2.putText(frame, f"Preprocess time: {((t_preprocess - t_input)*1000):.2f} ms", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
        pos += 20
        cv2.putText(frame, f"preprocessed_image shape: {preprocessed_image.shape}", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
        pos += 20
        cv2.putText(frame, f"input_batch shape: {input_batch.shape}", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
        pos += 20
    else:
        preprocessed_image = input_batch[0].astype(np.float16)
    t_total_preprocess = time.time()
    cv2.putText(frame, f"Total preprocess time: {((t_total_preprocess - t1)*1000):.2f} ms", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
    pos += 20

    # Inference
    start = time.time()
    outputs = predict(preprocessed_image)
    end = time.time()
    cv2.putText(frame, f"Inference time: {((end - start)*1000):.2f} ms", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
    pos += 20

    # Postprocess
    t2 = time.time()
    idx = outputs[0].argmax()
    t_postprocess = time.time()
    cv2.putText(frame, f"Predicted: {idx}-{labels[idx]}", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
    pos += 20
    cv2.putText(frame, f"Postprocess time: {((t_postprocess - t2)*1000):.2f} ms", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
    pos += 20

    # FPS
    T = time.time() - t0
    cv2.putText(frame, f"Total time: {(T*1000):.2f} ms", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
    pos += 20
    cv2.putText(frame, f"FPS: {1/T:.2f}", (10, pos), font, fontScale, fontColor, lineThickness, lineType)
    pos += 20

    # Display
    cv2.imshow("frame", frame)
    if cv2.waitKey(1) == ord('q'):
        break


cap.release()
cv2.destroyAllWindows()

In [51]:
cap.release()
cv2.destroyAllWindows()