### Mask-RCNN inference with tensorflow, onnxruntime, TensorRT engine.  Balloon dataset

In [None]:
import os
os.chdir('..')

In [None]:
import subprocess
import cv2
import numpy as np
import tf2onnx
import onnx
import onnxruntime as ort
import onnx_graphsurgeon as gs
import matplotlib.pyplot as plt

from layers import losses
from training import get_optimizer
from model import mask_rcnn_functional
from common import inference_utils
from common.inference_utils import process_input
from common import utils
from common.config import CONFIG

from common.inference_optimize import maskrcnn_to_onnx, modify_onnx_model

import tensorflow as tf
utils.tf_limit_gpu_memory(tf, 1500)

In [None]:
%load_ext watermark
%watermark
%watermark --iversions

#### Prepare model for inference

In [None]:
checkpoint = 'maskrcnn_seresnet34_14735ea1954396a749b4de160c9ce5c8_cp-0050.ckpt'
weights_path = os.path.join('..', 'tests', 'samples', 'balloon', checkpoint)
weights_path 

In [None]:
# Loading inference graph and import weights

CONFIG.update({'class_dict': {'balloon': 1, 'background': 0},
               'num_classes': 2,
               'backbone': 'seresnet34',
              },
             )
CONFIG.update({'meta_shape': (1 + 3 + 3 + 4 + 1 + CONFIG['num_classes']),})
model_name = f"""maskrcnn_{CONFIG['backbone']}_{'_'.join(list(map(str, CONFIG['image_shape'])))}""" 

In [None]:
inference_config = CONFIG
inference_config.update({'training': False})
inference_model = mask_rcnn_functional(config=inference_config)
inference_model = inference_utils.load_mrcnn_weights(model=inference_model,
                                                     weights_path=weights_path,
                                                     verbose=True
                                                    )

---

#### Run several tests with tensorflow

In [None]:
test_images_path = '../tests/images/balloon'
os.listdir(test_images_path)

In [None]:
for img_name in os.listdir(test_images_path):
    img = cv2.imread(os.path.join(test_images_path, img_name))
    img_show = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_processed, image_meta, window = process_input(img, CONFIG)
    
    output = inference_model([np.expand_dims(img_processed, 0),
                              np.expand_dims(image_meta, 0)]
                            ) 
    
    detections, mrcnn_probs, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox = output
    
    print(img_name, '\nOutput shapes:')
    for out in output:
        print(out.shape)
    
    
    # Extract bboxes, class_ids, scores and full-size masks
    boxes, class_ids, scores, full_masks = \
    utils.reformat_detections(detections=detections[0].numpy(), 
                              mrcnn_mask=mrcnn_mask[0].numpy(), 
                              original_image_shape=img.shape, 
                              image_shape=img_processed.shape, 
                              window=window
                             )
    
    fig=plt.figure(figsize=(10,10))
    plt.title('Input data')
    plt.imshow(img_show)
    
    for c, s, fm in zip(class_ids, scores, np.moveaxis(full_masks, -1, 0)):

        fig=plt.figure(figsize=(5,5))
        plt.title(f'Mask. class_id: {c} score: {s}')
        plt.imshow(fm)

#### Convert model to .onnx with tf2onnx

In [None]:
input_spec = (
    tf.TensorSpec((CONFIG['batch_size'], *CONFIG['image_shape']), tf.float32, name="input_image"),
    tf.TensorSpec((CONFIG['batch_size'], CONFIG['meta_shape']), tf.float32, name="input_image_meta")
)

In [None]:
maskrcnn_to_onnx(model=inference_model, 
                 model_name = model_name,
                 input_spec=input_spec,
                 kwargs={'opset': 11}
                )

#### Load onnx model and check it 

In [None]:
# Load the ONNX model
model = onnx.load(f"""../weights/maskrcnn_{CONFIG['backbone']}_512_512_3.onnx""")
# Check that the IR is well formed
onnx.checker.check_model(model)
# Print a human readable representation of the graph
print(onnx.helper.printable_graph(model.graph))

#### Run several tests with onnxruntime

In [None]:
sess = ort.InferenceSession(f"""../weights/maskrcnn_{CONFIG['backbone']}_512_512_3.onnx""")
print(f'Inputs: {[x.name for x in sess.get_inputs()]}\nOutputs:{[x.name for x in sess.get_outputs()]}')

In [None]:
for img_name in os.listdir(test_images_path):
    img = cv2.imread(os.path.join(test_images_path, img_name))
    img_show = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_processed, image_meta, window = process_input(img, CONFIG)
    
    try:
        output = sess.run(output_names=[x.name for x in sess.get_outputs()], 
                          input_feed={'input_image': np.expand_dims(img_processed, 0).astype('float32'),
                                      'input_image_meta': np.expand_dims(image_meta, 0).astype('float32'),
                                     }
                         )

        detections, mrcnn_probs, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox = output

        print(img_name, '\nOutput shapes:')
        for out in output:
            print(out.shape)


        # Extract bboxes, class_ids, scores and full-size masks
        boxes, class_ids, scores, full_masks = \
        utils.reformat_detections(detections=detections[0], 
                                  mrcnn_mask=mrcnn_mask[0], 
                                  original_image_shape=img.shape, 
                                  image_shape=img_processed.shape, 
                                  window=window
                                 )

        fig=plt.figure(figsize=(10,10))
        plt.title('Input data')
        plt.imshow(img_show)

        for c, s, fm in zip(class_ids, scores, np.moveaxis(full_masks, -1, 0)):

            fig=plt.figure(figsize=(5,5))
            plt.title(f'Mask. class_id: {c} score: {s}')
            plt.imshow(fm)
    except:
        pass

#### Configure model for TensorRT

In [None]:
modify_onnx_model(model_path=f'../weights/{model_name}.onnx',
                  config=CONFIG,
                  verbose=True
                 )

#### TensorRT optimization

__With trtexec:__ 

In [None]:
%%time

os.chdir('../weights')

# Construct appropriate command
fp16_mode = False

command = [os.environ['TRTEXEC'],
           f'--onnx={model_name}_trt_mod.onnx',
           f'--saveEngine={model_name}_trt_mod_fp32.engine',
            '--workspace=2048',
            '--explicitBatch',
            '--verbose',
          ]

# fp16 param
if fp16_mode:
    command[2].replace('32', '16')
    command.append('--fp16')

# tacticSources param
# Do not neeed on jetson with aarch64 architecture for now.
arch = os.uname().machine
if arch == 'x86_64':
    command.append('--tacticSources=-cublasLt,+cublas')
    
print(f'\nArch: {arch}\ntrtexec command list: {command}')

result = subprocess.run(command, capture_output=True, check=True)
# Print stdout inference result
print(result.stdout.decode('utf8')[-2495:])

__With python TensorRT API:__


In [None]:
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda

In [None]:
max_batch_size = 1
# Precision mode
fp16_mode = True
# Workspace size in Mb
wspace_size = 2048

In [None]:
%%time

# Init TensorRT Logger
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
# Init TensorRT plugins
trt.init_libnvinfer_plugins(TRT_LOGGER, "")
# Set tensorrt-prepared onnx model
onnx_model_path = f'../weights/{model_name}_trt_mod.onnx' 

# Use explicit batch
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

with trt.Builder(TRT_LOGGER) as builder, \
        builder.create_builder_config() as builder_config, \
        builder.create_network(explicit_batch) as network, \
        trt.OnnxParser(network, TRT_LOGGER) as parser:

    with open(onnx_model_path, 'rb') as model:
        parser.parse(model.read())

    print('Num of detected layers: ', network.num_layers)
    print('Detected inputs: ', network.num_inputs)
    print('Detected outputs: ', network.num_outputs)
    
    # Workspace size
    # 1e6 bytes == 1Mb
    builder_config.max_workspace_size = int(1e6 * wspace_size)
    
    # Precision mode
    if fp16_mode:
        builder_config.set_flag(trt.BuilderFlag.FP16)
    
    # Max batch size
    builder.max_batch_size = max_batch_size
    
    # Set the list of tactic sources
    # Do not need for Jetson with aarch64 architecture for now
    arch = os.uname().machine
    if arch == 'x86_64':
        tactic_source = 1 << int(trt.TacticSource.CUBLAS) | 0 << int(trt.TacticSource.CUBLAS_LT)
        builder_config.set_tactic_sources(tactic_source)
        
    
    # Make TensorRT engine
    engine = builder.build_engine(network, builder_config)
    
    # Save TensorRT engine
    if fp16_mode:
        trt_model_name = f'../weights/{model_name}_trt_mod_fp16.engine'
        
    else:
        trt_model_name = f'../weights/{model_name}_trt_mod_fp32_trt.engine'

    with open(trt_model_name, "wb") as f:
        f.write(engine.serialize())

#### Run TensorRT inference

In [None]:
def trt_mrcnn_inference(model, image):
    """

    Args:
        model: tensorflow tf.keras.Model
        image: prepared image for inference

    Returns: boxes,
             class_ids, 
             scores, f
             ull_masks, 
             eval_gt_boxes, 
             eval_gt_class_ids, 
             eval_gt_masks

    """

    # Extract trt-variables from a dict for transparency
    engine = model['engine']
    stream = model['stream']
    context = model['context']
    device_input = model['device_input']
    device_output1 = model['device_output1']
    device_output2 = model['device_output2']

    host_output1 = model['host_output1']
    host_output2 = model['host_output2']

    # Make inference
    host_input = image.astype(dtype=np.float32, order='C')
    cuda.memcpy_htod_async(device_input, host_input, stream)
    context.execute_async(bindings=[int(device_input),
                                    int(device_output1),
                                    int(device_output2),
                                    ],
                          stream_handle=stream.handle)

    cuda.memcpy_dtoh_async(host_output1, device_output1, stream)
    cuda.memcpy_dtoh_async(host_output2, device_output2, stream)
    stream.synchronize()
    
    trt_mrcnn_detection = host_output1.reshape(
        engine.get_binding_shape('mrcnn_detection')).astype(dtype=np.float32)
    trt_mrcnn_mask = host_output2.reshape(
        engine.get_binding_shape('mrcnn_mask')).astype(dtype=np.float32)
    
    return trt_mrcnn_detection, trt_mrcnn_mask

In [None]:
def set_mrcnn_trt_engine(model_path):
    
    """
    Load TensorRT engine via pycuda
    Args:
        model_path: model path to TensorRT-engine

    Returns: python dict of attributes for pycuda model inference

    """
    
    trt_logger = trt.Logger(trt.Logger.VERBOSE)
    trt.init_libnvinfer_plugins(trt_logger, "")

    with open(model_path, "rb") as f, trt.Runtime(trt_logger) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    # Inputs
    input_shape = engine.get_binding_shape('input_image')
    input_size = trt.volume(input_shape) *\
                 engine.max_batch_size * np.dtype(np.float32).itemsize
    device_input = cuda.mem_alloc(input_size)

    # Outputs
    output_names = list(engine)[1:]

    # mrcnn_detection output
    output_shape1 = engine.get_binding_shape('mrcnn_detection')
    host_output1 = cuda.pagelocked_empty(trt.volume(output_shape1) *
                                              engine.max_batch_size,
                                              dtype=np.float32)
    device_output1 = cuda.mem_alloc(host_output1.nbytes)


    # mrcnn_mask output
    output_shape2 = engine.get_binding_shape('mrcnn_mask')
    host_output2 = cuda.pagelocked_empty(trt.volume(output_shape2) * engine.max_batch_size,
                                              dtype=np.float32)
    device_output2 = cuda.mem_alloc(host_output2.nbytes)

    # Setting a cuda stream
    stream = cuda.Stream()
    
    return {'engine': engine,
            'stream': stream,
            'context': context,
            'device_input': device_input,
            'device_output1': device_output1,
            'device_output2':device_output2,
            'host_output1': host_output1,
            'host_output2': host_output2
           }

In [None]:
trt_model = set_mrcnn_trt_engine(f"""../weights/maskrcnn_{CONFIG['backbone']}_512_512_3_trt_mod_fp32.engine""")

In [None]:
for img_name in os.listdir(test_images_path):
    img = cv2.imread(os.path.join(test_images_path, img_name))
    img_show = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_processed, image_meta, window = process_input(img, CONFIG)
    

    trt_mrcnn_detection, trt_mrcnn_mask = trt_mrcnn_inference(trt_model, np.expand_dims(img_processed, 0))
    

    # Extract bboxes, class_ids, scores and full-size masks
    boxes, class_ids, scores, full_masks = \
    utils.reformat_detections(detections=trt_mrcnn_detection[0], 
                              mrcnn_mask=trt_mrcnn_mask[0], 
                              original_image_shape=img.shape, 
                              image_shape=img_processed.shape, 
                              window=window
                             )
    
    fig=plt.figure(figsize=(10,10))
    plt.title('Input data')
    plt.imshow(img_show)

    for c, s, fm in zip(class_ids, scores, np.moveaxis(full_masks, -1, 0)):

        fig=plt.figure(figsize=(5,5))
        plt.title(f'Mask. class_id: {c} score: {s}')
        plt.imshow(fm)