### Mask-RCNN inference with tensorflow, onnxruntime, TensorRT engine.  Balloon dataset

In [None]:
import os
%cd /src/src

In [None]:
import subprocess
import cv2
import numpy as np

import matplotlib.pyplot as plt

from layers import losses
from training import get_optimizer
from model import mask_rcnn_functional
from common import inference_utils
from common.inference_utils import process_input
from common import utils
from common.config import CONFIG

import tensorflow as tf
utils.tf_limit_gpu_memory(tf, 1500)

In [None]:
%load_ext watermark
%watermark
%watermark --iversions

#### Prepare model for inference

In [None]:
weights_path = '/src/result_models/resnet18_256x256_500/maskrcnn_resnet18_16a5e7ed4b511704027fb29c476f9928_cp-0012.ckpt'
# weights_path = '/src/result_models/256x256/maskrcnn_mobilenet_1e3046627e7e8bc073e8b9e50b354411_cp-0002.ckpt'
# weights_path = '/src/result_models/512x512/maskrcnn_mobilenet_c1f61e61570ae80cd3c574c008cbf226_cp-0010.ckpt'
weights_path

In [None]:
# Loading inference graph and import weights
from samples.plates import plates

CONFIG.update(plates.COCO_CONFIG)

CONFIG.update({
                'image_shape': (256, 256, 3),
                'backbone': 'resnet18',
                'image_resize_mode': 'square',
                'img_size': 256,
#                 'image_min_dim': 200,
                'image_min_scale': 0,
                'image_max_dim': 256,
                'batch_size': 1,
                'images_per_gpu': 1,

                }
    )

inference_config = CONFIG
inference_config.update({'training': False})
inference_model = mask_rcnn_functional(config=inference_config)
inference_model = inference_utils.load_mrcnn_weights(model=inference_model,
                                                     weights_path=weights_path,
                                                     verbose=True
                                                    )

---

#### Run several tests with tensorflow

In [None]:
!ls /data/cx-ir/patentes_500

In [None]:
from preprocess import augmentation as aug
from preprocess import preprocess


base_dir = r'/data/cx-ir/patentes_500'
train_dir = base_dir
val_dir = base_dir
eval_dataset = plates.PlateDataset(dataset_dir=base_dir,
                               subset='test',
                               # SegmentationDataset necessary parent attributes
                               augmentation=aug.get_validation_augmentation(
                                           image_size=CONFIG['img_size'],
                                           normalize=CONFIG['normalization']
                               ),
                               **CONFIG
                              )
eval_dataloader = preprocess.DataLoader(eval_dataset,
                                        shuffle=True,
                                        cast_output=False,
                                        return_original=True,
                                         **CONFIG
                                        )

In [None]:
'''
0 padded imagen
1 padded mask
2 labels ?
3 boxes
4 masks? RL·
5 original image
6 original mask
7 label ?
8 boxes original probablemente
'''



In [None]:
import time

In [None]:
len(eval_dataset)

In [None]:
# %%time
for data in eval_dataset:
    
    img = data[5]
    img_processed, image_meta, window = process_input(img, CONFIG)
    
    output = inference_model([np.expand_dims(img_processed, 0),
                              np.expand_dims(image_meta, 0)]
                            ) 
    
    detections, mrcnn_probs, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox = output
   
    
    # Extract bboxes, class_ids, scores and full-size masks
    boxes, class_ids, scores, full_masks = \
    utils.reformat_detections(detections=detections[0].numpy(), 
                              mrcnn_mask=mrcnn_mask[0].numpy(), 
                              original_image_shape=img.shape, 
                              image_shape=img_processed.shape, 
                              window=window
                             )
    
    fig=plt.figure(figsize=(10,10))
    plt.title('Input data')

    plt.imshow(img, 'gray', interpolation='none')
    out_data = zip(class_ids, scores,boxes, np.moveaxis(full_masks, -1, 0))
    out_data = sorted(filter(lambda x: x[1]>=.9,out_data), key=lambda y: y[1], reverse=True)
    if any(out_data):
        c, s, box, fm = out_data[0]
        print
        plt.imshow(fm, 'jet', interpolation='none', alpha=0.3)
        plt.title(f'Mask. class_id: {c} score: {s}')
    # for c, s, fm in zip(class_ids, scores, np.moveaxis(full_masks, -1, 0)):

    #     fig=plt.figure(figsize=(5,5))
    #     plt.title(f'Mask. class_id: {c} score: {s}')
    #     plt.imshow(fm)
    plt.show()    

#### Convert model to .onnx with tf2onnx

In [None]:
import tf2onnx
import onnx
import onnxruntime as ort
# import onnx_graphsurgeon as gs
from common.inference_optimize import maskrcnn_to_onnx, modify_onnx_model

In [None]:
input_spec = (
    tf.TensorSpec((CONFIG['batch_size'], *CONFIG['image_shape']), tf.float32, name="input_image"),
    tf.TensorSpec((CONFIG['batch_size'], CONFIG['meta_shape']), tf.float32, name="input_image_meta")
)
base_folder = os.path.dirname(weights_path)
output_path = os.path.join(base_folder,f"""maskrcnn_{CONFIG['backbone']}_{'_'.join(list(map(str, CONFIG['image_shape'])))}.onnx""" )


In [None]:
maskrcnn_to_onnx(model=inference_model, 
                 output_path = output_path,
                 input_spec=input_spec,
                 kwargs={'opset': 11}
                )

#### Load onnx model and check it 

In [None]:
# Load the ONNX model
model = onnx.load(output_path)
# Check that the IR is well formed
onnx.checker.check_model(model)
# Print a human readable representation of the graph
print(onnx.helper.printable_graph(model.graph))

#### Run several tests with onnxruntime

In [None]:
sess = ort.InferenceSession(output_path)
print(f'Inputs: {[x.name for x in sess.get_inputs()]}\nOutputs:{[x.name for x in sess.get_outputs()]}')

In [None]:
# for img_name in os.listdir(test_images_path):
for data in eval_dataset:
    
    img = data[5]
#     img = cv2.imread(os.path.join(test_images_path, img_name))
    img_processed, image_meta, window = process_input(img, CONFIG)
    

    output = sess.run(output_names=[x.name for x in sess.get_outputs()], 
                      input_feed={'input_image': np.expand_dims(img_processed, 0).astype('float32'),
                                  'input_image_meta': np.expand_dims(image_meta, 0).astype('float32'),
                                 }
                     )
    
    detections, mrcnn_probs, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox = output
    
#     print(img_name, '\nOutput shapes:')
#     for out in output:
#         print(out.shape)
    
    
    # Extract bboxes, class_ids, scores and full-size masks
    boxes, class_ids, scores, full_masks = \
    utils.reformat_detections(detections=detections[0], 
                              mrcnn_mask=mrcnn_mask[0], 
                              original_image_shape=img.shape, 
                              image_shape=img_processed.shape, 
                              window=window
                             )
    
    fig=plt.figure(figsize=(10,10))
    plt.title('Input data')

    plt.imshow(img, 'gray', interpolation='none')
    out_data = zip(class_ids, scores,boxes, np.moveaxis(full_masks, -1, 0))
    out_data = sorted(filter(lambda x: x[1]>=.9,out_data), key=lambda y: y[1], reverse=True)
    if any(out_data):
        c, s, box, fm = out_data[0]
        print
        plt.imshow(fm, 'jet', interpolation='none', alpha=0.3)
        plt.title(f'Mask. class_id: {c} score: {s}')
    plt.show()

#### Configure model for TensorRT

In [None]:
modify_onnx_model(model_path=output_path,
                  config=CONFIG,
                  verbose=True
                 )

#### TensorRT optimization


In [51]:
onnx_mod_path = output_path.replace('.onnx', '_trt_mod.onnx')
trt_path_32 = onnx_mod_path.replace('.onnx', '_fp32.engine')
trt_path_16 = onnx_mod_path.replace('.onnx', '_fp16.engine')
trt_path
onnx_mod_path

'/src/result_models/resnet18_256x256_500/maskrcnn_resnet18_256_256_3_trt_mod.onnx'

In [53]:
trt_path_32

'/src/result_models/resnet18_256x256_500/maskrcnn_resnet18_256_256_3_trt_mod_fp32.engine'

__With trtexec:__ 

In [None]:
%%time

os.chdir('../weights')

# Construct appropriate command
fp16_mode = False
command = [os.environ['TRTEXEC'],
           f'--onnx={onnx_mod_path}',
           f'--saveEngine={trt_path}',
            '--workspace=2048',
            '--explicitBatch',
            '--verbose',
          ]

# fp16 param
if fp16_mode:
    command[2].replace('32', '16')
    command.append('--fp16')

# tacticSources param
# Do not neeed on jetson with aarch64 architecture for now.
arch = os.uname().machine
if arch == 'x86_64':
    command.append('--tacticSources=-cublasLt,+cublas')
    
print(f'\nArch: {arch}\ntrtexec command list: {command}')

result = subprocess.run(command, capture_output=True, check=True)
# Print stdout inference result
print(result.stdout.decode('utf8')[-2495:])

__With python TensorRT API:__


In [49]:
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda

In [50]:
max_batch_size = 1
# Precision mode
fp16_mode = False
# Workspace size in Mb
wspace_size = 1024

In [52]:
%%time

# Init TensorRT Logger
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
# Init TensorRT plugins
trt.init_libnvinfer_plugins(TRT_LOGGER, "")
# Set tensorrt-prepared onnx model
onnx_model_path = onnx_mod_path
# Use explicit batch
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

with trt.Builder(TRT_LOGGER) as builder, \
        builder.create_builder_config() as builder_config, \
        builder.create_network(explicit_batch) as network, \
        trt.OnnxParser(network, TRT_LOGGER) as parser:

    with open(onnx_model_path, 'rb') as model:
        parser.parse(model.read())

    print('Num of detected layers: ', network.num_layers)
    print('Detected inputs: ', network.num_inputs)
    print('Detected outputs: ', network.num_outputs)
    
    # Workspace size
    # 1e6 bytes == 1Mb
    builder_config.max_workspace_size = int(1e6 * wspace_size)
    
    # Precision mode
    if fp16_mode:
        builder_config.set_flag(trt.BuilderFlag.FP16)
    
    # Max batch size
    builder.max_batch_size = max_batch_size
    
    # Set the list of tactic sources
    # Do not need for Jetson with aarch64 architecture for now
    arch = os.uname().machine
    if arch == 'x86_64':
        tactic_source = 1 << int(trt.TacticSource.CUBLAS) | 0 << int(trt.TacticSource.CUBLAS_LT)
        builder_config.set_tactic_sources(tactic_source)
        
    
    # Make TensorRT engine
    engine = builder.build_engine(network, builder_config)
    
    # Save TensorRT engine
    if fp16_mode:
        trt_model_name = trt_path_16
    else:
        trt_model_name = trt_path_32

    with open(trt_model_name, "wb") as f:
        f.write(engine.serialize())

Num of detected layers:  235
Detected inputs:  2
Detected outputs:  2
CPU times: user 24.1 s, sys: 2.53 s, total: 26.6 s
Wall time: 33 s


#### Run TensorRT inference

In [54]:
def trt_mrcnn_inference(model, image):
    """

    Args:
        model: tensorflow tf.keras.Model
        image: prepared image for inference

    Returns: boxes,
             class_ids, 
             scores, f
             ull_masks, 
             eval_gt_boxes, 
             eval_gt_class_ids, 
             eval_gt_masks

    """

    # Extract trt-variables from a dict for transparency
    engine = model['engine']
    stream = model['stream']
    context = model['context']
    device_input = model['device_input']
    device_output1 = model['device_output1']
    device_output2 = model['device_output2']

    host_output1 = model['host_output1']
    host_output2 = model['host_output2']

    # Make inference
    host_input = image.astype(dtype=np.float32, order='C')
    cuda.memcpy_htod_async(device_input, host_input, stream)
    context.execute_async(bindings=[int(device_input),
                                    int(device_output1),
                                    int(device_output2),
                                    ],
                          stream_handle=stream.handle)

    cuda.memcpy_dtoh_async(host_output1, device_output1, stream)
    cuda.memcpy_dtoh_async(host_output2, device_output2, stream)
    stream.synchronize()
    
    trt_mrcnn_detection = host_output1.reshape(
        engine.get_binding_shape('mrcnn_detection')).astype(dtype=np.float32)
    trt_mrcnn_mask = host_output2.reshape(
        engine.get_binding_shape('mrcnn_mask')).astype(dtype=np.float32)
    
    return trt_mrcnn_detection, trt_mrcnn_mask

In [55]:
def set_mrcnn_trt_engine(model_path):
    
    """
    Load TensorRT engine via pycuda
    Args:
        model_path: model path to TensorRT-engine

    Returns: python dict of attributes for pycuda model inference

    """
    
    trt_logger = trt.Logger(trt.Logger.VERBOSE)
    trt.init_libnvinfer_plugins(trt_logger, "")

    with open(model_path, "rb") as f, trt.Runtime(trt_logger) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()

    # Inputs
    input_shape = engine.get_binding_shape('input_image')
    input_size = trt.volume(input_shape) *\
                 engine.max_batch_size * np.dtype(np.float32).itemsize
    device_input = cuda.mem_alloc(input_size)

    # Outputs
    output_names = list(engine)[1:]

    # mrcnn_detection output
    output_shape1 = engine.get_binding_shape('mrcnn_detection')
    host_output1 = cuda.pagelocked_empty(trt.volume(output_shape1) *
                                              engine.max_batch_size,
                                              dtype=np.float32)
    device_output1 = cuda.mem_alloc(host_output1.nbytes)


    # mrcnn_mask output
    output_shape2 = engine.get_binding_shape('mrcnn_mask')
    host_output2 = cuda.pagelocked_empty(trt.volume(output_shape2) * engine.max_batch_size,
                                              dtype=np.float32)
    device_output2 = cuda.mem_alloc(host_output2.nbytes)

    # Setting a cuda stream
    stream = cuda.Stream()
    
    return {'engine': engine,
            'stream': stream,
            'context': context,
            'device_input': device_input,
            'device_output1': device_output1,
            'device_output2':device_output2,
            'host_output1': host_output1,
            'host_output2': host_output2
           }

In [None]:
trt_model = set_mrcnn_trt_engine(f"""../weights/maskrcnn_{CONFIG['backbone']}_512_512_3_trt_mod_fp32.engine""")

In [None]:
for img_name in os.listdir(test_images_path):
    img = cv2.imread(os.path.join(test_images_path, img_name))
    img_processed, image_meta, window = process_input(img, CONFIG)
    

    trt_mrcnn_detection, trt_mrcnn_mask = trt_mrcnn_inference(trt_model, np.expand_dims(img_processed, 0))
    

    # Extract bboxes, class_ids, scores and full-size masks
    boxes, class_ids, scores, full_masks = \
    utils.reformat_detections(detections=trt_mrcnn_detection[0], 
                              mrcnn_mask=trt_mrcnn_mask[0], 
                              original_image_shape=img.shape, 
                              image_shape=img_processed.shape, 
                              window=window
                             )
    
    fig=plt.figure(figsize=(10,10))
    plt.title('Input data')
    plt.imshow(img)

    for c, s, fm in zip(class_ids, scores, np.moveaxis(full_masks, -1, 0)):

        fig=plt.figure(figsize=(5,5))
        plt.title(f'Mask. class_id: {c} score: {s}')
        plt.imshow(fm)