### MaskRCNN tensorflow to onnx conversion and TensorRT optimization. Balloon dataset

In [1]:
import os
os.chdir('../src')

In [2]:
import tensorflow as tf
import onnx
import subprocess
from samples.balloon import balloon
from model import mask_rcnn_functional
from common.utils import tf_limit_gpu_memory
from common import inference_utils
from common.inference_optimize import maskrcnn_to_onnx, modify_onnx_model
tf_limit_gpu_memory(tf, 1000)

1 Physical GPUs, 1 Logical GPUs Memory limit: 1000
Physical GPU-devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
from common.config import CONFIG

In [4]:
%load_ext watermark
%watermark
%watermark --iversions

Last updated: 2021-09-25T15:55:26.538673+03:00

Python implementation: CPython
Python version       : 3.7.7
IPython version      : 7.16.1

Compiler    : GCC 7.3.0
OS          : Linux
Release     : 5.4.0-65-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 12
Architecture: 64bit

onnx      : 1.8.1
tensorflow: 2.2.0



In [5]:
CONFIG.update(balloon.BALLOON_CONFIG)
CONFIG

{'image_shape': (512, 512, 3),
 'img_size': 512,
 'backbone': 'mobilenet',
 'meta_shape': 14,
 'num_classes': 2,
 'class_dict': {'balloon': 1, 'background': 0},
 'normalization': {'mean': [0.485, 0.456, 0.406],
  'std': [0.229, 0.224, 0.225]},
 'image_min_dim': 300,
 'image_min_scale': 0,
 'image_max_dim': 512,
 'image_resize_mode': 'square',
 'use_mini_masks': False,
 'mini_mask_shape': (32, 32),
 'mask_shape': (28, 28),
 'epochs': 100,
 'gpu_num': 1,
 'batch_size': 1,
 'images_per_gpu': 1,
 'training': True,
 'log_per_steps': 5,
 'use_multiprocessing': True,
 'workers': 6,
 'callback': {'checkpoints_dir': '../logs/scalars',
  'reduce_lr_on_plateau': 0.98,
  'reduce_lr_on_plateau_patience': 10,
  'save_weights_only': True,
  'save_best_only': True,
  'histogram_freq': 0,
  'profile_batch': '1,2'},
 'backbone_strides': [4, 8, 16, 32, 64],
 'top_down_pyramid_size': 256,
 'rpn_anchor_scales': (32, 64, 128, 256, 512),
 'rpn_anchor_ratios': [0.5, 1, 2],
 'rpn_anchor_stride': 1,
 'rpn_train

#### Prepare inference graph

In [6]:
# Train your test model and place its checkpoint to ./tests/samples/balloon
checkpoint = [x for x in os.listdir(f'../tests/samples/balloon') 
              if 'maskrcnn_%s' % CONFIG['backbone'] in x]
checkpoint = checkpoint[0].split('.ckpt')[0] +'.ckpt'
checkpoint

'maskrcnn_mobilenet_246a706912c5d63d633bb39a112cf22c_cp-0045.ckpt'

In [7]:
model_name = f"""maskrcnn_{CONFIG['backbone']}_{'_'.join(list(map(str, CONFIG['image_shape'])))}""" 
weights_path = os.path.join('..', 'tests', 'samples', 'balloon', checkpoint)
print(f'Weights path:\n{weights_path}\n\nModel name:\n{model_name}')

Weights path:
../tests/samples/balloon/maskrcnn_mobilenet_246a706912c5d63d633bb39a112cf22c_cp-0045.ckpt

Model name:
maskrcnn_mobilenet_512_512_3


In [8]:
# Loading inference graph and import weights
inference_config = CONFIG
inference_config.update({'training': False})
inference_model = mask_rcnn_functional(config=inference_config)
inference_model = inference_utils.load_mrcnn_weights(model=inference_model,
                                                     weights_path=weights_path,
                                                     verbose=True
                                                    )

[MaskRCNN] Inference mode


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[MaskRCNN] Backbone architecture: mobilenet




[MaskRCNN] Total params: 24,073,932
[MaskRCNN] Trainable params: 23,755,980

Weights for inference graph will be transferred from training graph

[MaskRCNN] Training mode


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[MaskRCNN] Backbone architecture: mobilenet
[MaskRCNN] Total params: 24,073,932
[MaskRCNN] Trainable params: 23,755,980
MaskRCNN Losses:
rpn_class_loss: <layers.losses.RPNClassLoss object at 0x7f00905ec790>
rpn_bbox_loss: <layers.losses.RPNBboxLoss object at 0x7f0090053f50>
mrcnn_class_loss: <layers.losses.MRCNNClassLoss object at 0x7f0090053a90>
mrcnn_bbox_loss: <layers.losses.MRCNNBboxLoss object at 0x7f0090053b50>
mrcnn_mask_loss: <layers.losses.MRCNNMaskLoss object at 0x7f0090053b10>
l2_regularizer: <layers.losses.L2Re

#### Convert model to .onnx with tf2onnx

In [9]:
input_spec = (
    tf.TensorSpec((CONFIG['batch_size'], *CONFIG['image_shape']), tf.float32, name="input_image"),
    tf.TensorSpec((CONFIG['batch_size'], CONFIG['meta_shape']), tf.float32, name="input_image_meta")
)
maskrcnn_to_onnx(model=inference_model, 
                 model_name = model_name,
                 input_spec=input_spec,
                 kwargs={'opset': 11}
                )

Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
Successfully converted from tensorflow to .onnx: ../weights/maskrcnn_mobilenet_512_512_3.onnx


#### Load onnx model and check it 

In [10]:
# Load the ONNX model
model = onnx.load(f'../weights/{model_name}.onnx')
# Check that the IR is well formed
onnx.checker.check_model(model)
# Print a human readable representation of the graph
print(onnx.helper.printable_graph(model.graph))

graph tf2onnx (
  %input_image[FLOAT, 1x512x512x3]
  %input_image_meta[FLOAT, 1x14]
) initializers (
  %slice_axes__1160[INT32, 2]
  %roi__316[FLOAT, 0]
  %new_shape__1573[INT64, 4]
  %mask_rcnn_inference/rpn_model_2/rpn_class_raw/Conv2D/ReadVariableOp:0[FLOAT, 6x512x1x1]
  %mask_rcnn_inference/rpn_model_1/rpn_bbox_pred/Conv2D/ReadVariableOp:0[FLOAT, 12x512x1x1]
  %mask_rcnn_inference/rpn_model/rpn_conv_shared/Conv2D/ReadVariableOp:0[FLOAT, 512x256x3x3]
  %mask_rcnn_inference/roi_align_mask/truediv/x:0[FLOAT, scalar]
  %mask_rcnn_inference/roi_align_mask/strided_slice_24/stack_2:0[INT32, 1]
  %mask_rcnn_inference/roi_align_mask/range/delta:0[INT32, scalar]
  %mask_rcnn_inference/roi_align_mask/add/x:0[INT32, scalar]
  %mask_rcnn_inference/roi_align_mask/PadV2/constant_values:0[INT32, scalar]
  %mask_rcnn_inference/roi_align_classifier/range/start:0[INT32, scalar]
  %mask_rcnn_inference/roi_align_classifier/mul_4/y:0[INT32, scalar]
  %mask_rcnn_inference/roi/sub_4/x:0[INT32, scalar]
  %

#### Configure model for TensorRT

In [11]:
modify_onnx_model(model_path=f'../weights/{model_name}.onnx',
                  config=CONFIG,
                  verbose=True
                 )


Initial graph inputs: [Variable (input_image): (shape=[1, 512, 512, 3], dtype=float32), Variable (input_image_meta): (shape=[1, 14], dtype=float32)]

Initial graph outputs: [Variable (mrcnn_detection): (shape=[1, 100, 6], dtype=float32), Variable (fpnclf_mrcnn_class): (shape=['unk__2849', 1000, 2], dtype=float32), Variable (fpnclf_mrcnn_bbox_reshape): (shape=['unk__2850', 1000, 2, 4], dtype=float32), Variable (mrcnn_mask): (shape=[1, 100, 28, 28, 2], dtype=float32), Variable (roi): (shape=[1, 'unk__2851', 'unk__2852'], dtype=float32), Variable (concat_rpn_class): (shape=[1, 65472, 2], dtype=float32), Variable (concat_rpn_bbox): (shape=[1, 65472, 4], dtype=float32)]
Already cleared: mask_rcnn_inference/mrcnn_detection/Unique:1
Already cleared: mask_rcnn_inference/mrcnn_detection/Unique:2
Already cleared: mask_rcnn_inference/mrcnn_detection/TopKV2:0
Already cleared: mask_rcnn_inference/roi/top_anchors:0
Already cleared: mask_rcnn_inference/roi_align_classifier/Unique:1
Already cleared: 

#### TensorRT optimization

__With trtexec:__ 

In [12]:
os.environ['TRTEXEC']

'/home/alexander/TensorRT-7.2.3.4/bin/trtexec'

In [13]:
%%time

os.chdir('../weights')

# Construct appropriate command
fp16_mode = False
command = [os.environ['TRTEXEC'],
           f'--onnx={model_name}_trt_mod.onnx',
           f'--saveEngine={model_name}_trt_mod_fp32.engine',
            '--workspace=2048',
            '--explicitBatch',
            '--verbose',
          ]

# fp16 param
if fp16_mode:
    command[2].replace('32', '16')
    command.append('--fp16')

# tacticSources param
# Do not neeed on jetson with aarch64 architecture for now.
arch = os.uname().machine
if arch == 'x86_64':
    command.append('--tacticSources=-cublasLt,+cublas')
    
print(f'\nArch: {arch}\ntrtexec command list: {command}')

result = subprocess.run(command, capture_output=True, check=True)
# Print stdout inference result
print(result.stdout.decode('utf8')[-2495:])


Arch: x86_64
trtexec command list: ['/home/alexander/TensorRT-7.2.3.4/bin/trtexec', '--onnx=maskrcnn_mobilenet_512_512_3_trt_mod.onnx', '--saveEngine=maskrcnn_mobilenet_512_512_3_trt_mod_fp32.engine', '--workspace=2048', '--explicitBatch', '--verbose', '--tacticSources=-cublasLt,+cublas']
[09/25/2021-16:00:30] [V] [TRT] Allocated activation device memory of size 329411584
[09/25/2021-16:00:30] [V] [TRT] Assigning persistent memory blocks for various profiles
[09/25/2021-16:00:30] [I] Starting inference
[09/25/2021-16:00:33] [I] Warmup completed 0 queries over 200 ms
[09/25/2021-16:00:33] [I] Timing trace has 0 queries over 3.11259 s
[09/25/2021-16:00:33] [I] Trace averages of 10 runs:
[09/25/2021-16:00:33] [I] Average on 10 runs - GPU latency: 50.6385 ms - Host latency: 51.0674 ms (end to end 101.425 ms, enqueue 0.90798 ms)
[09/25/2021-16:00:33] [I] Average on 10 runs - GPU latency: 52.1923 ms - Host latency: 52.6286 ms (end to end 104.218 ms, enqueue 0.926916 ms)
[09/25/2021-16:00:33

__With python TensorRT API:__


In [14]:
import tensorrt as trt

In [15]:
max_batch_size = 1
# Precision mode
fp16_mode = False
# Workspace size in Mb
wspace_size = 2048

In [16]:
%%time

# Init TensorRT Logger
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
# Init TensorRT plugins
trt.init_libnvinfer_plugins(TRT_LOGGER, "")
# Set tensorrt-prepared onnx model
onnx_model_path = f'../weights/{model_name}_trt_mod.onnx'
# Use explicit batch
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

with trt.Builder(TRT_LOGGER) as builder, \
        builder.create_builder_config() as builder_config, \
        builder.create_network(explicit_batch) as network, \
        trt.OnnxParser(network, TRT_LOGGER) as parser:

    with open(onnx_model_path, 'rb') as model:
        parser.parse(model.read())

    print('Num of detected layers: ', network.num_layers)
    print('Detected inputs: ', network.num_inputs)
    print('Detected outputs: ', network.num_outputs)
    
    # Workspace size
    # 1e6 bytes == 1Mb
    builder_config.max_workspace_size = int(1e6 * wspace_size)
    
    # Precision mode
    if fp16_mode:
        builder_config.set_flag(trt.BuilderFlag.FP16)
    
    # Max batch size
    builder.max_batch_size = max_batch_size
    
    # Set the list of tactic sources
    # Do not need for Jetson with aarch64 architecture for now
    arch = os.uname().machine
    if arch == 'x86_64':
        tactic_source = 1 << int(trt.TacticSource.CUBLAS) | 0 << int(trt.TacticSource.CUBLAS_LT)
        builder_config.set_tactic_sources(tactic_source)
        
    
    # Make TensorRT engine
    engine = builder.build_engine(network, builder_config)
    
    # Save TensorRT engine
    if fp16_mode:
        trt_model_name = f'../weights/{model_name}_trt_mod_fp16.engine'
    else:
        trt_model_name = f'../weights/{model_name}_trt_mod_fp32.engine'

    with open(trt_model_name, "wb") as f:
        f.write(engine.serialize())

Num of detected layers:  233
Detected inputs:  1
Detected outputs:  2
CPU times: user 16.9 s, sys: 2.31 s, total: 19.2 s
Wall time: 39.5 s
