In [2]:
from mmdet3d.apis import init_model, inference_detector, show_result_meshlab

config_file = '../configs/centerpoint/centerpoint_03pillar_kitti_lum.py'
checkpoint_file = '/home/mark/checkpoints/centrpoint/centerpoint_lum_iris.pth'

# build the model from a config file and a checkpoint file
model = init_model(config_file, checkpoint_file, device='cuda:0')
print(model)


load checkpoint from local path: /home/mark/checkpoints/centrpoint/centerpoint_lum_iris.pth
CenterPoint(
  (pts_voxel_layer): Voxelization(voxel_size=[0.2, 0.2, 16], point_cloud_range=[0, -55.2, -15, 128, 55.2, 15], max_num_points=20, max_voxels=(30000, 40000), deterministic=True)
  (pts_voxel_encoder): PillarFeatureNet(
    (pfn_layers): ModuleList(
      (0): PFNLayer(
        (norm): BatchNorm1d(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        (linear): Linear(in_features=10, out_features=64, bias=False)
      )
    )
  )
  (pts_middle_encoder): PointPillarsScatter()
  (pts_backbone): SECOND(
    (blocks): ModuleList(
      (0): Sequential(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (4):

In [13]:
# test a single sample
pcd = '/home/mark/KITTI/testing/velodyne/000008.bin'
result, data = inference_detector(model, pcd)

['flip', 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 'pcd_trans', 'pcd_scale_factor', 'pcd_rotation', 'pcd_rotation_angle', 'pts_filename', 'transformation_3d_flow'] torch.Size([61241, 4])


True

In [13]:
import torch

pc = torch.randn([10000, 4]).cuda()
voxels, num_points, coors = model.voxelize([pc])
print('pts_voxel_layer: ', voxels.shape, num_points.shape, coors.shape,)

voxel_features = model.pts_voxel_encoder(voxels, num_points, coors)
print('pts_voxel_encoder: ', voxel_features.shape)

x = model.pts_middle_encoder(voxel_features, coors)
print('pts_middle_encoder: ', x.shape)

x = model.pts_backbone(x)
print('pts_backbone: ', x[0].shape, x[1].shape, x[2].shape)

x = model.pts_neck(x)
print('pts_neck: ', x[0].shape)

out = model.pts_bbox_head(x)
print('pts_bbox_head:')

class_names=['Car', 'Pedestrian', 'Cyclist', 'Large_vehicle']
for i, val in enumerate(out):
    print(f'{class_names[i]}:')
    for k, v in out[i][0].items():
        print(f'   {k}: {v.shape}')


pts_voxel_layer:  torch.Size([601, 20, 4]) torch.Size([601]) torch.Size([601, 4])
pts_voxel_encoder:  torch.Size([601, 64])
pts_middle_encoder:  torch.Size([1, 64, 552, 640])
pts_backbone:  torch.Size([1, 64, 276, 320]) torch.Size([1, 128, 138, 160]) torch.Size([1, 256, 69, 80])
pts_neck:  torch.Size([1, 384, 138, 160])
pts_bbox_head:
Car:
   reg: torch.Size([1, 2, 138, 160])
   height: torch.Size([1, 1, 138, 160])
   dim: torch.Size([1, 3, 138, 160])
   rot: torch.Size([1, 2, 138, 160])
   heatmap: torch.Size([1, 1, 138, 160])
Pedestrian:
   reg: torch.Size([1, 2, 138, 160])
   height: torch.Size([1, 1, 138, 160])
   dim: torch.Size([1, 3, 138, 160])
   rot: torch.Size([1, 2, 138, 160])
   heatmap: torch.Size([1, 1, 138, 160])
Cyclist:
   reg: torch.Size([1, 2, 138, 160])
   height: torch.Size([1, 1, 138, 160])
   dim: torch.Size([1, 3, 138, 160])
   rot: torch.Size([1, 2, 138, 160])
   heatmap: torch.Size([1, 1, 138, 160])
Large_vehicle:
   reg: torch.Size([1, 2, 138, 160])
   height

In [4]:
sub_model = model.get_sub_model_for_conversion()
out = sub_model(voxels, num_points, coors)
for i, val in enumerate(out):
    print(f'{class_names[i]}:')
    for k, v in out[i][0].items():
        print(f'   {k}: {v.shape}')
print(sub_model)

SubModel(
  (pts_voxel_encoder): PillarFeatureNet(
    (pfn_layers): ModuleList(
      (0): PFNLayer(
        (norm): BatchNorm1d(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        (linear): Linear(in_features=10, out_features=64, bias=False)
      )
    )
  )
  (pts_middle_encoder): PointPillarsScatter()
  (pts_backbone): SECOND(
    (blocks): ModuleList(
      (0): Sequential(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (4): BatchNorm2d(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        (5): ReLU(inplace=True)
        (6): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (7): BatchNorm2d(64, eps=0.001, momentum=0.01, affine=True,

In [4]:
## Export to ONNX
import torch

#torch.onnx.export(model.pts_voxel_layer, pc, 'centrpoint.onnx', verbose=True)
torch.onnx.export(sub_model, (voxels, num_points, coors), 'centrpoint.onnx', 
                  opset_version=11, verbose=True)
#torch.onnx.export(sub_model, (voxel_features, coors), 'centrpoint.onnx', 
#                  opset_version=10, verbose=True)

#data_in = torch.randn([1, 64, 128, 128]).cuda()

#torch.onnx.export(model, data_in, 'centrpoint.onnx', verbose=True)

graph(%0 : Float(241, 20, 4, strides=[80, 4, 1], requires_grad=0, device=cuda:0),
      %num_points : Int(241, strides=[1], requires_grad=0, device=cuda:0),
      %coors : Int(241, 4, strides=[4, 1], requires_grad=0, device=cuda:0),
      %pts_voxel_encoder.pfn_layers.0.norm.weight : Float(64, strides=[1], requires_grad=1, device=cuda:0),
      %pts_voxel_encoder.pfn_layers.0.norm.bias : Float(64, strides=[1], requires_grad=1, device=cuda:0),
      %pts_voxel_encoder.pfn_layers.0.norm.running_mean : Float(64, strides=[1], requires_grad=0, device=cuda:0),
      %pts_voxel_encoder.pfn_layers.0.norm.running_var : Float(64, strides=[1], requires_grad=0, device=cuda:0),
      %pts_neck.deblocks.2.0.weight : Float(256, 128, 2, 2, strides=[512, 4, 2, 1], requires_grad=1, device=cuda:0),
      %pts_neck.deblocks.2.1.weight : Float(128, strides=[1], requires_grad=1, device=cuda:0),
      %pts_neck.deblocks.2.1.bias : Float(128, strides=[1], requires_grad=1, device=cuda:0),
      %pts_neck.deblo

In [1]:
import tensorrt as trt
import numpy as np
    
print('TensorRT version: ', trt.__version__)
onnx_export_pathname = 'centrpoint.onnx'

trt_logger = trt.Logger(trt.Logger.INFO)

with trt.Builder(trt_logger) as builder:
    builder.max_batch_size = 1
    
    builder_config = builder.create_builder_config()
    builder_config.reset()
    builder_config.max_workspace_size = 500 * 1024 * 1024
    builder_config.default_device_type = trt.DeviceType.GPU
    builder_config.engine_capability = trt.EngineCapability.STANDARD

    # Build TensorRT engine
    EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    with builder.create_network(EXPLICIT_BATCH) as network:
        with trt.OnnxParser(network, trt_logger) as parser:
            if not parser.parse_from_file(onnx_export_pathname):
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                raise ValueError(f'Failed to parse ONNX to TensorRT network', flush=True)

            # build engine
            network.name = 'centerpoint'
            cuda_engine = builder.build_engine(network, builder_config)

            if cuda_engine is not None:                                
                with open('centerpoint.trt', 'wb') as f:
                    f.write(cuda_engine.serialize())
                del cuda_engine                
            else:
                raise ValueError(f'Failed to build TensorRT engine')



TensorRT version:  8.2.3.0




In [2]:
import tensorrt as trt
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda

cuda_engine_pathname = 'centerpoint.trt'


with open(cuda_engine_pathname, 'rb') as f:
    with trt.Runtime(trt.Logger(trt.Logger.INFO)) as runtime:
        cuda_engine =  runtime.deserialize_cuda_engine(f.read())
        
        
with cuda_engine.create_execution_context() as context:
    # Allocate GPU memory
    bindings_mem = []
    for idx in range(cuda_engine.num_bindings):
        binding_shape = context.get_binding_shape(idx)
        binding_dtype = cuda_engine.get_binding_dtype(idx)
        is_input = cuda_engine.binding_is_input(idx)
        print(f'binding {idx}: {binding_shape}, {binding_dtype}, {"input" if is_input else "output"}')
        if is_input:
            gpu_mem = cuda.to_device(np.ndarray(binding_shape, dtype= np.float32))        
        else:
            gpu_mem = cuda.mem_alloc(trt.volume(binding_shape) * binding_dtype.itemsize)        
        bindings_mem.append(gpu_mem)

        
    # Execute engine
    rc = context.execute_v2(bindings=[int(binding_mem) for binding_mem in bindings_mem])
    print('execute_v2() rc:', rc)



binding 0: (241, 20, 4), DataType.FLOAT, input
binding 1: (241,), DataType.INT32, input
binding 2: (241, 4), DataType.INT32, input
binding 3: (1, 2, 138, 160), DataType.FLOAT, output
binding 4: (1, 1, 138, 160), DataType.FLOAT, output
binding 5: (1, 3, 138, 160), DataType.FLOAT, output
binding 6: (1, 2, 138, 160), DataType.FLOAT, output
binding 7: (1, 1, 138, 160), DataType.FLOAT, output
binding 8: (1, 2, 138, 160), DataType.FLOAT, output
binding 9: (1, 1, 138, 160), DataType.FLOAT, output
binding 10: (1, 3, 138, 160), DataType.FLOAT, output
binding 11: (1, 2, 138, 160), DataType.FLOAT, output
binding 12: (1, 1, 138, 160), DataType.FLOAT, output
binding 13: (1, 2, 138, 160), DataType.FLOAT, output
binding 14: (1, 1, 138, 160), DataType.FLOAT, output
binding 15: (1, 3, 138, 160), DataType.FLOAT, output
binding 16: (1, 2, 138, 160), DataType.FLOAT, output
binding 17: (1, 1, 138, 160), DataType.FLOAT, output
binding 18: (1, 2, 138, 160), DataType.FLOAT, output
binding 19: (1, 1, 138, 160)

In [3]:
## Trace Centralpoint model
import torch


script = torch.jit.trace(model.pts_voxel_layer, pc)
script = torch.jit.trace(model, pc)


data_in = torch.randn([1, 384, 138, 160]).cuda()

script = torch.jit.trace_module(model, {'pts_bbox_head':[[data_in]]})
#script = torch.jit.script(model)


  torch.tensor(voxel_size, dtype=torch.float),
  torch.tensor(coors_range, dtype=torch.float),


torch.Size([1000, 4])


  print(return_loss.shape)
  if return_loss.item():


ValueError: only one element tensors can be converted to Python scalars