In [1]:
import math
import numpy as np
import torch
from kitti.kitti_dataset import get_dataloader
import yaml
from easydict import EasyDict
from pathlib import Path
from basic.utils.vis_utils import VisualWindow
# %matplotlib inline
torch.cuda.empty_cache()
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# DataLoader

In [2]:
dataset_cfg_path = Path("../kitti/cfg/kitti_dataset.yaml")
batch_size = 4
dataloader = get_dataloader(data_cfg_path=dataset_cfg_path, class_name_list=['Car'], batch_size=batch_size)  # 'Pedestrian','Cyclist'

  cfg = EasyDict(yaml.load(f))


In [3]:
from basic.utils.common_utils import put_data_to_gpu
# 取一批数据用于模块测试
for data in dataloader:
    test_data = data
    break
# 单独把某些数据放在GPU中，注意frame_id这种还是为np.ndarray
test_data = put_data_to_gpu(test_data)
# print(f"input voxels shape:", test_data['voxels'].shape)
test_data

{'frame_id': array(['006974', '007388', '006697', '007113'], dtype='<U6'),
 'gt_boxes': tensor([[[ 3.4287e+01, -3.6757e+01, -5.7138e-01,  3.3906e+00,  1.3138e+00,
            1.4297e+00, -3.6820e-01,  1.0000e+00],
          [ 2.6158e+01, -2.0168e+01, -7.6718e-01,  4.3470e+00,  1.5746e+00,
            1.5649e+00, -6.6820e-01,  1.0000e+00],
          [ 1.1218e+01, -5.0065e+00, -8.7279e-01,  4.2407e+00,  1.6229e+00,
            1.3910e+00,  2.4450e+00,  1.0000e+00],
          [ 7.4499e+00, -2.3022e+01, -5.1212e-01,  3.7481e+00,  1.5070e+00,
            1.4393e+00, -6.8820e-01,  1.0000e+00],
          [ 2.6972e+01, -2.6909e+01,  9.2320e-02,  4.0379e+00,  1.7968e+00,
            1.4297e+00, -2.3082e+00,  1.0000e+00],
          [ 2.5747e+01, -2.3670e+01, -7.6032e-01,  3.6611e+00,  1.5359e+00,
            1.4490e+00, -6.2820e-01,  1.0000e+00],
          [ 2.3785e+01, -1.1817e+01, -8.9798e-01,  4.3470e+00,  1.6519e+00,
            1.4973e+00,  2.4950e+00,  1.0000e+00],
          [ 3.4093e+01, 

# Model cfg
最终的目的是想用字符文档生成模型。但是下面只是单一测试每一个模块
- model cfg 中包含各个模块的配置：module cfg
- 在模型全局中使用model_info_dict记录一些必要的模型信息


In [4]:
from pprint import pprint
from basic.utils.config_utils import cfg_from_yaml_file
top_cfg = cfg_from_yaml_file('../basic/model/model_cfg/second.yaml')
model_cfg = top_cfg.MODEL
model_info_dict = {
    'module_list': [],
    'training': True,
}
data_infos = dataloader.dataset.get_data_infos()
model_info_dict.update(data_infos)
pprint(top_cfg)

{'DATASET_CONFIG': {'CONFIG_PATH': '/home/ph/Desktop/PointCloud/utils_my/kitti/cfg/kitti_dataset.yaml',
                    'DATASET': 'KittiDataset',
                    'DATA_AUGMENTOR': {'AUG_CONFIG_LIST': [{'DATABASE_WITH_FAKELIDAR': False,
                                                            'DB_INFO_PATH': ['db_infos_train.pkl'],
                                                            'LIMIT_WHOLE_SCENE': True,
                                                            'NAME': 'gt_sampling',
                                                            'NUM_POINT_FEATURES': 4,
                                                            'PREPARE': {'filter_by_difficulty': [-1],
                                                                        'filter_by_min_points': ['Car:5',
                                                                                                 'Pedestrian:5',
                                                                               

In [5]:
pprint(model_cfg)

{'BACKBONE2D': {'MULTI_FEAT_LAYER_CONFIG': {'in_channels': 256,
                                            'layer_nums': [5, 5],
                                            'layer_strides': [1, 2],
                                            'out_channels': [128, 256]},
                'NAME': 'SECONDFPN',
                'in_channels': [128, 256],
                'out_channels': [256, 256],
                'upsample_strides': [1, 2]},
 'BACKBONE3D': {'NAME': 'VoxelBackBone8x', 'in_channels': 4},
 'DENSE_HEAD': {'ANCHOR_GENERATOR_CONFIG': {'CLASS_CONFIG': [{'anchor_dims': 3,
                                                              'boxes_size': [[3.9,
                                                                              1.6,
                                                                              1.56]],
                                                              'center_aligned': True,
                                                              'class_name': 'Ca

初始模型信息，注意经过每一个模块处理后,更新以下信息。
- 更新module_list记录的模块
- 当前特征图中每个点的特征维度
- 后面模块可能会使用到的当前模块的一些信息

In [6]:
for key, value in model_info_dict.items():
    print(f"{key}:{value}")

module_list:[]
training:True
raw_point_feature_dims:4
cur_point_feature_dims:4
point_cloud_range:[  0.  -40.   -3.   70.4  40.    1. ]
voxel_size:None
grid_size:None
class_names:['Car']


# Voxelize Layer

In [7]:
voxelize_cfg = model_cfg.VOXELIZE_LAYER
pprint(voxelize_cfg)

{'NAME': 'VoxelLayer',
 'full_mean': False,
 'max_points_pre_voxel': 5,
 'max_voxels': {'test': 40000, 'train': 16000},
 'point_cloud_range': [0, -40, -3, 70.4, 40, 1],
 'use_lead_xyz': True,
 'voxel_size': [0.05, 0.05, 0.1]}


In [8]:
from basic.module.voxelize import VoxelLayer
voxelize_cfg = model_cfg.VOXELIZE_LAYER
voxelize_module = VoxelLayer(model_info_dict=model_info_dict, **voxelize_cfg).to(device)
output = voxelize_module(test_data, keep_points=True)
model_info_dict['module_list'].append(voxelize_module)
output

{'frame_id': array(['006974', '007388', '006697', '007113'], dtype='<U6'),
 'gt_boxes': tensor([[[ 3.4287e+01, -3.6757e+01, -5.7138e-01,  3.3906e+00,  1.3138e+00,
            1.4297e+00, -3.6820e-01,  1.0000e+00],
          [ 2.6158e+01, -2.0168e+01, -7.6718e-01,  4.3470e+00,  1.5746e+00,
            1.5649e+00, -6.6820e-01,  1.0000e+00],
          [ 1.1218e+01, -5.0065e+00, -8.7279e-01,  4.2407e+00,  1.6229e+00,
            1.3910e+00,  2.4450e+00,  1.0000e+00],
          [ 7.4499e+00, -2.3022e+01, -5.1212e-01,  3.7481e+00,  1.5070e+00,
            1.4393e+00, -6.8820e-01,  1.0000e+00],
          [ 2.6972e+01, -2.6909e+01,  9.2320e-02,  4.0379e+00,  1.7968e+00,
            1.4297e+00, -2.3082e+00,  1.0000e+00],
          [ 2.5747e+01, -2.3670e+01, -7.6032e-01,  3.6611e+00,  1.5359e+00,
            1.4490e+00, -6.2820e-01,  1.0000e+00],
          [ 2.3785e+01, -1.1817e+01, -8.9798e-01,  4.3470e+00,  1.6519e+00,
            1.4973e+00,  2.4950e+00,  1.0000e+00],
          [ 3.4093e+01, 

In [9]:
print(output['voxels'].shape)
print(output['voxel_coords'].shape)
print(output['voxel_num_points'].shape)

torch.Size([58812, 5, 4])
torch.Size([58812, 4])
torch.Size([58812])


In [10]:
print(voxelize_module.grid_size)
print(voxelize_module.voxel_size)

[1408 1600   40]
[0.05, 0.05, 0.1]


# Feature Extractor/Encoding Module
点云特征提取模块目的是：从无序的原始点云数据中提取出有序的初步特征,或者说找到一种方式编码原始点云，
令其有序。其实就是把原始点云转换为有序的张量矩阵
常见PointNet的方式，就是为了提取有序的初步特征；而体素的方式，是为了用体素这种格式编码原始点云，令其有序
为什么要这样做？我的理解是，现有CNN只能处理有序的张量！！！不管是3d卷积还是2d卷积



Voxel Feature Extractor(VFE)
- 提取体素级别的特征
输入：体素，以及体素相关的信息
输出：提取的体素特征
- Mean VFE：取每个体素内所有点的平均值作为输出特征
- MLP VFE:对每个体素内的点集，做类似PointNet的操作。即用MLP + Max pooling 提取点集的特征

In [11]:
fe_cfg = model_cfg.FEATURE_EXTRACTOR
pprint(fe_cfg)

{'NAME': 'MeanVFE', 'is_normalize': True}


In [12]:
#mean vfe
from basic.module.feature_extractor import MeanVFE
mean_vfe_module = MeanVFE(model_info_dict, **fe_cfg).cuda()
output = mean_vfe_module(output)
model_info_dict['module_list'].append(mean_vfe_module)
print(f"Mean VFE： voxel_features shape:", output['voxel_features'].shape)
output

Mean VFE： voxel_features shape: torch.Size([58812, 4])


{'frame_id': array(['006974', '007388', '006697', '007113'], dtype='<U6'),
 'gt_boxes': tensor([[[ 3.4287e+01, -3.6757e+01, -5.7138e-01,  3.3906e+00,  1.3138e+00,
            1.4297e+00, -3.6820e-01,  1.0000e+00],
          [ 2.6158e+01, -2.0168e+01, -7.6718e-01,  4.3470e+00,  1.5746e+00,
            1.5649e+00, -6.6820e-01,  1.0000e+00],
          [ 1.1218e+01, -5.0065e+00, -8.7279e-01,  4.2407e+00,  1.6229e+00,
            1.3910e+00,  2.4450e+00,  1.0000e+00],
          [ 7.4499e+00, -2.3022e+01, -5.1212e-01,  3.7481e+00,  1.5070e+00,
            1.4393e+00, -6.8820e-01,  1.0000e+00],
          [ 2.6972e+01, -2.6909e+01,  9.2320e-02,  4.0379e+00,  1.7968e+00,
            1.4297e+00, -2.3082e+00,  1.0000e+00],
          [ 2.5747e+01, -2.3670e+01, -7.6032e-01,  3.6611e+00,  1.5359e+00,
            1.4490e+00, -6.2820e-01,  1.0000e+00],
          [ 2.3785e+01, -1.1817e+01, -8.9798e-01,  4.3470e+00,  1.6519e+00,
            1.4973e+00,  2.4950e+00,  1.0000e+00],
          [ 3.4093e+01, 

In [13]:
print(f"current model infos:")
for key, value in model_info_dict.items():
    print(f"{key}:{value}")

current model infos:
module_list:[VoxelLayer(), MeanVFE()]
training:True
raw_point_feature_dims:4
cur_point_feature_dims:4
point_cloud_range:[  0.  -40.   -3.   70.4  40.    1. ]
voxel_size:[0.05, 0.05, 0.1]
grid_size:[1408 1600   40]
class_names:['Car']


In [14]:
# mlp vfe just test
from basic.module.feature_extractor import MlpVFE
cfg = {'mlp_dims': [32, 64, 64, 128, 128],
       'input_channels': 4}
mlp_vfe_module = MlpVFE(cfg).cuda()
t = mlp_vfe_module(test_data)
print(f"Mlp VFE： voxel_features shape:", t.shape)

Mlp VFE： voxel_features shape: torch.Size([58812, 128])


Point Feature Extractor(PFE)
- 直接提取原始点云的特征
- 代表方法PointNet++的SetAbstract layer

In [15]:
#todo

# Backbone3D
- 经过原始点云的特征提取/编码后，一般会得到B,C,VH,VW,VD的体素张量特征矩阵。或B,C,H,W的点云特征张量矩阵。
根据特征张量维度选择用3D卷积还是2D卷积网络来进一步提取特征。
- 因为体素张量特征矩阵非常稀疏，多使用稀疏卷积。使用spconv库来进行稀疏3D卷积

In [16]:
back3d_cfg = model_cfg.BACKBONE3D
pprint(back3d_cfg)

{'NAME': 'VoxelBackBone8x', 'in_channels': 4}


In [17]:
from basic.module.backbone3d import VoxelBackBone8x
backbone3d_module = VoxelBackBone8x(model_info_dict, **back3d_cfg).cuda()
output = backbone3d_module(output)
model_info_dict['module_list'].append(backbone3d_module)
print(f"spconv_tensor_shape:", output['sp_feat3d'].dense().shape)
output

spconv_tensor_shape: torch.Size([4, 128, 2, 200, 176])


{'frame_id': array(['006974', '007388', '006697', '007113'], dtype='<U6'),
 'gt_boxes': tensor([[[ 3.4287e+01, -3.6757e+01, -5.7138e-01,  3.3906e+00,  1.3138e+00,
            1.4297e+00, -3.6820e-01,  1.0000e+00],
          [ 2.6158e+01, -2.0168e+01, -7.6718e-01,  4.3470e+00,  1.5746e+00,
            1.5649e+00, -6.6820e-01,  1.0000e+00],
          [ 1.1218e+01, -5.0065e+00, -8.7279e-01,  4.2407e+00,  1.6229e+00,
            1.3910e+00,  2.4450e+00,  1.0000e+00],
          [ 7.4499e+00, -2.3022e+01, -5.1212e-01,  3.7481e+00,  1.5070e+00,
            1.4393e+00, -6.8820e-01,  1.0000e+00],
          [ 2.6972e+01, -2.6909e+01,  9.2320e-02,  4.0379e+00,  1.7968e+00,
            1.4297e+00, -2.3082e+00,  1.0000e+00],
          [ 2.5747e+01, -2.3670e+01, -7.6032e-01,  3.6611e+00,  1.5359e+00,
            1.4490e+00, -6.2820e-01,  1.0000e+00],
          [ 2.3785e+01, -1.1817e+01, -8.9798e-01,  4.3470e+00,  1.6519e+00,
            1.4973e+00,  2.4950e+00,  1.0000e+00],
          [ 3.4093e+01, 

In [18]:
print(f"current model infos:")
for key, value in model_info_dict.items():
    print(f"{key}:{value}")

current model infos:
module_list:[VoxelLayer(), MeanVFE(), VoxelBackBone8x(
  (conv_input): SparseSequential(
    (0): SubMConv3d()
    (1): BatchNorm1d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (conv_layers): ModuleList(
    (0): SparseSequential(
      (0): SubMConv3d()
      (1): BatchNorm1d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (1): SparseSequential(
      (0): SparseConv3d()
      (1): BatchNorm1d(32, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (2): SparseSequential(
      (0): SubMConv3d()
      (1): BatchNorm1d(32, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (3): SparseSequential(
      (0): SubMConv3d()
      (1): BatchNorm1d(32, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (4): SparseSequential(
      (0): SparseConv3d()
      

# NECK
BackBone3D提取的特征向量依然处于3维空间内。目前一般不会在3维空间内提取ROIs。因为3DNMS，3DIOU等都很麻烦....。
因此直接在前视图FOV或在鸟瞰图BEV上提取ROIs。为此需要将3d特征转换为2d特征。
- 常用的Neck：
直接压缩：比如将B,C,D,H,W的特征压缩为B，C*H，D,W,此时的特征图可以认为是BEV视角下的二维特征图

In [19]:
neck_cfg = model_cfg.NECK
pprint(neck_cfg)

{'NAME': 'DimCompression', 'dim': 2}


In [20]:
from basic.module.neck import DimCompression
neck_module = DimCompression(model_info_dict=model_info_dict, **neck_cfg)
output = neck_module(output)
model_info_dict['module_list'].append(neck_module)
print("直接压缩", output['dense_feat2d'].shape)
output

直接压缩 torch.Size([4, 256, 200, 176])


{'frame_id': array(['006974', '007388', '006697', '007113'], dtype='<U6'),
 'gt_boxes': tensor([[[ 3.4287e+01, -3.6757e+01, -5.7138e-01,  3.3906e+00,  1.3138e+00,
            1.4297e+00, -3.6820e-01,  1.0000e+00],
          [ 2.6158e+01, -2.0168e+01, -7.6718e-01,  4.3470e+00,  1.5746e+00,
            1.5649e+00, -6.6820e-01,  1.0000e+00],
          [ 1.1218e+01, -5.0065e+00, -8.7279e-01,  4.2407e+00,  1.6229e+00,
            1.3910e+00,  2.4450e+00,  1.0000e+00],
          [ 7.4499e+00, -2.3022e+01, -5.1212e-01,  3.7481e+00,  1.5070e+00,
            1.4393e+00, -6.8820e-01,  1.0000e+00],
          [ 2.6972e+01, -2.6909e+01,  9.2320e-02,  4.0379e+00,  1.7968e+00,
            1.4297e+00, -2.3082e+00,  1.0000e+00],
          [ 2.5747e+01, -2.3670e+01, -7.6032e-01,  3.6611e+00,  1.5359e+00,
            1.4490e+00, -6.2820e-01,  1.0000e+00],
          [ 2.3785e+01, -1.1817e+01, -8.9798e-01,  4.3470e+00,  1.6519e+00,
            1.4973e+00,  2.4950e+00,  1.0000e+00],
          [ 3.4093e+01, 

In [21]:
print(f"current model infos:")
for key, value in model_info_dict.items():
    print(f"{key}:{value}")

current model infos:
module_list:[VoxelLayer(), MeanVFE(), VoxelBackBone8x(
  (conv_input): SparseSequential(
    (0): SubMConv3d()
    (1): BatchNorm1d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (conv_layers): ModuleList(
    (0): SparseSequential(
      (0): SubMConv3d()
      (1): BatchNorm1d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (1): SparseSequential(
      (0): SparseConv3d()
      (1): BatchNorm1d(32, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (2): SparseSequential(
      (0): SubMConv3d()
      (1): BatchNorm1d(32, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (3): SparseSequential(
      (0): SubMConv3d()
      (1): BatchNorm1d(32, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (4): SparseSequential(
      (0): SparseConv3d()
      

# Backbone2D
与BackBone3D一样，根据输入的张量维度。选择用2D卷积网络提取特征。通常如果使用NECK 模块将3维空间内的特征压缩为2维空间的特征后
也会再次使用2D的卷积网络再次提取特征。

In [22]:
backbone2d_cfg = model_cfg.BACKBONE2D
pprint(backbone2d_cfg)

{'MULTI_FEAT_LAYER_CONFIG': {'in_channels': 256,
                             'layer_nums': [5, 5],
                             'layer_strides': [1, 2],
                             'out_channels': [128, 256]},
 'NAME': 'SECONDFPN',
 'in_channels': [128, 256],
 'out_channels': [256, 256],
 'upsample_strides': [1, 2]}


In [23]:
from basic.module.backbone2d import SECONDFPN
backbone2d = SECONDFPN(model_info_dict, **backbone2d_cfg).cuda()
output = backbone2d(output)
model_info_dict['module_list'].append(backbone2d)
output



{'frame_id': array(['006974', '007388', '006697', '007113'], dtype='<U6'),
 'gt_boxes': tensor([[[ 3.4287e+01, -3.6757e+01, -5.7138e-01,  3.3906e+00,  1.3138e+00,
            1.4297e+00, -3.6820e-01,  1.0000e+00],
          [ 2.6158e+01, -2.0168e+01, -7.6718e-01,  4.3470e+00,  1.5746e+00,
            1.5649e+00, -6.6820e-01,  1.0000e+00],
          [ 1.1218e+01, -5.0065e+00, -8.7279e-01,  4.2407e+00,  1.6229e+00,
            1.3910e+00,  2.4450e+00,  1.0000e+00],
          [ 7.4499e+00, -2.3022e+01, -5.1212e-01,  3.7481e+00,  1.5070e+00,
            1.4393e+00, -6.8820e-01,  1.0000e+00],
          [ 2.6972e+01, -2.6909e+01,  9.2320e-02,  4.0379e+00,  1.7968e+00,
            1.4297e+00, -2.3082e+00,  1.0000e+00],
          [ 2.5747e+01, -2.3670e+01, -7.6032e-01,  3.6611e+00,  1.5359e+00,
            1.4490e+00, -6.2820e-01,  1.0000e+00],
          [ 2.3785e+01, -1.1817e+01, -8.9798e-01,  4.3470e+00,  1.6519e+00,
            1.4973e+00,  2.4950e+00,  1.0000e+00],
          [ 3.4093e+01, 

In [24]:
output['dense_feat2d'].shape

torch.Size([4, 512, 200, 176])

In [25]:
print(f"current model infos:")
for key, value in model_info_dict.items():
    print(f"{key}:{value}")

current model infos:
module_list:[VoxelLayer(), MeanVFE(), VoxelBackBone8x(
  (conv_input): SparseSequential(
    (0): SubMConv3d()
    (1): BatchNorm1d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (conv_layers): ModuleList(
    (0): SparseSequential(
      (0): SubMConv3d()
      (1): BatchNorm1d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (1): SparseSequential(
      (0): SparseConv3d()
      (1): BatchNorm1d(32, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (2): SparseSequential(
      (0): SubMConv3d()
      (1): BatchNorm1d(32, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (3): SparseSequential(
      (0): SubMConv3d()
      (1): BatchNorm1d(32, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (4): SparseSequential(
      (0): SparseConv3d()
      

In [26]:
# from basic.module.backbone2d.base_bev_backbone import BaseBEVBackbone
# backbone2d_cfg = model_cfg.MODEL.BACKBONE2D
# backbone2d = BaseBEVBackbone(backbone2d_cfg, model_info_dict).cuda()
# output = backbone2d(output)
# output['spatial_features_2d'].shape

截止目前为止：输入点云的shape变化为
- 原始点云->体素：183149, 5, 4
- VFE：183149, 4
- BackBone3D：12， 128， 2， 200， 176
- neck：12，256，200，176
- BackBone2D：12，256，200，176，shape未变因为卷积过后，又转置卷积回了原始大小
经过上面的各个模块，从原始点云中获取了能代表该点云的二维特征图。接下来是3D目标识别中最重要的部分：Dense Head 与 ROI head。

# Dense Head
BackBone2D的输出为用于Bbox回归的，和Bbox分类的两个likelihood矩阵

anchor generator（一）
- 枚举7种anchor可能用到的特征，即x，y，z，h，w，l，r。然后通过mesh gird产生所有anchors。其中根据anchor中心坐标xyz的取法不同又分为Range和Stride两种方案
    - Range：在点云范围内，给定每个轴的取值范围。每个轴按照特征图中对应的维度平均划分这些轴。比如特征图对应X轴的维度大小为176，就在X轴范围内平均划分176个。
    - Stride：给定xyz坐标下的原点坐标，分别以x stride，y stride，z stride沿着各个轴的正方向按步长获得anchor中心坐标xyz。
    - 代码接口虽然可以自定义Range和Stride。但是为了将特征图上的每个特征点与原图上的每个anchor关联起来，一定要平均划分！！即Range取值为点云的范围，而Stride取值为
  点云采样范围 / 特征图大小。即\[z_stride, x_stride, y_stride\]=\[z_len, x_len, y_len\] / \[H, W, L\]。这样看按Range还是Stride的方案取得的结果应该差距不大。。。
    - 实际上就是把特征图上的每个特征点，映射回了原始数据上对应区域的中心？假如原始点云下采样了8倍得到特征图，则特征图中\[0,0,0\]点对应原点云（点云原点坐标为000）中以\[8,8,8\]为中心，边长为8的正方形区域？

In [27]:
# anchor generator config
anchor_gen_cfg = model_cfg.DENSE_HEAD.ANCHOR_GENERATOR_CONFIG
pprint(anchor_gen_cfg)

{'CLASS_CONFIG': [{'anchor_dims': 3,
                   'boxes_size': [[3.9, 1.6, 1.56]],
                   'center_aligned': True,
                   'class_name': 'Car',
                   'mode': 'Range',
                   'ratios': [1],
                   'road_plane_aligned': True,
                   'road_plane_height': -0.035,
                   'rotations': [0, 1.57]},
                  {'anchor_dims': 3,
                   'boxes_size': [[0.8, 0.6, 1.73]],
                   'center_aligned': True,
                   'class_name': 'Pedestrian',
                   'mode': 'Range',
                   'ratios': [1],
                   'road_plane_aligned': True,
                   'road_plane_height': -1.2,
                   'rotations': [0, 1.57]},
                  {'anchor_bottom_heights': [-0.6],
                   'anchor_dims': 3,
                   'boxes_size': [[1.76, 0.6, 1.73]],
                   'center_aligned': True,
                   'class_name': 'Cyclist',
 

In [28]:
from basic.module.dense_head.anchor_generator.anchor_gen_base import AnchorGenerator
anchor_generator = AnchorGenerator(anchor_gen_cfg, model_info_dict, class_type='Car', dtype=torch.float32)
anchors = anchor_generator.gen_anchors(flatten_output=False, feature_map_size=np.array([1, 200, 178]))
print("Range anchors shape:", anchors.shape)
print("Range stride:", anchors[1, 1] - anchors[0, 0])
print("begin:", anchors[0, 0])
print("end:", anchors[-1, -1])

Range anchors shape: torch.Size([178, 200, 1, 1, 2, 7])
Range stride: tensor([[[[0.3955, 0.4000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.3955, 0.4000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]],
       device='cuda:0')
begin: tensor([[[[ 1.9775e-01, -3.9800e+01, -3.5000e-02,  3.9000e+00,  1.6000e+00,
            1.5600e+00,  0.0000e+00],
          [ 1.9775e-01, -3.9800e+01, -3.5000e-02,  3.9000e+00,  1.6000e+00,
            1.5600e+00,  1.5700e+00]]]], device='cuda:0')
end: tensor([[[[ 7.0202e+01,  3.9800e+01, -3.5000e-02,  3.9000e+00,  1.6000e+00,
            1.5600e+00,  0.0000e+00],
          [ 7.0202e+01,  3.9800e+01, -3.5000e-02,  3.9000e+00,  1.6000e+00,
            1.5600e+00,  1.5700e+00]]]], device='cuda:0')


In [29]:
anchor_generator.set_mode('Stride')
anchors = anchor_generator.gen_anchors(flatten_output=False, feature_map_size=np.array([1, 200, 178]))
print("stride:", anchor_generator.stride)
print("Stride anchors shape:", anchors.shape)
print("begin:", anchors[0, 0])
print("end:", anchors[-1, -1])
final_anchors = anchors.view(-1, 7)
print("output anchor shape:", final_anchors.shape)

stride: tensor([0.3955, 0.4000, 4.0000])
Stride anchors shape: torch.Size([178, 200, 1, 1, 2, 7])
begin: tensor([[[[ 1.9775e-01, -3.9800e+01, -3.5000e-02,  3.9000e+00,  1.6000e+00,
            1.5600e+00,  0.0000e+00],
          [ 1.9775e-01, -3.9800e+01, -3.5000e-02,  3.9000e+00,  1.6000e+00,
            1.5600e+00,  1.5700e+00]]]], device='cuda:0')
end: tensor([[[[ 7.0202e+01,  3.9800e+01, -3.5000e-02,  3.9000e+00,  1.6000e+00,
            1.5600e+00,  0.0000e+00],
          [ 7.0202e+01,  3.9800e+01, -3.5000e-02,  3.9000e+00,  1.6000e+00,
            1.5600e+00,  1.5700e+00]]]], device='cuda:0')
output anchor shape: torch.Size([71200, 7])


1.注意输出anchors的shape为176, 200, 1, 1, 2, 7。最后一个维度代表anchor的特征向量xyzlwhr，
其他维度分别与x y z size rot的可枚举数量一致.当然最后输出的shape为(176x200x1x1x2, 7)
2.在对齐体素中心的情况下，Range和Stride两种方案的结果都是一样的。假如点云的x轴范围为\[0, 70.4\]，
而x轴对应的维度在特征图上大小为176.则均分后相邻点的距离为70.4 / 176 = 0.4。Range和Stride
枚举X坐标的核心代码如下

In [30]:
ranges = torch.linspace(0, 70.4, 176)
range_align_center = torch.linspace(0 + 0.2, 70.4 - 0.2, 176)
stride = torch.arange(0, 176) * 0.4
stride_align_center = stride + 0.4 / 2

In [31]:
anchor_generator.set_mode('Range')

随机绘制100个anchor box看看
- 明显anchor 产生的全部BBox能覆盖整个点云cube范围

In [32]:
%matplotlib auto
anchors = anchor_generator.gen_anchors(flatten_output=True, feature_map_size=np.array([1, 200, 178]))
w = VisualWindow(mode='3d')
points = test_data['points']
test_pc = points[points[:, 0] == 0][:, 1:]
w.draw_point_cloud(pc=test_pc.cpu().numpy())
sample_ids = torch.randperm(anchors.size(0))[:100]
w.draw_boxes3d(boxes=anchors[sample_ids].cpu().numpy(), format='corner')

Using matplotlib backend: Qt5Agg


anchor generator（二）
- 在xyz坐标原点生成基本的anchors，然后通过stride。移动这些anchors

MultiClass Generator
在同一feature map上为每种类别生成对应的anchor。输出\[class_dim，xdim，ydim，zdim，size_dim,rot_dim,7\]

In [33]:
# from basic.module.dense_head.anchor_generator.anchor_gen_base import MultiClsAnchorGenerator
# mul_generator = MultiClsAnchorGenerator(anchor_gen_cfg, model_info_dict,
#                                         feature_map_size=np.array([1, 200, 178],
#                                         cls_list=['Car', 'Pedestrian', 'Cyclist'])
# all_anchors = mul_generator.gen_anchors(flatten_output=False)
# print("class_dim，xdim，ydim，zdim，size_dim,rot_dim,7:", all_anchors.shape)
# print("Car:", all_anchors[0, 0, 0])
# print("Pedestrian:", all_anchors[1, 0, 0])
# print("Cyclist", all_anchors[2, 0, 0])

Target assigner
目的：1.训练时，为每个anchor指定类别标签和Boxes偏移量标签；
输入：1.Anchors\[K,7\];2.Ground Truth Boxes\[B,N,8\],其中8=xyzhwlr+class_ind

In [34]:
print("gt_box_temp:", test_data['gt_boxes'][0, 0])
assigner_cfg = model_cfg.DENSE_HEAD.TARGET_ASSIGNER_CONFIG
pprint(assigner_cfg)

gt_box_temp: tensor([ 34.2874, -36.7568,  -0.5714,   3.3906,   1.3138,   1.4297,  -0.3682,
          1.0000], device='cuda:0')
{'BOX_ENCODER': {'NAME': 'ResidualCoder',
                 'code_size': 7,
                 'encode_angle_by_sincos': False},
 'CLASS_THRESHOLD': [{'class_name': 'Car',
                      'neg_threshold': 0.4,
                      'pos_threshold': 0.55},
                     {'class_name': 'Pedestrian',
                      'neg_threshold': 0.35,
                      'pos_threshold': 0.5},
                     {'class_name': 'Cyclist',
                      'neg_threshold': 0.35,
                      'pos_threshold': 0.5}],
 'DEVICE': 'cuda',
 'FORCE_MATCH': True,
 'IOU_CALCULATOR': {'NAME': 'Iou3DCalculator', 'use_bev_iou': True},
 'MATCH_HEIGHT': False,
 'NAME': 'MaxIouTargetAssigner',
 'NORM_BY_NUM_EXAMPLES': False,
 'POS_FRACTION': -1.0,
 'SAMPLER': {'NAME': 'MaxSizeSubSampler', 'sample_size': 512}}


In [35]:
for key, value in model_info_dict.items():
    if key != 'module_list':
        print(key, value)

training True
raw_point_feature_dims 4
cur_point_feature_dims 4
point_cloud_range [  0.  -40.   -3.   70.4  40.    1. ]
voxel_size [0.05, 0.05, 0.1]
grid_size [1408 1600   40]
class_names ['Car']


In [36]:
from basic.module.dense_head.target_assigner import MaxIouTargetAssigner
target_assigner = MaxIouTargetAssigner(assigner_cfg, model_info_dict)

In [37]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
print("labels:", test_data['gt_boxes'][:, :, -1])

labels: tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.]],
       device='cuda:0')


In [38]:
# all_anchors = mul_generator.gen_anchors(flatten_output=True).cuda()
all_anchors = anchor_generator.gen_anchors(flatten_output=True, feature_map_size=np.array([1, 200, 178])).to(device)
target_assigner.force_match = True
# target_dict, batch_bbox_id_dict = target_assigner.assign(gts=test_data['gt_boxes'][..., :-1], bboxes=all_anchors, gt_labels=test_data['gt_boxes'][:, :, -1])
assign_ret = target_assigner.assign(gts=test_data['gt_boxes'][..., :-1], bboxes=all_anchors, gt_labels=test_data['gt_boxes'][..., -1])

In [39]:
pos_tuples = assign_ret.pos_tuples
neg_tuples = assign_ret.neg_tuples
pos_tuples

tensor([[    0,     3,  6484],
        [    0,     3,  6884],
        [    0,     3,  7282],
        ...,
        [    3,     4, 60960],
        [    3,     4, 61358],
        [    3,     4, 61360]], device='cuda:0')

In [40]:
t1, t2 = torch.where(assign_ret.pos_tuples_dense >= 0)
print(t1)
print(t1 == pos_tuples[:, 0])
print(t2)
print(t2 == pos_tuples[:, 2])

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [41]:
t = assign_ret.bbox_targets.nonzero()[:, :2].unique(dim=0)
print(t)
print(t[:,0] == pos_tuples[:, 0])
print(t[:,1] == pos_tuples[:, 2])

tensor([[    0,  6484],
        [    0,  6884],
        [    0,  7282],
        [    0,  7284],
        [    0,  7390],
        [    0,  7684],
        [    0,  7788],
        [    0,  7790],
        [    0,  8084],
        [    0,  8188],
        [    0,  8190],
        [    0,  8588],
        [    0,  8590],
        [    0, 10174],
        [    0, 10574],
        [    0, 10974],
        [    0, 11326],
        [    0, 11372],
        [    0, 11374],
        [    0, 11376],
        [    0, 11726],
        [    0, 11774],
        [    0, 12124],
        [    0, 12126],
        [    0, 12174],
        [    0, 12524],
        [    0, 12526],
        [    0, 12926],
        [    0, 18128],
        [    0, 18130],
        [    0, 18528],
        [    0, 18530],
        [    0, 18928],
        [    0, 18930],
        [    0, 19328],
        [    0, 19330],
        [    0, 21685],
        [    0, 21687],
        [    0, 21689],
        [    0, 22083],
        [    0, 22085],
        [    0, 

In [42]:
t = torch.where(assign_ret.bbox_weights == 1)[1]
print(t)
print(t == pos_tuples[:, -1])

tensor([ 6484,  6884,  7282,  7284,  7390,  7684,  7788,  7790,  8084,  8188,
         8190,  8588,  8590, 10174, 10574, 10974, 11326, 11372, 11374, 11376,
        11726, 11774, 12124, 12126, 12174, 12524, 12526, 12926, 18128, 18130,
        18528, 18530, 18928, 18930, 19328, 19330, 21685, 21687, 21689, 22083,
        22085, 22087, 22089, 22091, 22940, 23340, 23738, 23740, 24138, 24140,
        24540, 24940, 25280, 25298, 25350, 25680, 25682, 25698, 25748, 25750,
        26080, 26082, 26098, 26100, 26148, 26150, 26480, 26498, 26500, 26550,
        26863, 26865, 26867, 26898, 27261, 27263, 27265, 27267, 27269, 27298,
        28098, 28498, 28898, 28900, 29298, 29300, 29698, 30523, 30525, 30527,
        30921, 30923, 30925, 30927, 30929, 32092, 32194, 32492, 32592, 32594,
        32892, 32992, 32994, 33292, 33392, 33394, 33692, 33732, 33794, 34014,
        34016, 34130, 34132, 34414, 34416, 34530, 34532, 34814, 34816, 34930,
        34932, 35216,  2167,  2169,  2171,  2565,  2567,  2569, 

In [43]:
t = torch.where(assign_ret.cls_weights == 1)
print(t[1].sort()[0])
torch.cat([ pos_tuples[:, -1], neg_tuples[: ,-1]]).sort()[0]

tensor([   24,    77,   177,  ..., 71151, 71158, 71163], device='cuda:0')


tensor([   24,    77,   177,  ..., 71151, 71158, 71163], device='cuda:0')

可视化每个场景中，通过target assign匹配的anchor bbox

In [44]:
%matplotlib auto
batch_bbox = assign_ret.pos_tuples
batch_ids = batch_bbox[:, 0]
bbox_ids = batch_bbox[:, 2]
points = test_data['points']
for i in range(batch_size):
    mask = batch_ids == i
    frame_bbox_ids = bbox_ids[mask]
    if frame_bbox_ids.size(0) > 0:
        print(frame_bbox_ids)
        frame_pc = points[points[:, 0] == i][:, 1:]
        frame_bbox = all_anchors[frame_bbox_ids]
        frame_gt = test_data['gt_boxes'][i]
        w = VisualWindow(mode='3d')
        w.draw_point_cloud(frame_pc.cpu().numpy())
        w.draw_boxes3d(frame_gt[:,:7].cpu().numpy())
        w.draw_boxes3d(frame_bbox.cpu().numpy(), 'corner', c='r')
        # break

Using matplotlib backend: Qt5Agg
tensor([ 6484,  6884,  7282,  7284,  7390,  7684,  7788,  7790,  8084,  8188,
         8190,  8588,  8590, 10174, 10574, 10974, 11326, 11372, 11374, 11376,
        11726, 11774, 12124, 12126, 12174, 12524, 12526, 12926, 18128, 18130,
        18528, 18530, 18928, 18930, 19328, 19330, 21685, 21687, 21689, 22083,
        22085, 22087, 22089, 22091, 22940, 23340, 23738, 23740, 24138, 24140,
        24540, 24940, 25280, 25298, 25350, 25680, 25682, 25698, 25748, 25750,
        26080, 26082, 26098, 26100, 26148, 26150, 26480, 26498, 26500, 26550,
        26863, 26865, 26867, 26898, 27261, 27263, 27265, 27267, 27269, 27298,
        28098, 28498, 28898, 28900, 29298, 29300, 29698, 30523, 30525, 30527,
        30921, 30923, 30925, 30927, 30929, 32092, 32194, 32492, 32592, 32594,
        32892, 32992, 32994, 33292, 33392, 33394, 33692, 33732, 33794, 34014,
        34016, 34130, 34132, 34414, 34416, 34530, 34532, 34814, 34816, 34930,
        34932, 35216], device='

上面所有子模块组成基于anchor的Dense head：anchor head

In [45]:
dense_head_cfg = model_cfg.DENSE_HEAD
pprint(dense_head_cfg)

{'ANCHOR_GENERATOR_CONFIG': {'CLASS_CONFIG': [{'anchor_dims': 3,
                                               'boxes_size': [[3.9, 1.6, 1.56]],
                                               'center_aligned': True,
                                               'class_name': 'Car',
                                               'mode': 'Range',
                                               'ratios': [1],
                                               'road_plane_aligned': True,
                                               'road_plane_height': -0.035,
                                               'rotations': [0, 1.57]},
                                              {'anchor_dims': 3,
                                               'boxes_size': [[0.8, 0.6, 1.73]],
                                               'center_aligned': True,
                                               'class_name': 'Pedestrian',
                                               'mode': 'Range',
          

In [46]:
from basic.module.dense_head.anchor_head.anchor3d_head import Anchor3DHead
anchor_head = Anchor3DHead(top_cfg, model_info_dict).to(device)
output_dict = anchor_head(output)
output_dict

{'cls_pred': tensor([[[ 0.0472,  0.1896],
          [ 0.0864, -0.1015],
          [ 0.0270,  0.1919],
          ...,
          [-0.0729,  0.2113],
          [ 0.0480,  0.1261],
          [-0.0521,  0.0649]],
 
         [[ 0.0470,  0.1896],
          [ 0.0864, -0.1015],
          [ 0.0268,  0.1918],
          ...,
          [-0.0729,  0.2113],
          [ 0.0480,  0.1261],
          [-0.0521,  0.0649]],
 
         [[ 0.0470,  0.1896],
          [ 0.0864, -0.1015],
          [ 0.0268,  0.1918],
          ...,
          [-0.0729,  0.2113],
          [ 0.0480,  0.1261],
          [-0.0521,  0.0649]],
 
         [[ 0.0470,  0.1896],
          [ 0.0864, -0.1015],
          [ 0.0268,  0.1918],
          ...,
          [-0.0779,  0.2164],
          [ 0.0498,  0.1297],
          [-0.0545,  0.0742]]], device='cuda:0', grad_fn=<UnsafeViewBackward>),
 'reg_pred': tensor([[[-0.1089, -0.0311,  0.0013,  ...,  0.1299,  0.0174, -0.0485],
          [ 0.1816, -0.0382, -0.0508,  ..., -0.1204, -0.1244,  0.

最后直接用模型配置文档生成SECOND模型

In [47]:
from basic.model.second import SECOND
data_infos = dataloader.dataset.get_data_infos()
model = SECOND(top_cfg, data_infos).cuda()

In [48]:
loss = model(test_data)
loss['tol_loss']

tensor(11.0971, device='cuda:0', grad_fn=<AddBackward0>)

IOU

In [49]:
s = torch.randn(10, 1000, 4)
max_s,arg_s = s.max(dim=-1)
_,topk = max_s.topk(5, dim=1)

In [50]:
topk

tensor([[837, 470, 991,  24, 242],
        [916,  73, 536, 467, 827],
        [623, 915, 187, 905, 831],
        [826, 705, 793, 430, 374],
        [317, 165,  94, 425, 261],
        [435, 220, 640, 574, 226],
        [902, 884,  87, 125, 903],
        [207, 333, 718, 926, 448],
        [ 58, 518, 638, 139, 631],
        [795, 522, 660,   5, 312]])

In [51]:
torch.where(arg_s > 0)

(tensor([0, 0, 0,  ..., 9, 9, 9]),
 tensor([  1,   2,   3,  ..., 995, 997, 998]))

In [52]:
data = torch.arange(12).view(3,4)
torch.tensor([0,1,2,3]) in data

True

In [53]:
data == torch.tensor([0,1,2,3])

tensor([[ True,  True,  True,  True],
        [False, False, False, False],
        [False, False, False, False]])

In [54]:
for i in data:
    print(i)

tensor([0, 1, 2, 3])
tensor([4, 5, 6, 7])
tensor([ 8,  9, 10, 11])


In [55]:
data = torch.randint(0, 1000, (178, 200))

In [56]:
data.view(-1,1)[1234]

tensor([554])

In [57]:
data.permute(1,0).reshape(-1,1)[1234]

tensor([112])

In [58]:
torch.permute

<function _VariableFunctionsClass.permute>