From d1aac35d68a203955a32bca4635429f620fc08dd Mon Sep 17 00:00:00 2001 From: zhangwenwei Date: Tue, 14 Apr 2020 21:21:42 +0800 Subject: [PATCH] Initial commit --- .gitignore | 127 ++ .gitlab-ci.yml | 43 + .isort.cfg | 8 + .pre-commit-config.yaml | 27 + .style.yapf | 4 + .travis.yml | 43 + README.md | 58 + ...pn-fusion_adamw_2x8_80e_kitti-3d-3class.py | 283 ++++ ...intpillars_secfpn_6x8_160e_kitti-3d-car.py | 203 +++ ...d_secfpn_2x8_cosine_80e_kitti-3d-3class.py | 231 ++++ .../dv_second_secfpn_6x8_80e_kitti-3d-car.py | 199 +++ ...ffe_1x_kitti-2d-3class_coco-3x-pretrain.py | 194 +++ ...intpillars_secfpn_6x8_160e_kitti-3d-car.py | 204 +++ .../hv_second_secfpn_6x8_80e_kitti-3d-car.py | 197 +++ .../faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py | 187 +++ ...ntpillars_secfpn_sbn-all_4x8_20e_nus-3d.py | 236 ++++ ...pn_sbn-all_freeze_adamw_1x16_20e_nus-mm.py | 267 ++++ .../nus/retinanet_r50_fpn_caffe_2x8_1x_nus.py | 138 ++ docs/CHANGELOG.md | 209 +++ docs/CODE_OF_CONDUCT.md | 76 ++ docs/CONTRIBUTING.md | 35 + docs/GETTING_STARTED.md | 510 ++++++++ docs/INSTALL.md | 161 +++ docs/MODEL_ZOO.md | 532 ++++++++ docs/Makefile | 20 + docs/ROBUSTNESS_BENCHMARKING.md | 109 ++ docs/TECHNICAL_DETAILS.md | 226 ++++ docs/conf.py | 70 + docs/index.rst | 19 + docs/make.bat | 35 + docs/requirements.txt | 4 + mmdet3d/__init__.py | 3 + mmdet3d/apis/__init__.py | 5 + mmdet3d/apis/train.py | 199 +++ mmdet3d/core/__init__.py | 8 + mmdet3d/core/anchor/__init__.py | 19 + mmdet3d/core/anchor/anchor_generator.py | 288 +++++ mmdet3d/core/bbox/__init__.py | 49 + mmdet3d/core/bbox/assign_sampling.py | 43 + mmdet3d/core/bbox/assigners/__init__.py | 8 + .../bbox/assigners/approx_max_iou_assigner.py | 114 ++ mmdet3d/core/bbox/assigners/assign_result.py | 19 + mmdet3d/core/bbox/assigners/base_assigner.py | 8 + .../core/bbox/assigners/max_iou_assigner.py | 169 +++ mmdet3d/core/bbox/box_np_ops.py | 568 ++++++++ mmdet3d/core/bbox/box_torch_ops.py | 192 +++ mmdet3d/core/bbox/coders/__init__.py | 3 + mmdet3d/core/bbox/coders/box_coder.py | 116 ++ mmdet3d/core/bbox/geometry.py | 131 ++ mmdet3d/core/bbox/samplers/__init__.py | 14 + mmdet3d/core/bbox/samplers/base_sampler.py | 78 ++ .../core/bbox/samplers/combined_sampler.py | 16 + .../samplers/instance_balanced_pos_sampler.py | 41 + .../bbox/samplers/iou_balanced_neg_sampler.py | 133 ++ mmdet3d/core/bbox/samplers/ohem_sampler.py | 73 ++ mmdet3d/core/bbox/samplers/pseudo_sampler.py | 26 + mmdet3d/core/bbox/samplers/random_sampler.py | 53 + mmdet3d/core/bbox/samplers/sampling_result.py | 24 + mmdet3d/core/bbox/transforms.py | 269 ++++ mmdet3d/core/evaluation/__init__.py | 14 + mmdet3d/core/evaluation/bbox_overlaps.py | 47 + mmdet3d/core/evaluation/class_names.py | 127 ++ mmdet3d/core/evaluation/coco_utils.py | 251 ++++ mmdet3d/core/evaluation/eval_hooks.py | 204 +++ .../core/evaluation/kitti_utils/__init__.py | 3 + mmdet3d/core/evaluation/kitti_utils/eval.py | 814 ++++++++++++ .../core/evaluation/kitti_utils/rotate_iou.py | 341 +++++ mmdet3d/core/evaluation/mean_ap.py | 385 ++++++ mmdet3d/core/evaluation/recall.py | 185 +++ mmdet3d/core/optimizer/__init__.py | 5 + mmdet3d/core/optimizer/builder.py | 135 ++ mmdet3d/core/optimizer/mix_optimizer.py | 99 ++ mmdet3d/core/optimizer/registry.py | 23 + mmdet3d/core/post_processing/__init__.py | 8 + mmdet3d/core/post_processing/bbox_nms.py | 68 + mmdet3d/core/post_processing/merge_augs.py | 101 ++ mmdet3d/core/utils/__init__.py | 11 + mmdet3d/core/utils/contextmanagers.py | 121 ++ mmdet3d/core/utils/dist_utils.py | 58 + mmdet3d/core/utils/kitti_utils.py | 69 + mmdet3d/core/utils/misc.py | 65 + mmdet3d/core/voxel/__init__.py | 4 + mmdet3d/core/voxel/builder.py | 14 + mmdet3d/core/voxel/voxel_generator.py | 207 +++ mmdet3d/datasets/__init__.py | 16 + mmdet3d/datasets/builder.py | 45 + mmdet3d/datasets/dataset_wrappers.py | 103 ++ mmdet3d/datasets/kitti2d_dataset.py | 143 +++ mmdet3d/datasets/kitti_dataset.py | 579 +++++++++ mmdet3d/datasets/loader/__init__.py | 4 + mmdet3d/datasets/loader/build_loader.py | 57 + mmdet3d/datasets/loader/sampler.py | 164 +++ mmdet3d/datasets/nuscenes2d_dataset.py | 38 + mmdet3d/datasets/nuscenes_dataset.py | 495 +++++++ mmdet3d/datasets/pipelines/__init__.py | 13 + .../datasets/pipelines/data_augment_utils.py | 326 +++++ mmdet3d/datasets/pipelines/dbsampler.py | 509 ++++++++ mmdet3d/datasets/pipelines/formating.py | 165 +++ mmdet3d/datasets/pipelines/loading.py | 143 +++ mmdet3d/datasets/pipelines/train_aug.py | 326 +++++ mmdet3d/datasets/registry.py | 3 + mmdet3d/datasets/utils.py | 37 + mmdet3d/models/__init__.py | 21 + mmdet3d/models/anchor_heads/__init__.py | 4 + mmdet3d/models/anchor_heads/boxvelo_head.py | 224 ++++ mmdet3d/models/anchor_heads/second_head.py | 405 ++++++ mmdet3d/models/anchor_heads/train_mixins.py | 245 ++++ mmdet3d/models/backbones/__init__.py | 4 + mmdet3d/models/backbones/second.py | 84 ++ mmdet3d/models/bbox_heads/__init__.py | 8 + mmdet3d/models/builder.py | 56 + mmdet3d/models/detectors/__init__.py | 14 + mmdet3d/models/detectors/base.py | 110 ++ mmdet3d/models/detectors/mvx_faster_rcnn.py | 103 ++ mmdet3d/models/detectors/mvx_single_stage.py | 330 +++++ mmdet3d/models/detectors/mvx_two_stage.py | 376 ++++++ mmdet3d/models/detectors/single_stage.py | 89 ++ mmdet3d/models/detectors/test_mixins.py | 266 ++++ mmdet3d/models/detectors/two_stage.py | 314 +++++ mmdet3d/models/detectors/voxelnet.py | 140 ++ mmdet3d/models/fusion_layers/__init__.py | 3 + mmdet3d/models/fusion_layers/point_fusion.py | 287 +++++ mmdet3d/models/losses/__init__.py | 3 + mmdet3d/models/middle_encoders/__init__.py | 4 + .../models/middle_encoders/pillar_scatter.py | 85 ++ .../models/middle_encoders/sparse_encoder.py | 215 ++++ mmdet3d/models/necks/__init__.py | 4 + mmdet3d/models/necks/second_fpn.py | 147 +++ mmdet3d/models/registry.py | 5 + mmdet3d/models/roi_extractors/__init__.py | 3 + mmdet3d/models/utils/__init__.py | 3 + mmdet3d/models/utils/weight_init.py | 46 + mmdet3d/models/voxel_encoders/__init__.py | 8 + .../models/voxel_encoders/pillar_encoder.py | 378 ++++++ mmdet3d/models/voxel_encoders/utils.py | 148 +++ .../models/voxel_encoders/voxel_encoder.py | 478 +++++++ mmdet3d/ops/__init__.py | 11 + mmdet3d/ops/iou3d/__init__.py | 4 + mmdet3d/ops/iou3d/iou3d_utils.py | 113 ++ mmdet3d/ops/iou3d/setup.py | 18 + mmdet3d/ops/iou3d/src/iou3d.cpp | 179 +++ mmdet3d/ops/iou3d/src/iou3d_kernel.cu | 381 ++++++ mmdet3d/ops/norm.py | 10 + mmdet3d/ops/spconv/__init__.py | 37 + mmdet3d/ops/spconv/conv.py | 446 +++++++ mmdet3d/ops/spconv/functional.py | 98 ++ mmdet3d/ops/spconv/include/paramsgrid.h | 62 + mmdet3d/ops/spconv/include/prettyprint.h | 445 +++++++ mmdet3d/ops/spconv/include/pybind11_utils.h | 61 + mmdet3d/ops/spconv/include/spconv/box_iou.h | 157 +++ .../spconv/include/spconv/fused_spconv_ops.h | 127 ++ mmdet3d/ops/spconv/include/spconv/geometry.h | 301 +++++ mmdet3d/ops/spconv/include/spconv/indice.cu.h | 243 ++++ mmdet3d/ops/spconv/include/spconv/indice.h | 79 ++ mmdet3d/ops/spconv/include/spconv/maxpool.h | 44 + mmdet3d/ops/spconv/include/spconv/mp_helper.h | 47 + mmdet3d/ops/spconv/include/spconv/nms.h | 201 +++ .../ops/spconv/include/spconv/nms_functor.h | 42 + mmdet3d/ops/spconv/include/spconv/nms_gpu.h | 18 + mmdet3d/ops/spconv/include/spconv/nms_ops.h | 75 ++ .../ops/spconv/include/spconv/point2voxel.h | 414 ++++++ mmdet3d/ops/spconv/include/spconv/pool_ops.h | 97 ++ .../ops/spconv/include/spconv/reordering.cu.h | 161 +++ .../ops/spconv/include/spconv/reordering.h | 40 + .../ops/spconv/include/spconv/spconv_ops.h | 561 ++++++++ .../include/tensorview/helper_kernel.cu.h | 81 ++ .../spconv/include/tensorview/helper_launch.h | 21 + .../spconv/include/tensorview/tensorview.h | 1144 +++++++++++++++++ mmdet3d/ops/spconv/include/torch_utils.h | 70 + mmdet3d/ops/spconv/include/utility/timer.h | 54 + mmdet3d/ops/spconv/modules.py | 205 +++ mmdet3d/ops/spconv/ops.py | 183 +++ mmdet3d/ops/spconv/pool.py | 85 ++ mmdet3d/ops/spconv/src/all.cc | 51 + mmdet3d/ops/spconv/src/indice.cc | 89 ++ mmdet3d/ops/spconv/src/indice_cuda.cu | 158 +++ mmdet3d/ops/spconv/src/maxpool.cc | 82 ++ mmdet3d/ops/spconv/src/maxpool_cuda.cu | 471 +++++++ mmdet3d/ops/spconv/src/reordering.cc | 69 + mmdet3d/ops/spconv/src/reordering_cuda.cu | 155 +++ mmdet3d/ops/spconv/structure.py | 69 + mmdet3d/ops/spconv/test_utils.py | 193 +++ mmdet3d/ops/sync_bn.py | 110 ++ mmdet3d/ops/voxel/__init__.py | 4 + mmdet3d/ops/voxel/scatter_points.py | 129 ++ mmdet3d/ops/voxel/src/scatter_points_cpu.cpp | 131 ++ mmdet3d/ops/voxel/src/scatter_points_cuda.cu | 284 ++++ mmdet3d/ops/voxel/src/voxelization.cpp | 13 + mmdet3d/ops/voxel/src/voxelization.h | 113 ++ mmdet3d/ops/voxel/src/voxelization_cpu.cpp | 208 +++ mmdet3d/ops/voxel/src/voxelization_cuda.cu | 373 ++++++ mmdet3d/ops/voxel/voxelize.py | 122 ++ mmdet3d/utils/__init__.py | 8 + mmdet3d/utils/collect_env.py | 65 + requirements.txt | 4 + requirements/build.txt | 3 + requirements/optional.txt | 2 + requirements/runtime.txt | 9 + requirements/tests.txt | 12 + setup.py | 271 ++++ tests/test_config.py | 293 +++++ tools/create_data.py | 106 ++ tools/create_data.sh | 25 + tools/data_converter/__init__.py | 0 tools/data_converter/create_gt_database.py | 263 ++++ tools/data_converter/kitti_converter.py | 204 +++ tools/data_converter/kitti_data_utils.py | 355 +++++ tools/data_converter/nuscenes_converter.py | 503 ++++++++ tools/dist_train.sh | 9 + tools/publish_model.py | 35 + tools/slurm_test.sh | 22 + tools/slurm_train.sh | 23 + tools/test.py | 170 +++ tools/train.py | 149 +++ 214 files changed, 30129 insertions(+) create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml create mode 100644 .isort.cfg create mode 100644 .pre-commit-config.yaml create mode 100644 .style.yapf create mode 100644 .travis.yml create mode 100644 README.md create mode 100644 configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py create mode 100644 configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py create mode 100644 configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py create mode 100644 configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py create mode 100644 configs/kitti/faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py create mode 100644 configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py create mode 100644 configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py create mode 100644 configs/nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py create mode 100644 configs/nus/hv_pointpillars_secfpn_sbn-all_4x8_20e_nus-3d.py create mode 100644 configs/nus/hv_pointpillars_secfpn_sbn-all_freeze_adamw_1x16_20e_nus-mm.py create mode 100644 configs/nus/retinanet_r50_fpn_caffe_2x8_1x_nus.py create mode 100644 docs/CHANGELOG.md create mode 100644 docs/CODE_OF_CONDUCT.md create mode 100644 docs/CONTRIBUTING.md create mode 100644 docs/GETTING_STARTED.md create mode 100644 docs/INSTALL.md create mode 100644 docs/MODEL_ZOO.md create mode 100644 docs/Makefile create mode 100644 docs/ROBUSTNESS_BENCHMARKING.md create mode 100644 docs/TECHNICAL_DETAILS.md create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/make.bat create mode 100644 docs/requirements.txt create mode 100644 mmdet3d/__init__.py create mode 100644 mmdet3d/apis/__init__.py create mode 100644 mmdet3d/apis/train.py create mode 100644 mmdet3d/core/__init__.py create mode 100644 mmdet3d/core/anchor/__init__.py create mode 100644 mmdet3d/core/anchor/anchor_generator.py create mode 100644 mmdet3d/core/bbox/__init__.py create mode 100644 mmdet3d/core/bbox/assign_sampling.py create mode 100644 mmdet3d/core/bbox/assigners/__init__.py create mode 100644 mmdet3d/core/bbox/assigners/approx_max_iou_assigner.py create mode 100644 mmdet3d/core/bbox/assigners/assign_result.py create mode 100644 mmdet3d/core/bbox/assigners/base_assigner.py create mode 100644 mmdet3d/core/bbox/assigners/max_iou_assigner.py create mode 100644 mmdet3d/core/bbox/box_np_ops.py create mode 100644 mmdet3d/core/bbox/box_torch_ops.py create mode 100644 mmdet3d/core/bbox/coders/__init__.py create mode 100644 mmdet3d/core/bbox/coders/box_coder.py create mode 100644 mmdet3d/core/bbox/geometry.py create mode 100644 mmdet3d/core/bbox/samplers/__init__.py create mode 100644 mmdet3d/core/bbox/samplers/base_sampler.py create mode 100644 mmdet3d/core/bbox/samplers/combined_sampler.py create mode 100644 mmdet3d/core/bbox/samplers/instance_balanced_pos_sampler.py create mode 100644 mmdet3d/core/bbox/samplers/iou_balanced_neg_sampler.py create mode 100644 mmdet3d/core/bbox/samplers/ohem_sampler.py create mode 100644 mmdet3d/core/bbox/samplers/pseudo_sampler.py create mode 100644 mmdet3d/core/bbox/samplers/random_sampler.py create mode 100644 mmdet3d/core/bbox/samplers/sampling_result.py create mode 100644 mmdet3d/core/bbox/transforms.py create mode 100644 mmdet3d/core/evaluation/__init__.py create mode 100644 mmdet3d/core/evaluation/bbox_overlaps.py create mode 100644 mmdet3d/core/evaluation/class_names.py create mode 100644 mmdet3d/core/evaluation/coco_utils.py create mode 100644 mmdet3d/core/evaluation/eval_hooks.py create mode 100644 mmdet3d/core/evaluation/kitti_utils/__init__.py create mode 100644 mmdet3d/core/evaluation/kitti_utils/eval.py create mode 100644 mmdet3d/core/evaluation/kitti_utils/rotate_iou.py create mode 100644 mmdet3d/core/evaluation/mean_ap.py create mode 100644 mmdet3d/core/evaluation/recall.py create mode 100644 mmdet3d/core/optimizer/__init__.py create mode 100644 mmdet3d/core/optimizer/builder.py create mode 100644 mmdet3d/core/optimizer/mix_optimizer.py create mode 100644 mmdet3d/core/optimizer/registry.py create mode 100644 mmdet3d/core/post_processing/__init__.py create mode 100644 mmdet3d/core/post_processing/bbox_nms.py create mode 100644 mmdet3d/core/post_processing/merge_augs.py create mode 100644 mmdet3d/core/utils/__init__.py create mode 100644 mmdet3d/core/utils/contextmanagers.py create mode 100644 mmdet3d/core/utils/dist_utils.py create mode 100644 mmdet3d/core/utils/kitti_utils.py create mode 100644 mmdet3d/core/utils/misc.py create mode 100644 mmdet3d/core/voxel/__init__.py create mode 100644 mmdet3d/core/voxel/builder.py create mode 100644 mmdet3d/core/voxel/voxel_generator.py create mode 100644 mmdet3d/datasets/__init__.py create mode 100644 mmdet3d/datasets/builder.py create mode 100644 mmdet3d/datasets/dataset_wrappers.py create mode 100644 mmdet3d/datasets/kitti2d_dataset.py create mode 100644 mmdet3d/datasets/kitti_dataset.py create mode 100644 mmdet3d/datasets/loader/__init__.py create mode 100644 mmdet3d/datasets/loader/build_loader.py create mode 100644 mmdet3d/datasets/loader/sampler.py create mode 100644 mmdet3d/datasets/nuscenes2d_dataset.py create mode 100644 mmdet3d/datasets/nuscenes_dataset.py create mode 100644 mmdet3d/datasets/pipelines/__init__.py create mode 100644 mmdet3d/datasets/pipelines/data_augment_utils.py create mode 100644 mmdet3d/datasets/pipelines/dbsampler.py create mode 100644 mmdet3d/datasets/pipelines/formating.py create mode 100644 mmdet3d/datasets/pipelines/loading.py create mode 100644 mmdet3d/datasets/pipelines/train_aug.py create mode 100644 mmdet3d/datasets/registry.py create mode 100644 mmdet3d/datasets/utils.py create mode 100644 mmdet3d/models/__init__.py create mode 100644 mmdet3d/models/anchor_heads/__init__.py create mode 100644 mmdet3d/models/anchor_heads/boxvelo_head.py create mode 100644 mmdet3d/models/anchor_heads/second_head.py create mode 100644 mmdet3d/models/anchor_heads/train_mixins.py create mode 100644 mmdet3d/models/backbones/__init__.py create mode 100644 mmdet3d/models/backbones/second.py create mode 100644 mmdet3d/models/bbox_heads/__init__.py create mode 100644 mmdet3d/models/builder.py create mode 100644 mmdet3d/models/detectors/__init__.py create mode 100644 mmdet3d/models/detectors/base.py create mode 100644 mmdet3d/models/detectors/mvx_faster_rcnn.py create mode 100644 mmdet3d/models/detectors/mvx_single_stage.py create mode 100644 mmdet3d/models/detectors/mvx_two_stage.py create mode 100644 mmdet3d/models/detectors/single_stage.py create mode 100644 mmdet3d/models/detectors/test_mixins.py create mode 100644 mmdet3d/models/detectors/two_stage.py create mode 100644 mmdet3d/models/detectors/voxelnet.py create mode 100644 mmdet3d/models/fusion_layers/__init__.py create mode 100644 mmdet3d/models/fusion_layers/point_fusion.py create mode 100644 mmdet3d/models/losses/__init__.py create mode 100644 mmdet3d/models/middle_encoders/__init__.py create mode 100644 mmdet3d/models/middle_encoders/pillar_scatter.py create mode 100644 mmdet3d/models/middle_encoders/sparse_encoder.py create mode 100644 mmdet3d/models/necks/__init__.py create mode 100644 mmdet3d/models/necks/second_fpn.py create mode 100644 mmdet3d/models/registry.py create mode 100644 mmdet3d/models/roi_extractors/__init__.py create mode 100644 mmdet3d/models/utils/__init__.py create mode 100644 mmdet3d/models/utils/weight_init.py create mode 100644 mmdet3d/models/voxel_encoders/__init__.py create mode 100644 mmdet3d/models/voxel_encoders/pillar_encoder.py create mode 100644 mmdet3d/models/voxel_encoders/utils.py create mode 100644 mmdet3d/models/voxel_encoders/voxel_encoder.py create mode 100644 mmdet3d/ops/__init__.py create mode 100644 mmdet3d/ops/iou3d/__init__.py create mode 100644 mmdet3d/ops/iou3d/iou3d_utils.py create mode 100644 mmdet3d/ops/iou3d/setup.py create mode 100644 mmdet3d/ops/iou3d/src/iou3d.cpp create mode 100644 mmdet3d/ops/iou3d/src/iou3d_kernel.cu create mode 100644 mmdet3d/ops/norm.py create mode 100644 mmdet3d/ops/spconv/__init__.py create mode 100644 mmdet3d/ops/spconv/conv.py create mode 100644 mmdet3d/ops/spconv/functional.py create mode 100644 mmdet3d/ops/spconv/include/paramsgrid.h create mode 100644 mmdet3d/ops/spconv/include/prettyprint.h create mode 100644 mmdet3d/ops/spconv/include/pybind11_utils.h create mode 100644 mmdet3d/ops/spconv/include/spconv/box_iou.h create mode 100644 mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h create mode 100644 mmdet3d/ops/spconv/include/spconv/geometry.h create mode 100644 mmdet3d/ops/spconv/include/spconv/indice.cu.h create mode 100644 mmdet3d/ops/spconv/include/spconv/indice.h create mode 100644 mmdet3d/ops/spconv/include/spconv/maxpool.h create mode 100644 mmdet3d/ops/spconv/include/spconv/mp_helper.h create mode 100644 mmdet3d/ops/spconv/include/spconv/nms.h create mode 100644 mmdet3d/ops/spconv/include/spconv/nms_functor.h create mode 100644 mmdet3d/ops/spconv/include/spconv/nms_gpu.h create mode 100644 mmdet3d/ops/spconv/include/spconv/nms_ops.h create mode 100644 mmdet3d/ops/spconv/include/spconv/point2voxel.h create mode 100644 mmdet3d/ops/spconv/include/spconv/pool_ops.h create mode 100644 mmdet3d/ops/spconv/include/spconv/reordering.cu.h create mode 100644 mmdet3d/ops/spconv/include/spconv/reordering.h create mode 100644 mmdet3d/ops/spconv/include/spconv/spconv_ops.h create mode 100644 mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h create mode 100644 mmdet3d/ops/spconv/include/tensorview/helper_launch.h create mode 100644 mmdet3d/ops/spconv/include/tensorview/tensorview.h create mode 100644 mmdet3d/ops/spconv/include/torch_utils.h create mode 100644 mmdet3d/ops/spconv/include/utility/timer.h create mode 100644 mmdet3d/ops/spconv/modules.py create mode 100644 mmdet3d/ops/spconv/ops.py create mode 100644 mmdet3d/ops/spconv/pool.py create mode 100644 mmdet3d/ops/spconv/src/all.cc create mode 100644 mmdet3d/ops/spconv/src/indice.cc create mode 100644 mmdet3d/ops/spconv/src/indice_cuda.cu create mode 100644 mmdet3d/ops/spconv/src/maxpool.cc create mode 100644 mmdet3d/ops/spconv/src/maxpool_cuda.cu create mode 100644 mmdet3d/ops/spconv/src/reordering.cc create mode 100644 mmdet3d/ops/spconv/src/reordering_cuda.cu create mode 100644 mmdet3d/ops/spconv/structure.py create mode 100644 mmdet3d/ops/spconv/test_utils.py create mode 100644 mmdet3d/ops/sync_bn.py create mode 100644 mmdet3d/ops/voxel/__init__.py create mode 100644 mmdet3d/ops/voxel/scatter_points.py create mode 100644 mmdet3d/ops/voxel/src/scatter_points_cpu.cpp create mode 100644 mmdet3d/ops/voxel/src/scatter_points_cuda.cu create mode 100644 mmdet3d/ops/voxel/src/voxelization.cpp create mode 100644 mmdet3d/ops/voxel/src/voxelization.h create mode 100644 mmdet3d/ops/voxel/src/voxelization_cpu.cpp create mode 100644 mmdet3d/ops/voxel/src/voxelization_cuda.cu create mode 100644 mmdet3d/ops/voxel/voxelize.py create mode 100644 mmdet3d/utils/__init__.py create mode 100644 mmdet3d/utils/collect_env.py create mode 100644 requirements.txt create mode 100644 requirements/build.txt create mode 100644 requirements/optional.txt create mode 100644 requirements/runtime.txt create mode 100644 requirements/tests.txt create mode 100644 setup.py create mode 100644 tests/test_config.py create mode 100644 tools/create_data.py create mode 100644 tools/create_data.sh create mode 100644 tools/data_converter/__init__.py create mode 100644 tools/data_converter/create_gt_database.py create mode 100644 tools/data_converter/kitti_converter.py create mode 100644 tools/data_converter/kitti_data_utils.py create mode 100644 tools/data_converter/nuscenes_converter.py create mode 100644 tools/dist_train.sh create mode 100644 tools/publish_model.py create mode 100755 tools/slurm_test.sh create mode 100755 tools/slurm_train.sh create mode 100644 tools/test.py create mode 100644 tools/train.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..4de053d75f --- /dev/null +++ b/.gitignore @@ -0,0 +1,127 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +*.ipynb + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# cython generated cpp +mmdet3d/ops/nms/src/soft_nms_cpu.cpp +mmdet3d/version.py +data +.vscode +.idea + +# custom +*.pkl +*.pkl.json +*.log.json +work_dirs/ +exps/ +*~ + +# Pytorch +*.pth + +# demo +*.jpg +*.png diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000..6595452b13 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,43 @@ +variables: + PYTORCH_IMAGE: registry.sensetime.com/eig-research/pytorch:pytorch1.3.1-cuda10.1-devel + +stages: + - linting + - test + +before_script: + - echo $PATH + - gcc --version + - nvcc --version + - python --version + - pip --version + - python -c "import torch; print(torch.__version__)" + +.linting_template: &linting_template_def + stage: linting + script: + - pip install flake8 yapf isort + - flake8 . + - isort -rc --check-only --diff mmdet3d/ tools/ tests/ + - yapf -r -d mmdet3d/ tools/ tests/ configs/ + +.test_template: &test_template_def + stage: test + script: + - echo "Start building..." + - conda install av -c conda-forge -y + - pip install git+https://github.com/open-mmlab/mmdetection.git@v2.0 + - python -c "import mmdet; print(mmdet.__version__)" + - pip install -v -e .[all] + - python -c "import mmdet3d; print(mmdet3d.__version__)" + - echo "Start testing..." + - coverage run --branch --source mmdet3d -m pytest tests/ + - coverage report -m + +linting:pytorch1.3-cuda10: + image: $PYTORCH_IMAGE + <<: *linting_template_def + +test:pytorch1.3-cuda10: + image: $PYTORCH_IMAGE + <<: *test_template_def diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 0000000000..09a0e57266 --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,8 @@ +[isort] +line_length = 79 +multi_line_output = 0 +known_standard_library = setuptools +known_first_party = mmdet,mmdet3d +known_third_party = Cython,cv2,mmcv,numba,numpy,nuscenes,pycocotools,pyquaternion,scipy,shapely,six,skimage,terminaltables,torch,torchvision +no_lines_before = STDLIB,LOCALFOLDER +default_section = THIRDPARTY diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..8362bc545d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +repos: + - repo: https://gitlab.com/pycqa/flake8.git + rev: 3.7.9 + hooks: + - id: flake8 + - repo: https://github.com/asottile/seed-isort-config + rev: v2.1.0 + hooks: + - id: seed-isort-config + - repo: https://github.com/timothycrosley/isort + rev: 4.3.21 + hooks: + - id: isort + - repo: https://github.com/pre-commit/mirrors-yapf + rev: v0.29.0 + hooks: + - id: yapf + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.5.0 + hooks: + - id: trailing-whitespace + - id: check-yaml + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: double-quote-string-fixer + - id: fix-encoding-pragma + args: ["--remove"] diff --git a/.style.yapf b/.style.yapf new file mode 100644 index 0000000000..286a3f1d7a --- /dev/null +++ b/.style.yapf @@ -0,0 +1,4 @@ +[style] +BASED_ON_STYLE = pep8 +BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true +SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000..68f49ccc07 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,43 @@ +dist: bionic # ubuntu 18.04 +language: python + +python: + - "3.5" + - "3.6" + - "3.7" + +env: CUDA=10.1.105-1 CUDA_SHORT=10.1 UBUNTU_VERSION=ubuntu1804 FORCE_CUDA=1 +cache: pip + +# Ref to CUDA installation in Travis: https://github.com/jeremad/cuda-travis +before_install: + - INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb + - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER} + - sudo dpkg -i ${INSTALLER} + - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub + - sudo apt-key add 7fa2af80.pub + - sudo apt update -qq + - sudo apt install -y cuda-${CUDA_SHORT/./-} cuda-cufft-dev-${CUDA_SHORT/./-} + - sudo apt clean + - CUDA_HOME=/usr/local/cuda-${CUDA_SHORT} + - LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${CUDA_HOME}/include:${LD_LIBRARY_PATH} + - PATH=${CUDA_HOME}/bin:${PATH} + +install: + - pip install Pillow==6.2.2 # remove this line when torchvision>=0.5 + - pip install torch==1.2 torchvision==0.4.0 # TODO: fix CI for pytorch>1.2 + - pip install "git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI" + - pip install -r requirements.txt + +before_script: + - flake8 . + - isort -rc --check-only --diff mmdet3d/ tools/ tests/ + - yapf -r -d --style .style.yapf mmdet3d/ tools/ tests/ configs/ + +script: + - python setup.py check -m -s + - python setup.py build_ext --inplace + - coverage run --source mmdet3d -m py.test -v --xdoctest-modules tests mmdet3d + +after_success: + - coverage report diff --git a/README.md b/README.md new file mode 100644 index 0000000000..40e35a9e7a --- /dev/null +++ b/README.md @@ -0,0 +1,58 @@ + +# MMDetection3D + + +## Introduction + +The master branch works with **PyTorch 1.1** or higher. + +mmdetection3d is an open source 3D object detection toolbox based on PyTorch. It is +a part of the open-mmlab project developed by [Multimedia Laboratory, CUHK](http://mmlab.ie.cuhk.edu.hk/). + + +### Major features + + + +## License + +This project is released under the [Apache 2.0 license](LICENSE). + +## Updates + + +v0.0.1 (07/08/2019) +- the project is initiated + +## Benchmark and model zoo + +Supported methods and backbones are shown in the below table. +Results and models are available in the [Model zoo](MODEL_ZOO.md). + + +## Installation + +Please refer to [INSTALL.md](INSTALL.md) for installation and dataset preparation. + + +## Get Started + +Please see [GETTING_STARTED.md](GETTING_STARTED.md) for the basic usage of MMDetection. + +## Contributing + +We appreciate all contributions to improve MMDetection3D. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for the contributing guideline. + +## Acknowledgement + +MMDetection3D is an open source project that is contributed by researchers and engineers from various colleges and companies. We appreciate all the contributors who implement their methods or add new features, as well as users who give valuable feedbacks. +We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to reimplement existing methods and develop their own new detectors. + + +## Citation + + + +## Contact + +This repo is currently maintained by Wenwei Zhang ([@ZwwWayne](http://github.com/ZwwWayne)). diff --git a/configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py b/configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py new file mode 100644 index 0000000000..79e2d6f837 --- /dev/null +++ b/configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py @@ -0,0 +1,283 @@ +# model settings +voxel_size = [0.05, 0.05, 0.1] +point_cloud_range = [0, -40, -3, 70.4, 40, 1] # velodyne coordinates, x, y, z + +model = dict( + type='DynamicMVXFasterRCNNV2', + pretrained=('./pretrain_detectron/' + 'ImageNetPretrained/MSRA/resnet50_msra.pth'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + img_neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + pts_voxel_layer=dict( + max_num_points=-1, # max_points_per_voxel + point_cloud_range=point_cloud_range, + voxel_size=voxel_size, + max_voxels=(-1, -1), # (training, testing) max_coxels + ), + pts_voxel_encoder=dict( + type='DynamicVFE', + num_input_features=4, + num_filters=[64, 64], + with_distance=False, + voxel_size=voxel_size, + with_cluster_center=True, + with_voxel_center=True, + point_cloud_range=point_cloud_range, + fusion_layer=dict( + type='PointFusion', + img_channels=256, + pts_channels=64, + mid_channels=128, + out_channels=128, + img_levels=[0, 1, 2, 3, 4], + align_corners=False, + activate_out=True, + fuse_out=False), + ), + pts_middle_encoder=dict( + type='SparseEncoder', + in_channels=128, + output_shape=[41, 1600, 1408], # checked from PointCloud3D + pre_act=False, + ), + pts_backbone=dict( + type='SECOND', + in_channels=256, + layer_nums=[5, 5], + layer_strides=[1, 2], + num_filters=[128, 256], + ), + pts_neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + upsample_strides=[1, 2], + num_upsample_filters=[256, 256], + ), + pts_bbox_head=dict( + type='SECONDHead', + class_name=['Pedestrian', 'Cyclist', 'Car'], + in_channels=512, + feat_channels=512, + use_direction_classifier=True, + encode_bg_as_zeros=True, + anchor_range=[ + [0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -1.78, 70.4, 40.0, -1.78], + ], + assigner_per_size=True, + anchor_strides=[2], + anchor_sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + anchor_rotations=[0, 1.57], + diff_rad_by_sin=True, + assign_per_class=True, + bbox_coder=dict(type='ResidualCoder', ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2), + )) +# model training and testing settings +train_cfg = dict( + pts=dict( + assigner=[ + dict( # for Pedestrian + type='MaxIoUAssigner', + iou_type='nearest_3d', + pos_iou_thr=0.35, + neg_iou_thr=0.2, + min_pos_iou=0.2, + ignore_iof_thr=-1), + dict( # for Cyclist + type='MaxIoUAssigner', + iou_type='nearest_3d', + pos_iou_thr=0.35, + neg_iou_thr=0.2, + min_pos_iou=0.2, + ignore_iof_thr=-1), + dict( # for Car + type='MaxIoUAssigner', + iou_type='nearest_3d', + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1), + ], + allowed_border=0, + pos_weight=-1, + debug=False)) +test_cfg = dict( + pts=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_thr=0.01, + score_thr=0.3, + min_bbox_size=0, + post_center_limit_range=[0, -40, -3, 70.4, 40, 0.0], + ), ) + +# dataset settings +dataset_type = 'KittiDataset' +data_root = 'data/kitti/' +class_names = ['Pedestrian', 'Cyclist', 'Car'] +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +input_modality = dict( + use_lidar=True, + use_depth=False, + use_lidar_intensity=True, + use_camera=True, +) +db_sampler = dict( + type='MMDataBaseSampler', + root_path=data_root, + info_path=data_root + 'kitti_mm_dbinfos_train.pkl', + rate=1.0, + object_rot_range=[0.0, 0.0], + blending_type=['box', 'gaussian', 'poisson'], + depth_consistent=True, + check_2D_collision=True, + collision_thr=[0, 0.3, 0.5, 0.7], + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict( + Car=5, + Pedestrian=10, + Cyclist=10, + ), + ), + sample_groups=dict( + Car=12, + Pedestrian=6, + Cyclist=6, + ), +) +train_pipeline = [ + dict( + type='Resize', + img_scale=[(640, 192), (2560, 768)], + multiscale_mode='range', + keep_ratio=True), + dict( + type='GlobalRotScale', + rot_uniform_noise=[-0.78539816, 0.78539816], + scaling_uniform_noise=[0.95, 1.05], + trans_normal_noise=[0.2, 0.2, 0.2]), + dict(type='RandomFlip3D', flip_ratio=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict( + type='Collect3D', + keys=[ + 'points', 'img', 'gt_bboxes_3d', 'gt_bboxes', 'gt_labels', + 'gt_labels_3d' + ]), +] +test_pipeline = [ + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='Resize', + img_scale=[ + (1280, 384), + ], + multiscale_mode='value', + keep_ratio=True), + dict( + type='GlobalRotScale', + rot_uniform_noise=[0, 0], + scaling_uniform_noise=[1, 1]), + dict(type='RandomFlip3D', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points', 'img']), +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_train.pkl', + split='training', + training=True, + pipeline=train_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + val=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True, + test_mode=True), + test=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='testing', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True, + test_mode=True)) +# Training settings +optimizer = dict(type='AdamW', lr=0.003, betas=(0.95, 0.99), weight_decay=0.01) +# max_norm=10 is better for SECOND +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +lr_config = dict( + policy='cosine', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 10, + target_lr=1e-5, + as_ratio=True, +) +momentum_config = None +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +evaluation = dict(interval=1) +# runtime settings +total_epochs = 80 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/sec_secfpn_80e' +load_from = './pretrain_mmdet/mvx_faster_rcnn_r50_fpn_detectron2-caffe_freezeBN_l1-loss_roialign-v2_1x_coco-3-class_44.7_20200205-b1c1533f.pth' # noqa +resume_from = None +workflow = [('train', 1)] diff --git a/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py b/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py new file mode 100644 index 0000000000..43e7b0bbd8 --- /dev/null +++ b/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py @@ -0,0 +1,203 @@ +# model settings +voxel_size = [0.16, 0.16, 4] +point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1] + +model = dict( + type='DynamicVoxelNet', + voxel_layer=dict( + max_num_points=-1, # set -1 for dynamic voxel + point_cloud_range=point_cloud_range, # velodyne coordinates, x, y, z + voxel_size=voxel_size, + max_voxels=(-1, -1), # set -1 for dynamic voxel + ), + voxel_encoder=dict( + type='DynamicPillarFeatureNet', + num_input_features=4, + num_filters=[64], + with_distance=False, + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + ), + middle_encoder=dict( + type='PointPillarsScatter', + in_channels=64, + output_shape=[496, 432], + ), + backbone=dict( + type='SECOND', + in_channels=64, + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + num_filters=[64, 128, 256], + ), + neck=dict( + type='SECONDFPN', + in_channels=[64, 128, 256], + upsample_strides=[1, 2, 4], + num_upsample_filters=[128, 128, 128], + ), + bbox_head=dict( + type='SECONDHead', + class_name=['Car'], + in_channels=384, + feat_channels=384, + use_direction_classifier=True, + encode_bg_as_zeros=True, + anchor_range=[0, -39.68, -1.78, 69.12, 39.68, -1.78], + anchor_strides=[2], + anchor_sizes=[[1.6, 3.9, 1.56]], + anchor_rotations=[0, 1.57], + diff_rad_by_sin=True, + bbox_coder=dict(type='ResidualCoder', ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2), + ), +) +# model training and testing settings +train_cfg = dict( + assigner=dict( + type='MaxIoUAssigner', + iou_type='nearest_3d', + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1), + allowed_border=0, + pos_weight=-1, + debug=False) +test_cfg = dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_thr=0.01, + score_thr=0.3, + min_bbox_size=0, + post_center_limit_range=point_cloud_range, + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) +) + +# dataset settings +dataset_type = 'KittiDataset' +data_root = 'data/kitti/' +class_names = ['Car'] +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +input_modality = dict( + use_lidar=True, + use_depth=False, + use_lidar_intensity=True, + use_camera=False, +) +db_sampler = dict( + root_path=data_root, + info_path=data_root + 'kitti_dbinfos_train.pkl', + rate=1.0, + use_road_plane=False, + object_rot_range=[0.0, 0.0], + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict(Car=5), + ), + sample_groups=dict(Car=15), +) + +train_pipeline = [ + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='ObjectNoise', + num_try=100, + loc_noise_std=[0.25, 0.25, 0.25], + global_rot_range=[0.0, 0.0], + rot_uniform_noise=[-0.15707963267, 0.15707963267]), + dict(type='PointsRandomFlip', flip_ratio=0.5), + dict( + type='GlobalRotScale', + rot_uniform_noise=[-0.78539816, 0.78539816], + scaling_uniform_noise=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points', 'gt_bboxes']), +] + +data = dict( + samples_per_gpu=6, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_train.pkl', + split='training', + training=True, + pipeline=train_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + val=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + test=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='testing', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True)) +# optimizer +lr = 0.001 # max learning rate +optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='cyclic', + target_ratio=[10, 1e-4], + cyclic_times=1, + step_ratio_up=0.4, +) +momentum_config = dict( + policy='cyclic', + target_ratio=[0.85 / 0.95, 1], + cyclic_times=1, + step_ratio_up=0.4, +) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 160 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/pp_secfpn_80e' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py b/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py new file mode 100644 index 0000000000..2ae9164c7b --- /dev/null +++ b/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py @@ -0,0 +1,231 @@ +# model settings +voxel_size = [0.05, 0.05, 0.1] +point_cloud_range = [0, -40, -3, 70.4, 40, 1] # velodyne coordinates, x, y, z + +model = dict( + type='DynamicVoxelNet', + voxel_layer=dict( + max_num_points=-1, # max_points_per_voxel + point_cloud_range=point_cloud_range, + voxel_size=voxel_size, + max_voxels=(-1, -1), # (training, testing) max_coxels + ), + voxel_encoder=dict( + type='DynamicVFEV3', + num_input_features=4, + voxel_size=voxel_size, + point_cloud_range=point_cloud_range), + middle_encoder=dict( + type='SparseEncoder', + in_channels=4, + output_shape=[41, 1600, 1408], + pre_act=False, + ), + backbone=dict( + type='SECOND', + in_channels=256, + layer_nums=[5, 5], + layer_strides=[1, 2], + num_filters=[128, 256], + ), + neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + upsample_strides=[1, 2], + num_upsample_filters=[256, 256], + ), + bbox_head=dict( + type='SECONDHead', + class_name=['Pedestrian', 'Cyclist', 'Car'], + in_channels=512, + feat_channels=512, + use_direction_classifier=True, + encode_bg_as_zeros=True, + anchor_range=[ + [0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -1.78, 70.4, 40.0, -1.78], + ], + anchor_strides=[2], + anchor_sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + anchor_rotations=[0, 1.57], + diff_rad_by_sin=True, + assigner_per_size=True, + assign_per_class=True, + bbox_coder=dict(type='ResidualCoder', ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2), + ), +) +# model training and testing settings +train_cfg = dict( + assigner=[ + dict( # for Pedestrian + type='MaxIoUAssigner', + iou_type='nearest_3d', + pos_iou_thr=0.35, + neg_iou_thr=0.2, + min_pos_iou=0.2, + ignore_iof_thr=-1), + dict( # for Cyclist + type='MaxIoUAssigner', + iou_type='nearest_3d', + pos_iou_thr=0.35, + neg_iou_thr=0.2, + min_pos_iou=0.2, + ignore_iof_thr=-1), + dict( # for Car + type='MaxIoUAssigner', + iou_type='nearest_3d', + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1), + ], + allowed_border=0, + pos_weight=-1, + debug=False) +test_cfg = dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_thr=0.01, + score_thr=0.3, + min_bbox_size=0, + post_center_limit_range=[0, -40, -3, 70.4, 40, 0.0], + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) +) + +# dataset settings +dataset_type = 'KittiDataset' +data_root = 'data/kitti/' +class_names = ['Pedestrian', 'Cyclist', 'Car'] +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +input_modality = dict( + use_lidar=True, + use_depth=False, + use_lidar_intensity=True, + use_camera=False, +) +db_sampler = dict( + root_path=data_root, + info_path=data_root + 'kitti_dbinfos_train.pkl', + rate=1.0, + use_road_plane=False, + object_rot_range=[0.0, 0.0], + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict( + Car=5, + Pedestrian=10, + Cyclist=10, + ), + ), + sample_groups=dict( + Car=12, + Pedestrian=6, + Cyclist=6, + ), +) +train_pipeline = [ + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='ObjectNoise', + num_try=100, + loc_noise_std=[0, 0, 0], + global_rot_range=[0.0, 0.0], + rot_uniform_noise=[-0.39269908, 0.39269908]), + dict(type='RandomFlip3D', flip_ratio=0.5), + dict( + type='GlobalRotScale', + rot_uniform_noise=[-0.78539816, 0.78539816], + scaling_uniform_noise=[0.95, 1.05], + trans_normal_noise=[0.2, 0.2, 0.2]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']), +] +test_pipeline = [ + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d']), +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_train.pkl', + split='training', + training=True, + pipeline=train_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + val=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + test=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='testing', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True)) +# optimizer +lr = 0.003 # max learning rate +optimizer = dict( + type='AdamW', + lr=lr, + betas=(0.95, 0.99), # the momentum is change during training + weight_decay=0.001) +optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) +lr_config = dict( + policy='cosine', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 10, + target_lr=1e-5, + as_ratio=True, +) +momentum_config = None +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 80 +dist_params = dict(backend='nccl', port=29502) +log_level = 'INFO' +work_dir = './work_dirs/sec_secfpn_80e' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py b/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py new file mode 100644 index 0000000000..d0d8fed3a9 --- /dev/null +++ b/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py @@ -0,0 +1,199 @@ +# model settings +voxel_size = [0.05, 0.05, 0.1] +point_cloud_range = [0, -40, -3, 70.4, 40, 1] # velodyne coordinates, x, y, z + +model = dict( + type='DynamicVoxelNet', + voxel_layer=dict( + max_num_points=-1, # max_points_per_voxel + point_cloud_range=point_cloud_range, + voxel_size=voxel_size, + max_voxels=(-1, -1), # (training, testing) max_coxels + ), + voxel_encoder=dict( + type='DynamicVFEV3', + num_input_features=4, + voxel_size=voxel_size, + point_cloud_range=point_cloud_range), + middle_encoder=dict( + type='SparseEncoder', + in_channels=4, + output_shape=[41, 1600, 1408], # checked from PointCloud3D + pre_act=False, + ), + backbone=dict( + type='SECOND', + in_channels=256, + layer_nums=[5, 5], + layer_strides=[1, 2], + num_filters=[128, 256], + ), + neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + upsample_strides=[1, 2], + num_upsample_filters=[256, 256], + ), + bbox_head=dict( + type='SECONDHead', + class_name=['Car'], + in_channels=512, + feat_channels=512, + use_direction_classifier=True, + encode_bg_as_zeros=True, + anchor_range=[0, -40.0, -1.78, 70.4, 40.0, -1.78], + anchor_strides=[2], + anchor_sizes=[[1.6, 3.9, 1.56]], + anchor_rotations=[0, 1.57], + diff_rad_by_sin=True, + bbox_coder=dict(type='ResidualCoder', ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2), + ), +) +# model training and testing settings +train_cfg = dict( + assigner=dict( + type='MaxIoUAssigner', + iou_type='nearest_3d', + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1), + allowed_border=0, + pos_weight=-1, + debug=False) +test_cfg = dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_thr=0.01, + score_thr=0.3, + min_bbox_size=0, + post_center_limit_range=[0, -40, -3, 70.4, 40, 0.0], + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) +) + +# dataset settings +dataset_type = 'KittiDataset' +data_root = 'data/kitti/' +class_names = ['Car'] +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +input_modality = dict( + use_lidar=True, + use_depth=False, + use_lidar_intensity=True, + use_camera=False, +) +db_sampler = dict( + root_path=data_root, + info_path=data_root + 'kitti_dbinfos_train.pkl', + rate=1.0, + use_road_plane=False, + object_rot_range=[0.0, 0.0], + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict(Car=5), + ), + sample_groups=dict(Car=15), +) +train_pipeline = [ + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='ObjectNoise', + num_try=100, + loc_noise_std=[1.0, 1.0, 0.5], + global_rot_range=[0.0, 0.0], + rot_uniform_noise=[-0.78539816, 0.78539816]), + dict(type='PointsRandomFlip', flip_ratio=0.5), + dict( + type='GlobalRotScale', + rot_uniform_noise=[-0.78539816, 0.78539816], + scaling_uniform_noise=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points', 'gt_bboxes']), +] + +data = dict( + samples_per_gpu=6, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_train.pkl', + split='training', + training=True, + pipeline=train_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + val=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + test=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='testing', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True)) +# optimizer +lr = 0.0018 # max learning rate +optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01) +optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) +lr_config = dict( + policy='cyclic', + target_ratio=[10, 1e-4], + cyclic_times=1, + step_ratio_up=0.4, +) +momentum_config = dict( + policy='cyclic', + target_ratio=[0.85 / 0.95, 1], + cyclic_times=1, + step_ratio_up=0.4, +) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 80 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/sec_secfpn_80e' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/kitti/faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py b/configs/kitti/faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py new file mode 100644 index 0000000000..fa09f66a11 --- /dev/null +++ b/configs/kitti/faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py @@ -0,0 +1,194 @@ +# model settings +norm_cfg = dict(type='BN', requires_grad=False) +model = dict( + type='FasterRCNN', + pretrained=('./pretrain_detectron/' + 'ImageNetPretrained/MSRA/resnet50_msra.pth'), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[1 / 3, 0.5, 1.0, 2.0, 3.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.0, loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)))) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + # following the setting of detectron, + # which improves ~0.2 bbox mAP. + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) +) +# dataset settings +dataset_type = 'Kitti2DDataset' +data_root = 'data/kitti/' +class_names = ['Car', 'Pedestrian', 'Cyclist'] +# Values to be used for image normalization (BGR order) +# Default mean pixel value from ImageNet: [103.53, 116.28, 123.675] +# When using pre-trained models in Detectron1 or any MSRA models, +# std has been absorbed into its conv1 weights, so the std needs to be set 1. +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=False), + dict( + type='Resize', + img_scale=[(640, 192), (2560, 768)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1280, 384), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + data_root=data_root, + class_names=class_names, + ann_file='kitti_infos_train.pkl', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_root=data_root, + class_names=class_names, + ann_file='kitti_infos_val.pkl', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + class_names=class_names, + ann_file='kitti_infos_val.pkl', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 1000, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +evaluation = dict(interval=1) +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl', port=29501) +log_level = 'INFO' +work_dir = './work_dirs/faster_rcnn_r50_fpn_1x' +load_from = './pretrain_mmdet/faster_r50_fpn_detectron2-caffe_freezeBN_l1-loss_roialign-v2_3x-4767dd8e.pth' # noqa +resume_from = None +workflow = [('train', 1)] diff --git a/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py b/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py new file mode 100644 index 0000000000..946620b0a3 --- /dev/null +++ b/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py @@ -0,0 +1,204 @@ +# model settings +point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1] +model = dict( + type='VoxelNet', + voxel_layer=dict( + max_num_points=64, # max_points_per_voxel + point_cloud_range=point_cloud_range, # velodyne coordinates, x, y, z + voxel_size=[0.16, 0.16, 4], + max_voxels=(12000, 20000), # (training, testing) max_coxels + ), + voxel_encoder=dict( + type='PillarFeatureNet', + num_input_features=4, + num_filters=[64], + with_distance=False, + # these two arguments should be consistent with the voxel_generator + voxel_size=[0.16, 0.16, 4], + point_cloud_range=point_cloud_range, + ), + middle_encoder=dict( + type='PointPillarsScatter', + in_channels=64, + output_shape=[496, 432], + ), + backbone=dict( + type='SECOND', + in_channels=64, + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + num_filters=[64, 128, 256], + ), + neck=dict( + type='SECONDFPN', + in_channels=[64, 128, 256], + upsample_strides=[1, 2, 4], + num_upsample_filters=[128, 128, 128], + ), + bbox_head=dict( + type='SECONDHead', + class_name=['Car'], + in_channels=384, + feat_channels=384, + use_direction_classifier=True, + encode_bg_as_zeros=True, + anchor_range=[0, -39.68, -1.78, 69.12, 39.68, -1.78], + anchor_strides=[2], + anchor_sizes=[[1.6, 3.9, 1.56]], + anchor_rotations=[0, 1.57], + diff_rad_by_sin=True, + bbox_coder=dict(type='ResidualCoder', ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2), + ), +) +# model training and testing settings +train_cfg = dict( + assigner=dict( + type='MaxIoUAssigner', + iou_type='nearest_3d', + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1), + allowed_border=0, + pos_weight=-1, + debug=False) +test_cfg = dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_thr=0.01, + score_thr=0.3, + min_bbox_size=0, + post_center_limit_range=point_cloud_range, +) + +# dataset settings +dataset_type = 'KittiDataset' +data_root = 'data/kitti/' +class_names = ['Car'] +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +input_modality = dict( + use_lidar=True, + use_depth=False, + use_lidar_intensity=True, + use_camera=False, +) +db_sampler = dict( + root_path=data_root, + info_path=data_root + 'kitti_dbinfos_train.pkl', + rate=1.0, + use_road_plane=False, + object_rot_range=[0.0, 0.0], + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict(Car=5), + ), + sample_groups=dict(Car=15), +) + +train_pipeline = [ + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='ObjectNoise', + num_try=100, + loc_noise_std=[0.25, 0.25, 0.25], + global_rot_range=[0.0, 0.0], + rot_uniform_noise=[-0.15707963267, 0.15707963267]), + dict(type='PointsRandomFlip', flip_ratio=0.5), + dict( + type='GlobalRotScale', + rot_uniform_noise=[-0.78539816, 0.78539816], + scaling_uniform_noise=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points', 'gt_bboxes']), +] + +data = dict( + samples_per_gpu=6, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_train.pkl', + split='training', + training=True, + pipeline=train_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + val=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + test=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='testing', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True)) +# optimizer +lr = 0.001 # max learning rate +optimizer = dict( + type='AdamW', + lr=lr, + betas=(0.95, 0.99), # the momentum is change during training + weight_decay=0.01) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='cyclic', + target_ratio=[10, 1e-4], + cyclic_times=1, + step_ratio_up=0.4, +) +momentum_config = dict( + policy='cyclic', + target_ratio=[0.85 / 0.95, 1], + cyclic_times=1, + step_ratio_up=0.4, +) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 160 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/pp_secfpn_80e' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py b/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py new file mode 100644 index 0000000000..c616a86a98 --- /dev/null +++ b/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py @@ -0,0 +1,197 @@ +# model settings +voxel_size = [0.05, 0.05, 0.1] +point_cloud_range = [0, -40, -3, 70.4, 40, 1] # velodyne coordinates, x, y, z + +model = dict( + type='VoxelNet', + voxel_layer=dict( + max_num_points=5, # max_points_per_voxel + point_cloud_range=point_cloud_range, + voxel_size=voxel_size, + max_voxels=(16000, 40000), # (training, testing) max_coxels + ), + voxel_encoder=dict( + type='VoxelFeatureExtractorV3', + num_input_features=4, + num_filters=[4], + with_distance=False), + middle_encoder=dict( + type='SparseEncoder', + in_channels=4, + output_shape=[41, 1600, 1408], # checked from PointCloud3D + pre_act=False, + ), + backbone=dict( + type='SECOND', + in_channels=256, + layer_nums=[5, 5], + layer_strides=[1, 2], + num_filters=[128, 256], + ), + neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + upsample_strides=[1, 2], + num_upsample_filters=[256, 256], + ), + bbox_head=dict( + type='SECONDHead', + class_name=['Car'], + in_channels=512, + feat_channels=512, + use_direction_classifier=True, + encode_bg_as_zeros=True, + anchor_range=[0, -40.0, -1.78, 70.4, 40.0, -1.78], + anchor_strides=[2], + anchor_sizes=[[1.6, 3.9, 1.56]], + anchor_rotations=[0, 1.57], + diff_rad_by_sin=True, + bbox_coder=dict(type='ResidualCoder', ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2), + ), +) +# model training and testing settings +train_cfg = dict( + assigner=dict( + type='MaxIoUAssigner', + iou_type='nearest_3d', + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1), + allowed_border=0, + pos_weight=-1, + debug=False) +test_cfg = dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_thr=0.01, + score_thr=0.3, + min_bbox_size=0, + post_center_limit_range=[0, -40, -3, 70.4, 40, 0.0], +) + +# dataset settings +dataset_type = 'KittiDataset' +data_root = 'data/kitti/' +class_names = ['Car'] +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +input_modality = dict( + use_lidar=True, + use_depth=False, + use_lidar_intensity=True, + use_camera=False, +) +db_sampler = dict( + root_path=data_root, + info_path=data_root + 'kitti_dbinfos_train.pkl', + rate=1.0, + use_road_plane=False, + object_rot_range=[0.0, 0.0], + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict(Car=5), + ), + sample_groups=dict(Car=15), +) +train_pipeline = [ + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='ObjectNoise', + num_try=100, + loc_noise_std=[1.0, 1.0, 0.5], + global_rot_range=[0.0, 0.0], + rot_uniform_noise=[-0.78539816, 0.78539816]), + dict(type='PointsRandomFlip', flip_ratio=0.5), + dict( + type='GlobalRotScale', + rot_uniform_noise=[-0.78539816, 0.78539816], + scaling_uniform_noise=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points', 'gt_bboxes']), +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_train.pkl', + split='training', + training=True, + pipeline=train_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + val=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + test=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='testing', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True)) +# optimizer +lr = 0.001 # max learning rate +optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01) +optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) +lr_config = dict( + policy='cyclic', + target_ratio=[10, 1e-4], + cyclic_times=1, + step_ratio_up=0.4, +) +momentum_config = dict( + policy='cyclic', + target_ratio=[0.85 / 0.95, 1], + cyclic_times=1, + step_ratio_up=0.4, +) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 80 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/sec_secfpn_80e' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py b/configs/nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py new file mode 100644 index 0000000000..42757071a4 --- /dev/null +++ b/configs/nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py @@ -0,0 +1,187 @@ +# model settings +norm_cfg = dict(type='BN', requires_grad=False) +model = dict( + type='FasterRCNN', + pretrained=('./pretrain_detectron/' + 'ImageNetPretrained/MSRA/resnet50_msra.pth'), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.0, loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)))) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + # following the setting of detectron, + # which improves ~0.2 bbox mAP. + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100) + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) +) +# dataset settings +dataset_type = 'NuScenes2DDataset' +data_root = 'data/nuscenes/' +# Values to be used for image normalization (BGR order) +# Default mean pixel values are from ImageNet: [103.53, 116.28, 123.675] +# When using pre-trained models in Detectron1 or any MSRA models, +# std has been absorbed into its conv1 weights, so the std needs to be set 1. +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=False), + dict( + type='Resize', + img_scale=[(1200, 720), (1920, 1080)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1600, 900), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'nuscenes_infos_train.coco.json', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'nuscenes_infos_val.coco.json', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'nuscenes_infos_val.coco.json', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 1000, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +evaluation = dict(interval=1) +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl', port=29501) +log_level = 'INFO' +work_dir = './work_dirs/faster_rcnn_r50_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/nus/hv_pointpillars_secfpn_sbn-all_4x8_20e_nus-3d.py b/configs/nus/hv_pointpillars_secfpn_sbn-all_4x8_20e_nus-3d.py new file mode 100644 index 0000000000..34ccedb7eb --- /dev/null +++ b/configs/nus/hv_pointpillars_secfpn_sbn-all_4x8_20e_nus-3d.py @@ -0,0 +1,236 @@ +# model settings +voxel_size = [0.25, 0.25, 8] +point_cloud_range = [-50, -50, -5, 50, 50, 3] +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +model = dict( + type='MVXFasterRCNNV2', + pts_voxel_layer=dict( + max_num_points=64, # max_points_per_voxel + point_cloud_range=point_cloud_range, # velodyne coordinates, x, y, z + voxel_size=voxel_size, + max_voxels=(30000, 40000), # (training, testing) max_coxels + ), + pts_voxel_encoder=dict( + type='HardVFE', + num_input_features=4, + num_filters=[64, 64], + with_distance=False, + voxel_size=voxel_size, + with_cluster_center=True, + with_voxel_center=True, + point_cloud_range=point_cloud_range, + norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)), + pts_middle_encoder=dict( + type='PointPillarsScatter', + in_channels=64, + output_shape=[400, 400], # checked from PointCloud3D + ), + pts_backbone=dict( + type='SECOND', + in_channels=64, + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + num_filters=[64, 128, 256], + ), + pts_neck=dict( + type='SECONDFPN', + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + in_channels=[64, 128, 256], + upsample_strides=[1, 2, 4], + num_upsample_filters=[128, 128, 128], + ), + pts_bbox_head=dict( + type='Anchor3DVeloHead', + class_names=class_names, + num_classes=10, + in_channels=384, + feat_channels=384, + use_direction_classifier=True, + encode_bg_as_zeros=True, + anchor_range=[ + [-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795], # car + [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365], # truck + [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504], # trailer + [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111], # bicycle + [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072], # pedestrian + [-49.6, -49.6, -1.80984986, 49.6, 49.6, + -1.80984986], # traffic_cone + [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965], # barrier + ], + anchor_strides=[2], + anchor_sizes=[ + [1.95017717, 4.60718145, 1.72270761], # car + [2.4560939, 6.73778078, 2.73004906], # truck + [2.87427237, 12.01320693, 3.81509561], # trailer + [0.60058911, 1.68452161, 1.27192197], # bicycle + [0.66344886, 0.7256437, 1.75748069], # pedestrian + [0.39694519, 0.40359262, 1.06232151], # traffic_cone + [2.49008838, 0.48578221, 0.98297065], # barrier + ], + anchor_custom_values=[0, 0], + anchor_rotations=[0, 1.57], + assigner_per_size=False, + diff_rad_by_sin=True, + dir_offset=0.7854, # pi/4 + dir_limit_offset=0, + bbox_coder=dict(type='ResidualCoder', ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2), + ), +) +# model training and testing settings +train_cfg = dict( + pts=dict( + assigner=dict( # for Car + type='MaxIoUAssigner', + iou_type='nearest_3d', + pos_iou_thr=0.6, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + allowed_border=0, + code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + pos_weight=-1, + debug=False)) +test_cfg = dict( + pts=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=1000, + nms_thr=0.2, + score_thr=0.05, + min_bbox_size=0, + max_per_img=500, + post_center_limit_range=point_cloud_range, + # TODO: check whether need to change this + # post_center_limit_range=[-59.6, -59.6, -6, 59.6, 59.6, 4], + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) + )) + +# dataset settings +dataset_type = 'NuScenesDataset' +data_root = 'data/nuscenes/' +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +input_modality = dict( + use_lidar=True, + use_depth=False, + use_lidar_intensity=True, + use_camera=True, +) +db_sampler = dict( + root_path=data_root, + info_path=data_root + 'nuscenes_dbinfos_train.pkl', + rate=1.0, + use_road_plane=False, + object_rot_range=[0.0, 0.0], + prepare=dict(), + sample_groups=dict( + bus=4, + trailer=4, + truck=4, + ), +) + +train_pipeline = [ + dict( + type='GlobalRotScale', + rot_uniform_noise=[-0.3925, 0.3925], + scaling_uniform_noise=[0.95, 1.05], + trans_normal_noise=[0, 0, 0]), + dict(type='RandomFlip3D', flip_ratio=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']), +] +test_pipeline = [ + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='Resize', + img_scale=[ + (1280, 720), + ], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomFlip3D', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']), +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'nuscenes_infos_train.pkl', + pipeline=train_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + val=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + test=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=False)) +# optimizer +optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01) +# max_norm=10 is better for SECOND +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 1000, + step=[16, 19]) +momentum_config = None +checkpoint_config = dict(interval=1) +# yapf:disable +evaluation = dict(interval=20) +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 20 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/pp_secfpn_80e' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/nus/hv_pointpillars_secfpn_sbn-all_freeze_adamw_1x16_20e_nus-mm.py b/configs/nus/hv_pointpillars_secfpn_sbn-all_freeze_adamw_1x16_20e_nus-mm.py new file mode 100644 index 0000000000..5d26d2560f --- /dev/null +++ b/configs/nus/hv_pointpillars_secfpn_sbn-all_freeze_adamw_1x16_20e_nus-mm.py @@ -0,0 +1,267 @@ +# model settings +voxel_size = [0.25, 0.25, 8] +point_cloud_range = [-50, -50, -5, 50, 50, 3] +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +model = dict( + type='MVXFasterRCNNV2', + pretrained=('./pretrain_detectron/' + 'ImageNetPretrained/MSRA/resnet50_msra.pth'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=4, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + pts_voxel_layer=dict( + max_num_points=64, # max_points_per_voxel + point_cloud_range=point_cloud_range, # velodyne coordinates, x, y, z + voxel_size=voxel_size, + max_voxels=(30000, 40000), # (training, testing) max_coxels + ), + pts_voxel_encoder=dict( + type='HardVFE', + num_input_features=4, + num_filters=[64, 64], + with_distance=False, + voxel_size=voxel_size, + with_cluster_center=True, + with_voxel_center=True, + point_cloud_range=point_cloud_range, + norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01), + fusion_layer=dict( + type='MultiViewPointFusion', + img_channels=2048, + pts_channels=64, + mid_channels=128, + out_channels=128, + norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01), + img_levels=[3], + align_corners=False, + activate_out=True, + fuse_out=False), + ), + pts_middle_encoder=dict( + type='PointPillarsScatter', + in_channels=128, + output_shape=[400, 400], # checked from PointCloud3D + ), + pts_backbone=dict( + type='SECOND', + in_channels=128, + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + num_filters=[64, 128, 256], + ), + pts_neck=dict( + type='SECONDFPN', + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + in_channels=[64, 128, 256], + upsample_strides=[1, 2, 4], + num_upsample_filters=[128, 128, 128], + ), + pts_bbox_head=dict( + type='Anchor3DVeloHead', + class_names=class_names, + num_classes=10, + in_channels=384, + feat_channels=384, + use_direction_classifier=True, + encode_bg_as_zeros=True, + anchor_generator=dict(type='AlignedAnchorGeneratorRange', ), + anchor_range=[ + [-50, -50, -1.80032795, 50, 50, -1.80032795], # car + [-50, -50, -1.74440365, 50, 50, -1.74440365], # truck + [-50, -50, -1.68526504, 50, 50, -1.68526504], # trailer + [-50, -50, -1.67339111, 50, 50, -1.67339111], # bicycle + [-50, -50, -1.61785072, 50, 50, -1.61785072], # pedestrian + [-50, -50, -1.80984986, 50, 50, -1.80984986], # traffic_cone + [-50, -50, -1.763965, 50, 50, -1.763965], # barrier + ], + anchor_strides=[2], + anchor_sizes=[ + [1.95017717, 4.60718145, 1.72270761], # car + [2.4560939, 6.73778078, 2.73004906], # truck + [2.87427237, 12.01320693, 3.81509561], # trailer + [0.60058911, 1.68452161, 1.27192197], # bicycle + [0.66344886, 0.7256437, 1.75748069], # pedestrian + [0.39694519, 0.40359262, 1.06232151], # traffic_cone + [2.49008838, 0.48578221, 0.98297065], # barrier + ], + anchor_custom_values=[0, 0], + anchor_rotations=[0, 1.57], + assigner_per_size=False, + assign_per_class=False, + diff_rad_by_sin=True, + dir_offset=0.7854, # pi/4 + dir_limit_offset=0, + bbox_coder=dict(type='ResidualCoder', ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2), + ), +) +# model training and testing settings +train_cfg = dict( + pts=dict( + assigner=dict( # for Car + type='MaxIoUAssigner', + iou_type='nearest_3d', + pos_iou_thr=0.6, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + allowed_border=0, + code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + pos_weight=-1, + debug=False)) +test_cfg = dict( + pts=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=1000, + nms_thr=0.2, + score_thr=0.05, + min_bbox_size=0, + max_per_img=500, + post_center_limit_range=point_cloud_range, + # TODO: check whether need to change this + # post_center_limit_range=[-59.6, -59.6, -6, 59.6, 59.6, 4], + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05) + )) + +# dataset settings +dataset_type = 'NuScenesDataset' +data_root = 'data/nuscenes/' +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +input_modality = dict( + use_lidar=True, + use_radar=False, + use_map=False, + use_external=False, + use_camera=True, +) +db_sampler = dict( + root_path=data_root, + info_path=data_root + 'nuscenes_dbinfos_train.pkl', + rate=1.0, + use_road_plane=False, + object_rot_range=[0.0, 0.0], + prepare=dict(), + sample_groups=dict( + bus=4, + trailer=4, + truck=4, + ), +) + +train_pipeline = [ + dict( + type='Resize', + img_scale=(1280, 720), + ratio_range=(0.8, 1.2), + keep_ratio=True), + dict( + type='GlobalRotScale', + rot_uniform_noise=[-0.3925, 0.3925], + scaling_uniform_noise=[0.95, 1.05], + trans_normal_noise=[0, 0, 0]), + dict(type='RandomFlip3D', flip_ratio=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict( + type='Collect3D', + keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']), +] +test_pipeline = [ + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='Resize', + img_scale=[ + (1280, 720), + ], + multiscale_mode='value', + keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points', 'img']), +] + +data = dict( + samples_per_gpu=1, + workers_per_gpu=2, + train=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'nuscenes_infos_train.pkl', + pipeline=train_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + val=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=True), + test=dict( + type=dataset_type, + root_path=data_root, + ann_file=data_root + 'nuscenes_infos_test.pkl', + pipeline=test_pipeline, + modality=input_modality, + class_names=class_names, + with_label=False)) +# optimizer +optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01) +# max_norm=10 is better for SECOND +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 1000, + step=[16, 19]) +momentum_config = None +checkpoint_config = dict(interval=1) +# yapf:disable +evaluation = dict(interval=20) +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 20 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/pp_secfpn_80e' +load_from = './pretrain_mmdet/mvx_faster_rcnn_r50_fpn_detectron2-caffe_freezeBN_l1-loss_roialign-v2_nus_1x_coco-3x-pre_ap-28.8-4e72d8c7.pth' # noqa +resume_from = None +workflow = [('train', 1)] diff --git a/configs/nus/retinanet_r50_fpn_caffe_2x8_1x_nus.py b/configs/nus/retinanet_r50_fpn_caffe_2x8_1x_nus.py new file mode 100644 index 0000000000..f93e120313 --- /dev/null +++ b/configs/nus/retinanet_r50_fpn_caffe_2x8_1x_nus.py @@ -0,0 +1,138 @@ +# model settings +norm_cfg = dict(type='BN', requires_grad=False) +model = dict( + type='RetinaNet', + pretrained=('./pretrain_detectron/' + 'ImageNetPretrained/MSRA/resnet50_msra.pth'), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5), + bbox_head=dict( + type='RetinaHead', + num_classes=10, + in_channels=256, + stacked_convs=4, + feat_channels=256, + octave_base_scale=4, + scales_per_octave=3, + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[8, 16, 32, 64, 128], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.0, loss_weight=1.0))) +# training and testing settings +train_cfg = dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'NuScenes2DDataset' +data_root = 'data/nuscenes/' +# Values to be used for image normalization (BGR order) +# Default mean pixel value are from ImageNet: [103.53, 116.28, 123.675] +# When using pre-trained models in Detectron1 or any MSRA models, +# std has been absorbed into its conv1 weights, so the std needs to be set 1. +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=(1600, 900), + ratio_range=(0.8, 1.2), + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1600, 900), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'nuscenes_infos_train.coco.json', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'nuscenes_infos_val.coco.json', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'nuscenes_infos_val.coco.json', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 1000, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/retinanet_r50_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md new file mode 100644 index 0000000000..632b97d77f --- /dev/null +++ b/docs/CHANGELOG.md @@ -0,0 +1,209 @@ +## Changelog + +### v1.1.0 (24/2/2020) + +**Highlights** +- Dataset evaluation is rewritten with a unified api, which is used by both evaluation hooks and test scripts. +- Support new methods: [CARAFE](https://arxiv.org/abs/1905.02188). + +**Breaking Changes** +- The new MMDDP inherits from the official DDP, thus the `__init__` api is changed to be the same as official DDP. +- The `mask_head` field in HTC config files is modified. +- The evaluation and testing script is updated. +- In all transforms, instance masks are stored as a numpy array shaped (n, h, w) instead of a list of (h, w) arrays, where n is the number of instances. + +**Bug Fixes** +- Fix IOU assigners when ignore_iof_thr > 0 and there is no pred boxes. (#2135) +- Fix mAP evaluation when there are no ignored boxes. (#2116) +- Fix the empty RoI input for Deformable RoI Pooling. (#2099) +- Fix the dataset settings for multiple workflows. (#2103) +- Fix the warning related to `torch.uint8` in PyTorch 1.4. (#2105) +- Fix the inference demo on devices other than gpu:0. (#2098) +- Fix Dockerfile. (#2097) +- Fix the bug that `pad_val` is unused in Pad transform. (#2093) +- Fix the albumentation transform when there is no ground truth bbox. (#2032) + +**Improvements** +- Use torch instead of numpy for random sampling. (#2094) +- Migrate to the new MMDDP implementation in MMCV v0.3. (#2090) +- Add meta information in logs. (#2086) +- Rewrite Soft NMS with pytorch extension and remove cython as a dependency. (#2056) +- Rewrite dataset evaluation. (#2042, #2087, #2114, #2128) +- Use numpy array for masks in transforms. (#2030) + +**New Features** +- Implement "CARAFE: Content-Aware ReAssembly of FEatures". (#1583) +- Add `worker_init_fn()` in data_loader when seed is set. (#2066, #2111) +- Add logging utils. (#2035) + +### v1.0.0 (30/1/2020) + +This release mainly improves the code quality and add more docstrings. + +**Highlights** +- Documentation is online now: https://mmdetection.readthedocs.io. +- Support new models: [ATSS](https://arxiv.org/abs/1912.02424). +- DCN is now available with the api `build_conv_layer` and `ConvModule` like the normal conv layer. +- A tool to collect environment information is available for trouble shooting. + +**Bug Fixes** +- Fix the incompatibility of the latest numpy and pycocotools. (#2024) +- Fix the case when distributed package is unavailable, e.g., on Windows. (#1985) +- Fix the dimension issue for `refine_bboxes()`. (#1962) +- Fix the typo when `seg_prefix` is a list. (#1906) +- Add segmentation map cropping to RandomCrop. (#1880) +- Fix the return value of `ga_shape_target_single()`. (#1853) +- Fix the loaded shape of empty proposals. (#1819) +- Fix the mask data type when using albumentation. (#1818) + +**Improvements** +- Enhance AssignResult and SamplingResult. (#1995) +- Add ability to overwrite existing module in Registry. (#1982) +- Reorganize requirements and make albumentations and imagecorruptions optional. (#1969) +- Check NaN in `SSDHead`. (#1935) +- Encapsulate the DCN in ResNe(X)t into a ConvModule & Conv_layers. (#1894) +- Refactoring for mAP evaluation and support multiprocessing and logging. (#1889) +- Init the root logger before constructing Runner to log more information. (#1865) +- Split `SegResizeFlipPadRescale` into different existing transforms. (#1852) +- Move `init_dist()` to MMCV. (#1851) +- Documentation and docstring improvements. (#1971, #1938, #1869, #1838) +- Fix the color of the same class for mask visualization. (#1834) +- Remove the option `keep_all_stages` in HTC and Cascade R-CNN. (#1806) + +**New Features** +- Add two test-time options `crop_mask` and `rle_mask_encode` for mask heads. (#2013) +- Support loading grayscale images as single channel. (#1975) +- Implement "Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection". (#1872) +- Add sphinx generated docs. (#1859, #1864) +- Add GN support for flops computation. (#1850) +- Collect env info for trouble shooting. (#1812) + + +### v1.0rc1 (13/12/2019) + +The RC1 release mainly focuses on improving the user experience, and fixing bugs. + +**Highlights** +- Support new models: [FoveaBox](https://arxiv.org/abs/1904.03797), [RepPoints](https://arxiv.org/abs/1904.11490) and [FreeAnchor](https://arxiv.org/abs/1909.02466). +- Add a Dockerfile. +- Add a jupyter notebook demo and a webcam demo. +- Setup the code style and CI. +- Add lots of docstrings and unit tests. +- Fix lots of bugs. + +**Breaking Changes** +- There was a bug for computing COCO-style mAP w.r.t different scales (AP_s, AP_m, AP_l), introduced by #621. (#1679) + +**Bug Fixes** +- Fix a sampling interval bug in Libra R-CNN. (#1800) +- Fix the learning rate in SSD300 WIDER FACE. (#1781) +- Fix the scaling issue when `keep_ratio=False`. (#1730) +- Fix typos. (#1721, #1492, #1242, #1108, #1107) +- Fix the shuffle argument in `build_dataloader`. (#1693) +- Clip the proposal when computing mask targets. (#1688) +- Fix the "index out of range" bug for samplers in some corner cases. (#1610, #1404) +- Fix the NMS issue on devices other than GPU:0. (#1603) +- Fix SSD Head and GHM Loss on CPU. (#1578) +- Fix the OOM error when there are too many gt bboxes. (#1575) +- Fix the wrong keyword argument `nms_cfg` in HTC. (#1573) +- Process masks and semantic segmentation in Expand and MinIoUCrop transforms. (#1550, #1361) +- Fix a scale bug in the Non Local op. (#1528) +- Fix a bug in transforms when `gt_bboxes_ignore` is None. (#1498) +- Fix a bug when `img_prefix` is None. (#1497) +- Pass the device argument to `grid_anchors` and `valid_flags`. (#1478) +- Fix the data pipeline for test_robustness. (#1476) +- Fix the argument type of deformable pooling. (#1390) +- Fix the coco_eval when there are only two classes. (#1376) +- Fix a bug in Modulated DeformableConv when deformable_group>1. (#1359) +- Fix the mask cropping in RandomCrop. (#1333) +- Fix zero outputs in DeformConv when not running on cuda:0. (#1326) +- Fix the type issue in Expand. (#1288) +- Fix the inference API. (#1255) +- Fix the inplace operation in Expand. (#1249) +- Fix the from-scratch training config. (#1196) +- Fix inplace add in RoIExtractor which cause an error in PyTorch 1.2. (#1160) +- Fix FCOS when input images has no positive sample. (#1136) +- Fix recursive imports. (#1099) + +**Improvements** +- Print the config file and mmdet version in the log. (#1721) +- Lint the code before compiling in travis CI. (#1715) +- Add a probability argument for the `Expand` transform. (#1651) +- Update the PyTorch and CUDA version in the docker file. (#1615) +- Raise a warning when specifying `--validate` in non-distributed training. (#1624, #1651) +- Beautify the mAP printing. (#1614) +- Add pre-commit hook. (#1536) +- Add the argument `in_channels` to backbones. (#1475) +- Add lots of docstrings and unit tests, thanks to [@Erotemic](https://github.com/Erotemic). (#1603, #1517, #1506, #1505, #1491, #1479, #1477, #1475, #1474) +- Add support for multi-node distributed test when there is no shared storage. (#1399) +- Optimize Dockerfile to reduce the image size. (#1306) +- Update new results of HRNet. (#1284, #1182) +- Add an argument `no_norm_on_lateral` in FPN. (#1240) +- Test the compiling in CI. (#1235) +- Move docs to a separate folder. (#1233) +- Add a jupyter notebook demo. (#1158) +- Support different type of dataset for training. (#1133) +- Use int64_t instead of long in cuda kernels. (#1131) +- Support unsquare RoIs for bbox and mask heads. (#1128) +- Manually add type promotion to make compatible to PyTorch 1.2. (#1114) +- Allowing validation dataset for computing validation loss. (#1093) +- Use `.scalar_type()` instead of `.type()` to suppress some warnings. (#1070) + +**New Features** +- Add an option `--with_ap` to compute the AP for each class. (#1549) +- Implement "FreeAnchor: Learning to Match Anchors for Visual Object Detection". (#1391) +- Support [Albumentations](https://github.com/albumentations-team/albumentations) for augmentations in the data pipeline. (#1354) +- Implement "FoveaBox: Beyond Anchor-based Object Detector". (#1339) +- Support horizontal and vertical flipping. (#1273, #1115) +- Implement "RepPoints: Point Set Representation for Object Detection". (#1265) +- Add test-time augmentation to HTC and Cascade R-CNN. (#1251) +- Add a COCO result analysis tool. (#1228) +- Add Dockerfile. (#1168) +- Add a webcam demo. (#1155, #1150) +- Add FLOPs counter. (#1127) +- Allow arbitrary layer order for ConvModule. (#1078) + + +### v1.0rc0 (27/07/2019) +- Implement lots of new methods and components (Mixed Precision Training, HTC, Libra R-CNN, Guided Anchoring, Empirical Attention, Mask Scoring R-CNN, Grid R-CNN (Plus), GHM, GCNet, FCOS, HRNet, Weight Standardization, etc.). Thank all collaborators! +- Support two additional datasets: WIDER FACE and Cityscapes. +- Refactoring for loss APIs and make it more flexible to adopt different losses and related hyper-parameters. +- Speed up multi-gpu testing. +- Integrate all compiling and installing in a single script. + +### v0.6.0 (14/04/2019) +- Up to 30% speedup compared to the model zoo. +- Support both PyTorch stable and nightly version. +- Replace NMS and SigmoidFocalLoss with Pytorch CUDA extensions. + +### v0.6rc0(06/02/2019) +- Migrate to PyTorch 1.0. + +### v0.5.7 (06/02/2019) +- Add support for Deformable ConvNet v2. (Many thanks to the authors and [@chengdazhi](https://github.com/chengdazhi)) +- This is the last release based on PyTorch 0.4.1. + +### v0.5.6 (17/01/2019) +- Add support for Group Normalization. +- Unify RPNHead and single stage heads (RetinaHead, SSDHead) with AnchorHead. + +### v0.5.5 (22/12/2018) +- Add SSD for COCO and PASCAL VOC. +- Add ResNeXt backbones and detection models. +- Refactoring for Samplers/Assigners and add OHEM. +- Add VOC dataset and evaluation scripts. + +### v0.5.4 (27/11/2018) +- Add SingleStageDetector and RetinaNet. + +### v0.5.3 (26/11/2018) +- Add Cascade R-CNN and Cascade Mask R-CNN. +- Add support for Soft-NMS in config files. + +### v0.5.2 (21/10/2018) +- Add support for custom datasets. +- Add a script to convert PASCAL VOC annotations to the expected format. + +### v0.5.1 (20/10/2018) +- Add BBoxAssigner and BBoxSampler, the `train_cfg` field in config files are restructured. +- `ConvFCRoIHead` / `SharedFCRoIHead` are renamed to `ConvFCBBoxHead` / `SharedFCBBoxHead` for consistency. diff --git a/docs/CODE_OF_CONDUCT.md b/docs/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000..efd4305798 --- /dev/null +++ b/docs/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at chenkaidev@gmail.com. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md new file mode 100644 index 0000000000..7a24fb56ca --- /dev/null +++ b/docs/CONTRIBUTING.md @@ -0,0 +1,35 @@ +# Contributing to mmdetection + +All kinds of contributions are welcome, including but not limited to the following. + +- Fixes (typo, bugs) +- New features and components + +## Workflow + +1. fork and pull the latest mmdetection +2. checkout a new branch (do not use master branch for PRs) +3. commit your changes +4. create a PR + +Note +- If you plan to add some new features that involve large changes, it is encouraged to open an issue for discussion first. +- If you are the author of some papers and would like to include your method to mmdetection, +please contact Wenwei Zhang (zwwdev[at]gmail[dot]com). We will much appreciate your contribution. + +## Code style + +### Python +We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style. + +We use the following tools for linting and formatting: +- [flake8](http://flake8.pycqa.org/en/latest/): linter +- [yapf](https://github.com/google/yapf): formatter +- [isort](https://github.com/timothycrosley/isort): sort imports + +Style configurations of yapf and isort can be found in [.style.yapf](.style.yapf) and [.isort.cfg](.isort.cfg). + +>Before you create a PR, make sure that your code lints and is formatted by yapf. + +### C++ and CUDA +We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md new file mode 100644 index 0000000000..077def3e39 --- /dev/null +++ b/docs/GETTING_STARTED.md @@ -0,0 +1,510 @@ +# Getting Started + +This page provides basic tutorials about the usage of MMDetection. +For installation instructions, please see [INSTALL.md](INSTALL.md). + +## Inference with pretrained models + +We provide testing scripts to evaluate a whole dataset (COCO, PASCAL VOC, Cityscapes, etc.), +and also some high-level apis for easier integration to other projects. + +### Test a dataset + +- [x] single GPU testing +- [x] multiple GPU testing +- [x] visualize detection results + +You can use the following commands to test a dataset. + +```shell +# single-gpu testing +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show] + +# multi-gpu testing +./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] +``` + +Optional arguments: +- `RESULT_FILE`: Filename of the output results in pickle format. If not specified, the results will not be saved to a file. +- `EVAL_METRICS`: Items to be evaluated on the results. Allowed values depend on the dataset, e.g., `proposal_fast`, `proposal`, `bbox`, `segm` are available for COCO, `mAP`, `recall` for PASCAL VOC. Cityscapes could be evaluated by `cityscapes` as well as all COCO metrics. +- `--show`: If specified, detection results will be plotted on the images and shown in a new window. It is only applicable to single GPU testing and used for debugging and visualization. Please make sure that GUI is available in your environment, otherwise you may encounter the error like `cannot connect to X server`. + +If you would like to evaluate the dataset, do not specify `--show` at the same time. + +Examples: + +Assume that you have already downloaded the checkpoints to the directory `checkpoints/`. + +1. Test Faster R-CNN and visualize the results. Press any key for the next image. + +```shell +python tools/test.py configs/faster_rcnn_r50_fpn_1x.py \ + checkpoints/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth \ + --show +``` + +2. Test Faster R-CNN on PASCAL VOC (without saving the test results) and evaluate the mAP. + +```shell +python tools/test.py configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc.py \ + checkpoints/SOME_CHECKPOINT.pth \ + --eval mAP +``` + +3. Test Mask R-CNN with 8 GPUs, and evaluate the bbox and mask AP. + +```shell +./tools/dist_test.sh configs/mask_rcnn_r50_fpn_1x.py \ + checkpoints/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth \ + 8 --out results.pkl --eval bbox segm +``` + +4. Test Mask R-CNN on COCO test-dev with 8 GPUs, and generate the json file to be submit to the official evaluation server. + +```shell +./tools/dist_test.sh configs/mask_rcnn_r50_fpn_1x.py \ + checkpoints/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth \ + 8 --format-only --options "jsonfile_prefix=./mask_rcnn_test-dev_results" +``` + +You will get two json files `mask_rcnn_test-dev_results.bbox.json` and `mask_rcnn_test-dev_results.segm.json`. + +5. Test Mask R-CNN on Cityscapes test with 8 GPUs, and generate the txt and png files to be submit to the official evaluation server. + +```shell +./tools/dist_test.sh configs/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py \ + checkpoints/mask_rcnn_r50_fpn_1x_cityscapes_20200227-afe51d5a.pth \ + 8 --format_only --options "outfile_prefix=./mask_rcnn_cityscapes_test_results" +``` + +The generated png and txt would be under `./mask_rcnn_cityscapes_test_results` directory. + +### Webcam demo + +We provide a webcam demo to illustrate the results. + +```shell +python demo/webcam_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--camera-id ${CAMERA-ID}] [--score-thr ${SCORE_THR}] +``` + +Examples: + +```shell +python demo/webcam_demo.py configs/faster_rcnn_r50_fpn_1x.py \ + checkpoints/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth +``` + +### High-level APIs for testing images + +#### Synchronous interface +Here is an example of building the model and test given images. + +```python +from mmdet.apis import init_detector, inference_detector, show_result +import mmcv + +config_file = 'configs/faster_rcnn_r50_fpn_1x.py' +checkpoint_file = 'checkpoints/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth' + +# build the model from a config file and a checkpoint file +model = init_detector(config_file, checkpoint_file, device='cuda:0') + +# test a single image and show the results +img = 'test.jpg' # or img = mmcv.imread(img), which will only load it once +result = inference_detector(model, img) +# visualize the results in a new window +show_result(img, result, model.CLASSES) +# or save the visualization results to image files +show_result(img, result, model.CLASSES, out_file='result.jpg') + +# test a video and show the results +video = mmcv.VideoReader('video.mp4') +for frame in video: + result = inference_detector(model, frame) + show_result(frame, result, model.CLASSES, wait_time=1) +``` + +A notebook demo can be found in [demo/inference_demo.ipynb](https://github.com/open-mmlab/mmdetection/blob/master/demo/inference_demo.ipynb). + +#### Asynchronous interface - supported for Python 3.7+ + +Async interface allows not to block CPU on GPU bound inference code and enables better CPU/GPU utilization for single threaded application. Inference can be done concurrently either between different input data samples or between different models of some inference pipeline. + +See `tests/async_benchmark.py` to compare the speed of synchronous and asynchronous interfaces. + +```python +import asyncio +import torch +from mmdet.apis import init_detector, async_inference_detector, show_result +from mmdet.utils.contextmanagers import concurrent + +async def main(): + config_file = 'configs/faster_rcnn_r50_fpn_1x.py' + checkpoint_file = 'checkpoints/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth' + device = 'cuda:0' + model = init_detector(config_file, checkpoint=checkpoint_file, device=device) + + # queue is used for concurrent inference of multiple images + streamqueue = asyncio.Queue() + # queue size defines concurrency level + streamqueue_size = 3 + + for _ in range(streamqueue_size): + streamqueue.put_nowait(torch.cuda.Stream(device=device)) + + # test a single image and show the results + img = 'test.jpg' # or img = mmcv.imread(img), which will only load it once + + async with concurrent(streamqueue): + result = await async_inference_detector(model, img) + + # visualize the results in a new window + show_result(img, result, model.CLASSES) + # or save the visualization results to image files + show_result(img, result, model.CLASSES, out_file='result.jpg') + + +asyncio.run(main()) + +``` + + +## Train a model + +MMDetection implements distributed training and non-distributed training, +which uses `MMDistributedDataParallel` and `MMDataParallel` respectively. + +All outputs (log files and checkpoints) will be saved to the working directory, +which is specified by `work_dir` in the config file. + +By default we evaluate the model on the validation set after each epoch, you can change the evaluation interval by adding the interval argument in the training config. +```python +evaluation = dict(interval=12) # This evaluate the model per 12 epoch. +``` + +**\*Important\***: The default learning rate in config files is for 8 GPUs and 2 img/gpu (batch size = 8*2 = 16). +According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you need to set the learning rate proportional to the batch size if you use different GPUs or images per GPU, e.g., lr=0.01 for 4 GPUs * 2 img/gpu and lr=0.08 for 16 GPUs * 4 img/gpu. + +### Train with a single GPU + +```shell +python tools/train.py ${CONFIG_FILE} +``` + +If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`. + +### Train with multiple GPUs + +```shell +./tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments] +``` + +Optional arguments are: + +- `--validate` (**strongly recommended**): Perform evaluation at every k (default value is 1, which can be modified like [this](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask_rcnn_r50_fpn_1x.py#L174)) epochs during the training. +- `--work_dir ${WORK_DIR}`: Override the working directory specified in the config file. +- `--resume_from ${CHECKPOINT_FILE}`: Resume from a previous checkpoint file. + +Difference between `resume_from` and `load_from`: +`resume_from` loads both the model weights and optimizer status, and the epoch is also inherited from the specified checkpoint. It is usually used for resuming the training process that is interrupted accidentally. +`load_from` only loads the model weights and the training epoch starts from 0. It is usually used for finetuning. + +### Train with multiple machines + +If you run MMDetection on a cluster managed with [slurm](https://slurm.schedmd.com/), you can use the script `slurm_train.sh`. (This script also supports single machine training.) + +```shell +./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR} [${GPUS}] +``` + +Here is an example of using 16 GPUs to train Mask R-CNN on the dev partition. + +```shell +./tools/slurm_train.sh dev mask_r50_1x configs/mask_rcnn_r50_fpn_1x.py /nfs/xxxx/mask_rcnn_r50_fpn_1x 16 +``` + +You can check [slurm_train.sh](https://github.com/open-mmlab/mmdetection/blob/master/tools/slurm_train.sh) for full arguments and environment variables. + +If you have just multiple machines connected with ethernet, you can refer to +pytorch [launch utility](https://pytorch.org/docs/stable/distributed_deprecated.html#launch-utility). +Usually it is slow if you do not have high speed networking like infiniband. + +### Launch multiple jobs on a single machine + +If you launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs, +you need to specify different ports (29500 by default) for each job to avoid communication conflict. + +If you use `dist_train.sh` to launch training jobs, you can set the port in commands. + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4 +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4 +``` + +If you use launch training jobs with slurm, you need to modify the config files (usually the 6th line from the bottom in config files) to set different communication ports. + +In `config1.py`, +```python +dist_params = dict(backend='nccl', port=29500) +``` + +In `config2.py`, +```python +dist_params = dict(backend='nccl', port=29501) +``` + +Then you can launch two jobs with `config1.py` ang `config2.py`. + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR} 4 +CUDA_VISIBLE_DEVICES=4,5,6,7 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR} 4 +``` + +## Useful tools + +We provide lots of useful tools under `tools/` directory. + +### Analyze logs + +You can plot loss/mAP curves given a training log file. Run `pip install seaborn` first to install the dependency. + +![loss curve image](../demo/loss_curve.png) + +```shell +python tools/analyze_logs.py plot_curve [--keys ${KEYS}] [--title ${TITLE}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}] +``` + +Examples: + +- Plot the classification loss of some run. + +```shell +python tools/analyze_logs.py plot_curve log.json --keys loss_cls --legend loss_cls +``` + +- Plot the classification and regression loss of some run, and save the figure to a pdf. + +```shell +python tools/analyze_logs.py plot_curve log.json --keys loss_cls loss_reg --out losses.pdf +``` + +- Compare the bbox mAP of two runs in the same figure. + +```shell +python tools/analyze_logs.py plot_curve log1.json log2.json --keys bbox_mAP --legend run1 run2 +``` + +You can also compute the average training speed. + +```shell +python tools/analyze_logs.py cal_train_time ${CONFIG_FILE} [--include-outliers] +``` + +The output is expected to be like the following. + +``` +-----Analyze train time of work_dirs/some_exp/20190611_192040.log.json----- +slowest epoch 11, average time is 1.2024 +fastest epoch 1, average time is 1.1909 +time std over epochs is 0.0028 +average iter time: 1.1959 s/iter + +``` + +### Get the FLOPs and params (experimental) + +We provide a script adapted from [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch) to compute the FLOPs and params of a given model. + +```shell +python tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}] +``` + +You will get the result like this. + +``` +============================== +Input shape: (3, 1280, 800) +Flops: 239.32 GMac +Params: 37.74 M +============================== +``` + +**Note**: This tool is still experimental and we do not guarantee that the number is correct. You may well use the result for simple comparisons, but double check it before you adopt it in technical reports or papers. + +(1) FLOPs are related to the input shape while parameters are not. The default input shape is (1, 3, 1280, 800). +(2) Some operators are not counted into FLOPs like GN and custom operators. +You can add support for new operators by modifying [`mmdet/utils/flops_counter.py`](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/utils/flops_counter.py). +(3) The FLOPs of two-stage detectors is dependent on the number of proposals. + +### Publish a model + +Before you upload a model to AWS, you may want to +(1) convert model weights to CPU tensors, (2) delete the optimizer states and +(3) compute the hash of the checkpoint file and append the hash id to the filename. + +```shell +python tools/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME} +``` + +E.g., + +```shell +python tools/publish_model.py work_dirs/faster_rcnn/latest.pth faster_rcnn_r50_fpn_1x_20190801.pth +``` + +The final output filename will be `faster_rcnn_r50_fpn_1x_20190801-{hash id}.pth`. + +### Test the robustness of detectors + +Please refer to [ROBUSTNESS_BENCHMARKING.md](ROBUSTNESS_BENCHMARKING.md). + + +## How-to + +### Use my own datasets + +The simplest way is to convert your dataset to existing dataset formats (COCO or PASCAL VOC). + +Here we show an example of adding a custom dataset of 5 classes, assuming it is also in COCO format. + +In `mmdet/datasets/my_dataset.py`: + +```python +from .coco import CocoDataset +from .registry import DATASETS + + +@DATASETS.register_module +class MyDataset(CocoDataset): + + CLASSES = ('a', 'b', 'c', 'd', 'e') +``` + +In `mmdet/datasets/__init__.py`: + +```python +from .my_dataset import MyDataset +``` + +Then you can use `MyDataset` in config files, with the same API as CocoDataset. + + +It is also fine if you do not want to convert the annotation format to COCO or PASCAL format. +Actually, we define a simple annotation format and all existing datasets are +processed to be compatible with it, either online or offline. + +The annotation of a dataset is a list of dict, each dict corresponds to an image. +There are 3 field `filename` (relative path), `width`, `height` for testing, +and an additional field `ann` for training. `ann` is also a dict containing at least 2 fields: +`bboxes` and `labels`, both of which are numpy arrays. Some datasets may provide +annotations like crowd/difficult/ignored bboxes, we use `bboxes_ignore` and `labels_ignore` +to cover them. + +Here is an example. +``` +[ + { + 'filename': 'a.jpg', + 'width': 1280, + 'height': 720, + 'ann': { + 'bboxes': (n, 4), + 'labels': (n, ), + 'bboxes_ignore': (k, 4), + 'labels_ignore': (k, ) (optional field) + } + }, + ... +] +``` + +There are two ways to work with custom datasets. + +- online conversion + + You can write a new Dataset class inherited from `CustomDataset`, and overwrite two methods + `load_annotations(self, ann_file)` and `get_ann_info(self, idx)`, + like [CocoDataset](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py) and [VOCDataset](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/voc.py). + +- offline conversion + + You can convert the annotation format to the expected format above and save it to + a pickle or json file, like [pascal_voc.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/convert_datasets/pascal_voc.py). + Then you can simply use `CustomDataset`. + +### Customize optimizer + +An example of customized optimizer `CopyOfSGD` is defined in `mmdet/core/optimizer/copy_of_sgd.py`. +More generally, a customized optimizer could be defined as following. + +In `mmdet/core/optimizer/my_optimizer.py`: + +```python +from .registry import OPTIMIZERS +from torch.optim import Optimizer + + +@OPTIMIZERS.register_module +class MyOptimizer(Optimizer): + +``` + +In `mmdet/core/optimizer/__init__.py`: + +```python +from .my_optimizer import MyOptimizer +``` + +Then you can use `MyOptimizer` in `optimizer` field of config files. + +### Develop new components + +We basically categorize model components into 4 types. + +- backbone: usually an FCN network to extract feature maps, e.g., ResNet, MobileNet. +- neck: the component between backbones and heads, e.g., FPN, PAFPN. +- head: the component for specific tasks, e.g., bbox prediction and mask prediction. +- roi extractor: the part for extracting RoI features from feature maps, e.g., RoI Align. + +Here we show how to develop new components with an example of MobileNet. + +1. Create a new file `mmdet/models/backbones/mobilenet.py`. + +```python +import torch.nn as nn + +from ..registry import BACKBONES + + +@BACKBONES.register_module +class MobileNet(nn.Module): + + def __init__(self, arg1, arg2): + pass + + def forward(self, x): # should return a tuple + pass + + def init_weights(self, pretrained=None): + pass +``` + +2. Import the module in `mmdet/models/backbones/__init__.py`. + +```python +from .mobilenet import MobileNet +``` + +3. Use it in your config file. + +```python +model = dict( + ... + backbone=dict( + type='MobileNet', + arg1=xxx, + arg2=xxx), + ... +``` + +For more information on how it works, you can refer to [TECHNICAL_DETAILS.md](TECHNICAL_DETAILS.md) (TODO). diff --git a/docs/INSTALL.md b/docs/INSTALL.md new file mode 100644 index 0000000000..b22b970b97 --- /dev/null +++ b/docs/INSTALL.md @@ -0,0 +1,161 @@ +## Installation + +### Requirements + +- Linux (Windows is not officially supported) +- Python 3.5+ +- PyTorch 1.1 or higher +- CUDA 9.0 or higher +- NCCL 2 +- GCC 4.9 or higher +- [mmcv](https://github.com/open-mmlab/mmcv) + +We have tested the following versions of OS and softwares: + +- OS: Ubuntu 16.04/18.04 and CentOS 7.2 +- CUDA: 9.0/9.2/10.0/10.1 +- NCCL: 2.1.15/2.2.13/2.3.7/2.4.2 +- GCC(G++): 4.9/5.3/5.4/7.3 + +### Install mmdetection + +a. Create a conda virtual environment and activate it. + +```shell +conda create -n open-mmlab python=3.7 numba=0.45.1 -y +conda activate open-mmlab +``` + +b. Install PyTorch and torchvision following the [official instructions](https://pytorch.org/), e.g., + +```shell +conda install pytorch torchvision -c pytorch +``` + +c. Clone the mmdetection repository. + +```shell +git clone https://github.com/open-mmlab/mmdetection.git +cd mmdetection +``` + +d. Install build requirements and then install mmdetection. +(We install pycocotools via the github repo instead of pypi because the pypi version is old and not compatible with the latest numpy.) + +```shell +pip install -r requirements/build.txt +pip install "git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI" +pip install -v -e . # or "python setup.py develop" +``` + +e. Clone the MMDetection3D repository. + +```shell +git clone https://github.com/open-mmlab/mmdetection3d.git +cd mmdetection3d +``` + +f. Install build requirements and then install MMDetection3D. + +```shell +pip install -r requirements/build.txt +pip install -v -e . # or "python setup.py develop" +``` + +Note: + +1. The git commit id will be written to the version number with step d, e.g. 0.6.0+2e7045c. The version will also be saved in trained models. +It is recommended that you run step d each time you pull some updates from github. If C++/CUDA codes are modified, then this step is compulsory. + +2. Following the above instructions, mmdetection is installed on `dev` mode, any local modifications made to the code will take effect without the need to reinstall it (unless you submit some commits and want to update the version number). + +3. If you would like to use `opencv-python-headless` instead of `opencv-python`, +you can install it before installing MMCV. + +4. Some dependencies are optional. Simply running `pip install -v -e .` will only install the minimum runtime requirements. To use optional dependencies like `albumentations` and `imagecorruptions` either install them manually with `pip install -r requirements/optional.txt` or specify desired extras when calling `pip` (e.g. `pip install -v -e .[optional]`). Valid keys for the extras field are: `all`, `tests`, `build`, and `optional`. + +### Another option: Docker Image + +We provide a [Dockerfile](https://github.com/open-mmlab/mmdetection/blob/master/docker/Dockerfile) to build an image. + +```shell +# build an image with PyTorch 1.1, CUDA 10.0 and CUDNN 7.5 +docker build -t mmdetection docker/ +``` + +### Prepare datasets + +It is recommended to symlink the dataset root to `$MMDETECTION/data`. +If your folder structure is different, you may need to change the corresponding paths in config files. + +``` +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ ├── cityscapes +│ │ ├── annotations +│ │ ├── leftImg8bit +│ │ │ ├── train +│ │ │ ├── val +│ │ ├── gtFine +│ │ │ ├── train +│ │ │ ├── val +│ ├── VOCdevkit +│ │ ├── VOC2007 +│ │ ├── VOC2012 + +``` +The cityscapes annotations have to be converted into the coco format using `tools/convert_datasets/cityscapes.py`: +```shell +pip install cityscapesscripts +python tools/convert_datasets/cityscapes.py ./data/cityscapes --nproc 8 --out_dir ./data/cityscapes/annotations +``` +Current the config files in `cityscapes` use COCO pre-trained weights to initialize. +You could download the pre-trained models in advance if network is unavailable or slow, otherwise it would cause errors at the beginning of training. + +### A from-scratch setup script + +Here is a full script for setting up mmdetection with conda and link the dataset path (supposing that your COCO dataset path is $COCO_ROOT). + +```shell +conda create -n open-mmlab python=3.7 numba=0.45.1 -y +conda activate open-mmlab + +conda install -c pytorch pytorch torchvision -y +git clone https://github.com/open-mmlab/mmdetection.git +cd mmdetection +pip install -r requirements/build.txt +pip install "git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI" +pip install -v -e . + +git clone https://github.com/open-mmlab/mmdetection3d.git +cd mmdetection3d +pip install -r requirements/build.txt +pip install -v -e . + +mkdir data +ln -s $COCO_ROOT data +``` + +### Using multiple MMDetection3D versions + +If there are more than one mmdetection on your machine, and you want to use them alternatively, the recommended way is to create multiple conda environments and use different environments for different versions. + +Another way is to insert the following code to the main scripts (`train.py`, `test.py` or any other scripts you run) +```python +import os.path as osp +import sys +sys.path.insert(0, osp.join(osp.dirname(osp.abspath(__file__)), '../')) +``` + +Or run the following command in the terminal of corresponding folder to temporally use the current one. +```shell +export PYTHONPATH=`pwd`:$PYTHONPATH +``` diff --git a/docs/MODEL_ZOO.md b/docs/MODEL_ZOO.md new file mode 100644 index 0000000000..c15a00b6fa --- /dev/null +++ b/docs/MODEL_ZOO.md @@ -0,0 +1,532 @@ +# Benchmark and Model Zoo + +## Environment + +### Hardware + +- 8 NVIDIA Tesla V100 GPUs +- Intel Xeon 4114 CPU @ 2.20GHz + +### Software environment + +- Python 3.6 / 3.7 +- PyTorch 1.1 +- CUDA 9.0.176 +- CUDNN 7.0.4 +- NCCL 2.1.15 + +## Mirror sites + +We use AWS as the main site to host our model zoo, and maintain a mirror on aliyun. +You can replace `https://s3.ap-northeast-2.amazonaws.com/open-mmlab` with `https://open-mmlab.oss-cn-beijing.aliyuncs.com` in model urls. + +## Common settings + +- All FPN baselines and RPN-C4 baselines were trained using 8 GPU with a batch size of 16 (2 images per GPU). Other C4 baselines were trained using 8 GPU with a batch size of 8 (1 image per GPU). +- All models were trained on `coco_2017_train`, and tested on the `coco_2017_val`. +- We use distributed training and BN layer stats are fixed. +- We adopt the same training schedules as Detectron. 1x indicates 12 epochs and 2x indicates 24 epochs, which corresponds to slightly less iterations than Detectron and the difference can be ignored. +- All pytorch-style pretrained backbones on ImageNet are from PyTorch model zoo. +- For fair comparison with other codebases, we report the GPU memory as the maximum value of `torch.cuda.max_memory_allocated()` for all 8 GPUs. Note that this value is usually less than what `nvidia-smi` shows. +- We report the inference time as the overall time including data loading, network forwarding and post processing. + + +## Baselines + +More models with different backbones will be added to the model zoo. + +### RPN + +| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | AR1000 | Download | +| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------------: | +| R-50-C4 | caffe | 1x | - | - | 20.5 | 51.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_caffe_c4_1x-ea7d3428.pth) | +| R-50-C4 | caffe | 2x | 2.2 | 0.17 | 20.3 | 52.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_caffe_c4_2x-c6d5b958.pth) | +| R-50-C4 | pytorch | 1x | - | - | 20.1 | 50.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_c4_1x-eb38972b.pth) | +| R-50-C4 | pytorch | 2x | - | - | 20.0 | 51.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_c4_2x-3d4c1e14.pth) | +| R-50-FPN | caffe | 1x | 3.3 | 0.253 | 16.9 | 58.2 | - | +| R-50-FPN | pytorch | 1x | 3.5 | 0.276 | 17.7 | 57.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_fpn_1x_20181010-4a9c0712.pth) | +| R-50-FPN | pytorch | 2x | - | - | - | 57.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_fpn_2x_20181010-88a4a471.pth) | +| R-101-FPN | caffe | 1x | 5.2 | 0.379 | 13.9 | 59.4 | - | +| R-101-FPN | pytorch | 1x | 5.4 | 0.396 | 14.4 | 58.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r101_fpn_1x_20181129-f50da4bd.pth) | +| R-101-FPN | pytorch | 2x | - | - | - | 59.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r101_fpn_2x_20181129-e42c6c9a.pth) | +| X-101-32x4d-FPN | pytorch | 1x | 6.6 | 0.589 | 11.8 | 59.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_32x4d_fpn_1x_20181218-7e379d26.pth) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | - | 59.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_32x4d_fpn_2x_20181218-0510af40.pth) | +| X-101-64x4d-FPN | pytorch | 1x | 9.5 | 0.955 | 8.3 | 59.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_64x4d_fpn_1x_20181218-c1a24f1f.pth) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | - | 60.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_64x4d_fpn_2x_20181218-c22bdd70.pth) | + +### Faster R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | +| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :--------------------------------------------------------------------------------------------------------------------------------: | +| R-50-C4 | caffe | 1x | - | - | 9.5 | 34.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_caffe_c4_1x-75ecfdfa.pth) | +| R-50-C4 | caffe | 2x | 4.0 | 0.39 | 9.3 | 36.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_caffe_c4_2x-71c67f27.pth) | +| R-50-C4 | pytorch | 1x | - | - | 9.3 | 33.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_c4_1x-642cf91f.pth) | +| R-50-C4 | pytorch | 2x | - | - | 9.4 | 35.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_c4_2x-6e4fdf4f.pth) | +| R-50-FPN | caffe | 1x | 3.6 | 0.333 | 13.5 | 36.6 | - | +| R-50-FPN | pytorch | 1x | 3.8 | 0.353 | 13.6 | 36.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth) | +| R-50-FPN | pytorch | 2x | - | - | - | 37.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_fpn_2x_20181010-443129e1.pth) | +| R-101-FPN | caffe | 1x | 5.5 | 0.465 | 11.5 | 38.8 | - | +| R-101-FPN | pytorch | 1x | 5.7 | 0.474 | 11.9 | 38.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r101_fpn_1x_20181129-d1468807.pth) | +| R-101-FPN | pytorch | 2x | - | - | - | 39.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r101_fpn_2x_20181129-73e7ade7.pth) | +| X-101-32x4d-FPN | pytorch | 1x | 6.9 | 0.672 | 10.3 | 40.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_32x4d_fpn_1x_20181218-ad81c133.pth) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | - | 40.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_32x4d_fpn_2x_20181218-0ed58946.pth) | +| X-101-64x4d-FPN | pytorch | 1x | 9.8 | 1.040 | 7.3 | 41.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_64x4d_fpn_1x_20181218-c9c69c8f.pth) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | - | 40.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_64x4d_fpn_2x_20181218-fe94f9b8.pth) | +| HRNetV2p-W18 | pytorch | 1x | - | - | - | 36.1 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/faster_rcnn_hrnetv2p_w18_1x_20190522-e368c387.pth) | +| HRNetV2p-W18 | pytorch | 2x | - | - | - | 38.3 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/faster_rcnn_hrnetv2p_w18_2x_20190810-9c8615d5.pth) | +| HRNetV2p-W32 | pytorch | 1x | - | - | - | 39.5 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/faster_rcnn_hrnetv2p_w32_1x_20190522-d22f1fef.pth) | +| HRNetV2p-W32 | pytorch | 2x | - | - | - | 40.6 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/faster_rcnn_hrnetv2p_w32_2x_20190810-24e8912a.pth) | +| HRNetV2p-W48 | pytorch | 1x | - | - | - | 40.9 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/faster_rcnn_hrnetv2p_w48_1x_20190820-5c6d0903.pth) | +| HRNetV2p-W48 | pytorch | 2x | - | - | - | 41.5 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/faster_rcnn_hrnetv2p_w48_2x_20190820-79fb8bfc.pth) | + + +### Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | +| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :------------------------------------------------------------------------------------------------------------------------------: | +| R-50-C4 | caffe | 1x | - | - | 8.1 | 35.9 | 31.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_caffe_c4_1x-02a4ad3b.pth) | +| R-50-C4 | caffe | 2x | 4.2 | 0.43 | 8.1 | 37.9 | 32.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_caffe_c4_2x-d150973a.pth) | +| R-50-C4 | pytorch | 1x | - | - | 7.9 | 35.1 | 31.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_c4_1x-a83bdd40.pth) | +| R-50-C4 | pytorch | 2x | - | - | 8.0 | 37.2 | 32.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_c4_2x-3cf169a9.pth) | +| R-50-FPN | caffe | 1x | 3.8 | 0.430 | 10.2 | 37.4 | 34.3 | - | +| R-50-FPN | pytorch | 1x | 3.9 | 0.453 | 10.6 | 37.3 | 34.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth) | +| R-50-FPN | pytorch | 2x | - | - | - | 38.5 | 35.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_2x_20181010-41d35c05.pth) | +| R-101-FPN | caffe | 1x | 5.7 | 0.534 | 9.4 | 39.9 | 36.1 | - | +| R-101-FPN | pytorch | 1x | 5.8 | 0.571 | 9.5 | 39.4 | 35.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r101_fpn_1x_20181129-34ad1961.pth) | +| R-101-FPN | pytorch | 2x | - | - | - | 40.3 | 36.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r101_fpn_2x_20181129-a254bdfc.pth) | +| X-101-32x4d-FPN | pytorch | 1x | 7.1 | 0.759 | 8.3 | 41.1 | 37.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_32x4d_fpn_1x_20181218-44e635cc.pth) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | - | 41.4 | 37.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_32x4d_fpn_2x_20181218-f023dffa.pth) | +| X-101-64x4d-FPN | pytorch | 1x | 10.0 | 1.102 | 6.5 | 42.1 | 38.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_64x4d_fpn_1x_20181218-cb159987.pth) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | - | 42.0 | 37.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_64x4d_fpn_2x_20181218-ea936e44.pth) | +| HRNetV2p-W18 | pytorch | 1x | - | - | - | 37.3 | 34.2 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/mask_rcnn_hrnetv2p_w18_1x_20190522-c8ad459f.pth) | +| HRNetV2p-W18 | pytorch | 2x | - | - | - | 39.2 | 35.7 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/mask_rcnn_hrnetv2p_w18_2x_20190810-1e4747eb.pth) | +| HRNetV2p-W32 | pytorch | 1x | - | - | - | 40.7 | 36.8 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/mask_rcnn_hrnetv2p_w32_1x_20190522-374aaa00.pth) | +| HRNetV2p-W32 | pytorch | 2x | - | - | - | 41.7 | 37.5 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/mask_rcnn_hrnetv2p_w32_2x_20190810-773eca75.pth) | +| HRNetV2p-W48 | pytorch | 1x | - | - | - | 42.4 | 38.1 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/mask_rcnn_hrnetv2p_w48_1x_20190820-0923d1ad.pth) | +| HRNetV2p-W48 | pytorch | 2x | - | - | - | 42.9 | 38.3 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/mask_rcnn_hrnetv2p_w48_2x_20190820-70df51b2.pth) | + +### Fast R-CNN (with pre-computed proposals) + +| Backbone | Style | Type | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | +| :-------: | :-----: | :----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------------------------------------------------------------------: | +| R-50-C4 | caffe | Faster | 1x | - | - | 6.7 | 35.0 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_caffe_c4_1x-0ef9a60b.pth) | +| R-50-C4 | caffe | Faster | 2x | 3.8 | 0.34 | 6.6 | 36.4 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_c4_2x-657a9fc6.pth) | +| R-50-C4 | pytorch | Faster | 1x | - | - | 6.3 | 34.2 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_c4_1x-2bc00ca9.pth) | +| R-50-C4 | pytorch | Faster | 2x | - | - | 6.1 | 35.8 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_caffe_c4_2x-9171d0fc.pth) | +| R-50-FPN | caffe | Faster | 1x | 3.3 | 0.242 | 18.4 | 36.6 | - | - | +| R-50-FPN | pytorch | Faster | 1x | 3.5 | 0.250 | 16.5 | 35.8 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_fpn_1x_20181010-08160859.pth) | +| R-50-C4 | caffe | Mask | 1x | - | - | 8.1 | 35.9 | 31.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_caffe_c4_1x-b43f7f3c.pth) | +| R-50-C4 | caffe | Mask | 2x | 4.2 | 0.43 | 8.1 | 37.9 | 32.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_caffe_c4_2x-e3580184.pth) | +| R-50-C4 | pytorch | Mask | 1x | - | - | 7.9 | 35.1 | 31.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_c4_1x-bc7fa8c8.pth) | +| R-50-C4 | pytorch | Mask | 2x | - | - | 8.0 | 37.2 | 32.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_fpn_2x_20181010-5048cb03.pth) | +| R-50-FPN | pytorch | Faster | 2x | - | - | - | 37.1 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_fpn_2x_20181010-d263ada5.pth) | +| R-101-FPN | caffe | Faster | 1x | 5.2 | 0.355 | 14.4 | 38.6 | - | - | +| R-101-FPN | pytorch | Faster | 1x | 5.4 | 0.388 | 13.2 | 38.1 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r101_fpn_1x_20181129-ffaa2eb0.pth) | +| R-101-FPN | pytorch | Faster | 2x | - | - | - | 38.8 | - | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r101_fpn_2x_20181129-9dba92ce.pth) | +| R-50-FPN | caffe | Mask | 1x | 3.4 | 0.328 | 12.8 | 37.3 | 34.5 | - | +| R-50-FPN | pytorch | Mask | 1x | 3.5 | 0.346 | 12.7 | 36.8 | 34.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_fpn_1x_20181010-e030a38f.pth) | +| R-50-FPN | pytorch | Mask | 2x | - | - | - | 37.9 | 34.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_fpn_2x_20181010-5048cb03.pth) | +| R-101-FPN | caffe | Mask | 1x | 5.2 | 0.429 | 11.2 | 39.4 | 36.1 | - | +| R-101-FPN | pytorch | Mask | 1x | 5.4 | 0.462 | 10.9 | 38.9 | 35.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r101_fpn_1x_20181129-2273fa9b.pth) | +| R-101-FPN | pytorch | Mask | 2x | - | - | - | 39.9 | 36.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r101_fpn_2x_20181129-bf63ec5e.pth) | + +### RetinaNet + +| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | +| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | caffe | 1x | 3.4 | 0.285 | 12.5 | 35.8 | - | +| R-50-FPN | pytorch | 1x | 3.6 | 0.308 | 12.1 | 35.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r50_fpn_1x_20181125-7b0c2548.pth) | +| R-50-FPN | pytorch | 2x | - | - | - | 36.4 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/retinanet_r50_fpn_2x_20190616-75574209.pth) | +| R-101-FPN | caffe | 1x | 5.3 | 0.410 | 10.4 | 37.8 | - | +| R-101-FPN | pytorch | 1x | 5.5 | 0.429 | 10.9 | 37.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r101_fpn_1x_20181129-f016f384.pth) | +| R-101-FPN | pytorch | 2x | - | - | - | 38.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r101_fpn_2x_20181129-72c14526.pth) | +| X-101-32x4d-FPN | pytorch | 1x | 6.7 | 0.632 | 9.3 | 39.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_32x4d_fpn_1x_20190501-967812ba.pth) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | - | 39.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_32x4d_fpn_2x_20181218-8596452d.pth) | +| X-101-64x4d-FPN | pytorch | 1x | 9.6 | 0.993 | 7.0 | 40.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_64x4d_fpn_1x_20181218-a0a22662.pth) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | - | 39.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_64x4d_fpn_2x_20181218-5e88d045.pth) | + +### Cascade R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | +| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------------------------: | +| R-50-C4 | caffe | 1x | 8.7 | 0.92 | 5.0 | 38.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r50_caffe_c4_1x-7c85c62b.pth) | +| R-50-FPN | caffe | 1x | 3.9 | 0.464 | 10.9 | 40.5 | - | +| R-50-FPN | pytorch | 1x | 4.1 | 0.455 | 11.9 | 40.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r50_fpn_1x_20190501-3b6211ab.pth) | +| R-50-FPN | pytorch | 20e | - | - | - | 41.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r50_fpn_20e_20181123-db483a09.pth) | +| R-101-FPN | caffe | 1x | 5.8 | 0.569 | 9.6 | 42.4 | - | +| R-101-FPN | pytorch | 1x | 6.0 | 0.584 | 10.3 | 42.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r101_fpn_1x_20181129-d64ebac7.pth) | +| R-101-FPN | pytorch | 20e | - | - | - | 42.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r101_fpn_20e_20181129-b46dcede.pth) | +| X-101-32x4d-FPN | pytorch | 1x | 7.2 | 0.770 | 8.9 | 43.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_32x4d_fpn_1x_20190501-af628be5.pth) | +| X-101-32x4d-FPN | pytorch | 20e | - | - | - | 44.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_32x4d_fpn_2x_20181218-28f73c4c.pth) | +| X-101-64x4d-FPN | pytorch | 1x | 10.0 | 1.133 | 6.7 | 44.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_64x4d_fpn_1x_20181218-e2dc376a.pth) | +| X-101-64x4d-FPN | pytorch | 20e | - | - | - | 44.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_64x4d_fpn_2x_20181218-5add321e.pth) | +| HRNetV2p-W18 | pytorch | 20e | - | - | - | 41.2 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/cascade_rcnn_hrnetv2p_w18_20e_20190810-132012d0.pth) | +| HRNetV2p-W32 | pytorch | 20e | - | - | - | 43.7 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/cascade_rcnn_hrnetv2p_w32_20e_20190522-55bec4ee.pth)| +| HRNetV2p-W48 | pytorch | 20e | - | - | - | 44.6 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/cascade_rcnn_hrnetv2p_w48_20e_20190810-f40ed8e1.pth) | + +### Cascade Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | +| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :---------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-C4 | caffe | 1x | 9.1 | 0.99 | 4.5 | 39.3 | 32.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_caffe_c4_1x-f72cc254.pth) | +| R-50-FPN | caffe | 1x | 5.1 | 0.692 | 7.6 | 40.9 | 35.5 | - | +| R-50-FPN | pytorch | 1x | 5.3 | 0.683 | 7.4 | 41.2 | 35.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_fpn_1x_20181123-88b170c9.pth) | +| R-50-FPN | pytorch | 20e | - | - | - | 42.3 | 36.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_fpn_20e_20181123-6e0c9713.pth) | +| R-101-FPN | caffe | 1x | 7.0 | 0.803 | 7.2 | 43.1 | 37.2 | - | +| R-101-FPN | pytorch | 1x | 7.2 | 0.807 | 6.8 | 42.6 | 37.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r101_fpn_1x_20181129-64f00602.pth) | +| R-101-FPN | pytorch | 20e | - | - | - | 43.3 | 37.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r101_fpn_20e_20181129-cb85151d.pth) | +| X-101-32x4d-FPN | pytorch | 1x | 8.4 | 0.976 | 6.6 | 44.4 | 38.2 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_32x4d_fpn_1x_20181218-1d944c89.pth) | +| X-101-32x4d-FPN | pytorch | 20e | - | - | - | 44.7 | 38.6 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_32x4d_fpn_20e_20181218-761a3473.pth) | +| X-101-64x4d-FPN | pytorch | 1x | 11.4 | 1.33 | 5.3 | 45.4 | 39.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_64x4d_fpn_1x_20190501-827e0a70.pth) | +| X-101-64x4d-FPN | pytorch | 20e | - | - | - | 45.7 | 39.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_64x4d_fpn_20e_20181218-630773a7.pth) | +| HRNetV2p-W18 | pytorch | 20e | - | - | - | 41.9 | 36.4 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_20190810-054fb7bf.pth) | +| HRNetV2p-W32 | pytorch | 20e | - | - | - | 44.5 | 38.5 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_20190810-76f61cd0.pth) | +| HRNetV2p-W48 | pytorch | 20e | - | - | - | 46.0 | 39.5 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/cascade_mask_rcnn_hrnetv2p_w48_20e_20190810-d04a1415.pth) | + +**Notes:** + +- The `20e` schedule in Cascade (Mask) R-CNN indicates decreasing the lr at 16 and 19 epochs, with a total of 20 epochs. + +### Hybrid Task Cascade (HTC) + +| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | +| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | 1x | 7.4 | 0.936 | 4.1 | 42.1 | 37.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r50_fpn_1x_20190408-878c1712.pth) | +| R-50-FPN | pytorch | 20e | - | - | - | 43.2 | 38.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r50_fpn_20e_20190408-c03b7015.pth) | +| R-101-FPN | pytorch | 20e | 9.3 | 1.051 | 4.0 | 44.9 | 39.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r101_fpn_20e_20190408-a2e586db.pth) | +| X-101-32x4d-FPN | pytorch | 20e | 5.8 | 0.769 | 3.8 | 46.1 | 40.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_x101_32x4d_fpn_20e_20190408-9eae4d0b.pth) | +| X-101-64x4d-FPN | pytorch | 20e | 7.5 | 1.120 | 3.5 | 46.9 | 40.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_x101_64x4d_fpn_20e_20190408-497f2561.pth) | +| HRNetV2p-W18 | pytorch | 20e | - | - | - | 43.1 | 37.9 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/htc_hrnetv2p_w18_20e_20190810-d70072af.pth) | +| HRNetV2p-W32 | pytorch | 20e | - | - | - | 45.3 | 39.6 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/htc_hrnetv2p_w32_20e_20190810-82f9ef5a.pth) | +| HRNetV2p-W48 | pytorch | 20e | - | - | - | 46.8 | 40.7 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/htc_hrnetv2p_w48_20e_20190810-f6d2c3fd.pth) | +| HRNetV2p-W48 | pytorch | 28e | - | - | - | 47.0 | 41.0 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/htc_hrnetv2p_w48_28e_20190810-a4274b38.pth) | + +**Notes:** + +- Please refer to [Hybrid Task Cascade](https://github.com/open-mmlab/mmdetection/blob/master/configs/htc) for details and more a powerful model (50.7/43.9). + +### SSD + +| Backbone | Size | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | +| :------: | :---: | :---: | :-----: | :------: | :-----------------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------------------: | +| VGG16 | 300 | caffe | 120e | 3.5 | 0.256 | 25.9 / 34.6 | 25.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd300_coco_vgg16_caffe_120e_20181221-84d7110b.pth) | +| VGG16 | 512 | caffe | 120e | 7.6 | 0.412 | 20.7 / 25.4 | 29.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd512_coco_vgg16_caffe_120e_20181221-d48b0be8.pth) | + +**Notes:** + +- `cudnn.benchmark` is set as `True` for SSD training and testing. +- Inference time is reported for batch size = 1 and batch size = 8. +- The speed on COCO and VOC are different due to model parameters and nms. + +### Group Normalization (GN) + +Please refer to [Group Normalization](https://github.com/open-mmlab/mmdetection/blob/master/configs/gn) for details. + +### Weight Standardization + +Please refer to [Weight Standardization](https://github.com/open-mmlab/mmdetection/blob/master/configs/gn+ws) for details. + +### Deformable Convolution v2 + +Please refer to [Deformable Convolutional Networks](https://github.com/open-mmlab/mmdetection/blob/master/configs/dcn) for details. + +### CARAFE: Content-Aware ReAssembly of FEatures +Please refer to [CARAFE](https://github.com/open-mmlab/mmdetection/blob/master/configs/carafe) for details. + +### Instaboost + +Please refer to [Instaboost](https://github.com/open-mmlab/mmdetection/blob/master/configs/instaboost) for details. + +### Libra R-CNN + +Please refer to [Libra R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/libra_rcnn) for details. + +### Guided Anchoring + +Please refer to [Guided Anchoring](https://github.com/open-mmlab/mmdetection/blob/master/configs/guided_anchoring) for details. + +### FCOS + +Please refer to [FCOS](https://github.com/open-mmlab/mmdetection/blob/master/configs/fcos) for details. + +### FoveaBox + +Please refer to [FoveaBox](https://github.com/open-mmlab/mmdetection/blob/master/configs/foveabox) for details. + +### RepPoints + +Please refer to [RepPoints](https://github.com/open-mmlab/mmdetection/blob/master/configs/reppoints) for details. + +### FreeAnchor + +Please refer to [FreeAnchor](https://github.com/open-mmlab/mmdetection/blob/master/configs/free_anchor) for details. + +### Grid R-CNN (plus) + +Please refer to [Grid R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/grid_rcnn) for details. + +### GHM + +Please refer to [GHM](https://github.com/open-mmlab/mmdetection/blob/master/configs/ghm) for details. + +### GCNet + +Please refer to [GCNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/gcnet) for details. + +### HRNet +Please refer to [HRNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/hrnet) for details. + +### Mask Scoring R-CNN + +Please refer to [Mask Scoring R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/ms_rcnn) for details. + +### Train from Scratch + +Please refer to [Rethinking ImageNet Pre-training](https://github.com/open-mmlab/mmdetection/blob/master/configs/scratch) for details. + +### NAS-FPN +Please refer to [NAS-FPN](https://github.com/open-mmlab/mmdetection/blob/master/configs/nas_fpn) for details. + +### ATSS +Please refer to [ATSS](https://github.com/open-mmlab/mmdetection/blob/master/configs/atss) for details. + +### Other datasets + +We also benchmark some methods on [PASCAL VOC](https://github.com/open-mmlab/mmdetection/blob/master/configs/pascal_voc), [Cityscapes](https://github.com/open-mmlab/mmdetection/blob/master/configs/cityscapes) and [WIDER FACE](https://github.com/open-mmlab/mmdetection/blob/master/configs/wider_face). + + +## Comparison with Detectron and maskrcnn-benchmark + +We compare mmdetection with [Detectron](https://github.com/facebookresearch/Detectron) +and [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark). The backbone used is R-50-FPN. + +In general, mmdetection has 3 advantages over Detectron. + +- **Higher performance** (especially in terms of mask AP) +- **Faster training speed** +- **Memory efficient** + +### Performance + +Detectron and maskrcnn-benchmark use caffe-style ResNet as the backbone. +We report results using both caffe-style (weights converted from +[here](https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md#imagenet-pretrained-models)) +and pytorch-style (weights from the official model zoo) ResNet backbone, +indicated as *pytorch-style results* / *caffe-style results*. + +We find that pytorch-style ResNet usually converges slower than caffe-style ResNet, +thus leading to slightly lower results in 1x schedule, but the final results +of 2x schedule is higher. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TypeLr schdDetectronmaskrcnn-benchmarkmmdetection
RPN1x57.2-57.1 / 58.2
2x--57.6 / -
Faster R-CNN1x36.736.836.4 / 36.6
2x37.9-37.7 / -
Mask R-CNN1x37.7 & 33.937.8 & 34.237.3 & 34.2 / 37.4 & 34.3
2x38.6 & 34.5-38.5 & 35.1 / -
Fast R-CNN1x36.4-35.8 / 36.6
2x36.8-37.1 / -
Fast R-CNN (w/mask)1x37.3 & 33.7-36.8 & 34.1 / 37.3 & 34.5
2x37.7 & 34.0-37.9 & 34.8 / -
+ +### Training Speed + +The training speed is measure with s/iter. The lower, the better. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TypeDetectron (P1001)maskrcnn-benchmark (V100)mmdetection (V1002)
RPN0.416-0.253
Faster R-CNN0.5440.3530.333
Mask R-CNN0.8890.4540.430
Fast R-CNN0.285-0.242
Fast R-CNN (w/mask)0.377-0.328
+ +\*1. Facebook's Big Basin servers (P100/V100) is slightly faster than the servers we use. mmdetection can also run slightly faster on FB's servers. + +\*2. For fair comparison, we list the caffe-style results here. + + +### Inference Speed + +The inference speed is measured with fps (img/s) on a single GPU. The higher, the better. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TypeDetectron (P100)maskrcnn-benchmark (V100)mmdetection (V100)
RPN12.5-16.9
Faster R-CNN10.37.913.5
Mask R-CNN8.57.710.2
Fast R-CNN12.5-18.4
Fast R-CNN (w/mask)9.9-12.8
+ +### Training memory + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TypeDetectronmaskrcnn-benchmarkmmdetection
RPN6.4-3.3
Faster R-CNN7.24.43.6
Mask R-CNN8.65.23.8
Fast R-CNN6.0-3.3
Fast R-CNN (w/mask)7.9-3.4
+ +There is no doubt that maskrcnn-benchmark and mmdetection is more memory efficient than Detectron, +and the main advantage is PyTorch itself. We also perform some memory optimizations to push it forward. + +Note that Caffe2 and PyTorch have different apis to obtain memory usage with different implementations. +For all codebases, `nvidia-smi` shows a larger memory usage than the reported number in the above table. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000000..d4bb2cbb9e --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/ROBUSTNESS_BENCHMARKING.md b/docs/ROBUSTNESS_BENCHMARKING.md new file mode 100644 index 0000000000..1ed441ab5a --- /dev/null +++ b/docs/ROBUSTNESS_BENCHMARKING.md @@ -0,0 +1,109 @@ +# Corruption Benchmarking + +## Introduction + +We provide tools to test object detection and instance segmentation models on the image corruption benchmark defined in [Benchmarking Robustness in Object Detection: Autonomous Driving when Winter is Coming](https://arxiv.org/abs/1907.07484). +This page provides basic tutorials how to use the benchmark. + +``` +@article{michaelis2019winter, + title={Benchmarking Robustness in Object Detection: + Autonomous Driving when Winter is Coming}, + author={Michaelis, Claudio and Mitzkus, Benjamin and + Geirhos, Robert and Rusak, Evgenia and + Bringmann, Oliver and Ecker, Alexander S. and + Bethge, Matthias and Brendel, Wieland}, + journal={arXiv:1907.07484}, + year={2019} +} +``` + +![image corruption example](../demo/corruptions_sev_3.png) + +## About the benchmark + +To submit results to the benchmark please visit the [benchmark homepage](https://github.com/bethgelab/robust-detection-benchmark) + +The benchmark is modelled after the [imagenet-c benchmark](https://github.com/hendrycks/robustness) which was originally +published in [Benchmarking Neural Network Robustness to Common Corruptions and Perturbations](https://arxiv.org/abs/1903.12261) (ICLR 2019) by Dan Hendrycks and Thomas Dietterich. + +The image corruption functions are included in this library but can be installed separately using: + +```shell +pip install imagecorruptions +``` + +Compared to imagenet-c a few changes had to be made to handle images of arbitrary size and greyscale images. +We also modfied the 'motion blur' and 'snow' corruptions to remove dependency from a linux specific library, +which would have to be installed separately otherwise. For details please refer to the [imagecorruptions repository](https://github.com/bethgelab/imagecorruptions). + +## Inference with pretrained models + +We provide a testing script to evaluate a models performance on any combination of the corruptions provided in the benchmark. + +### Test a dataset + +- [x] single GPU testing +- [ ] multiple GPU testing +- [ ] visualize detection results + +You can use the following commands to test a models performance under the 15 corruptions used in the benchmark. + +```shell +# single-gpu testing +python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] +``` + +Alternatively different group of corruptions can be selected. + +```shell +# noise +python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions noise + +# blur +python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions blur + +# wetaher +python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions weather + +# digital +python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions digital +``` + +Or a costom set of corruptions e.g.: +```shell +# gaussian noise, zoom blur and snow +python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions gaussian_noise zoom_blur snow +``` + +Finally the corruption severities to evaluate can be chosen. +Severity 0 corresponds to clean data and the effect increases from 1 to 5. + +```shell +# severity 1 +python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --severities 1 + +# severities 0,2,4 +python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --severities 0 2 4 +``` + +## Results for modelzoo models + +The results on COCO 2017val are shown in the below table. + +Model | Backbone | Style | Lr schd | box AP clean | box AP corr. | box % | mask AP clean | mask AP corr. | mask % | +:-----:|:---------:|:-------:|:-------:|:------------:|:------------:|:-----:|:-------------:|:-------------:|:------:| +Faster R-CNN | R-50-FPN | pytorch | 1x | 36.3 | 18.2 | 50.2 | - | - | - | +Faster R-CNN | R-101-FPN | pytorch | 1x | 38.5 | 20.9 | 54.2 | - | - | - | +Faster R-CNN | X-101-32x4d-FPN | pytorch |1x | 40.1 | 22.3 | 55.5 | - | - | - | +Faster R-CNN | X-101-64x4d-FPN | pytorch |1x | 41.3 | 23.4 | 56.6 | - | - | - | +Faster R-CNN | R-50-FPN-DCN | pytorch | 1x | 40.0 | 22.4 | 56.1 | - | - | - | +Faster R-CNN | X-101-32x4d-FPN-DCN | pytorch | 1x | 43.4 | 26.7 | 61.6 | - | - | - | +Mask R-CNN | R-50-FPN | pytorch | 1x | 37.3 | 18.7 | 50.1 | 34.2 | 16.8 | 49.1 | +Mask R-CNN | R-50-FPN-DCN | pytorch | 1x | 41.1 | 23.3 | 56.7 | 37.2 | 20.7 | 55.7 | +Cascade R-CNN | R-50-FPN | pytorch | 1x | 40.4 | 20.1 | 49.7 | - | - | - | +Cascade Mask R-CNN | R-50-FPN | pytorch | 1x| 41.2 | 20.7 | 50.2 | 35.7 | 17.6 | 49.3 | +RetinaNet | R-50-FPN | pytorch | 1x | 35.6 | 17.8 | 50.1 | - | - | - | +Hybrid Task Cascade | X-101-64x4d-FPN-DCN | pytorch | 1x | 50.6 | 32.7 | 64.7 | 43.8 | 28.1 | 64.0 | + +Results may vary slightly due to the stochastic application of the corruptions. diff --git a/docs/TECHNICAL_DETAILS.md b/docs/TECHNICAL_DETAILS.md new file mode 100644 index 0000000000..91b0cfb941 --- /dev/null +++ b/docs/TECHNICAL_DETAILS.md @@ -0,0 +1,226 @@ +# Technical Details + +In this section, we will introduce the main units of training a detector: +data pipeline, model and iteration pipeline. + +## Data pipeline + +Following typical conventions, we use `Dataset` and `DataLoader` for data loading +with multiple workers. `Dataset` returns a dict of data items corresponding +the arguments of models' forward method. +Since the data in object detection may not be the same size (image size, gt bbox size, etc.), +we introduce a new `DataContainer` type in MMCV to help collect and distribute +data of different size. +See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details. + +The data preparation pipeline and the dataset is decomposed. Usually a dataset +defines how to process the annotations and a data pipeline defines all the steps to prepare a data dict. +A pipeline consists of a sequence of operations. Each operation takes a dict as input and also output a dict for the next transform. + +We present a classical pipeline in the following figure. The blue blocks are pipeline operations. With the pipeline going on, each operator can add new keys (marked as green) to the result dict or update the existing keys (marked as orange). +![pipeline figure](../demo/data_pipeline.png) + +The operations are categorized into data loading, pre-processing, formatting and test-time augmentation. + +Here is an pipeline example for Faster R-CNN. +```python +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +``` + +For each operation, we list the related dict fields that are added/updated/removed. + +### Data loading + +`LoadImageFromFile` +- add: img, img_shape, ori_shape + +`LoadAnnotations` +- add: gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks, gt_semantic_seg, bbox_fields, mask_fields + +`LoadProposals` +- add: proposals + +### Pre-processing + +`Resize` +- add: scale, scale_idx, pad_shape, scale_factor, keep_ratio +- update: img, img_shape, *bbox_fields, *mask_fields, *seg_fields + +`RandomFlip` +- add: flip +- update: img, *bbox_fields, *mask_fields, *seg_fields + +`Pad` +- add: pad_fixed_size, pad_size_divisor +- update: img, pad_shape, *mask_fields, *seg_fields + +`RandomCrop` +- update: img, pad_shape, gt_bboxes, gt_labels, gt_masks, *bbox_fields + +`Normalize` +- add: img_norm_cfg +- update: img + +`SegRescale` +- update: gt_semantic_seg + +`PhotoMetricDistortion` +- update: img + +`Expand` +- update: img, gt_bboxes + +`MinIoURandomCrop` +- update: img, gt_bboxes, gt_labels + +`Corrupt` +- update: img + +### Formatting + +`ToTensor` +- update: specified by `keys`. + +`ImageToTensor` +- update: specified by `keys`. + +`Transpose` +- update: specified by `keys`. + +`ToDataContainer` +- update: specified by `fields`. + +`DefaultFormatBundle` +- update: img, proposals, gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks, gt_semantic_seg + +`Collect` +- add: img_meta (the keys of img_meta is specified by `meta_keys`) +- remove: all other keys except for those specified by `keys` + +### Test time augmentation + +`MultiScaleFlipAug` + +## Model + +In MMDetection, model components are basically categorized as 4 types. + +- backbone: usually a FCN network to extract feature maps, e.g., ResNet. +- neck: the part between backbones and heads, e.g., FPN, ASPP. +- head: the part for specific tasks, e.g., bbox prediction and mask prediction. +- roi extractor: the part for extracting features from feature maps, e.g., RoI Align. + +We also write implement some general detection pipelines with the above components, +such as `SingleStageDetector` and `TwoStageDetector`. + +### Build a model with basic components + +Following some basic pipelines (e.g., two-stage detectors), the model structure +can be customized through config files with no pains. + +If we want to implement some new components, e.g, the path aggregation +FPN structure in [Path Aggregation Network for Instance Segmentation](https://arxiv.org/abs/1803.01534), there are two things to do. + +1. create a new file in `mmdet/models/necks/pafpn.py`. + + ```python + from ..registry import NECKS + + @NECKS.register + class PAFPN(nn.Module): + + def __init__(self, + in_channels, + out_channels, + num_outs, + start_level=0, + end_level=-1, + add_extra_convs=False): + pass + + def forward(self, inputs): + # implementation is ignored + pass + ``` + +2. Import the module in `mmdet/models/necks/__init__.py`. + + ```python + from .pafpn import PAFPN + ``` + +2. modify the config file from + + ```python + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5) + ``` + + to + + ```python + neck=dict( + type='PAFPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5) + ``` + +We will release more components (backbones, necks, heads) for research purpose. + +### Write a new model + +To write a new detection pipeline, you need to inherit from `BaseDetector`, +which defines the following abstract methods. + +- `extract_feat()`: given an image batch of shape (n, c, h, w), extract the feature map(s). +- `forward_train()`: forward method of the training mode +- `simple_test()`: single scale testing without augmentation +- `aug_test()`: testing with augmentation (multi-scale, flip, etc.) + +[TwoStageDetector](https://github.com/hellock/mmdetection/blob/master/mmdet/models/detectors/two_stage.py) +is a good example which shows how to do that. + +## Iteration pipeline + +We adopt distributed training for both single machine and multiple machines. +Supposing that the server has 8 GPUs, 8 processes will be started and each process runs on a single GPU. + +Each process keeps an isolated model, data loader, and optimizer. +Model parameters are only synchronized once at the beginning. +After a forward and backward pass, gradients will be allreduced among all GPUs, +and the optimizer will update model parameters. +Since the gradients are allreduced, the model parameter stays the same for all processes after the iteration. + +## Other information + +For more information, please refer to our [technical report](https://arxiv.org/abs/1906.07155). diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000000..aad51b6ae3 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,70 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + +# -- Project information ----------------------------------------------------- + +project = 'MMDetection' +copyright = '2018-2020, OpenMMLab' +author = 'OpenMMLab' + +# The full version, including alpha/beta/rc tags +release = '1.0.0' + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'recommonmark', + 'sphinx_markdown_tables', +] + +autodoc_mock_imports = ['torch', 'torchvision', 'mmcv'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} + +# The master toctree document. +master_doc = 'index' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000000..6e56b1432e --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,19 @@ +Welcome to MMDetection's documentation! +======================================= + +.. toctree:: + :maxdepth: 2 + + INSTALL.md + GETTING_STARTED.md + MODEL_ZOO.md + TECHNICAL_DETAILS.md + CHANGELOG.md + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000000..2119f51099 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000000..89fbf86c01 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,4 @@ +recommonmark +sphinx +sphinx_markdown_tables +sphinx_rtd_theme diff --git a/mmdet3d/__init__.py b/mmdet3d/__init__.py new file mode 100644 index 0000000000..1c4f7e8fcc --- /dev/null +++ b/mmdet3d/__init__.py @@ -0,0 +1,3 @@ +from .version import __version__, short_version + +__all__ = ['__version__', 'short_version'] diff --git a/mmdet3d/apis/__init__.py b/mmdet3d/apis/__init__.py new file mode 100644 index 0000000000..4833e520f4 --- /dev/null +++ b/mmdet3d/apis/__init__.py @@ -0,0 +1,5 @@ +from .train import train_detector + +__all__ = [ + 'train_detector', +] diff --git a/mmdet3d/apis/train.py b/mmdet3d/apis/train.py new file mode 100644 index 0000000000..d85f6c1784 --- /dev/null +++ b/mmdet3d/apis/train.py @@ -0,0 +1,199 @@ +import torch +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import DistSamplerSeedHook, Runner + +from mmdet3d.core import build_optimizer +from mmdet3d.datasets import build_dataloader, build_dataset +from mmdet.apis.train import parse_losses +from mmdet.core import (DistEvalHook, DistOptimizerHook, EvalHook, + Fp16OptimizerHook) +from mmdet.utils import get_root_logger + + +def batch_processor(model, data, train_mode): + """Process a data batch. + + This method is required as an argument of Runner, which defines how to + process a data batch and obtain proper outputs. The first 3 arguments of + batch_processor are fixed. + + Args: + model (nn.Module): A PyTorch model. + data (dict): The data batch in a dict. + train_mode (bool): Training mode or not. It may be useless for some + models. + + Returns: + dict: A dict containing losses and log vars. + """ + losses = model(**data) + loss, log_vars = parse_losses(losses) + + if 'img_meta' in data: + num_samples = len(data['img_meta'].data) + else: + num_samples = len(data['img'].data) + outputs = dict(loss=loss, log_vars=log_vars, num_samples=num_samples) + + return outputs + + +def train_detector(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + meta=None): + logger = get_root_logger(cfg.log_level) + + # start training + if distributed: + _dist_train( + model, + dataset, + cfg, + validate=validate, + logger=logger, + timestamp=timestamp, + meta=meta) + else: + _non_dist_train( + model, + dataset, + cfg, + validate=validate, + logger=logger, + timestamp=timestamp, + meta=meta) + + +def _dist_train(model, + dataset, + cfg, + validate=False, + logger=None, + timestamp=None, + meta=None): + # prepare data loaders + dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] + data_loaders = [ + build_dataloader( + ds, + cfg.data.samples_per_gpu, + cfg.data.workers_per_gpu, + dist=True, + seed=cfg.seed) for ds in dataset + ] + # put model on gpus + find_unused_parameters = cfg.get('find_unused_parameters', False) + # Sets the `find_unused_parameters` parameter in + # torch.nn.parallel.DistributedDataParallel + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + + # build runner + optimizer = build_optimizer(model, cfg.optimizer) + runner = Runner( + model, + batch_processor, + optimizer, + cfg.work_dir, + logger=logger, + meta=meta) + # an ugly walkaround to make the .log and .log.json filenames the same + runner.timestamp = timestamp + + # fp16 setting + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config, + **fp16_cfg) + else: + optimizer_config = DistOptimizerHook(**cfg.optimizer_config) + + # register hooks + runner.register_training_hooks(cfg.lr_config, optimizer_config, + cfg.checkpoint_config, cfg.log_config) + runner.register_hook(DistSamplerSeedHook()) + # register eval hooks + if validate: + val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) + val_dataloader = build_dataloader( + val_dataset, + samples_per_gpu=1, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=True, + shuffle=False) + eval_cfg = cfg.get('evaluation', {}) + runner.register_hook(DistEvalHook(val_dataloader, **eval_cfg)) + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run(data_loaders, cfg.workflow, cfg.total_epochs) + + +def _non_dist_train(model, + dataset, + cfg, + validate=False, + logger=None, + timestamp=None, + meta=None): + # prepare data loaders + dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] + data_loaders = [ + build_dataloader( + ds, + cfg.data.samples_per_gpu, + cfg.data.workers_per_gpu, + cfg.gpus, + dist=False, + seed=cfg.seed) for ds in dataset + ] + # put model on gpus + model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() + + # build runner + optimizer = build_optimizer(model, cfg.optimizer) + runner = Runner( + model, + batch_processor, + optimizer, + cfg.work_dir, + logger=logger, + meta=meta) + # an ugly walkaround to make the .log and .log.json filenames the same + runner.timestamp = timestamp + # fp16 setting + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + optimizer_config = Fp16OptimizerHook( + **cfg.optimizer_config, **fp16_cfg, distributed=False) + else: + optimizer_config = cfg.optimizer_config + runner.register_training_hooks(cfg.lr_config, optimizer_config, + cfg.checkpoint_config, cfg.log_config) + + # register eval hooks + if validate: + val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) + val_dataloader = build_dataloader( + val_dataset, + samples_per_gpu=1, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=False, + shuffle=False) + eval_cfg = cfg.get('evaluation', {}) + runner.register_hook(EvalHook(val_dataloader, **eval_cfg)) + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run(data_loaders, cfg.workflow, cfg.total_epochs) diff --git a/mmdet3d/core/__init__.py b/mmdet3d/core/__init__.py new file mode 100644 index 0000000000..1d7bf519c1 --- /dev/null +++ b/mmdet3d/core/__init__.py @@ -0,0 +1,8 @@ +from .anchor import * # noqa: F401, F403 +from .bbox import * # noqa: F401, F403 +from .evaluation import * # noqa: F401, F403 +from .optimizer import * # noqa: F401, F403 +from .post_processing import * # noqa: F401, F403 +from .utils import * # noqa: F401, F403 + +# from .voxel import * # noqa: F401, F403 diff --git a/mmdet3d/core/anchor/__init__.py b/mmdet3d/core/anchor/__init__.py new file mode 100644 index 0000000000..4693853eb8 --- /dev/null +++ b/mmdet3d/core/anchor/__init__.py @@ -0,0 +1,19 @@ +from .anchor_generator import (AlignedAnchorGeneratorRange, AnchorGenerator, + AnchorGeneratorRange) + +__all__ = [ + 'AnchorGenerator', 'anchor_inside_flags', 'images_to_levels', 'unmap', + 'AlignedAnchorGeneratorRange', 'AnchorGeneratorRange', + 'build_anchor_generator' +] + + +def build_anchor_generator(cfg, **kwargs): + from . import anchor_generator + import mmcv + if isinstance(cfg, dict): + return mmcv.runner.obj_from_dict( + cfg, anchor_generator, default_args=kwargs) + else: + raise TypeError('Invalid type {} for building a sampler'.format( + type(cfg))) diff --git a/mmdet3d/core/anchor/anchor_generator.py b/mmdet3d/core/anchor/anchor_generator.py new file mode 100644 index 0000000000..21af0f92dc --- /dev/null +++ b/mmdet3d/core/anchor/anchor_generator.py @@ -0,0 +1,288 @@ +import torch + + +class AnchorGenerator(object): + """ + Examples: + >>> from mmdet.core import AnchorGenerator + >>> self = AnchorGenerator(9, [1.], [1.]) + >>> all_anchors = self.grid_anchors((2, 2), device='cpu') + >>> print(all_anchors) + tensor([[ 0., 0., 8., 8.], + [16., 0., 24., 8.], + [ 0., 16., 8., 24.], + [16., 16., 24., 24.]]) + """ + + def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None): + self.base_size = base_size + self.scales = torch.Tensor(scales) + self.ratios = torch.Tensor(ratios) + self.scale_major = scale_major + self.ctr = ctr + self.base_anchors = self.gen_base_anchors() + + @property + def num_base_anchors(self): + return self.base_anchors.size(0) + + def gen_base_anchors(self): + w = self.base_size + h = self.base_size + + h_ratios = torch.sqrt(self.ratios) + w_ratios = 1 / h_ratios + if self.scale_major: + ws = (w * w_ratios[:, None] * self.scales[None, :]).view(-1) + hs = (h * h_ratios[:, None] * self.scales[None, :]).view(-1) + else: + ws = (w * self.scales[:, None] * w_ratios[None, :]).view(-1) + hs = (h * self.scales[:, None] * h_ratios[None, :]).view(-1) + + # yapf: disable + base_anchors = torch.stack( + [ + -0.5 * ws, -0.5 * hs, + 0.5 * ws, 0.5 * hs + ], + dim=-1) + # yapf: enable + return base_anchors + + def _meshgrid(self, x, y, row_major=True): + xx = x.repeat(len(y)) + yy = y.view(-1, 1).repeat(1, len(x)).view(-1) + if row_major: + return xx, yy + else: + return yy, xx + + def grid_anchors(self, featmap_size, stride=16, device='cuda'): + base_anchors = self.base_anchors.to(device) + + feat_h, feat_w = featmap_size + shift_x = torch.arange(0, feat_w, device=device) * stride + shift_y = torch.arange(0, feat_h, device=device) * stride + shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) + shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1) + shifts = shifts.type_as(base_anchors) + # first feat_w elements correspond to the first row of shifts + # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get + # shifted anchors (K, A, 4), reshape to (K*A, 4) + + all_anchors = base_anchors[None, :, :] + shifts[:, None, :] + all_anchors = all_anchors.view(-1, 4) + # first A rows correspond to A anchors of (0, 0) in feature map, + # then (0, 1), (0, 2), ... + return all_anchors + + def valid_flags(self, featmap_size, valid_size, device='cuda'): + feat_h, feat_w = featmap_size + valid_h, valid_w = valid_size + assert valid_h <= feat_h and valid_w <= feat_w + valid_x = torch.zeros(feat_w, dtype=torch.uint8, device=device) + valid_y = torch.zeros(feat_h, dtype=torch.uint8, device=device) + valid_x[:valid_w] = 1 + valid_y[:valid_h] = 1 + valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) + valid = valid_xx & valid_yy + valid = valid[:, None].expand( + valid.size(0), self.num_base_anchors).contiguous().view(-1).bool() + return valid + + +class AnchorGeneratorRange(object): + + def __init__(self, + anchor_ranges, + sizes=((1.6, 3.9, 1.56), ), + stride=2, + rotations=(0, 3.1415926 / 2), + custom_values=(), + cache_anchor=False): + self.sizes = sizes + self.stride = stride + self.anchor_ranges = anchor_ranges + if len(anchor_ranges) != len(sizes): + self.anchor_ranges = anchor_ranges * len(sizes) + self.rotations = rotations + self.custom_values = custom_values + self.cache_anchor = cache_anchor + self.cached_anchors = None + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += 'anchor_range={}, '.format(self.anchor_ranges) + s += 'stride={}, '.format(self.stride) + s += 'sizes={}, '.format(self.sizes) + s += 'rotations={})'.format(self.rotations) + return s + + @property + def num_base_anchors(self): + num_rot = len(self.rotations) + num_size = torch.tensor(self.sizes).reshape(-1, 3).size(0) + return num_rot * num_size + + def grid_anchors(self, feature_map_size, device='cuda'): + # We reimplement the anchor generator using torch in cuda + # torch: 0.6975 s for 1000 times + # numpy: 4.3345 s for 1000 times + # which is ~5 times faster than numpy implementation + if (self.cache_anchor and self.cached_anchors): + return self.cached_anchors + if not isinstance(self.anchor_ranges[0], list): + return self.anchors_single_range( + feature_map_size, + self.anchor_ranges, + self.sizes, + self.rotations, + device=device) + assert len(self.sizes) == len(self.anchor_ranges) + mr_anchors = [] + for anchor_range, anchor_size in zip(self.anchor_ranges, self.sizes): + mr_anchors.append( + self.anchors_single_range( + feature_map_size, + anchor_range, + anchor_size, + self.rotations, + device=device)) + mr_anchors = torch.cat(mr_anchors, dim=-3) + if self.cache_anchor and not self.cached_anchors: + self.cached_anchors = mr_anchors + return mr_anchors + + def anchors_single_range(self, + feature_size, + anchor_range, + sizes=((1.6, 3.9, 1.56), ), + rotations=(0, 3.1415927 / 2), + device='cuda'): + """Generate anchors in a single range + Args: + feature_size: list [D, H, W](zyx) + sizes: [N, 3] list of list or array, size of anchors, xyz + + Returns: + anchors: [*feature_size, num_sizes, num_rots, 7] tensor. + """ + if len(feature_size) == 2: + feature_size = [1, feature_size[0], feature_size[1]] + anchor_range = torch.tensor(anchor_range, device=device) + z_centers = torch.linspace( + anchor_range[2], anchor_range[5], feature_size[0], device=device) + y_centers = torch.linspace( + anchor_range[1], anchor_range[4], feature_size[1], device=device) + x_centers = torch.linspace( + anchor_range[0], anchor_range[3], feature_size[2], device=device) + sizes = torch.tensor(sizes, device=device).reshape(-1, 3) + rotations = torch.tensor(rotations, device=device) + + # torch.meshgrid default behavior is 'id', np's default is 'xy' + rets = torch.meshgrid(x_centers, y_centers, z_centers, rotations) + # torch.meshgrid returns a tuple rather than list + rets = list(rets) + tile_shape = [1] * 5 + tile_shape[-2] = int(sizes.shape[0]) + for i in range(len(rets)): + rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1) + + sizes = sizes.reshape([1, 1, 1, -1, 1, 3]) + tile_size_shape = list(rets[0].shape) + tile_size_shape[3] = 1 + sizes = sizes.repeat(tile_size_shape) + rets.insert(3, sizes) + + ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5]) + # [1, 200, 176, N, 2, 7] for kitti after permute + # ret = ret.reshape(-1, 7) + + if len(self.custom_values) > 0: + custom_ndim = len(self.custom_values) + custom = ret.new_zeros([*ret.shape[:-1], custom_ndim]) + # custom[:] = self.custom_values + ret = torch.cat([ret, custom], dim=-1) + # [1, 200, 176, N, 2, 9] for nus dataset after permute + return ret + + +class AlignedAnchorGeneratorRange(AnchorGeneratorRange): + + def __init__(self, shift_center=True, **kwargs): + super(AlignedAnchorGeneratorRange, self).__init__(**kwargs) + self.shift_center = shift_center + + def anchors_single_range(self, + feature_size, + anchor_range, + sizes=((1.6, 3.9, 1.56), ), + rotations=(0, 3.1415927 / 2), + device='cuda'): + """Generate anchors in a single range + Args: + feature_size: list [D, H, W](zyx) + sizes: [N, 3] list of list or array, size of anchors, xyz + + Returns: + anchors: [*feature_size, num_sizes, num_rots, 7] tensor. + """ + if len(feature_size) == 2: + feature_size = [1, feature_size[0], feature_size[1]] + anchor_range = torch.tensor(anchor_range, device=device) + z_centers = torch.linspace( + anchor_range[2], + anchor_range[5], + feature_size[0] + 1, + device=device) + y_centers = torch.linspace( + anchor_range[1], + anchor_range[4], + feature_size[1] + 1, + device=device) + x_centers = torch.linspace( + anchor_range[0], + anchor_range[3], + feature_size[2] + 1, + device=device) + sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * self.stride + rotations = torch.tensor(rotations, device=device) + + # shift the anchor center + if self.shift_center: + z_shift = (z_centers[1] - z_centers[0]) / 2 + y_shift = (y_centers[1] - y_centers[0]) / 2 + x_shift = (x_centers[1] - x_centers[0]) / 2 + z_centers += z_shift + y_centers += y_shift + x_centers += x_shift + + # torch.meshgrid default behavior is 'id', np's default is 'xy' + rets = torch.meshgrid(x_centers[:feature_size[2]], + y_centers[:feature_size[1]], + z_centers[:feature_size[0]], rotations) + + # torch.meshgrid returns a tuple rather than list + rets = list(rets) + tile_shape = [1] * 5 + tile_shape[-2] = int(sizes.shape[0]) + for i in range(len(rets)): + rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1) + + sizes = sizes.reshape([1, 1, 1, -1, 1, 3]) + tile_size_shape = list(rets[0].shape) + tile_size_shape[3] = 1 + sizes = sizes.repeat(tile_size_shape) + rets.insert(3, sizes) + + ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5]) + # [1, 200, 176, N, 2, 7] for kitti after permute + # ret = ret.reshape(-1, 7) + + if len(self.custom_values) > 0: + custom_ndim = len(self.custom_values) + custom = ret.new_zeros([*ret.shape[:-1], custom_ndim]) + # custom[:] = self.custom_values + ret = torch.cat([ret, custom], dim=-1) + # [1, 200, 176, N, 2, 9] for nus dataset after permute + return ret diff --git a/mmdet3d/core/bbox/__init__.py b/mmdet3d/core/bbox/__init__.py new file mode 100644 index 0000000000..95efe671fe --- /dev/null +++ b/mmdet3d/core/bbox/__init__.py @@ -0,0 +1,49 @@ +from . import box_torch_ops +from .assigners import AssignResult, BaseAssigner, MaxIoUAssigner +from .coders import ResidualCoder +# from .bbox_target import bbox_target +from .geometry import (bbox_overlaps_2d, bbox_overlaps_3d, + bbox_overlaps_nearest_3d) +from .samplers import (BaseSampler, CombinedSampler, + InstanceBalancedPosSampler, IoUBalancedNegSampler, + PseudoSampler, RandomSampler, SamplingResult) +from .transforms import delta2bbox # bbox2result_kitti, +from .transforms import (bbox2delta, bbox2result_coco, bbox2roi, bbox_flip, + bbox_mapping, bbox_mapping_back, + boxes3d_to_bev_torch_lidar, distance2bbox, roi2bbox) + +from .assign_sampling import ( # isort:skip, avoid recursive imports + build_bbox_coder, # temporally settings + assign_and_sample, build_assigner, build_sampler) + +__all__ = [ + 'BaseAssigner', + 'MaxIoUAssigner', + 'AssignResult', + 'BaseSampler', + 'PseudoSampler', + 'RandomSampler', + 'InstanceBalancedPosSampler', + 'IoUBalancedNegSampler', + 'CombinedSampler', + 'SamplingResult', + 'bbox2delta', + 'delta2bbox', + 'bbox_flip', + 'bbox_mapping', + 'bbox_mapping_back', + 'bbox2roi', + 'roi2bbox', + 'bbox2result_coco', + 'distance2bbox', # 'bbox2result_kitti', + 'build_assigner', + 'build_sampler', + 'assign_and_sample', + 'bbox_overlaps_2d', + 'bbox_overlaps_3d', + 'bbox_overlaps_nearest_3d', + 'box_torch_ops', + 'build_bbox_coder', + 'ResidualCoder', + 'boxes3d_to_bev_torch_lidar' +] diff --git a/mmdet3d/core/bbox/assign_sampling.py b/mmdet3d/core/bbox/assign_sampling.py new file mode 100644 index 0000000000..ed7632984c --- /dev/null +++ b/mmdet3d/core/bbox/assign_sampling.py @@ -0,0 +1,43 @@ +import mmcv + +from . import assigners, coders, samplers + + +def build_assigner(cfg, **kwargs): + if isinstance(cfg, assigners.BaseAssigner): + return cfg + elif isinstance(cfg, dict): + return mmcv.runner.obj_from_dict(cfg, assigners, default_args=kwargs) + else: + raise TypeError('Invalid type {} for building a sampler'.format( + type(cfg))) + + +def build_bbox_coder(cfg, **kwargs): + if isinstance(cfg, coders.ResidualCoder): + return cfg + elif isinstance(cfg, dict): + return mmcv.runner.obj_from_dict(cfg, coders, default_args=kwargs) + else: + raise TypeError('Invalid type {} for building a sampler'.format( + type(cfg))) + + +def build_sampler(cfg, **kwargs): + if isinstance(cfg, samplers.BaseSampler): + return cfg + elif isinstance(cfg, dict): + return mmcv.runner.obj_from_dict(cfg, samplers, default_args=kwargs) + else: + raise TypeError('Invalid type {} for building a sampler'.format( + type(cfg))) + + +def assign_and_sample(bboxes, gt_bboxes, gt_bboxes_ignore, gt_labels, cfg): + bbox_assigner = build_assigner(cfg.assigner) + bbox_sampler = build_sampler(cfg.sampler) + assign_result = bbox_assigner.assign(bboxes, gt_bboxes, gt_bboxes_ignore, + gt_labels) + sampling_result = bbox_sampler.sample(assign_result, bboxes, gt_bboxes, + gt_labels) + return assign_result, sampling_result diff --git a/mmdet3d/core/bbox/assigners/__init__.py b/mmdet3d/core/bbox/assigners/__init__.py new file mode 100644 index 0000000000..594e8406b5 --- /dev/null +++ b/mmdet3d/core/bbox/assigners/__init__.py @@ -0,0 +1,8 @@ +from .approx_max_iou_assigner import ApproxMaxIoUAssigner +from .assign_result import AssignResult +from .base_assigner import BaseAssigner +from .max_iou_assigner import MaxIoUAssigner + +__all__ = [ + 'BaseAssigner', 'MaxIoUAssigner', 'ApproxMaxIoUAssigner', 'AssignResult' +] diff --git a/mmdet3d/core/bbox/assigners/approx_max_iou_assigner.py b/mmdet3d/core/bbox/assigners/approx_max_iou_assigner.py new file mode 100644 index 0000000000..e308a1b1c2 --- /dev/null +++ b/mmdet3d/core/bbox/assigners/approx_max_iou_assigner.py @@ -0,0 +1,114 @@ +import torch + +from ..geometry import bbox_overlaps_2d +from .max_iou_assigner import MaxIoUAssigner + + +class ApproxMaxIoUAssigner(MaxIoUAssigner): + """Assign a corresponding gt bbox or background to each bbox. + + Each proposals will be assigned with `-1`, `0`, or a positive integer + indicating the ground truth index. + + - -1: don't care + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + pos_iou_thr (float): IoU threshold for positive bboxes. + neg_iou_thr (float or tuple): IoU threshold for negative bboxes. + min_pos_iou (float): Minimum iou for a bbox to be considered as a + positive bbox. Positive samples can have smaller IoU than + pos_iou_thr due to the 4th step (assign max IoU sample to each gt). + gt_max_assign_all (bool): Whether to assign all bboxes with the same + highest overlap with some gt to that gt. + ignore_iof_thr (float): IoF threshold for ignoring bboxes (if + `gt_bboxes_ignore` is specified). Negative values mean not + ignoring any bboxes. + ignore_wrt_candidates (bool): Whether to compute the iof between + `bboxes` and `gt_bboxes_ignore`, or the contrary. + """ + + def __init__(self, + pos_iou_thr, + neg_iou_thr, + min_pos_iou=.0, + gt_max_assign_all=True, + ignore_iof_thr=-1, + ignore_wrt_candidates=True): + self.pos_iou_thr = pos_iou_thr + self.neg_iou_thr = neg_iou_thr + self.min_pos_iou = min_pos_iou + self.gt_max_assign_all = gt_max_assign_all + self.ignore_iof_thr = ignore_iof_thr + self.ignore_wrt_candidates = ignore_wrt_candidates + + def assign(self, + approxs, + squares, + approxs_per_octave, + gt_bboxes, + gt_bboxes_ignore=None, + gt_labels=None): + """Assign gt to approxs. + + This method assign a gt bbox to each group of approxs (bboxes), + each group of approxs is represent by a base approx (bbox) and + will be assigned with -1, 0, or a positive number. + -1 means don't care, 0 means negative sample, + positive number is the index (1-based) of assigned gt. + The assignment is done in following steps, the order matters. + + 1. assign every bbox to -1 + 2. use the max IoU of each group of approxs to assign + 2. assign proposals whose iou with all gts < neg_iou_thr to 0 + 3. for each bbox, if the iou with its nearest gt >= pos_iou_thr, + assign it to that bbox + 4. for each gt bbox, assign its nearest proposals (may be more than + one) to itself + + Args: + approxs (Tensor): Bounding boxes to be assigned, + shape(approxs_per_octave*n, 4). + squares (Tensor): Base Bounding boxes to be assigned, + shape(n, 4). + approxs_per_octave (int): number of approxs per octave + gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4). + gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are + labelled as `ignored`, e.g., crowd boxes in COCO. + gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ). + + Returns: + :obj:`AssignResult`: The assign result. + """ + + if squares.shape[0] == 0 or gt_bboxes.shape[0] == 0: + raise ValueError('No gt or approxs') + num_squares = squares.size(0) + num_gts = gt_bboxes.size(0) + # re-organize anchors by approxs_per_octave x num_squares + approxs = torch.transpose( + approxs.view(num_squares, approxs_per_octave, 4), 0, + 1).contiguous().view(-1, 4) + all_overlaps = bbox_overlaps_2d(approxs, gt_bboxes) + + overlaps, _ = all_overlaps.view(approxs_per_octave, num_squares, + num_gts).max(dim=0) + overlaps = torch.transpose(overlaps, 0, 1) + + bboxes = squares[:, :4] + + if (self.ignore_iof_thr > 0) and (gt_bboxes_ignore is not None) and ( + gt_bboxes_ignore.numel() > 0): + if self.ignore_wrt_candidates: + ignore_overlaps = bbox_overlaps_2d( + bboxes, gt_bboxes_ignore, mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=1) + else: + ignore_overlaps = bbox_overlaps_2d( + gt_bboxes_ignore, bboxes, mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=0) + overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1 + + assign_result = self.assign_wrt_overlaps(overlaps, gt_labels) + return assign_result diff --git a/mmdet3d/core/bbox/assigners/assign_result.py b/mmdet3d/core/bbox/assigners/assign_result.py new file mode 100644 index 0000000000..33c761dde2 --- /dev/null +++ b/mmdet3d/core/bbox/assigners/assign_result.py @@ -0,0 +1,19 @@ +import torch + + +class AssignResult(object): + + def __init__(self, num_gts, gt_inds, max_overlaps, labels=None): + self.num_gts = num_gts + self.gt_inds = gt_inds + self.max_overlaps = max_overlaps + self.labels = labels + + def add_gt_(self, gt_labels): + self_inds = torch.arange( + 1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device) + self.gt_inds = torch.cat([self_inds, self.gt_inds]) + self.max_overlaps = torch.cat( + [self.max_overlaps.new_ones(self.num_gts), self.max_overlaps]) + if self.labels is not None: + self.labels = torch.cat([gt_labels, self.labels]) diff --git a/mmdet3d/core/bbox/assigners/base_assigner.py b/mmdet3d/core/bbox/assigners/base_assigner.py new file mode 100644 index 0000000000..7bd02dce14 --- /dev/null +++ b/mmdet3d/core/bbox/assigners/base_assigner.py @@ -0,0 +1,8 @@ +from abc import ABCMeta, abstractmethod + + +class BaseAssigner(metaclass=ABCMeta): + + @abstractmethod + def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None): + pass diff --git a/mmdet3d/core/bbox/assigners/max_iou_assigner.py b/mmdet3d/core/bbox/assigners/max_iou_assigner.py new file mode 100644 index 0000000000..53e3df1307 --- /dev/null +++ b/mmdet3d/core/bbox/assigners/max_iou_assigner.py @@ -0,0 +1,169 @@ +import torch + +from .. import geometry +from .assign_result import AssignResult +from .base_assigner import BaseAssigner + + +class MaxIoUAssigner(BaseAssigner): + """Assign a corresponding gt bbox or background to each bbox. + + Each proposals will be assigned with `-1`, `0`, or a positive integer + indicating the ground truth index. + + - -1: don't care + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + pos_iou_thr (float): IoU threshold for positive bboxes. + neg_iou_thr (float or tuple): IoU threshold for negative bboxes. + min_pos_iou (float): Minimum iou for a bbox to be considered as a + positive bbox. Positive samples can have smaller IoU than + pos_iou_thr due to the 4th step (assign max IoU sample to each gt). + gt_max_assign_all (bool): Whether to assign all bboxes with the same + highest overlap with some gt to that gt. + ignore_iof_thr (float): IoF threshold for ignoring bboxes (if + `gt_bboxes_ignore` is specified). Negative values mean not + ignoring any bboxes. + ignore_wrt_candidates (bool): Whether to compute the iof between + `bboxes` and `gt_bboxes_ignore`, or the contrary. + """ + + def __init__(self, + pos_iou_thr, + neg_iou_thr, + min_pos_iou=.0, + gt_max_assign_all=True, + ignore_iof_thr=-1, + iou_type='2d', + ignore_wrt_candidates=True): + self.pos_iou_thr = pos_iou_thr + self.neg_iou_thr = neg_iou_thr + self.min_pos_iou = min_pos_iou + self.gt_max_assign_all = gt_max_assign_all + self.ignore_iof_thr = ignore_iof_thr + self.ignore_wrt_candidates = ignore_wrt_candidates + # iou_type could be 2d, 3d, nearest_3d + self.iou_type = iou_type + self.bbox_overlaps = getattr(geometry, + 'bbox_overlaps_{}'.format(iou_type)) + + def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None): + """Assign gt to bboxes. + + This method assign a gt bbox to every bbox (proposal/anchor), each bbox + will be assigned with -1, 0, or a positive number. -1 means don't care, + 0 means negative sample, positive number is the index (1-based) of + assigned gt. + The assignment is done in following steps, the order matters. + + 1. assign every bbox to -1 + 2. assign proposals whose iou with all gts < neg_iou_thr to 0 + 3. for each bbox, if the iou with its nearest gt >= pos_iou_thr, + assign it to that bbox + 4. for each gt bbox, assign its nearest proposals (may be more than + one) to itself + + Args: + bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4). + gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4). + gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are + labelled as `ignored`, e.g., crowd boxes in COCO. + gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ). + + Returns: + :obj:`AssignResult`: The assign result. + """ + if self.iou_type == '2d': + bboxes = bboxes[:, :4] + overlaps = self.bbox_overlaps(gt_bboxes, bboxes) + if (self.ignore_iof_thr > 0) and (gt_bboxes_ignore is not None) and ( + gt_bboxes_ignore.numel() > 0): + if self.ignore_wrt_candidates: + ignore_overlaps = self.bbox_overlaps( + bboxes, gt_bboxes_ignore, mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=1) + else: + ignore_overlaps = self.bbox_overlaps( + gt_bboxes_ignore, bboxes, mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=0) + overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1 + + assign_result = self.assign_wrt_overlaps(overlaps, gt_labels) + return assign_result + + def assign_wrt_overlaps(self, overlaps, gt_labels=None): + """Assign w.r.t. the overlaps of bboxes with gts. + + Args: + overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes, + shape(k, n). + gt_labels (Tensor, optional): Labels of k gt_bboxes, shape (k, ). + + Returns: + :obj:`AssignResult`: The assign result. + """ + num_gts, num_bboxes = overlaps.size(0), overlaps.size(1) + + # 1. assign -1 by default + assigned_gt_inds = overlaps.new_full((num_bboxes, ), + -1, + dtype=torch.long) + + if num_gts == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + max_overlaps = overlaps.new_zeros((num_bboxes, )) + if num_gts == 0: + # No truth, assign everything to background + assigned_gt_inds[:] = 0 + if gt_labels is None: + assigned_labels = None + else: + assigned_labels = overlaps.new_zeros((num_bboxes, ), + dtype=torch.long) + return AssignResult( + num_gts, + assigned_gt_inds, + max_overlaps, + labels=assigned_labels) + + # for each anchor, which gt best overlaps with it + # for each anchor, the max iou of all gts + max_overlaps, argmax_overlaps = overlaps.max(dim=0) + # for each gt, which anchor best overlaps with it + # for each gt, the max iou of all proposals + gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1) + # 2. assign negative: below + if isinstance(self.neg_iou_thr, float): + assigned_gt_inds[(max_overlaps >= 0) + & (max_overlaps < self.neg_iou_thr)] = 0 + elif isinstance(self.neg_iou_thr, tuple): + assert len(self.neg_iou_thr) == 2 + assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0]) + & (max_overlaps < self.neg_iou_thr[1])] = 0 + + # 3. assign positive: above positive IoU threshold + pos_inds = max_overlaps >= self.pos_iou_thr + assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1 + + # 4. assign fg: for each gt, proposals with highest IoU + for i in range(num_gts): + if gt_max_overlaps[i] >= self.min_pos_iou: + if self.gt_max_assign_all: + max_iou_inds = overlaps[i, :] == gt_max_overlaps[i] + assigned_gt_inds[max_iou_inds] = i + 1 + else: + assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1 + + if gt_labels is not None: + assigned_labels = assigned_gt_inds.new_zeros((num_bboxes, )) + pos_inds = torch.nonzero(assigned_gt_inds > 0).squeeze() + if pos_inds.numel() > 0: + assigned_labels[pos_inds] = gt_labels[ + assigned_gt_inds[pos_inds] - 1] + else: + assigned_labels = None + + return AssignResult( + num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels) diff --git a/mmdet3d/core/bbox/box_np_ops.py b/mmdet3d/core/bbox/box_np_ops.py new file mode 100644 index 0000000000..966905e19c --- /dev/null +++ b/mmdet3d/core/bbox/box_np_ops.py @@ -0,0 +1,568 @@ +import numba +import numpy as np + + +def camera_to_lidar(points, r_rect, velo2cam): + points_shape = list(points.shape[0:-1]) + if points.shape[-1] == 3: + points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1) + lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T) + return lidar_points[..., :3] + + +def box_camera_to_lidar(data, r_rect, velo2cam): + xyz = data[:, 0:3] + l, h, w = data[:, 3:4], data[:, 4:5], data[:, 5:6] + r = data[:, 6:7] + xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam) + return np.concatenate([xyz_lidar, w, l, h, r], axis=1) + + +def corners_nd(dims, origin=0.5): + """generate relative box corners based on length per dim and + origin point. + + Args: + dims (float array, shape=[N, ndim]): array of length per dim + origin (list or array or float): origin point relate to smallest point. + + Returns: + float array, shape=[N, 2 ** ndim, ndim]: returned corners. + point layout example: (2d) x0y0, x0y1, x1y0, x1y1; + (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 + where x0 < x1, y0 < y1, z0 < z1 + """ + ndim = int(dims.shape[1]) + corners_norm = np.stack( + np.unravel_index(np.arange(2**ndim), [2] * ndim), + axis=1).astype(dims.dtype) + # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1 + # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 + # so need to convert to a format which is convenient to do other computing. + # for 2d boxes, format is clockwise start with minimum point + # for 3d boxes, please draw lines by your hand. + if ndim == 2: + # generate clockwise box corners + corners_norm = corners_norm[[0, 1, 3, 2]] + elif ndim == 3: + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) + corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape( + [1, 2**ndim, ndim]) + return corners + + +def rotation_2d(points, angles): + """rotation 2d points based on origin point clockwise when angle positive. + + Args: + points (float array, shape=[N, point_size, 2]): points to be rotated. + angles (float array, shape=[N]): rotation angle. + + Returns: + float array: same shape as points + """ + rot_sin = np.sin(angles) + rot_cos = np.cos(angles) + rot_mat_T = np.stack([[rot_cos, -rot_sin], [rot_sin, rot_cos]]) + return np.einsum('aij,jka->aik', points, rot_mat_T) + + +def center_to_corner_box2d(centers, dims, angles=None, origin=0.5): + """convert kitti locations, dimensions and angles to corners. + format: center(xy), dims(xy), angles(clockwise when positive) + + Args: + centers (float array, shape=[N, 2]): locations in kitti label file. + dims (float array, shape=[N, 2]): dimensions in kitti label file. + angles (float array, shape=[N]): rotation_y in kitti label file. + + Returns: + [type]: [description] + """ + # 'length' in kitti format is in x axis. + # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) + # center in kitti format is [0.5, 1.0, 0.5] in xyz. + corners = corners_nd(dims, origin=origin) + # corners: [N, 4, 2] + if angles is not None: + corners = rotation_2d(corners, angles) + corners += centers.reshape([-1, 1, 2]) + return corners + + +@numba.jit(nopython=True) +def depth_to_points(depth, trunc_pixel): + num_pts = np.sum(depth[trunc_pixel:, ] > 0.1) + points = np.zeros((num_pts, 3), dtype=depth.dtype) + x = np.array([0, 0, 1], dtype=depth.dtype) + k = 0 + for i in range(trunc_pixel, depth.shape[0]): + for j in range(depth.shape[1]): + if depth[i, j] > 0.1: + x = np.array([j, i, 1], dtype=depth.dtype) + points[k] = x * depth[i, j] + k += 1 + return points + + +def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam): + pts = depth_to_points(depth, trunc_pixel) + points_shape = list(pts.shape[0:-1]) + points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1) + points = points @ np.linalg.inv(P2.T) + lidar_points = camera_to_lidar(points, r_rect, velo2cam) + return lidar_points + + +def rotation_3d_in_axis(points, angles, axis=0): + # points: [N, point_size, 3] + rot_sin = np.sin(angles) + rot_cos = np.cos(angles) + ones = np.ones_like(rot_cos) + zeros = np.zeros_like(rot_cos) + if axis == 1: + rot_mat_T = np.stack([[rot_cos, zeros, -rot_sin], [zeros, ones, zeros], + [rot_sin, zeros, rot_cos]]) + elif axis == 2 or axis == -1: + rot_mat_T = np.stack([[rot_cos, -rot_sin, zeros], + [rot_sin, rot_cos, zeros], [zeros, zeros, ones]]) + elif axis == 0: + rot_mat_T = np.stack([[zeros, rot_cos, -rot_sin], + [zeros, rot_sin, rot_cos], [ones, zeros, zeros]]) + else: + raise ValueError('axis should in range') + + return np.einsum('aij,jka->aik', points, rot_mat_T) + + +def center_to_corner_box3d(centers, + dims, + angles=None, + origin=(0.5, 1.0, 0.5), + axis=1): + """convert kitti locations, dimensions and angles to corners + + Args: + centers (float array, shape=[N, 3]): locations in kitti label file. + dims (float array, shape=[N, 3]): dimensions in kitti label file. + angles (float array, shape=[N]): rotation_y in kitti label file. + origin (list or array or float): origin point relate to smallest point. + use [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar. + axis (int): rotation axis. 1 for camera and 2 for lidar. + Returns: + [type]: [description] + """ + # 'length' in kitti format is in x axis. + # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) + # center in kitti format is [0.5, 1.0, 0.5] in xyz. + corners = corners_nd(dims, origin=origin) + # corners: [N, 8, 3] + if angles is not None: + corners = rotation_3d_in_axis(corners, angles, axis=axis) + corners += centers.reshape([-1, 1, 3]) + return corners + + +@numba.jit(nopython=True) +def box2d_to_corner_jit(boxes): + num_box = boxes.shape[0] + corners_norm = np.zeros((4, 2), dtype=boxes.dtype) + corners_norm[1, 1] = 1.0 + corners_norm[2] = 1.0 + corners_norm[3, 0] = 1.0 + corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) + corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape( + 1, 4, 2) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype) + for i in range(num_box): + rot_sin = np.sin(boxes[i, -1]) + rot_cos = np.cos(boxes[i, -1]) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2] + return box_corners + + +@numba.njit +def corner_to_standup_nd_jit(boxes_corner): + num_boxes = boxes_corner.shape[0] + ndim = boxes_corner.shape[-1] + result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype) + for i in range(num_boxes): + for j in range(ndim): + result[i, j] = np.min(boxes_corner[i, :, j]) + for j in range(ndim): + result[i, j + ndim] = np.max(boxes_corner[i, :, j]) + return result + + +@numba.jit(nopython=True) +def corner_to_surfaces_3d_jit(corners): + """convert 3d box corners from corner function above + to surfaces that normal vectors all direct to internal. + + Args: + corners (float array, [N, 8, 3]): 3d box corners. + Returns: + surfaces (float array, [N, 6, 4, 3]): + """ + # box_corners: [N, 8, 3], must from corner functions in this module + num_boxes = corners.shape[0] + surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype) + corner_idxes = np.array([ + 0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7 + ]).reshape(6, 4) + for i in range(num_boxes): + for j in range(6): + for k in range(4): + surfaces[i, j, k] = corners[i, corner_idxes[j, k]] + return surfaces + + +def rotation_points_single_angle(points, angle, axis=0): + # points: [N, 3] + rot_sin = np.sin(angle) + rot_cos = np.cos(angle) + if axis == 1: + rot_mat_T = np.array( + [[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]], + dtype=points.dtype) + elif axis == 2 or axis == -1: + rot_mat_T = np.array( + [[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]], + dtype=points.dtype) + elif axis == 0: + rot_mat_T = np.array( + [[1, 0, 0], [0, rot_cos, -rot_sin], [0, rot_sin, rot_cos]], + dtype=points.dtype) + else: + raise ValueError('axis should in range') + + return points @ rot_mat_T, rot_mat_T + + +def project_to_image(points_3d, proj_mat): + points_shape = list(points_3d.shape) + points_shape[-1] = 1 + points_4 = np.concatenate([points_3d, np.zeros(points_shape)], axis=-1) + point_2d = points_4 @ proj_mat.T + point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] + return point_2d_res + + +def box3d_to_bbox(box3d, rect, Trv2c, P2): + box_corners = center_to_corner_box3d( + box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1) + box_corners_in_image = project_to_image(box_corners, P2) + # box_corners_in_image: [N, 8, 2] + minxy = np.min(box_corners_in_image, axis=1) + maxxy = np.max(box_corners_in_image, axis=1) + bbox = np.concatenate([minxy, maxxy], axis=1) + return bbox + + +def corner_to_surfaces_3d(corners): + """convert 3d box corners from corner function above + to surfaces that normal vectors all direct to internal. + + Args: + corners (float array, [N, 8, 3]): 3d box corners. + Returns: + surfaces (float array, [N, 6, 4, 3]): + """ + # box_corners: [N, 8, 3], must from corner functions in this module + surfaces = np.array([ + [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]], + [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]], + [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]], + [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]], + [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]], + [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]], + ]).transpose([2, 0, 1, 3]) + return surfaces + + +def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)): + # TODO: this function is different from PointCloud3D, be careful + # when start to use nuscene, check the input + rbbox_corners = center_to_corner_box3d( + rbbox[:, :3], rbbox[:, 3:6], rbbox[:, 6], origin=origin, axis=z_axis) + surfaces = corner_to_surfaces_3d(rbbox_corners) + indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) + return indices + + +def minmax_to_corner_2d(minmax_box): + ndim = minmax_box.shape[-1] // 2 + center = minmax_box[..., :ndim] + dims = minmax_box[..., ndim:] - center + return center_to_corner_box2d(center, dims, origin=0.0) + + +def limit_period(val, offset=0.5, period=np.pi): + return val - np.floor(val / period + offset) * period + + +def create_anchors_3d_range(feature_size, + anchor_range, + sizes=((1.6, 3.9, 1.56), ), + rotations=(0, np.pi / 2), + dtype=np.float32): + """ + Args: + feature_size: list [D, H, W](zyx) + sizes: [N, 3] list of list or array, size of anchors, xyz + + Returns: + anchors: [*feature_size, num_sizes, num_rots, 7] tensor. + """ + anchor_range = np.array(anchor_range, dtype) + z_centers = np.linspace( + anchor_range[2], anchor_range[5], feature_size[0], dtype=dtype) + y_centers = np.linspace( + anchor_range[1], anchor_range[4], feature_size[1], dtype=dtype) + x_centers = np.linspace( + anchor_range[0], anchor_range[3], feature_size[2], dtype=dtype) + sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3]) + rotations = np.array(rotations, dtype=dtype) + rets = np.meshgrid( + x_centers, y_centers, z_centers, rotations, indexing='ij') + tile_shape = [1] * 5 + tile_shape[-2] = int(sizes.shape[0]) + for i in range(len(rets)): + rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape) + rets[i] = rets[i][..., np.newaxis] # for concat + sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3]) + tile_size_shape = list(rets[0].shape) + tile_size_shape[3] = 1 + sizes = np.tile(sizes, tile_size_shape) + rets.insert(3, sizes) + ret = np.concatenate(rets, axis=-1) + return np.transpose(ret, [2, 1, 0, 3, 4, 5]) + + +def center_to_minmax_2d_0_5(centers, dims): + return np.concatenate([centers - dims / 2, centers + dims / 2], axis=-1) + + +def center_to_minmax_2d(centers, dims, origin=0.5): + if origin == 0.5: + return center_to_minmax_2d_0_5(centers, dims) + corners = center_to_corner_box2d(centers, dims, origin=origin) + return corners[:, [0, 2]].reshape([-1, 4]) + + +def rbbox2d_to_near_bbox(rbboxes): + """convert rotated bbox to nearest 'standing' or 'lying' bbox. + Args: + rbboxes: [N, 5(x, y, xdim, ydim, rad)] rotated bboxes + Returns: + bboxes: [N, 4(xmin, ymin, xmax, ymax)] bboxes + """ + rots = rbboxes[..., -1] + rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi)) + cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis] + bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4]) + bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:]) + return bboxes + + +@numba.jit(nopython=True) +def iou_jit(boxes, query_boxes, mode='iou', eps=0.0): + """calculate box iou. note that jit version runs ~10x faster than the + box_overlaps function in mmdet3d.core.evaluation + Parameters + ---------- + boxes: (N, 4) ndarray of float + query_boxes: (K, 4) ndarray of float + Returns + ------- + overlaps: (N, K) ndarray of overlap between boxes and query_boxes + """ + N = boxes.shape[0] + K = query_boxes.shape[0] + overlaps = np.zeros((N, K), dtype=boxes.dtype) + for k in range(K): + box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) * + (query_boxes[k, 3] - query_boxes[k, 1] + eps)) + for n in range(N): + iw = ( + min(boxes[n, 2], query_boxes[k, 2]) - + max(boxes[n, 0], query_boxes[k, 0]) + eps) + if iw > 0: + ih = ( + min(boxes[n, 3], query_boxes[k, 3]) - + max(boxes[n, 1], query_boxes[k, 1]) + eps) + if ih > 0: + if mode == 'iou': + ua = ((boxes[n, 2] - boxes[n, 0] + eps) * + (boxes[n, 3] - boxes[n, 1] + eps) + box_area - + iw * ih) + else: + ua = ((boxes[n, 2] - boxes[n, 0] + eps) * + (boxes[n, 3] - boxes[n, 1] + eps)) + overlaps[n, k] = iw * ih / ua + return overlaps + + +def change_box3d_center_(box3d, src, dst): + dst = np.array(dst, dtype=box3d.dtype) + src = np.array(src, dtype=box3d.dtype) + box3d[..., :3] += box3d[..., 3:6] * (dst - src) + + +def projection_matrix_to_CRT_kitti(proj): + # P = C @ [R|T] + # C is upper triangular matrix, so we need to inverse CR and use QR + # stable for all kitti camera projection matrix + CR = proj[0:3, 0:3] + CT = proj[0:3, 3] + RinvCinv = np.linalg.inv(CR) + Rinv, Cinv = np.linalg.qr(RinvCinv) + C = np.linalg.inv(Cinv) + R = np.linalg.inv(Rinv) + T = Cinv @ CT + return C, R, T + + +def remove_outside_points(points, rect, Trv2c, P2, image_shape): + # 5x faster than remove_outside_points_v1(2ms vs 10ms) + C, R, T = projection_matrix_to_CRT_kitti(P2) + image_bbox = [0, 0, image_shape[1], image_shape[0]] + frustum = get_frustum(image_bbox, C) + frustum -= T + frustum = np.linalg.inv(R) @ frustum.T + frustum = camera_to_lidar(frustum.T, rect, Trv2c) + frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...]) + indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces) + points = points[indices.reshape([-1])] + return points + + +def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100): + fku = C[0, 0] + fkv = -C[1, 1] + u0v0 = C[0:2, 2] + z_points = np.array( + [near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis] + b = bbox_image + box_corners = np.array( + [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]], + dtype=C.dtype) + near_box_corners = (box_corners - u0v0) / np.array( + [fku / near_clip, -fkv / near_clip], dtype=C.dtype) + far_box_corners = (box_corners - u0v0) / np.array( + [fku / far_clip, -fkv / far_clip], dtype=C.dtype) + ret_xy = np.concatenate([near_box_corners, far_box_corners], + axis=0) # [8, 2] + ret_xyz = np.concatenate([ret_xy, z_points], axis=1) + return ret_xyz + + +def surface_equ_3d(polygon_surfaces): + # return [a, b, c], d in ax+by+cz+d=0 + # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] + surface_vec = polygon_surfaces[:, :, :2, :] - polygon_surfaces[:, :, + 1:3, :] + # normal_vec: [..., 3] + normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :]) + # print(normal_vec.shape, points[..., 0, :].shape) + # d = -np.inner(normal_vec, points[..., 0, :]) + d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :]) + return normal_vec, -d + + +@numba.njit +def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d, + num_surfaces): + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + ret = np.ones((num_points, num_polygons), dtype=np.bool_) + sign = 0.0 + for i in range(num_points): + for j in range(num_polygons): + for k in range(max_num_surfaces): + if k > num_surfaces[j]: + break + sign = ( + points[i, 0] * normal_vec[j, k, 0] + + points[i, 1] * normal_vec[j, k, 1] + + points[i, 2] * normal_vec[j, k, 2] + d[j, k]) + if sign >= 0: + ret[i, j] = False + break + return ret + + +def points_in_convex_polygon_3d_jit(points, + polygon_surfaces, + num_surfaces=None): + """check points is in 3d convex polygons. + Args: + points: [num_points, 3] array. + polygon_surfaces: [num_polygon, max_num_surfaces, + max_num_points_of_surface, 3] + array. all surfaces' normal vector must direct to internal. + max_num_points_of_surface must at least 3. + num_surfaces: [num_polygon] array. indicate how many surfaces + a polygon contain + Returns: + [num_points, num_polygon] bool array. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + # num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + if num_surfaces is None: + num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64) + normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :]) + # normal_vec: [num_polygon, max_num_surfaces, 3] + # d: [num_polygon, max_num_surfaces] + return _points_in_convex_polygon_3d_jit(points, polygon_surfaces, + normal_vec, d, num_surfaces) + + +@numba.jit +def points_in_convex_polygon_jit(points, polygon, clockwise=True): + """check points is in 2d convex polygons. True when point in polygon + Args: + points: [num_points, 2] array. + polygon: [num_polygon, num_points_of_polygon, 2] array. + clockwise: bool. indicate polygon is clockwise. + Returns: + [num_points, num_polygon] bool array. + """ + # first convert polygon to directed lines + num_points_of_polygon = polygon.shape[1] + num_points = points.shape[0] + num_polygons = polygon.shape[0] + # if clockwise: + # vec1 = polygon - polygon[:, [num_points_of_polygon - 1] + + # list(range(num_points_of_polygon - 1)), :] + # else: + # vec1 = polygon[:, [num_points_of_polygon - 1] + + # list(range(num_points_of_polygon - 1)), :] - polygon + # vec1: [num_polygon, num_points_of_polygon, 2] + vec1 = np.zeros((2), dtype=polygon.dtype) + ret = np.zeros((num_points, num_polygons), dtype=np.bool_) + success = True + cross = 0.0 + for i in range(num_points): + for j in range(num_polygons): + success = True + for k in range(num_points_of_polygon): + if clockwise: + vec1 = polygon[j, k] - polygon[j, k - 1] + else: + vec1 = polygon[j, k - 1] - polygon[j, k] + cross = vec1[1] * (polygon[j, k, 0] - points[i, 0]) + cross -= vec1[0] * (polygon[j, k, 1] - points[i, 1]) + if cross >= 0: + success = False + break + ret[i, j] = success + return ret diff --git a/mmdet3d/core/bbox/box_torch_ops.py b/mmdet3d/core/bbox/box_torch_ops.py new file mode 100644 index 0000000000..b0d197f4b1 --- /dev/null +++ b/mmdet3d/core/bbox/box_torch_ops.py @@ -0,0 +1,192 @@ +import numpy as np +import torch + + +def limit_period(val, offset=0.5, period=np.pi): + return val - torch.floor(val / period + offset) * period + + +def corners_nd(dims, origin=0.5): + """generate relative box corners based on length per dim and + origin point. + + Args: + dims (float array, shape=[N, ndim]): array of length per dim + origin (list or array or float): origin point relate to smallest point. + + Returns: + float array, shape=[N, 2 ** ndim, ndim]: returned corners. + point layout example: (2d) x0y0, x0y1, x1y0, x1y1; + (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 + where x0 < x1, y0 < y1, z0 < z1 + """ + ndim = int(dims.shape[1]) + corners_norm = np.stack( + np.unravel_index(np.arange(2**ndim), [2] * ndim), + axis=1).astype(dims.dtype) + # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1 + # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 + # so need to convert to a format which is convenient to do other computing. + # for 2d boxes, format is clockwise start with minimum point + # for 3d boxes, please draw lines by your hand. + if ndim == 2: + # generate clockwise box corners + corners_norm = corners_norm[[0, 1, 3, 2]] + elif ndim == 3: + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) + corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape( + [1, 2**ndim, ndim]) + return corners + + +def rotation_3d_in_axis(points, angles, axis=0): + # points: [N, point_size, 3] + # angles: [N] + rot_sin = torch.sin(angles) + rot_cos = torch.cos(angles) + ones = torch.ones_like(rot_cos) + zeros = torch.zeros_like(rot_cos) + if axis == 1: + rot_mat_T = torch.stack([ + torch.stack([rot_cos, zeros, -rot_sin]), + torch.stack([zeros, ones, zeros]), + torch.stack([rot_sin, zeros, rot_cos]) + ]) + elif axis == 2 or axis == -1: + rot_mat_T = torch.stack([ + torch.stack([rot_cos, -rot_sin, zeros]), + torch.stack([rot_sin, rot_cos, zeros]), + torch.stack([zeros, zeros, ones]) + ]) + elif axis == 0: + rot_mat_T = torch.stack([ + torch.stack([zeros, rot_cos, -rot_sin]), + torch.stack([zeros, rot_sin, rot_cos]), + torch.stack([ones, zeros, zeros]) + ]) + else: + raise ValueError('axis should in range') + + return torch.einsum('aij,jka->aik', (points, rot_mat_T)) + + +def center_to_corner_box3d(centers, + dims, + angles, + origin=[0.5, 1.0, 0.5], + axis=1): + """convert kitti locations, dimensions and angles to corners + + Args: + centers (float array, shape=[N, 3]): locations in kitti label file. + dims (float array, shape=[N, 3]): dimensions in kitti label file. + angles (float array, shape=[N]): rotation_y in kitti label file. + origin (list or array or float): origin point relate to smallest point. + use [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar. + axis (int): rotation axis. 1 for camera and 2 for lidar. + Returns: + [type]: [description] + """ + # 'length' in kitti format is in x axis. + # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) + # center in kitti format is [0.5, 1.0, 0.5] in xyz. + corners = corners_nd(dims, origin=origin) + # corners: [N, 8, 3] + corners = rotation_3d_in_axis(corners, angles, axis=axis) + corners += centers.view(-1, 1, 3) + return corners + + +def lidar_to_camera(points, r_rect, velo2cam): + num_points = points.shape[0] + points = torch.cat( + [points, torch.ones(num_points, 1).type_as(points)], dim=-1) + camera_points = points @ (r_rect @ velo2cam).t() + return camera_points[..., :3] + + +def box_lidar_to_camera(data, r_rect, velo2cam): + xyz_lidar = data[..., 0:3] + w, l, h = data[..., 3:4], data[..., 4:5], data[..., 5:6] + r = data[..., 6:7] + xyz = lidar_to_camera(xyz_lidar, r_rect, velo2cam) + return torch.cat([xyz, l, h, w, r], dim=-1) + + +def project_to_image(points_3d, proj_mat): + points_num = list(points_3d.shape)[:-1] + points_shape = np.concatenate([points_num, [1]], axis=0).tolist() + # previous implementation use new_zeros, new_one yeilds better results + points_4 = torch.cat( + [points_3d, points_3d.new_ones(*points_shape)], dim=-1) + # point_2d = points_4 @ tf.transpose(proj_mat, [1, 0]) + point_2d = torch.matmul(points_4, proj_mat.t()) + point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] + return point_2d_res + + +def rbbox2d_to_near_bbox(rbboxes): + """convert rotated bbox to nearest 'standing' or 'lying' bbox. + + Args: + rbboxes: [N, 5(x, y, xdim, ydim, rad)] rotated bboxes + Returns: + bboxes: [N, 4(xmin, ymin, xmax, ymax)] bboxes + """ + rots = rbboxes[..., -1] + rots_0_pi_div_2 = torch.abs(limit_period(rots, 0.5, np.pi)) + cond = (rots_0_pi_div_2 > np.pi / 4)[..., None] + bboxes_center = torch.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4]) + bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:]) + return bboxes + + +def center_to_minmax_2d_0_5(centers, dims): + return torch.cat([centers - dims / 2, centers + dims / 2], dim=-1) + + +def center_to_minmax_2d(centers, dims, origin=0.5): + if origin == 0.5: + return center_to_minmax_2d_0_5(centers, dims) + corners = center_to_corner_box2d(centers, dims, origin=origin) + return corners[:, [0, 2]].reshape([-1, 4]) + + +def center_to_corner_box2d(centers, dims, angles=None, origin=0.5): + """convert kitti locations, dimensions and angles to corners. + format: center(xy), dims(xy), angles(clockwise when positive) + + Args: + centers (float array, shape=[N, 2]): locations in kitti label file. + dims (float array, shape=[N, 2]): dimensions in kitti label file. + angles (float array, shape=[N]): rotation_y in kitti label file. + + Returns: + [type]: [description] + """ + # 'length' in kitti format is in x axis. + # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) + # center in kitti format is [0.5, 1.0, 0.5] in xyz. + corners = corners_nd(dims, origin=origin) + # corners: [N, 4, 2] + if angles is not None: + corners = rotation_2d(corners, angles) + corners += centers.reshape([-1, 1, 2]) + return corners + + +def rotation_2d(points, angles): + """rotation 2d points based on origin point clockwise when angle positive. + + Args: + points (float array, shape=[N, point_size, 2]): points to be rotated. + angles (float array, shape=[N]): rotation angle. + + Returns: + float array: same shape as points + """ + rot_sin = torch.sin(angles) + rot_cos = torch.cos(angles) + rot_mat_T = torch.stack([[rot_cos, -rot_sin], [rot_sin, rot_cos]]) + return torch.einsum('aij,jka->aik', points, rot_mat_T) diff --git a/mmdet3d/core/bbox/coders/__init__.py b/mmdet3d/core/bbox/coders/__init__.py new file mode 100644 index 0000000000..700a4963f8 --- /dev/null +++ b/mmdet3d/core/bbox/coders/__init__.py @@ -0,0 +1,3 @@ +from .box_coder import ResidualCoder + +__all__ = ['ResidualCoder'] diff --git a/mmdet3d/core/bbox/coders/box_coder.py b/mmdet3d/core/bbox/coders/box_coder.py new file mode 100644 index 0000000000..d936a3f1e1 --- /dev/null +++ b/mmdet3d/core/bbox/coders/box_coder.py @@ -0,0 +1,116 @@ +import numpy as np +import torch + + +class ResidualCoder(object): + + def __init__(self, code_size=7, mean=None, std=None): + super().__init__() + self.code_size = code_size + self.mean = mean + self.std = std + + @staticmethod + def encode_np(boxes, anchors): + """ + :param boxes: (N, 7) x, y, z, w, l, h, r + :param anchors: (N, 7) + :return: + """ + # need to convert boxes to z-center format + xa, ya, za, wa, la, ha, ra = np.split(anchors, 7, axis=-1) + xg, yg, zg, wg, lg, hg, rg = np.split(boxes, 7, axis=-1) + zg = zg + hg / 2 + za = za + ha / 2 + diagonal = np.sqrt(la**2 + wa**2) # 4.3 + xt = (xg - xa) / diagonal + yt = (yg - ya) / diagonal + zt = (zg - za) / ha # 1.6 + lt = np.log(lg / la) + wt = np.log(wg / wa) + ht = np.log(hg / ha) + rt = rg - ra + return np.concatenate([xt, yt, zt, wt, lt, ht, rt], axis=-1) + + @staticmethod + def decode_np(box_encodings, anchors): + """ + :param box_encodings: (N, 7) x, y, z, w, l, h, r + :param anchors: (N, 7) + :return: + """ + # need to convert box_encodings to z-bottom format + xa, ya, za, wa, la, ha, ra = np.split(anchors, 7, axis=-1) + xt, yt, zt, wt, lt, ht, rt = np.split(box_encodings, 7, axis=-1) + + za = za + ha / 2 + diagonal = np.sqrt(la**2 + wa**2) + xg = xt * diagonal + xa + yg = yt * diagonal + ya + zg = zt * ha + za + + lg = np.exp(lt) * la + wg = np.exp(wt) * wa + hg = np.exp(ht) * ha + rg = rt + ra + zg = zg - hg / 2 + return np.concatenate([xg, yg, zg, wg, lg, hg, rg], axis=-1) + + @staticmethod + def encode_torch(anchors, boxes, means, stds): + """ + :param boxes: (N, 7+n) x, y, z, w, l, h, r, velo* + :param anchors: (N, 7+n) + :return: + """ + box_ndim = anchors.shape[-1] + cas, cgs, cts = [], [], [] + if box_ndim > 7: + xa, ya, za, wa, la, ha, ra, *cas = torch.split(anchors, 1, dim=-1) + xg, yg, zg, wg, lg, hg, rg, *cgs = torch.split(boxes, 1, dim=-1) + cts = [g - a for g, a in zip(cgs, cas)] + else: + xa, ya, za, wa, la, ha, ra = torch.split(anchors, 1, dim=-1) + xg, yg, zg, wg, lg, hg, rg = torch.split(boxes, 1, dim=-1) + za = za + ha / 2 + zg = zg + hg / 2 + diagonal = torch.sqrt(la**2 + wa**2) + xt = (xg - xa) / diagonal + yt = (yg - ya) / diagonal + zt = (zg - za) / ha + lt = torch.log(lg / la) + wt = torch.log(wg / wa) + ht = torch.log(hg / ha) + rt = rg - ra + return torch.cat([xt, yt, zt, wt, lt, ht, rt, *cts], dim=-1) + + @staticmethod + def decode_torch(anchors, box_encodings, means, stds): + """ + :param box_encodings: (N, 7 + n) x, y, z, w, l, h, r + :param anchors: (N, 7) + :return: + """ + cas, cts = [], [] + box_ndim = anchors.shape[-1] + if box_ndim > 7: + xa, ya, za, wa, la, ha, ra, *cas = torch.split(anchors, 1, dim=-1) + xt, yt, zt, wt, lt, ht, rt, *cts = torch.split( + box_encodings, 1, dim=-1) + else: + xa, ya, za, wa, la, ha, ra = torch.split(anchors, 1, dim=-1) + xt, yt, zt, wt, lt, ht, rt = torch.split(box_encodings, 1, dim=-1) + + za = za + ha / 2 + diagonal = torch.sqrt(la**2 + wa**2) + xg = xt * diagonal + xa + yg = yt * diagonal + ya + zg = zt * ha + za + + lg = torch.exp(lt) * la + wg = torch.exp(wt) * wa + hg = torch.exp(ht) * ha + rg = rt + ra + zg = zg - hg / 2 + cgs = [t + a for t, a in zip(cts, cas)] + return torch.cat([xg, yg, zg, wg, lg, hg, rg, *cgs], dim=-1) diff --git a/mmdet3d/core/bbox/geometry.py b/mmdet3d/core/bbox/geometry.py new file mode 100644 index 0000000000..9d2b95b0b1 --- /dev/null +++ b/mmdet3d/core/bbox/geometry.py @@ -0,0 +1,131 @@ +import torch + +from mmdet3d.ops.iou3d import boxes_iou3d_gpu +from . import box_torch_ops + + +def bbox_overlaps_2d(bboxes1, bboxes2, mode='iou', is_aligned=False): + """Calculate overlap between two set of bboxes. + + If ``is_aligned`` is ``False``, then calculate the ious between each bbox + of bboxes1 and bboxes2, otherwise the ious between each aligned pair of + bboxes1 and bboxes2. + + Args: + bboxes1 (Tensor): shape (m, 4) in format. + bboxes2 (Tensor): shape (n, 4) in format. + If is_aligned is ``True``, then m and n must be equal. + mode (str): "iou" (intersection over union) or iof (intersection over + foreground). + + Returns: + ious(Tensor): shape (m, n) if is_aligned == False else shape (m, 1) + + Example: + >>> bboxes1 = torch.FloatTensor([ + >>> [0, 0, 10, 10], + >>> [10, 10, 20, 20], + >>> [32, 32, 38, 42], + >>> ]) + >>> bboxes2 = torch.FloatTensor([ + >>> [0, 0, 10, 20], + >>> [0, 10, 10, 19], + >>> [10, 10, 20, 20], + >>> ]) + >>> bbox_overlaps(bboxes1, bboxes2) + tensor([[0.5238, 0.0500, 0.0041], + [0.0323, 0.0452, 1.0000], + [0.0000, 0.0000, 0.0000]]) + + Example: + >>> empty = torch.FloatTensor([]) + >>> nonempty = torch.FloatTensor([ + >>> [0, 0, 10, 9], + >>> ]) + >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1) + >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0) + >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0) + """ + + assert mode in ['iou', 'iof'] + + rows = bboxes1.size(0) + cols = bboxes2.size(0) + if is_aligned: + assert rows == cols + + if rows * cols == 0: + return bboxes1.new(rows, 1) if is_aligned else bboxes1.new(rows, cols) + + if is_aligned: + lt = torch.max(bboxes1[:, :2], bboxes2[:, :2]) # [rows, 2] + rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:]) # [rows, 2] + + wh = (rb - lt).clamp(min=0) # [rows, 2] + overlap = wh[:, 0] * wh[:, 1] + area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * ( + bboxes1[:, 3] - bboxes1[:, 1]) + + if mode == 'iou': + area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * ( + bboxes2[:, 3] - bboxes2[:, 1]) + ious = overlap / (area1 + area2 - overlap) + else: + ious = overlap / area1 + else: + lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2]) # [rows, cols, 2] + rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:]) # [rows, cols, 2] + + wh = (rb - lt).clamp(min=0) # [rows, cols, 2] + overlap = wh[:, :, 0] * wh[:, :, 1] + area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * ( + bboxes1[:, 3] - bboxes1[:, 1]) + + if mode == 'iou': + area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * ( + bboxes2[:, 3] - bboxes2[:, 1]) + ious = overlap / (area1[:, None] + area2 - overlap) + else: + ious = overlap / (area1[:, None]) + + return ious + + +def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou'): + ''' + + :param bboxes1: Tensor, shape (N, 7) [x, y, z, h, w, l, ry] + :param bboxes2: Tensor, shape (M, 7) [x, y, z, h, w, l, ry] + :param mode: mode (str): "iou" (intersection over union) or + iof (intersection over foreground). + :return: iou: (M, N) not support aligned mode currently + ''' + # TODO: check the input dimension meanings, + # this is inconsistent with that in bbox_overlaps_nearest_3d + return boxes_iou3d_gpu(bboxes1, bboxes2, mode) + + +def bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode='iou', is_aligned=False): + ''' + :param bboxes1: Tensor, shape (N, 7) [x, y, z, h, w, l, ry]? + :param bboxes2: Tensor, shape (M, 7) [x, y, z, h, w, l, ry]? + :param mode: mode (str): "iou" (intersection over union) or iof + (intersection over foreground). + :return: iou: (M, N) not support aligned mode currently + rbboxes: [N, 5(x, y, xdim, ydim, rad)] rotated bboxes + ''' + # TODO: check the input dimension meanings, + # this is inconsistent with that in bbox_overlaps_3d + rbboxes1_np = bboxes1.index_select( + dim=-1, index=bboxes1.new_tensor([0, 1, 3, 4, 6]).long()) + rbboxes2_np = bboxes2.index_select( + dim=-1, index=bboxes1.new_tensor([0, 1, 3, 4, 6]).long()) + + # Change the bboxes to bev + # box conversion and iou calculation in torch version on CUDA + # is 10x faster than that in numpy version + bboxes1_bv = box_torch_ops.rbbox2d_to_near_bbox(rbboxes1_np) + bboxes2_bv = box_torch_ops.rbbox2d_to_near_bbox(rbboxes2_np) + ret = bbox_overlaps_2d( + bboxes1_bv, bboxes2_bv, mode=mode, is_aligned=is_aligned) + return ret diff --git a/mmdet3d/core/bbox/samplers/__init__.py b/mmdet3d/core/bbox/samplers/__init__.py new file mode 100644 index 0000000000..d709d8ecb2 --- /dev/null +++ b/mmdet3d/core/bbox/samplers/__init__.py @@ -0,0 +1,14 @@ +from .base_sampler import BaseSampler +from .combined_sampler import CombinedSampler +from .instance_balanced_pos_sampler import InstanceBalancedPosSampler +from .iou_balanced_neg_sampler import IoUBalancedNegSampler +from .ohem_sampler import OHEMSampler +from .pseudo_sampler import PseudoSampler +from .random_sampler import RandomSampler +from .sampling_result import SamplingResult + +__all__ = [ + 'BaseSampler', 'PseudoSampler', 'RandomSampler', + 'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler', + 'OHEMSampler', 'SamplingResult' +] diff --git a/mmdet3d/core/bbox/samplers/base_sampler.py b/mmdet3d/core/bbox/samplers/base_sampler.py new file mode 100644 index 0000000000..12df01306f --- /dev/null +++ b/mmdet3d/core/bbox/samplers/base_sampler.py @@ -0,0 +1,78 @@ +from abc import ABCMeta, abstractmethod + +import torch + +from .sampling_result import SamplingResult + + +class BaseSampler(metaclass=ABCMeta): + + def __init__(self, + num, + pos_fraction, + neg_pos_ub=-1, + add_gt_as_proposals=True, + **kwargs): + self.num = num + self.pos_fraction = pos_fraction + self.neg_pos_ub = neg_pos_ub + self.add_gt_as_proposals = add_gt_as_proposals + self.pos_sampler = self + self.neg_sampler = self + + @abstractmethod + def _sample_pos(self, assign_result, num_expected, **kwargs): + pass + + @abstractmethod + def _sample_neg(self, assign_result, num_expected, **kwargs): + pass + + def sample(self, + assign_result, + bboxes, + gt_bboxes, + gt_labels=None, + **kwargs): + """Sample positive and negative bboxes. + + This is a simple implementation of bbox sampling given candidates, + assigning results and ground truth bboxes. + + Args: + assign_result (:obj:`AssignResult`): Bbox assigning results. + bboxes (Tensor): Boxes to be sampled from. + gt_bboxes (Tensor): Ground truth bboxes. + gt_labels (Tensor, optional): Class labels of ground truth bboxes. + + Returns: + :obj:`SamplingResult`: Sampling result. + """ + bboxes = bboxes[:, :4] + + gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8) + if self.add_gt_as_proposals: + bboxes = torch.cat([gt_bboxes, bboxes], dim=0) + assign_result.add_gt_(gt_labels) + gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8) + gt_flags = torch.cat([gt_ones, gt_flags]) + + num_expected_pos = int(self.num * self.pos_fraction) + pos_inds = self.pos_sampler._sample_pos( + assign_result, num_expected_pos, bboxes=bboxes, **kwargs) + # We found that sampled indices have duplicated items occasionally. + # (may be a bug of PyTorch) + pos_inds = pos_inds.unique() + num_sampled_pos = pos_inds.numel() + num_expected_neg = self.num - num_sampled_pos + if self.neg_pos_ub >= 0: + _pos = max(1, num_sampled_pos) + neg_upper_bound = int(self.neg_pos_ub * _pos) + if num_expected_neg > neg_upper_bound: + num_expected_neg = neg_upper_bound + neg_inds = self.neg_sampler._sample_neg( + assign_result, num_expected_neg, bboxes=bboxes, **kwargs) + neg_inds = neg_inds.unique() + + return SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, + assign_result, gt_flags) diff --git a/mmdet3d/core/bbox/samplers/combined_sampler.py b/mmdet3d/core/bbox/samplers/combined_sampler.py new file mode 100644 index 0000000000..351a097f67 --- /dev/null +++ b/mmdet3d/core/bbox/samplers/combined_sampler.py @@ -0,0 +1,16 @@ +from ..assign_sampling import build_sampler +from .base_sampler import BaseSampler + + +class CombinedSampler(BaseSampler): + + def __init__(self, pos_sampler, neg_sampler, **kwargs): + super(CombinedSampler, self).__init__(**kwargs) + self.pos_sampler = build_sampler(pos_sampler, **kwargs) + self.neg_sampler = build_sampler(neg_sampler, **kwargs) + + def _sample_pos(self, **kwargs): + raise NotImplementedError + + def _sample_neg(self, **kwargs): + raise NotImplementedError diff --git a/mmdet3d/core/bbox/samplers/instance_balanced_pos_sampler.py b/mmdet3d/core/bbox/samplers/instance_balanced_pos_sampler.py new file mode 100644 index 0000000000..bc829a236c --- /dev/null +++ b/mmdet3d/core/bbox/samplers/instance_balanced_pos_sampler.py @@ -0,0 +1,41 @@ +import numpy as np +import torch + +from .random_sampler import RandomSampler + + +class InstanceBalancedPosSampler(RandomSampler): + + def _sample_pos(self, assign_result, num_expected, **kwargs): + pos_inds = torch.nonzero(assign_result.gt_inds > 0) + if pos_inds.numel() != 0: + pos_inds = pos_inds.squeeze(1) + if pos_inds.numel() <= num_expected: + return pos_inds + else: + unique_gt_inds = assign_result.gt_inds[pos_inds].unique() + num_gts = len(unique_gt_inds) + num_per_gt = int(round(num_expected / float(num_gts)) + 1) + sampled_inds = [] + for i in unique_gt_inds: + inds = torch.nonzero(assign_result.gt_inds == i.item()) + if inds.numel() != 0: + inds = inds.squeeze(1) + else: + continue + if len(inds) > num_per_gt: + inds = self.random_choice(inds, num_per_gt) + sampled_inds.append(inds) + sampled_inds = torch.cat(sampled_inds) + if len(sampled_inds) < num_expected: + num_extra = num_expected - len(sampled_inds) + extra_inds = np.array( + list(set(pos_inds.cpu()) - set(sampled_inds.cpu()))) + if len(extra_inds) > num_extra: + extra_inds = self.random_choice(extra_inds, num_extra) + extra_inds = torch.from_numpy(extra_inds).to( + assign_result.gt_inds.device).long() + sampled_inds = torch.cat([sampled_inds, extra_inds]) + elif len(sampled_inds) > num_expected: + sampled_inds = self.random_choice(sampled_inds, num_expected) + return sampled_inds diff --git a/mmdet3d/core/bbox/samplers/iou_balanced_neg_sampler.py b/mmdet3d/core/bbox/samplers/iou_balanced_neg_sampler.py new file mode 100644 index 0000000000..62431d6a07 --- /dev/null +++ b/mmdet3d/core/bbox/samplers/iou_balanced_neg_sampler.py @@ -0,0 +1,133 @@ +import numpy as np +import torch + +from .random_sampler import RandomSampler + + +class IoUBalancedNegSampler(RandomSampler): + """IoU Balanced Sampling + + arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019) + + Sampling proposals according to their IoU. `floor_fraction` of needed RoIs + are sampled from proposals whose IoU are lower than `floor_thr` randomly. + The others are sampled from proposals whose IoU are higher than + `floor_thr`. These proposals are sampled from some bins evenly, which are + split by `num_bins` via IoU evenly. + + Args: + num (int): number of proposals. + pos_fraction (float): fraction of positive proposals. + floor_thr (float): threshold (minimum) IoU for IoU balanced sampling, + set to -1 if all using IoU balanced sampling. + floor_fraction (float): sampling fraction of proposals under floor_thr. + num_bins (int): number of bins in IoU balanced sampling. + """ + + def __init__(self, + num, + pos_fraction, + floor_thr=-1, + floor_fraction=0, + num_bins=3, + **kwargs): + super(IoUBalancedNegSampler, self).__init__(num, pos_fraction, + **kwargs) + assert floor_thr >= 0 or floor_thr == -1 + assert 0 <= floor_fraction <= 1 + assert num_bins >= 1 + + self.floor_thr = floor_thr + self.floor_fraction = floor_fraction + self.num_bins = num_bins + + def sample_via_interval(self, max_overlaps, full_set, num_expected): + max_iou = max_overlaps.max() + iou_interval = (max_iou - self.floor_thr) / self.num_bins + per_num_expected = int(num_expected / self.num_bins) + + sampled_inds = [] + for i in range(self.num_bins): + start_iou = self.floor_thr + i * iou_interval + end_iou = self.floor_thr + (i + 1) * iou_interval + tmp_set = set( + np.where( + np.logical_and(max_overlaps >= start_iou, + max_overlaps < end_iou))[0]) + tmp_inds = list(tmp_set & full_set) + if len(tmp_inds) > per_num_expected: + tmp_sampled_set = self.random_choice(tmp_inds, + per_num_expected) + else: + tmp_sampled_set = np.array(tmp_inds, dtype=np.int) + sampled_inds.append(tmp_sampled_set) + + sampled_inds = np.concatenate(sampled_inds) + if len(sampled_inds) < num_expected: + num_extra = num_expected - len(sampled_inds) + extra_inds = np.array(list(full_set - set(sampled_inds))) + if len(extra_inds) > num_extra: + extra_inds = self.random_choice(extra_inds, num_extra) + sampled_inds = np.concatenate([sampled_inds, extra_inds]) + + return sampled_inds + + def _sample_neg(self, assign_result, num_expected, **kwargs): + neg_inds = torch.nonzero(assign_result.gt_inds == 0) + if neg_inds.numel() != 0: + neg_inds = neg_inds.squeeze(1) + if len(neg_inds) <= num_expected: + return neg_inds + else: + max_overlaps = assign_result.max_overlaps.cpu().numpy() + # balance sampling for negative samples + neg_set = set(neg_inds.cpu().numpy()) + + if self.floor_thr > 0: + floor_set = set( + np.where( + np.logical_and(max_overlaps >= 0, + max_overlaps < self.floor_thr))[0]) + iou_sampling_set = set( + np.where(max_overlaps >= self.floor_thr)[0]) + elif self.floor_thr == 0: + floor_set = set(np.where(max_overlaps == 0)[0]) + iou_sampling_set = set( + np.where(max_overlaps > self.floor_thr)[0]) + else: + floor_set = set() + iou_sampling_set = set( + np.where(max_overlaps > self.floor_thr)[0]) + + floor_neg_inds = list(floor_set & neg_set) + iou_sampling_neg_inds = list(iou_sampling_set & neg_set) + num_expected_iou_sampling = int(num_expected * + (1 - self.floor_fraction)) + if len(iou_sampling_neg_inds) > num_expected_iou_sampling: + if self.num_bins >= 2: + iou_sampled_inds = self.sample_via_interval( + max_overlaps, set(iou_sampling_neg_inds), + num_expected_iou_sampling) + else: + iou_sampled_inds = self.random_choice( + iou_sampling_neg_inds, num_expected_iou_sampling) + else: + iou_sampled_inds = np.array( + iou_sampling_neg_inds, dtype=np.int) + num_expected_floor = num_expected - len(iou_sampled_inds) + if len(floor_neg_inds) > num_expected_floor: + sampled_floor_inds = self.random_choice( + floor_neg_inds, num_expected_floor) + else: + sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int) + sampled_inds = np.concatenate( + (sampled_floor_inds, iou_sampled_inds)) + if len(sampled_inds) < num_expected: + num_extra = num_expected - len(sampled_inds) + extra_inds = np.array(list(neg_set - set(sampled_inds))) + if len(extra_inds) > num_extra: + extra_inds = self.random_choice(extra_inds, num_extra) + sampled_inds = np.concatenate((sampled_inds, extra_inds)) + sampled_inds = torch.from_numpy(sampled_inds).long().to( + assign_result.gt_inds.device) + return sampled_inds diff --git a/mmdet3d/core/bbox/samplers/ohem_sampler.py b/mmdet3d/core/bbox/samplers/ohem_sampler.py new file mode 100644 index 0000000000..2500f3113c --- /dev/null +++ b/mmdet3d/core/bbox/samplers/ohem_sampler.py @@ -0,0 +1,73 @@ +import torch + +from ..transforms import bbox2roi +from .base_sampler import BaseSampler + + +class OHEMSampler(BaseSampler): + + def __init__(self, + num, + pos_fraction, + context, + neg_pos_ub=-1, + add_gt_as_proposals=True, + **kwargs): + super(OHEMSampler, self).__init__(num, pos_fraction, neg_pos_ub, + add_gt_as_proposals) + if not hasattr(context, 'num_stages'): + self.bbox_roi_extractor = context.bbox_roi_extractor + self.bbox_head = context.bbox_head + else: + self.bbox_roi_extractor = context.bbox_roi_extractor[ + context.current_stage] + self.bbox_head = context.bbox_head[context.current_stage] + + def hard_mining(self, inds, num_expected, bboxes, labels, feats): + with torch.no_grad(): + rois = bbox2roi([bboxes]) + bbox_feats = self.bbox_roi_extractor( + feats[:self.bbox_roi_extractor.num_inputs], rois) + cls_score, _ = self.bbox_head(bbox_feats) + loss = self.bbox_head.loss( + cls_score=cls_score, + bbox_pred=None, + labels=labels, + label_weights=cls_score.new_ones(cls_score.size(0)), + bbox_targets=None, + bbox_weights=None, + reduction_override='none')['loss_cls'] + _, topk_loss_inds = loss.topk(num_expected) + return inds[topk_loss_inds] + + def _sample_pos(self, + assign_result, + num_expected, + bboxes=None, + feats=None, + **kwargs): + # Sample some hard positive samples + pos_inds = torch.nonzero(assign_result.gt_inds > 0) + if pos_inds.numel() != 0: + pos_inds = pos_inds.squeeze(1) + if pos_inds.numel() <= num_expected: + return pos_inds + else: + return self.hard_mining(pos_inds, num_expected, bboxes[pos_inds], + assign_result.labels[pos_inds], feats) + + def _sample_neg(self, + assign_result, + num_expected, + bboxes=None, + feats=None, + **kwargs): + # Sample some hard negative samples + neg_inds = torch.nonzero(assign_result.gt_inds == 0) + if neg_inds.numel() != 0: + neg_inds = neg_inds.squeeze(1) + if len(neg_inds) <= num_expected: + return neg_inds + else: + return self.hard_mining(neg_inds, num_expected, bboxes[neg_inds], + assign_result.labels[neg_inds], feats) diff --git a/mmdet3d/core/bbox/samplers/pseudo_sampler.py b/mmdet3d/core/bbox/samplers/pseudo_sampler.py new file mode 100644 index 0000000000..b4c2ea09b0 --- /dev/null +++ b/mmdet3d/core/bbox/samplers/pseudo_sampler.py @@ -0,0 +1,26 @@ +import torch + +from .base_sampler import BaseSampler +from .sampling_result import SamplingResult + + +class PseudoSampler(BaseSampler): + + def __init__(self, **kwargs): + pass + + def _sample_pos(self, **kwargs): + raise NotImplementedError + + def _sample_neg(self, **kwargs): + raise NotImplementedError + + def sample(self, assign_result, bboxes, gt_bboxes, **kwargs): + pos_inds = torch.nonzero( + assign_result.gt_inds > 0).squeeze(-1).unique() + neg_inds = torch.nonzero( + assign_result.gt_inds == 0).squeeze(-1).unique() + gt_flags = bboxes.new_zeros(bboxes.shape[0], dtype=torch.uint8) + sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, + assign_result, gt_flags) + return sampling_result diff --git a/mmdet3d/core/bbox/samplers/random_sampler.py b/mmdet3d/core/bbox/samplers/random_sampler.py new file mode 100644 index 0000000000..0d02b2747f --- /dev/null +++ b/mmdet3d/core/bbox/samplers/random_sampler.py @@ -0,0 +1,53 @@ +import numpy as np +import torch + +from .base_sampler import BaseSampler + + +class RandomSampler(BaseSampler): + + def __init__(self, + num, + pos_fraction, + neg_pos_ub=-1, + add_gt_as_proposals=True, + **kwargs): + super(RandomSampler, self).__init__(num, pos_fraction, neg_pos_ub, + add_gt_as_proposals) + + @staticmethod + def random_choice(gallery, num): + """Random select some elements from the gallery. + + It seems that Pytorch's implementation is slower than numpy so we use + numpy to randperm the indices. + """ + assert len(gallery) >= num + if isinstance(gallery, list): + gallery = np.array(gallery) + cands = np.arange(len(gallery)) + np.random.shuffle(cands) + rand_inds = cands[:num] + if not isinstance(gallery, np.ndarray): + rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device) + return gallery[rand_inds] + + def _sample_pos(self, assign_result, num_expected, **kwargs): + """Randomly sample some positive samples.""" + pos_inds = torch.nonzero(assign_result.gt_inds > 0) + if pos_inds.numel() != 0: + pos_inds = pos_inds.squeeze(1) + if pos_inds.numel() <= num_expected: + return pos_inds + else: + return self.random_choice(pos_inds, num_expected) + + def _sample_neg(self, assign_result, num_expected, **kwargs): + """Randomly sample some negative samples.""" + neg_inds = torch.nonzero(assign_result.gt_inds == 0) + if neg_inds.numel() != 0: + neg_inds = neg_inds.squeeze(1) + if len(neg_inds) <= num_expected: + return neg_inds + else: + return self.random_choice(neg_inds, num_expected) diff --git a/mmdet3d/core/bbox/samplers/sampling_result.py b/mmdet3d/core/bbox/samplers/sampling_result.py new file mode 100644 index 0000000000..696e650971 --- /dev/null +++ b/mmdet3d/core/bbox/samplers/sampling_result.py @@ -0,0 +1,24 @@ +import torch + + +class SamplingResult(object): + + def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, + gt_flags): + self.pos_inds = pos_inds + self.neg_inds = neg_inds + self.pos_bboxes = bboxes[pos_inds] + self.neg_bboxes = bboxes[neg_inds] + self.pos_is_gt = gt_flags[pos_inds] + + self.num_gts = gt_bboxes.shape[0] + self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 + self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds, :] + if assign_result.labels is not None: + self.pos_gt_labels = assign_result.labels[pos_inds] + else: + self.pos_gt_labels = None + + @property + def bboxes(self): + return torch.cat([self.pos_bboxes, self.neg_bboxes]) diff --git a/mmdet3d/core/bbox/transforms.py b/mmdet3d/core/bbox/transforms.py new file mode 100644 index 0000000000..3a213ca016 --- /dev/null +++ b/mmdet3d/core/bbox/transforms.py @@ -0,0 +1,269 @@ +import mmcv +import numpy as np +import torch + + +def bbox2delta(proposals, gt, means=[0, 0, 0, 0], stds=[1, 1, 1, 1]): + assert proposals.size() == gt.size() + + proposals = proposals.float() + gt = gt.float() + px = (proposals[..., 0] + proposals[..., 2]) * 0.5 + py = (proposals[..., 1] + proposals[..., 3]) * 0.5 + pw = proposals[..., 2] - proposals[..., 0] + ph = proposals[..., 3] - proposals[..., 1] + + gx = (gt[..., 0] + gt[..., 2]) * 0.5 + gy = (gt[..., 1] + gt[..., 3]) * 0.5 + gw = gt[..., 2] - gt[..., 0] + gh = gt[..., 3] - gt[..., 1] + + dx = (gx - px) / pw + dy = (gy - py) / ph + dw = torch.log(gw / pw) + dh = torch.log(gh / ph) + deltas = torch.stack([dx, dy, dw, dh], dim=-1) + + means = deltas.new_tensor(means).unsqueeze(0) + stds = deltas.new_tensor(stds).unsqueeze(0) + deltas = deltas.sub_(means).div_(stds) + + return deltas + + +def delta2bbox(rois, + deltas, + means=[0, 0, 0, 0], + stds=[1, 1, 1, 1], + max_shape=None, + wh_ratio_clip=16 / 1000): + """ + Apply deltas to shift/scale base boxes. + + Typically the rois are anchor or proposed bounding boxes and the deltas are + network outputs used to shift/scale those boxes. + + Args: + rois (Tensor): boxes to be transformed. Has shape (N, 4) + deltas (Tensor): encoded offsets with respect to each roi. + Has shape (N, 4). Note N = num_anchors * W * H when rois is a grid + of anchors. Offset encoding follows [1]_. + means (list): denormalizing means for delta coordinates + stds (list): denormalizing standard deviation for delta coordinates + max_shape (tuple[int, int]): maximum bounds for boxes. specifies (H, W) + wh_ratio_clip (float): maximum aspect ratio for boxes. + + Returns: + Tensor: boxes with shape (N, 4), where columns represent + tl_x, tl_y, br_x, br_y. + + References: + .. [1] https://arxiv.org/abs/1311.2524 + + Example: + >>> rois = torch.Tensor([[ 0., 0., 1., 1.], + >>> [ 0., 0., 1., 1.], + >>> [ 0., 0., 1., 1.], + >>> [ 5., 5., 5., 5.]]) + >>> deltas = torch.Tensor([[ 0., 0., 0., 0.], + >>> [ 1., 1., 1., 1.], + >>> [ 0., 0., 2., -1.], + >>> [ 0.7, -1.9, -0.5, 0.3]]) + >>> delta2bbox(rois, deltas, max_shape=(32, 32)) + tensor([[0.0000, 0.0000, 1.0000, 1.0000], + [0.2817, 0.2817, 4.7183, 4.7183], + [0.0000, 0.6321, 7.3891, 0.3679], + [5.8967, 2.9251, 5.5033, 3.2749]]) + """ + means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4) + stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4) + denorm_deltas = deltas * stds + means + dx = denorm_deltas[:, 0::4] + dy = denorm_deltas[:, 1::4] + dw = denorm_deltas[:, 2::4] + dh = denorm_deltas[:, 3::4] + max_ratio = np.abs(np.log(wh_ratio_clip)) + dw = dw.clamp(min=-max_ratio, max=max_ratio) + dh = dh.clamp(min=-max_ratio, max=max_ratio) + # Compute center of each roi + px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx) + py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy) + # Compute width/height of each roi + pw = (rois[:, 2] - rois[:, 0]).unsqueeze(1).expand_as(dw) + ph = (rois[:, 3] - rois[:, 1]).unsqueeze(1).expand_as(dh) + # Use exp(network energy) to enlarge/shrink each roi + gw = pw * dw.exp() + gh = ph * dh.exp() + # Use network energy to shift the center of each roi + gx = torch.addcmul(px, 1, pw, dx) # gx = px + pw * dx + gy = torch.addcmul(py, 1, ph, dy) # gy = py + ph * dy + # Convert center-xy/width/height to top-left, bottom-right + x1 = gx - gw * 0.5 + y1 = gy - gh * 0.5 + x2 = gx + gw * 0.5 + y2 = gy + gh * 0.5 + if max_shape is not None: + x1 = x1.clamp(min=0, max=max_shape[1]) + y1 = y1.clamp(min=0, max=max_shape[0]) + x2 = x2.clamp(min=0, max=max_shape[1]) + y2 = y2.clamp(min=0, max=max_shape[0]) + bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas) + return bboxes + + +def bbox_flip(bboxes, img_shape): + """Flip bboxes horizontally. + + Args: + bboxes(Tensor or ndarray): Shape (..., 4*k) + img_shape(tuple): Image shape. + + Returns: + Same type as `bboxes`: Flipped bboxes. + """ + if isinstance(bboxes, torch.Tensor): + assert bboxes.shape[-1] % 4 == 0 + flipped = bboxes.clone() + flipped[:, 0::4] = img_shape[1] - bboxes[:, 2::4] + flipped[:, 2::4] = img_shape[1] - bboxes[:, 0::4] + return flipped + elif isinstance(bboxes, np.ndarray): + return mmcv.bbox_flip(bboxes, img_shape) + + +def bbox_mapping(bboxes, img_shape, scale_factor, flip): + """Map bboxes from the original image scale to testing scale""" + new_bboxes = bboxes * scale_factor + if flip: + new_bboxes = bbox_flip(new_bboxes, img_shape) + return new_bboxes + + +def bbox_mapping_back(bboxes, img_shape, scale_factor, flip): + """Map bboxes from testing scale to original image scale""" + new_bboxes = bbox_flip(bboxes, img_shape) if flip else bboxes + new_bboxes = new_bboxes / scale_factor + return new_bboxes + + +def bbox2roi(bbox_list): + """Convert a list of bboxes to roi format. + + Args: + bbox_list (list[Tensor]): a list of bboxes corresponding to a batch + of images. + + Returns: + Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2] + """ + rois_list = [] + for img_id, bboxes in enumerate(bbox_list): + if bboxes.size(0) > 0: + img_inds = bboxes.new_full((bboxes.size(0), 1), img_id) + rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1) + else: + rois = bboxes.new_zeros((0, 5)) + rois_list.append(rois) + rois = torch.cat(rois_list, 0) + return rois + + +def roi2bbox(rois): + bbox_list = [] + img_ids = torch.unique(rois[:, 0].cpu(), sorted=True) + for img_id in img_ids: + inds = (rois[:, 0] == img_id.item()) + bbox = rois[inds, 1:] + bbox_list.append(bbox) + return bbox_list + + +def bbox2result_coco(bboxes, labels, num_classes): + """Convert detection results to a list of numpy arrays. + + Args: + bboxes (Tensor): shape (n, 5) + labels (Tensor): shape (n, ) + num_classes (int): class number, including background class + + Returns: + list(ndarray): bbox results of each class + """ + if bboxes.shape[0] == 0: + return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)] + else: + bboxes = bboxes.cpu().numpy() + labels = labels.cpu().numpy() + return [bboxes[labels == i, :] for i in range(num_classes)] + + +def distance2bbox(points, distance, max_shape=None): + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (n, 2), [x, y]. + distance (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). + max_shape (tuple): Shape of the image. + + Returns: + Tensor: Decoded bboxes. + """ + x1 = points[:, 0] - distance[:, 0] + y1 = points[:, 1] - distance[:, 1] + x2 = points[:, 0] + distance[:, 2] + y2 = points[:, 1] + distance[:, 3] + if max_shape is not None: + x1 = x1.clamp(min=0, max=max_shape[1]) + y1 = y1.clamp(min=0, max=max_shape[0]) + x2 = x2.clamp(min=0, max=max_shape[1]) + y2 = y2.clamp(min=0, max=max_shape[0]) + return torch.stack([x1, y1, x2, y2], -1) + + +def transform_lidar_to_cam(boxes_lidar): + """ + Only transform format, not exactly in camera coords + :param boxes_lidar: (N, 3 or 7) [x, y, z, w, l, h, ry] in LiDAR coords + :return: boxes_cam: (N, 3 or 7) [x, y, z, h, w, l, ry] in camera coords + """ + # boxes_cam = boxes_lidar.new_tensor(boxes_lidar.data) + boxes_cam = boxes_lidar.clone().detach() + boxes_cam[:, 0] = -boxes_lidar[:, 1] + boxes_cam[:, 1] = -boxes_lidar[:, 2] + boxes_cam[:, 2] = boxes_lidar[:, 0] + if boxes_cam.shape[1] > 3: + boxes_cam[:, [3, 4, 5]] = boxes_lidar[:, [5, 3, 4]] + return boxes_cam + + +def boxes3d_to_bev_torch(boxes3d): + """ + :param boxes3d: (N, 7) [x, y, z, h, w, l, ry] in camera coords + :return: + boxes_bev: (N, 5) [x1, y1, x2, y2, ry] + """ + boxes_bev = boxes3d.new(torch.Size((boxes3d.shape[0], 5))) + + cu, cv = boxes3d[:, 0], boxes3d[:, 2] + half_l, half_w = boxes3d[:, 5] / 2, boxes3d[:, 4] / 2 + boxes_bev[:, 0], boxes_bev[:, 1] = cu - half_l, cv - half_w + boxes_bev[:, 2], boxes_bev[:, 3] = cu + half_l, cv + half_w + boxes_bev[:, 4] = boxes3d[:, 6] + return boxes_bev + + +def boxes3d_to_bev_torch_lidar(boxes3d): + """ + :param boxes3d: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coords + :return: + boxes_bev: (N, 5) [x1, y1, x2, y2, ry] + """ + boxes_bev = boxes3d.new(torch.Size((boxes3d.shape[0], 5))) + + cu, cv = boxes3d[:, 0], boxes3d[:, 1] + half_l, half_w = boxes3d[:, 4] / 2, boxes3d[:, 3] / 2 + boxes_bev[:, 0], boxes_bev[:, 1] = cu - half_w, cv - half_l + boxes_bev[:, 2], boxes_bev[:, 3] = cu + half_w, cv + half_l + boxes_bev[:, 4] = boxes3d[:, 6] + return boxes_bev diff --git a/mmdet3d/core/evaluation/__init__.py b/mmdet3d/core/evaluation/__init__.py new file mode 100644 index 0000000000..6d424903ea --- /dev/null +++ b/mmdet3d/core/evaluation/__init__.py @@ -0,0 +1,14 @@ +from .class_names import (coco_classes, dataset_aliases, get_classes, + imagenet_det_classes, imagenet_vid_classes, + kitti_classes, voc_classes) +from .eval_hooks import (CocoDistEvalmAPHook, CocoDistEvalRecallHook, + DistEvalHook, DistEvalmAPHook, KittiDistEvalmAPHook) +from .kitti_utils import kitti_eval, kitti_eval_coco_style + +__all__ = [ + 'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes', + 'coco_classes', 'dataset_aliases', 'get_classes', 'kitti_classes', + 'kitti_eval_coco_style', 'kitti_eval', 'CocoDistEvalmAPHook', + 'KittiDistEvalmAPHook', 'CocoDistEvalRecallHook', 'DistEvalHook', + 'DistEvalmAPHook' +] diff --git a/mmdet3d/core/evaluation/bbox_overlaps.py b/mmdet3d/core/evaluation/bbox_overlaps.py new file mode 100644 index 0000000000..5507e88c00 --- /dev/null +++ b/mmdet3d/core/evaluation/bbox_overlaps.py @@ -0,0 +1,47 @@ +import numpy as np + + +def bbox_overlaps(bboxes1, bboxes2, mode='iou'): + """Calculate the ious between each bbox of bboxes1 and bboxes2. + + Args: + bboxes1(ndarray): shape (n, 4) + bboxes2(ndarray): shape (k, 4) + mode(str): iou (intersection over union) or iof (intersection + over foreground) + + Returns: + ious(ndarray): shape (n, k) + """ + + assert mode in ['iou', 'iof'] + + bboxes1 = bboxes1.astype(np.float32) + bboxes2 = bboxes2.astype(np.float32) + rows = bboxes1.shape[0] + cols = bboxes2.shape[0] + ious = np.zeros((rows, cols), dtype=np.float32) + if rows * cols == 0: + return ious + exchange = False + if bboxes1.shape[0] > bboxes2.shape[0]: + bboxes1, bboxes2 = bboxes2, bboxes1 + ious = np.zeros((cols, rows), dtype=np.float32) + exchange = True + area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1]) + area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1]) + for i in range(bboxes1.shape[0]): + x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0]) + y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1]) + x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2]) + y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3]) + overlap = np.maximum(x_end - x_start, 0) * np.maximum( + y_end - y_start, 0) + if mode == 'iou': + union = area1[i] + area2 - overlap + else: + union = area1[i] if not exchange else area2 + ious[i, :] = overlap / union + if exchange: + ious = ious.T + return ious diff --git a/mmdet3d/core/evaluation/class_names.py b/mmdet3d/core/evaluation/class_names.py new file mode 100644 index 0000000000..216c2f5691 --- /dev/null +++ b/mmdet3d/core/evaluation/class_names.py @@ -0,0 +1,127 @@ +import mmcv + + +def wider_face_classes(): + return ['face'] + + +def voc_classes(): + return [ + 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', + 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', + 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor' + ] + + +def imagenet_det_classes(): + return [ + 'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo', + 'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam', + 'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap', + 'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder', + 'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito', + 'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle', + 'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker', + 'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew', + 'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper', + 'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly', + 'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig', + 'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog', + 'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart', + 'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger', + 'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim', + 'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse', + 'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle', + 'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard', + 'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can', + 'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace', + 'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume', + 'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza', + 'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine', + 'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse', + 'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator', + 'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler', + 'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver', + 'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile', + 'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula', + 'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer', + 'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine', + 'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie', + 'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet', + 'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin', + 'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft', + 'whale', 'wine_bottle', 'zebra' + ] + + +def imagenet_vid_classes(): + return [ + 'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car', + 'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda', + 'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit', + 'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle', + 'watercraft', 'whale', 'zebra' + ] + + +def coco_classes(): + return [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign', + 'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', + 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', + 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', + 'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard', + 'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork', + 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', + 'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair', + 'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv', + 'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', + 'scissors', 'teddy_bear', 'hair_drier', 'toothbrush' + ] + + +def cityscapes_classes(): + return [ + 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', + 'bicycle' + ] + + +def kitti_classes(): + return [ + 'Car', + 'Pedestrian', + 'Cyclist', + 'Van', + 'Person_sitting', + ] + + +dataset_aliases = { + 'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'], + 'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'], + 'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'], + 'coco': ['coco', 'mscoco', 'ms_coco'], + 'wider_face': ['WIDERFaceDataset', 'wider_face', 'WDIERFace'], + 'cityscapes': ['cityscapes'], + 'kitti': ['KITTI', 'kitti'] +} + + +def get_classes(dataset): + """Get class names of a dataset.""" + alias2name = {} + for name, aliases in dataset_aliases.items(): + for alias in aliases: + alias2name[alias] = name + + if mmcv.is_str(dataset): + if dataset in alias2name: + labels = eval(alias2name[dataset] + '_classes()') + else: + raise ValueError('Unrecognized dataset: {}'.format(dataset)) + else: + raise TypeError('dataset must a str, but got {}'.format(type(dataset))) + return labels diff --git a/mmdet3d/core/evaluation/coco_utils.py b/mmdet3d/core/evaluation/coco_utils.py new file mode 100644 index 0000000000..7fbb6d2ac1 --- /dev/null +++ b/mmdet3d/core/evaluation/coco_utils.py @@ -0,0 +1,251 @@ +import itertools + +import mmcv +import numpy as np +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval +from terminaltables import AsciiTable + +from .recall import eval_recalls + + +def coco_eval(result_files, + result_types, + coco, + max_dets=(100, 300, 1000), + cat_ids=[], + classwise=False): + for res_type in result_types: + assert res_type in [ + 'proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints' + ] + + if mmcv.is_str(coco): + coco = COCO(coco) + assert isinstance(coco, COCO) + + if result_types == ['proposal_fast']: + ar = fast_eval_recall(result_files, coco, np.array(max_dets)) + for i, num in enumerate(max_dets): + print('AR@{}\t= {:.4f}'.format(num, ar[i])) + return + + for res_type in result_types: + if isinstance(result_files, str): + result_file = result_files + elif isinstance(result_files, dict): + result_file = result_files[res_type] + else: + assert TypeError('result_files must be a str or dict') + assert result_file.endswith('.json') + + coco_dets = coco.loadRes(result_file) + # it will load all images if cat_ids is [] + # img_ids = getImgIds(coco, catIds=cat_ids) + if len(cat_ids) < 80: + img_ids = getImgIds(coco, catIds=cat_ids) + else: + img_ids = coco.getImgIds() + iou_type = 'bbox' if res_type == 'proposal' else res_type + cocoEval = COCOeval(coco, coco_dets, iou_type) + if cat_ids: + # cat_ids is not None means it is set + cocoEval.params.catIds = cat_ids + cocoEval.params.imgIds = img_ids + if res_type == 'proposal': + cocoEval.params.useCats = 0 + cocoEval.params.maxDets = list(max_dets) + cocoEval.evaluate() + cocoEval.accumulate() + cocoEval.summarize() + + if classwise: + # Compute per-category AP + # from https://github.com/facebookresearch/detectron2/blob/03064eb5bafe4a3e5750cc7a16672daf5afe8435/detectron2/evaluation/coco_evaluation.py#L259-L283 # noqa + precisions = cocoEval.eval['precision'] + catIds = cat_ids if cat_ids else coco.getCatIds() + # precision has dims (iou, recall, cls, area range, max dets) + assert len(catIds) == precisions.shape[2] + + results_per_category = [] + for idx, catId in enumerate(catIds): + # area range index 0: all area ranges + # max dets index -1: typically 100 per image + nm = coco.loadCats(catId)[0] + precision = precisions[:, :, idx, 0, -1] + precision = precision[precision > -1] + ap = np.mean(precision) if precision.size else float('nan') + results_per_category.append( + ('{}'.format(nm['name']), + '{:0.3f}'.format(float(ap * 100)))) + + N_COLS = min(6, len(results_per_category) * 2) + results_flatten = list(itertools.chain(*results_per_category)) + headers = ['category', 'AP'] * (N_COLS // 2) + results_2d = itertools.zip_longest( + *[results_flatten[i::N_COLS] for i in range(N_COLS)]) + table_data = [headers] + table_data += [result for result in results_2d] + table = AsciiTable(table_data) + print(table.table) + + +def fast_eval_recall(results, + coco, + max_dets, + iou_thrs=np.arange(0.5, 0.96, 0.05)): + if mmcv.is_str(results): + assert results.endswith('.pkl') + results = mmcv.load(results) + elif not isinstance(results, list): + raise TypeError( + 'results must be a list of numpy arrays or a filename, not {}'. + format(type(results))) + + gt_bboxes = [] + img_ids = coco.getImgIds() + for i in range(len(img_ids)): + ann_ids = coco.getAnnIds(imgIds=img_ids[i]) + ann_info = coco.loadAnns(ann_ids) + if len(ann_info) == 0: + gt_bboxes.append(np.zeros((0, 4))) + continue + bboxes = [] + for ann in ann_info: + if ann.get('ignore', False) or ann['iscrowd']: + continue + x1, y1, w, h = ann['bbox'] + bboxes.append([x1, y1, x1 + w, y1 + h]) + bboxes = np.array(bboxes, dtype=np.float32) + if bboxes.shape[0] == 0: + bboxes = np.zeros((0, 4)) + gt_bboxes.append(bboxes) + + recalls = eval_recalls( + gt_bboxes, results, max_dets, iou_thrs, print_summary=False) + ar = recalls.mean(axis=1) + return ar + + +def xyxy2xywh(bbox): + _bbox = bbox.tolist() + return [ + _bbox[0], + _bbox[1], + _bbox[2] - _bbox[0], + _bbox[3] - _bbox[1], + ] + + +def proposal2json(dataset, results): + json_results = [] + for idx in range(len(dataset)): + img_id = dataset.img_ids[idx] + bboxes = results[idx] + for i in range(bboxes.shape[0]): + data = dict() + data['image_id'] = img_id + data['bbox'] = xyxy2xywh(bboxes[i]) + data['score'] = float(bboxes[i][4]) + data['category_id'] = 1 + json_results.append(data) + return json_results + + +def det2json(dataset, results): + json_results = [] + for idx in range(len(dataset)): + img_id = dataset.img_ids[idx] + result = results[idx] + for label in range(len(result)): + bboxes = result[label] + for i in range(bboxes.shape[0]): + data = dict() + data['image_id'] = img_id + data['bbox'] = xyxy2xywh(bboxes[i]) + data['score'] = float(bboxes[i][4]) + data['category_id'] = dataset.cat_ids[label] + json_results.append(data) + return json_results + + +def segm2json(dataset, results): + bbox_json_results = [] + segm_json_results = [] + for idx in range(len(dataset)): + img_id = dataset.img_ids[idx] + det, seg = results[idx] + for label in range(len(det)): + # bbox results + bboxes = det[label] + for i in range(bboxes.shape[0]): + data = dict() + data['image_id'] = img_id + data['bbox'] = xyxy2xywh(bboxes[i]) + data['score'] = float(bboxes[i][4]) + data['category_id'] = dataset.cat_ids[label] + bbox_json_results.append(data) + + # segm results + # some detectors use different score for det and segm + if isinstance(seg, tuple): + segms = seg[0][label] + mask_score = seg[1][label] + else: + segms = seg[label] + mask_score = [bbox[4] for bbox in bboxes] + for i in range(bboxes.shape[0]): + data = dict() + data['image_id'] = img_id + data['bbox'] = xyxy2xywh(bboxes[i]) + data['score'] = float(mask_score[i]) + data['category_id'] = dataset.cat_ids[label] + if isinstance(segms[i]['counts'], bytes): + segms[i]['counts'] = segms[i]['counts'].decode() + data['segmentation'] = segms[i] + segm_json_results.append(data) + return bbox_json_results, segm_json_results + + +def results2json(dataset, results, out_file): + result_files = dict() + if isinstance(results[0], list): + json_results = det2json(dataset, results) + result_files['bbox'] = '{}.{}.json'.format(out_file, 'bbox') + result_files['proposal'] = '{}.{}.json'.format(out_file, 'bbox') + mmcv.dump(json_results, result_files['bbox']) + elif isinstance(results[0], tuple): + json_results = segm2json(dataset, results) + result_files['bbox'] = '{}.{}.json'.format(out_file, 'bbox') + result_files['proposal'] = '{}.{}.json'.format(out_file, 'bbox') + result_files['segm'] = '{}.{}.json'.format(out_file, 'segm') + mmcv.dump(json_results[0], result_files['bbox']) + mmcv.dump(json_results[1], result_files['segm']) + elif isinstance(results[0], np.ndarray): + json_results = proposal2json(dataset, results) + result_files['proposal'] = '{}.{}.json'.format(out_file, 'proposal') + mmcv.dump(json_results, result_files['proposal']) + else: + raise TypeError('invalid type of results') + return result_files + + +def getImgIds(coco, imgIds=[], catIds=[]): + ''' + Get img ids that satisfy given filter conditions. + Different from the coco.getImgIds, this function returns the id if + the img contains one of the cat rather than all. + :param imgIds (int array) : get imgs for given ids + :param catIds (int array) : get imgs with all given cats + :return: ids (int array) : integer array of img ids + ''' + if len(imgIds) == len(catIds) == 0: + ids = coco.imgs.keys() + else: + ids = set(imgIds) + for i, catId in enumerate(catIds): + if i == 0 and len(ids) == 0: + ids = set(coco.catToImgs[catId]) + else: + ids |= set(coco.catToImgs[catId]) + return list(ids) diff --git a/mmdet3d/core/evaluation/eval_hooks.py b/mmdet3d/core/evaluation/eval_hooks.py new file mode 100644 index 0000000000..ccb3cbd73a --- /dev/null +++ b/mmdet3d/core/evaluation/eval_hooks.py @@ -0,0 +1,204 @@ +import os +import os.path as osp + +import mmcv +import numpy as np +import torch +import torch.distributed as dist +from mmcv.parallel import collate, scatter +from mmcv.runner import Hook +from pycocotools.cocoeval import COCOeval +from torch.utils.data import Dataset + +from mmdet3d import datasets +from .coco_utils import fast_eval_recall, results2json +from .mean_ap import eval_map + + +class DistEvalHook(Hook): + + def __init__(self, dataset, interval=1): + if isinstance(dataset, Dataset): + self.dataset = dataset + elif isinstance(dataset, dict): + self.dataset = datasets.build_dataset(dataset, {'test_mode': True}) + else: + raise TypeError( + 'dataset must be a Dataset object or a dict, not {}'.format( + type(dataset))) + self.interval = interval + + def after_train_epoch(self, runner): + if not self.every_n_epochs(runner, self.interval): + return + runner.model.eval() + results = [None for _ in range(len(self.dataset))] + if runner.rank == 0: + prog_bar = mmcv.ProgressBar(len(self.dataset)) + for idx in range(runner.rank, len(self.dataset), runner.world_size): + data = self.dataset[idx] + data_gpu = scatter( + collate([data], samples_per_gpu=1), + [torch.cuda.current_device()])[0] + + # compute output + with torch.no_grad(): + result = runner.model( + return_loss=False, rescale=True, **data_gpu) + results[idx] = result + + batch_size = runner.world_size + if runner.rank == 0: + for _ in range(batch_size): + prog_bar.update() + + if runner.rank == 0: + print('\n') + dist.barrier() + for i in range(1, runner.world_size): + tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i)) + tmp_results = mmcv.load(tmp_file) + for idx in range(i, len(results), runner.world_size): + results[idx] = tmp_results[idx] + os.remove(tmp_file) + self.evaluate(runner, results) + else: + tmp_file = osp.join(runner.work_dir, + 'temp_{}.pkl'.format(runner.rank)) + mmcv.dump(results, tmp_file) + dist.barrier() + dist.barrier() + + def evaluate(self): + raise NotImplementedError + + +class DistEvalmAPHook(DistEvalHook): + + def evaluate(self, runner, results): + gt_bboxes = [] + gt_labels = [] + gt_ignore = [] + for i in range(len(self.dataset)): + ann = self.dataset.get_ann_info(i) + bboxes = ann['bboxes'] + labels = ann['labels'] + if 'bboxes_ignore' in ann: + ignore = np.concatenate([ + np.zeros(bboxes.shape[0], dtype=np.bool), + np.ones(ann['bboxes_ignore'].shape[0], dtype=np.bool) + ]) + gt_ignore.append(ignore) + bboxes = np.vstack([bboxes, ann['bboxes_ignore']]) + labels = np.concatenate([labels, ann['labels_ignore']]) + gt_bboxes.append(bboxes) + gt_labels.append(labels) + if not gt_ignore: + gt_ignore = None + # If the dataset is VOC2007, then use 11 points mAP evaluation. + if hasattr(self.dataset, 'year') and self.dataset.year == 2007: + ds_name = 'voc07' + else: + ds_name = self.dataset.CLASSES + mean_ap, eval_results = eval_map( + results, + gt_bboxes, + gt_labels, + gt_ignore=gt_ignore, + scale_ranges=None, + iou_thr=0.5, + dataset=ds_name, + print_summary=True) + runner.log_buffer.output['mAP'] = mean_ap + runner.log_buffer.ready = True + + +class KittiDistEvalmAPHook(DistEvalHook): + + def evaluate(self, runner, results): + tmp_file = osp.join(runner.work_dir, 'temp_0') + if not isinstance(results[0], dict): + result_files = self.dataset.reformat_bbox(results, tmp_file) + paste_result, ret_dict = self.dataset.evaluate(result_files) + for ap_cls, ap_result in ret_dict.items(): + for ap_type, ap in ap_result.items(): + key = f'{ap_cls}_{ap_type}' + val = float('{:.4f}'.format(ap)) + runner.log_buffer.output[key] = val + else: + for name in results[0]: + print('\nEvaluating {}'.format(name)) + results_ = [out[name] for out in results] + tmp_file_ = osp.join(tmp_file, name) + result_files = self.dataset.reformat_bbox(results_, tmp_file_) + paste_result, ret_dict = self.dataset.evaluate( + result_files, name) + for ap_cls, ap_result in ret_dict.items(): + for ap_type, ap in ap_result.items(): + key = f'{name}/{ap_cls}_{ap_type}' + val = float('{:.4f}'.format(ap)) + runner.log_buffer.output[key] = val + runner.log_buffer.ready = True + + +class CocoDistEvalRecallHook(DistEvalHook): + + def __init__(self, + dataset, + interval=1, + proposal_nums=(100, 300, 1000), + iou_thrs=np.arange(0.5, 0.96, 0.05)): + super(CocoDistEvalRecallHook, self).__init__( + dataset, interval=interval) + self.proposal_nums = np.array(proposal_nums, dtype=np.int32) + self.iou_thrs = np.array(iou_thrs, dtype=np.float32) + + def evaluate(self, runner, results): + # the official coco evaluation is too slow, here we use our own + # implementation instead, which may get slightly different results + ar = fast_eval_recall(results, self.dataset.coco, self.proposal_nums, + self.iou_thrs) + for i, num in enumerate(self.proposal_nums): + runner.log_buffer.output['AR@{}'.format(num)] = ar[i] + runner.log_buffer.ready = True + + +class CocoDistEvalmAPHook(DistEvalHook): + + def evaluate(self, runner, results): + tmp_file = osp.join(runner.work_dir, 'temp_0') + result_files = results2json(self.dataset, results, tmp_file) + + res_types = ['bbox', 'segm' + ] if runner.model.module.with_mask else ['bbox'] + cocoGt = self.dataset.coco + # load image based on cat_ids + if len(self.dataset.cat_ids) < len(self.dataset.CLASSES): + from .coco_utils import getImgIds + imgIds = getImgIds(cocoGt, catIds=self.dataset.cat_ids) + else: + imgIds = cocoGt.getImgIds() + for res_type in res_types: + try: + cocoDt = cocoGt.loadRes(result_files[res_type]) + except IndexError: + print('No prediction found.') + break + iou_type = res_type + cocoEval = COCOeval(cocoGt, cocoDt, iou_type) + cocoEval.params.catIds = self.dataset.cat_ids + cocoEval.params.imgIds = imgIds + cocoEval.evaluate() + cocoEval.accumulate() + cocoEval.summarize() + metrics = ['mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'] + for i in range(len(metrics)): + key = '{}_{}'.format(res_type, metrics[i]) + val = float('{:.3f}'.format(cocoEval.stats[i])) + runner.log_buffer.output[key] = val + runner.log_buffer.output['{}_mAP_copypaste'.format(res_type)] = ( + '{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} ' + '{ap[4]:.3f} {ap[5]:.3f}').format(ap=cocoEval.stats[:6]) + runner.log_buffer.ready = True + for res_type in res_types: + os.remove(result_files[res_type]) diff --git a/mmdet3d/core/evaluation/kitti_utils/__init__.py b/mmdet3d/core/evaluation/kitti_utils/__init__.py new file mode 100644 index 0000000000..b1fc7bc3dc --- /dev/null +++ b/mmdet3d/core/evaluation/kitti_utils/__init__.py @@ -0,0 +1,3 @@ +from .eval import kitti_eval, kitti_eval_coco_style + +__all__ = ['kitti_eval', 'kitti_eval_coco_style'] diff --git a/mmdet3d/core/evaluation/kitti_utils/eval.py b/mmdet3d/core/evaluation/kitti_utils/eval.py new file mode 100644 index 0000000000..d9591892dd --- /dev/null +++ b/mmdet3d/core/evaluation/kitti_utils/eval.py @@ -0,0 +1,814 @@ +import gc +import io as sysio + +import numba +import numpy as np + + +@numba.jit +def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41): + scores.sort() + scores = scores[::-1] + current_recall = 0 + thresholds = [] + for i, score in enumerate(scores): + l_recall = (i + 1) / num_gt + if i < (len(scores) - 1): + r_recall = (i + 2) / num_gt + else: + r_recall = l_recall + if (((r_recall - current_recall) < (current_recall - l_recall)) + and (i < (len(scores) - 1))): + continue + # recall = l_recall + thresholds.append(score) + current_recall += 1 / (num_sample_pts - 1.0) + return thresholds + + +def clean_data(gt_anno, dt_anno, current_class, difficulty): + CLASS_NAMES = ['car', 'pedestrian', 'cyclist'] + MIN_HEIGHT = [40, 25, 25] + MAX_OCCLUSION = [0, 1, 2] + MAX_TRUNCATION = [0.15, 0.3, 0.5] + dc_bboxes, ignored_gt, ignored_dt = [], [], [] + current_cls_name = CLASS_NAMES[current_class].lower() + num_gt = len(gt_anno['name']) + num_dt = len(dt_anno['name']) + num_valid_gt = 0 + for i in range(num_gt): + bbox = gt_anno['bbox'][i] + gt_name = gt_anno['name'][i].lower() + height = bbox[3] - bbox[1] + valid_class = -1 + if (gt_name == current_cls_name): + valid_class = 1 + elif (current_cls_name == 'Pedestrian'.lower() + and 'Person_sitting'.lower() == gt_name): + valid_class = 0 + elif (current_cls_name == 'Car'.lower() and 'Van'.lower() == gt_name): + valid_class = 0 + else: + valid_class = -1 + ignore = False + if ((gt_anno['occluded'][i] > MAX_OCCLUSION[difficulty]) + or (gt_anno['truncated'][i] > MAX_TRUNCATION[difficulty]) + or (height <= MIN_HEIGHT[difficulty])): + ignore = True + if valid_class == 1 and not ignore: + ignored_gt.append(0) + num_valid_gt += 1 + elif (valid_class == 0 or (ignore and (valid_class == 1))): + ignored_gt.append(1) + else: + ignored_gt.append(-1) + # for i in range(num_gt): + if gt_anno['name'][i] == 'DontCare': + dc_bboxes.append(gt_anno['bbox'][i]) + for i in range(num_dt): + if (dt_anno['name'][i].lower() == current_cls_name): + valid_class = 1 + else: + valid_class = -1 + height = abs(dt_anno['bbox'][i, 3] - dt_anno['bbox'][i, 1]) + if height < MIN_HEIGHT[difficulty]: + ignored_dt.append(1) + elif valid_class == 1: + ignored_dt.append(0) + else: + ignored_dt.append(-1) + + return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes + + +@numba.jit(nopython=True) +def image_box_overlap(boxes, query_boxes, criterion=-1): + N = boxes.shape[0] + K = query_boxes.shape[0] + overlaps = np.zeros((N, K), dtype=boxes.dtype) + for k in range(K): + qbox_area = ((query_boxes[k, 2] - query_boxes[k, 0]) * + (query_boxes[k, 3] - query_boxes[k, 1])) + for n in range(N): + iw = ( + min(boxes[n, 2], query_boxes[k, 2]) - + max(boxes[n, 0], query_boxes[k, 0])) + if iw > 0: + ih = ( + min(boxes[n, 3], query_boxes[k, 3]) - + max(boxes[n, 1], query_boxes[k, 1])) + if ih > 0: + if criterion == -1: + ua = ((boxes[n, 2] - boxes[n, 0]) * + (boxes[n, 3] - boxes[n, 1]) + qbox_area - + iw * ih) + elif criterion == 0: + ua = ((boxes[n, 2] - boxes[n, 0]) * + (boxes[n, 3] - boxes[n, 1])) + elif criterion == 1: + ua = qbox_area + else: + ua = 1.0 + overlaps[n, k] = iw * ih / ua + return overlaps + + +def bev_box_overlap(boxes, qboxes, criterion=-1): + from .rotate_iou import rotate_iou_gpu_eval + riou = rotate_iou_gpu_eval(boxes, qboxes, criterion) + return riou + + +@numba.jit(nopython=True, parallel=True) +def d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1): + # ONLY support overlap in CAMERA, not lidar. + # TODO: change to use prange for parallel mode, should check the difference + N, K = boxes.shape[0], qboxes.shape[0] + for i in numba.prange(N): + for j in numba.prange(K): + if rinc[i, j] > 0: + # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] + + # qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1])) + iw = ( + min(boxes[i, 1], qboxes[j, 1]) - + max(boxes[i, 1] - boxes[i, 4], + qboxes[j, 1] - qboxes[j, 4])) + + if iw > 0: + area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5] + area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5] + inc = iw * rinc[i, j] + if criterion == -1: + ua = (area1 + area2 - inc) + elif criterion == 0: + ua = area1 + elif criterion == 1: + ua = area2 + else: + ua = inc + rinc[i, j] = inc / ua + else: + rinc[i, j] = 0.0 + + +def d3_box_overlap(boxes, qboxes, criterion=-1): + from .rotate_iou import rotate_iou_gpu_eval + rinc = rotate_iou_gpu_eval(boxes[:, [0, 2, 3, 5, 6]], + qboxes[:, [0, 2, 3, 5, 6]], 2) + d3_box_overlap_kernel(boxes, qboxes, rinc, criterion) + return rinc + + +@numba.jit(nopython=True) +def compute_statistics_jit(overlaps, + gt_datas, + dt_datas, + ignored_gt, + ignored_det, + dc_bboxes, + metric, + min_overlap, + thresh=0, + compute_fp=False, + compute_aos=False): + + det_size = dt_datas.shape[0] + gt_size = gt_datas.shape[0] + dt_scores = dt_datas[:, -1] + dt_alphas = dt_datas[:, 4] + gt_alphas = gt_datas[:, 4] + dt_bboxes = dt_datas[:, :4] + # gt_bboxes = gt_datas[:, :4] + + assigned_detection = [False] * det_size + ignored_threshold = [False] * det_size + if compute_fp: + for i in range(det_size): + if (dt_scores[i] < thresh): + ignored_threshold[i] = True + NO_DETECTION = -10000000 + tp, fp, fn, similarity = 0, 0, 0, 0 + # thresholds = [0.0] + # delta = [0.0] + thresholds = np.zeros((gt_size, )) + thresh_idx = 0 + delta = np.zeros((gt_size, )) + delta_idx = 0 + for i in range(gt_size): + if ignored_gt[i] == -1: + continue + det_idx = -1 + valid_detection = NO_DETECTION + max_overlap = 0 + assigned_ignored_det = False + + for j in range(det_size): + if (ignored_det[j] == -1): + continue + if (assigned_detection[j]): + continue + if (ignored_threshold[j]): + continue + overlap = overlaps[j, i] + dt_score = dt_scores[j] + if (not compute_fp and (overlap > min_overlap) + and dt_score > valid_detection): + det_idx = j + valid_detection = dt_score + elif (compute_fp and (overlap > min_overlap) + and (overlap > max_overlap or assigned_ignored_det) + and ignored_det[j] == 0): + max_overlap = overlap + det_idx = j + valid_detection = 1 + assigned_ignored_det = False + elif (compute_fp and (overlap > min_overlap) + and (valid_detection == NO_DETECTION) + and ignored_det[j] == 1): + det_idx = j + valid_detection = 1 + assigned_ignored_det = True + + if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0: + fn += 1 + elif ((valid_detection != NO_DETECTION) + and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1)): + assigned_detection[det_idx] = True + elif valid_detection != NO_DETECTION: + tp += 1 + # thresholds.append(dt_scores[det_idx]) + thresholds[thresh_idx] = dt_scores[det_idx] + thresh_idx += 1 + if compute_aos: + # delta.append(gt_alphas[i] - dt_alphas[det_idx]) + delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx] + delta_idx += 1 + + assigned_detection[det_idx] = True + if compute_fp: + for i in range(det_size): + if (not (assigned_detection[i] or ignored_det[i] == -1 + or ignored_det[i] == 1 or ignored_threshold[i])): + fp += 1 + nstuff = 0 + if metric == 0: + overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0) + for i in range(dc_bboxes.shape[0]): + for j in range(det_size): + if (assigned_detection[j]): + continue + if (ignored_det[j] == -1 or ignored_det[j] == 1): + continue + if (ignored_threshold[j]): + continue + if overlaps_dt_dc[j, i] > min_overlap: + assigned_detection[j] = True + nstuff += 1 + fp -= nstuff + if compute_aos: + tmp = np.zeros((fp + delta_idx, )) + # tmp = [0] * fp + for i in range(delta_idx): + tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0 + # tmp.append((1.0 + np.cos(delta[i])) / 2.0) + # assert len(tmp) == fp + tp + # assert len(delta) == tp + if tp > 0 or fp > 0: + similarity = np.sum(tmp) + else: + similarity = -1 + return tp, fp, fn, similarity, thresholds[:thresh_idx] + + +def get_split_parts(num, num_part): + same_part = num // num_part + remain_num = num % num_part + if remain_num == 0: + return [same_part] * num_part + else: + return [same_part] * num_part + [remain_num] + + +@numba.jit(nopython=True) +def fused_compute_statistics(overlaps, + pr, + gt_nums, + dt_nums, + dc_nums, + gt_datas, + dt_datas, + dontcares, + ignored_gts, + ignored_dets, + metric, + min_overlap, + thresholds, + compute_aos=False): + gt_num = 0 + dt_num = 0 + dc_num = 0 + for i in range(gt_nums.shape[0]): + for t, thresh in enumerate(thresholds): + overlap = overlaps[dt_num:dt_num + dt_nums[i], + gt_num:gt_num + gt_nums[i]] + + gt_data = gt_datas[gt_num:gt_num + gt_nums[i]] + dt_data = dt_datas[dt_num:dt_num + dt_nums[i]] + ignored_gt = ignored_gts[gt_num:gt_num + gt_nums[i]] + ignored_det = ignored_dets[dt_num:dt_num + dt_nums[i]] + dontcare = dontcares[dc_num:dc_num + dc_nums[i]] + tp, fp, fn, similarity, _ = compute_statistics_jit( + overlap, + gt_data, + dt_data, + ignored_gt, + ignored_det, + dontcare, + metric, + min_overlap=min_overlap, + thresh=thresh, + compute_fp=True, + compute_aos=compute_aos) + pr[t, 0] += tp + pr[t, 1] += fp + pr[t, 2] += fn + if similarity != -1: + pr[t, 3] += similarity + gt_num += gt_nums[i] + dt_num += dt_nums[i] + dc_num += dc_nums[i] + + +def calculate_iou_partly(gt_annos, dt_annos, metric, num_parts=50): + """fast iou algorithm. this function can be used independently to + do result analysis. Must be used in CAMERA coordinate system. + Args: + gt_annos: dict, must from get_label_annos() in kitti_common.py + dt_annos: dict, must from get_label_annos() in kitti_common.py + metric: eval type. 0: bbox, 1: bev, 2: 3d + num_parts: int. a parameter for fast calculate algorithm + """ + assert len(gt_annos) == len(dt_annos) + total_dt_num = np.stack([len(a['name']) for a in dt_annos], 0) + total_gt_num = np.stack([len(a['name']) for a in gt_annos], 0) + num_examples = len(gt_annos) + split_parts = get_split_parts(num_examples, num_parts) + parted_overlaps = [] + example_idx = 0 + + for num_part in split_parts: + gt_annos_part = gt_annos[example_idx:example_idx + num_part] + dt_annos_part = dt_annos[example_idx:example_idx + num_part] + if metric == 0: + gt_boxes = np.concatenate([a['bbox'] for a in gt_annos_part], 0) + dt_boxes = np.concatenate([a['bbox'] for a in dt_annos_part], 0) + overlap_part = image_box_overlap(gt_boxes, dt_boxes) + elif metric == 1: + loc = np.concatenate( + [a['location'][:, [0, 2]] for a in gt_annos_part], 0) + dims = np.concatenate( + [a['dimensions'][:, [0, 2]] for a in gt_annos_part], 0) + rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0) + gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], + axis=1) + loc = np.concatenate( + [a['location'][:, [0, 2]] for a in dt_annos_part], 0) + dims = np.concatenate( + [a['dimensions'][:, [0, 2]] for a in dt_annos_part], 0) + rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0) + dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], + axis=1) + overlap_part = bev_box_overlap(gt_boxes, + dt_boxes).astype(np.float64) + elif metric == 2: + loc = np.concatenate([a['location'] for a in gt_annos_part], 0) + dims = np.concatenate([a['dimensions'] for a in gt_annos_part], 0) + rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0) + gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], + axis=1) + loc = np.concatenate([a['location'] for a in dt_annos_part], 0) + dims = np.concatenate([a['dimensions'] for a in dt_annos_part], 0) + rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0) + dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], + axis=1) + overlap_part = d3_box_overlap(gt_boxes, + dt_boxes).astype(np.float64) + else: + raise ValueError('unknown metric') + parted_overlaps.append(overlap_part) + example_idx += num_part + overlaps = [] + example_idx = 0 + for j, num_part in enumerate(split_parts): + gt_annos_part = gt_annos[example_idx:example_idx + num_part] + dt_annos_part = dt_annos[example_idx:example_idx + num_part] + gt_num_idx, dt_num_idx = 0, 0 + for i in range(num_part): + gt_box_num = total_gt_num[example_idx + i] + dt_box_num = total_dt_num[example_idx + i] + overlaps.append( + parted_overlaps[j][gt_num_idx:gt_num_idx + gt_box_num, + dt_num_idx:dt_num_idx + dt_box_num]) + gt_num_idx += gt_box_num + dt_num_idx += dt_box_num + example_idx += num_part + + return overlaps, parted_overlaps, total_gt_num, total_dt_num + + +def _prepare_data(gt_annos, dt_annos, current_class, difficulty): + gt_datas_list = [] + dt_datas_list = [] + total_dc_num = [] + ignored_gts, ignored_dets, dontcares = [], [], [] + total_num_valid_gt = 0 + for i in range(len(gt_annos)): + rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty) + num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets + ignored_gts.append(np.array(ignored_gt, dtype=np.int64)) + ignored_dets.append(np.array(ignored_det, dtype=np.int64)) + if len(dc_bboxes) == 0: + dc_bboxes = np.zeros((0, 4)).astype(np.float64) + else: + dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64) + total_dc_num.append(dc_bboxes.shape[0]) + dontcares.append(dc_bboxes) + total_num_valid_gt += num_valid_gt + gt_datas = np.concatenate( + [gt_annos[i]['bbox'], gt_annos[i]['alpha'][..., np.newaxis]], 1) + dt_datas = np.concatenate([ + dt_annos[i]['bbox'], dt_annos[i]['alpha'][..., np.newaxis], + dt_annos[i]['score'][..., np.newaxis] + ], 1) + gt_datas_list.append(gt_datas) + dt_datas_list.append(dt_datas) + total_dc_num = np.stack(total_dc_num, axis=0) + return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, + total_dc_num, total_num_valid_gt) + + +def eval_class(gt_annos, + dt_annos, + current_classes, + difficultys, + metric, + min_overlaps, + compute_aos=False, + num_parts=200): + """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP. + Args: + gt_annos: dict, must from get_label_annos() in kitti_common.py + dt_annos: dict, must from get_label_annos() in kitti_common.py + current_classes: list of int, 0: car, 1: pedestrian, 2: cyclist + difficultys: list of int. eval difficulty, 0: easy, 1: normal, 2: hard + metric: eval type. 0: bbox, 1: bev, 2: 3d + min_overlaps: float, min overlap. format: [num_overlap, metric, class]. + num_parts: int. a parameter for fast calculate algorithm + + Returns: + dict of recall, precision and aos + """ + assert len(gt_annos) == len(dt_annos) + num_examples = len(gt_annos) + split_parts = get_split_parts(num_examples, num_parts) + + rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts) + overlaps, parted_overlaps, total_dt_num, total_gt_num = rets + N_SAMPLE_PTS = 41 + num_minoverlap = len(min_overlaps) + num_class = len(current_classes) + num_difficulty = len(difficultys) + precision = np.zeros( + [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) + recall = np.zeros( + [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) + aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) + for m, current_class in enumerate(current_classes): + for l, difficulty in enumerate(difficultys): + rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty) + (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, + dontcares, total_dc_num, total_num_valid_gt) = rets + for k, min_overlap in enumerate(min_overlaps[:, metric, m]): + thresholdss = [] + for i in range(len(gt_annos)): + rets = compute_statistics_jit( + overlaps[i], + gt_datas_list[i], + dt_datas_list[i], + ignored_gts[i], + ignored_dets[i], + dontcares[i], + metric, + min_overlap=min_overlap, + thresh=0.0, + compute_fp=False) + tp, fp, fn, similarity, thresholds = rets + thresholdss += thresholds.tolist() + thresholdss = np.array(thresholdss) + thresholds = get_thresholds(thresholdss, total_num_valid_gt) + thresholds = np.array(thresholds) + pr = np.zeros([len(thresholds), 4]) + idx = 0 + for j, num_part in enumerate(split_parts): + gt_datas_part = np.concatenate( + gt_datas_list[idx:idx + num_part], 0) + dt_datas_part = np.concatenate( + dt_datas_list[idx:idx + num_part], 0) + dc_datas_part = np.concatenate( + dontcares[idx:idx + num_part], 0) + ignored_dets_part = np.concatenate( + ignored_dets[idx:idx + num_part], 0) + ignored_gts_part = np.concatenate( + ignored_gts[idx:idx + num_part], 0) + fused_compute_statistics( + parted_overlaps[j], + pr, + total_gt_num[idx:idx + num_part], + total_dt_num[idx:idx + num_part], + total_dc_num[idx:idx + num_part], + gt_datas_part, + dt_datas_part, + dc_datas_part, + ignored_gts_part, + ignored_dets_part, + metric, + min_overlap=min_overlap, + thresholds=thresholds, + compute_aos=compute_aos) + idx += num_part + for i in range(len(thresholds)): + recall[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2]) + precision[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1]) + if compute_aos: + aos[m, l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1]) + for i in range(len(thresholds)): + precision[m, l, k, i] = np.max( + precision[m, l, k, i:], axis=-1) + recall[m, l, k, i] = np.max(recall[m, l, k, i:], axis=-1) + if compute_aos: + aos[m, l, k, i] = np.max(aos[m, l, k, i:], axis=-1) + ret_dict = { + 'recall': recall, + 'precision': precision, + 'orientation': aos, + } + + # clean temp variables + del overlaps + del parted_overlaps + + gc.collect() + return ret_dict + + +def get_mAP(prec): + sums = 0 + for i in range(0, prec.shape[-1], 4): + sums = sums + prec[..., i] + return sums / 11 * 100 + + +def print_str(value, *arg, sstream=None): + if sstream is None: + sstream = sysio.StringIO() + sstream.truncate(0) + sstream.seek(0) + print(value, *arg, file=sstream) + return sstream.getvalue() + + +def do_eval(gt_annos, + dt_annos, + current_classes, + min_overlaps, + eval_types=['bbox', 'bev', '3d']): + # min_overlaps: [num_minoverlap, metric, num_class] + difficultys = [0, 1, 2] + ret = eval_class( + gt_annos, + dt_annos, + current_classes, + difficultys, + 0, + min_overlaps, + compute_aos=('aos' in eval_types)) + # ret: [num_class, num_diff, num_minoverlap, num_sample_points] + mAP_bbox = get_mAP(ret['precision']) + mAP_aos = None + if 'aos' in eval_types: + mAP_aos = get_mAP(ret['orientation']) + + mAP_bev = None + if 'bev' in eval_types: + ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 1, + min_overlaps) + mAP_bev = get_mAP(ret['precision']) + + mAP_3d = None + if '3d' in eval_types: + ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2, + min_overlaps) + mAP_3d = get_mAP(ret['precision']) + return mAP_bbox, mAP_bev, mAP_3d, mAP_aos + + +def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges, + compute_aos): + # overlap_ranges: [range, metric, num_class] + min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]]) + for i in range(overlap_ranges.shape[1]): + for j in range(overlap_ranges.shape[2]): + min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j]) + mAP_bbox, mAP_bev, mAP_3d, mAP_aos = do_eval(gt_annos, dt_annos, + current_classes, min_overlaps, + compute_aos) + # ret: [num_class, num_diff, num_minoverlap] + mAP_bbox = mAP_bbox.mean(-1) + mAP_bev = mAP_bev.mean(-1) + mAP_3d = mAP_3d.mean(-1) + if mAP_aos is not None: + mAP_aos = mAP_aos.mean(-1) + return mAP_bbox, mAP_bev, mAP_3d, mAP_aos + + +def kitti_eval(gt_annos, + dt_annos, + current_classes, + eval_types=['bbox', 'bev', '3d']): + assert 'bbox' in eval_types, 'must evaluate bbox at least' + overlap_0_7 = np.array([[0.7, 0.5, 0.5, 0.7, + 0.5], [0.7, 0.5, 0.5, 0.7, 0.5], + [0.7, 0.5, 0.5, 0.7, 0.5]]) + overlap_0_5 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5], + [0.5, 0.25, 0.25, 0.5, 0.25], + [0.5, 0.25, 0.25, 0.5, 0.25]]) + min_overlaps = np.stack([overlap_0_7, overlap_0_5], axis=0) # [2, 3, 5] + class_to_name = { + 0: 'Car', + 1: 'Pedestrian', + 2: 'Cyclist', + 3: 'Van', + 4: 'Person_sitting', + } + name_to_class = {v: n for n, v in class_to_name.items()} + if not isinstance(current_classes, (list, tuple)): + current_classes = [current_classes] + current_classes_int = [] + for curcls in current_classes: + if isinstance(curcls, str): + current_classes_int.append(name_to_class[curcls]) + else: + current_classes_int.append(curcls) + current_classes = current_classes_int + min_overlaps = min_overlaps[:, :, current_classes] + result = '' + # check whether alpha is valid + compute_aos = False + for anno in dt_annos: + if anno['alpha'].shape[0] != 0: + if anno['alpha'][0] != -10: + compute_aos = True + eval_types.append('aos') + break + + mAPbbox, mAPbev, mAP3d, mAPaos = do_eval(gt_annos, dt_annos, + current_classes, min_overlaps, + eval_types) + + ret_dict = {} + difficulty = ['easy', 'moderate', 'hard'] + for j, curcls in enumerate(current_classes): + # mAP threshold array: [num_minoverlap, metric, class] + # mAP result: [num_class, num_diff, num_minoverlap] + curcls_name = class_to_name[curcls] + ret_dict[curcls_name] = {} + for i in range(min_overlaps.shape[0]): + # prepare results for print + result += ('{} AP@{:.2f}, {:.2f}, {:.2f}:\n'.format( + curcls_name, *min_overlaps[i, :, j])) + if mAPbbox is not None: + result += ( + 'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbbox[j, :, + i])) + if mAPbev is not None: + result += ( + 'bev AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbev[j, :, + i])) + if mAP3d is not None: + result += ( + '3d AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP3d[j, :, i])) + + if compute_aos: + result += ( + 'aos AP:{:.2f}, {:.2f}, {:.2f}\n'.format(*mAPaos[j, :, + i])) + + # prepare results for logger + for idx in range(3): + postfix = '{}_{}'.format(difficulty[idx], min_overlaps[i, idx, + j]) + if mAP3d is not None: + ret_dict[curcls_name]['3D_{}'.format(postfix)] = mAP3d[j, + idx, + i] + if mAPbev is not None: + ret_dict[curcls_name]['BEV_{}'.format(postfix)] = mAPbev[ + j, idx, i] + if mAPbbox is not None: + ret_dict[curcls_name]['2D_{}'.format(postfix)] = mAPbbox[ + j, idx, i] + + # calculate mAP over all classes if there are multiple classes + if len(current_classes) > 1: + # prepare results for print + result += ('\nOverall AP@{}, {}, {}:\n'.format(*difficulty)) + if mAPbbox is not None: + mAPbbox = mAPbbox.mean(axis=0) + result += ('bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbbox[:, + 0])) + if mAPbev is not None: + mAPbev = mAPbev.mean(axis=0) + result += ('bev AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbev[:, + 0])) + if mAP3d is not None: + mAP3d = mAP3d.mean(axis=0) + result += ('3d AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP3d[:, 0])) + if compute_aos: + mAPaos = mAPaos.mean(axis=0) + result += ('aos AP:{:.2f}, {:.2f}, {:.2f}\n'.format(*mAPaos[:, + 0])) + + # prepare results for logger + ret_dict['Overall'] = dict() + for idx in range(3): + postfix = '{}'.format(difficulty[idx]) + if mAP3d is not None: + ret_dict['Overall']['3D_{}'.format(postfix)] = mAP3d[idx, 0] + if mAPbev is not None: + ret_dict['Overall']['BEV_{}'.format(postfix)] = mAPbev[idx, 0] + if mAPbbox is not None: + ret_dict['Overall']['2D_{}'.format(postfix)] = mAPbbox[idx, 0] + print(result) + return result, ret_dict + + +def kitti_eval_coco_style(gt_annos, dt_annos, current_classes): + class_to_name = { + 0: 'Car', + 1: 'Pedestrian', + 2: 'Cyclist', + 3: 'Van', + 4: 'Person_sitting', + } + class_to_range = { + 0: [0.5, 0.95, 10], + 1: [0.25, 0.7, 10], + 2: [0.25, 0.7, 10], + 3: [0.5, 0.95, 10], + 4: [0.25, 0.7, 10], + } + name_to_class = {v: n for n, v in class_to_name.items()} + if not isinstance(current_classes, (list, tuple)): + current_classes = [current_classes] + current_classes_int = [] + for curcls in current_classes: + if isinstance(curcls, str): + current_classes_int.append(name_to_class[curcls]) + else: + current_classes_int.append(curcls) + current_classes = current_classes_int + overlap_ranges = np.zeros([3, 3, len(current_classes)]) + for i, curcls in enumerate(current_classes): + overlap_ranges[:, :, i] = np.array(class_to_range[curcls])[:, + np.newaxis] + result = '' + # check whether alpha is valid + compute_aos = False + for anno in dt_annos: + if anno['alpha'].shape[0] != 0: + if anno['alpha'][0] != -10: + compute_aos = True + break + mAPbbox, mAPbev, mAP3d, mAPaos = do_coco_style_eval( + gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos) + for j, curcls in enumerate(current_classes): + # mAP threshold array: [num_minoverlap, metric, class] + # mAP result: [num_class, num_diff, num_minoverlap] + o_range = np.array(class_to_range[curcls])[[0, 2, 1]] + o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1) + result += print_str((f'{class_to_name[curcls]} ' + 'coco AP@{:.2f}:{:.2f}:{:.2f}:'.format(*o_range))) + result += print_str((f'bbox AP:{mAPbbox[j, 0]:.2f}, ' + f'{mAPbbox[j, 1]:.2f}, ' + f'{mAPbbox[j, 2]:.2f}')) + result += print_str((f'bev AP:{mAPbev[j, 0]:.2f}, ' + f'{mAPbev[j, 1]:.2f}, ' + f'{mAPbev[j, 2]:.2f}')) + result += print_str((f'3d AP:{mAP3d[j, 0]:.2f}, ' + f'{mAP3d[j, 1]:.2f}, ' + f'{mAP3d[j, 2]:.2f}')) + if compute_aos: + result += print_str((f'aos AP:{mAPaos[j, 0]:.2f}, ' + f'{mAPaos[j, 1]:.2f}, ' + f'{mAPaos[j, 2]:.2f}')) + return result diff --git a/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py b/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py new file mode 100644 index 0000000000..735386943c --- /dev/null +++ b/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py @@ -0,0 +1,341 @@ +##################### +# Based on https://github.com/hongzhenwang/RRPN-revise +# Licensed under The MIT License +# Author: yanyan, scrin@foxmail.com +##################### +import math + +import numba +import numpy as np +from numba import cuda + + +@numba.jit(nopython=True) +def div_up(m, n): + return m // n + (m % n > 0) + + +@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True) +def trangle_area(a, b, c): + return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) * + (b[0] - c[0])) / 2.0 + + +@cuda.jit('(float32[:], int32)', device=True, inline=True) +def area(int_pts, num_of_inter): + area_val = 0.0 + for i in range(num_of_inter - 2): + area_val += abs( + trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4], + int_pts[2 * i + 4:2 * i + 6])) + return area_val + + +@cuda.jit('(float32[:], int32)', device=True, inline=True) +def sort_vertex_in_convex_polygon(int_pts, num_of_inter): + if num_of_inter > 0: + center = cuda.local.array((2, ), dtype=numba.float32) + center[:] = 0.0 + for i in range(num_of_inter): + center[0] += int_pts[2 * i] + center[1] += int_pts[2 * i + 1] + center[0] /= num_of_inter + center[1] /= num_of_inter + v = cuda.local.array((2, ), dtype=numba.float32) + vs = cuda.local.array((16, ), dtype=numba.float32) + for i in range(num_of_inter): + v[0] = int_pts[2 * i] - center[0] + v[1] = int_pts[2 * i + 1] - center[1] + d = math.sqrt(v[0] * v[0] + v[1] * v[1]) + v[0] = v[0] / d + v[1] = v[1] / d + if v[1] < 0: + v[0] = -2 - v[0] + vs[i] = v[0] + j = 0 + temp = 0 + for i in range(1, num_of_inter): + if vs[i - 1] > vs[i]: + temp = vs[i] + tx = int_pts[2 * i] + ty = int_pts[2 * i + 1] + j = i + while j > 0 and vs[j - 1] > temp: + vs[j] = vs[j - 1] + int_pts[j * 2] = int_pts[j * 2 - 2] + int_pts[j * 2 + 1] = int_pts[j * 2 - 1] + j -= 1 + + vs[j] = temp + int_pts[j * 2] = tx + int_pts[j * 2 + 1] = ty + + +@cuda.jit( + '(float32[:], float32[:], int32, int32, float32[:])', + device=True, + inline=True) +def line_segment_intersection(pts1, pts2, i, j, temp_pts): + A = cuda.local.array((2, ), dtype=numba.float32) + B = cuda.local.array((2, ), dtype=numba.float32) + C = cuda.local.array((2, ), dtype=numba.float32) + D = cuda.local.array((2, ), dtype=numba.float32) + + A[0] = pts1[2 * i] + A[1] = pts1[2 * i + 1] + + B[0] = pts1[2 * ((i + 1) % 4)] + B[1] = pts1[2 * ((i + 1) % 4) + 1] + + C[0] = pts2[2 * j] + C[1] = pts2[2 * j + 1] + + D[0] = pts2[2 * ((j + 1) % 4)] + D[1] = pts2[2 * ((j + 1) % 4) + 1] + BA0 = B[0] - A[0] + BA1 = B[1] - A[1] + DA0 = D[0] - A[0] + CA0 = C[0] - A[0] + DA1 = D[1] - A[1] + CA1 = C[1] - A[1] + acd = DA1 * CA0 > CA1 * DA0 + bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0]) + if acd != bcd: + abc = CA1 * BA0 > BA1 * CA0 + abd = DA1 * BA0 > BA1 * DA0 + if abc != abd: + DC0 = D[0] - C[0] + DC1 = D[1] - C[1] + ABBA = A[0] * B[1] - B[0] * A[1] + CDDC = C[0] * D[1] - D[0] * C[1] + DH = BA1 * DC0 - BA0 * DC1 + Dx = ABBA * DC0 - BA0 * CDDC + Dy = ABBA * DC1 - BA1 * CDDC + temp_pts[0] = Dx / DH + temp_pts[1] = Dy / DH + return True + return False + + +@cuda.jit( + '(float32[:], float32[:], int32, int32, float32[:])', + device=True, + inline=True) +def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts): + a = cuda.local.array((2, ), dtype=numba.float32) + b = cuda.local.array((2, ), dtype=numba.float32) + c = cuda.local.array((2, ), dtype=numba.float32) + d = cuda.local.array((2, ), dtype=numba.float32) + + a[0] = pts1[2 * i] + a[1] = pts1[2 * i + 1] + + b[0] = pts1[2 * ((i + 1) % 4)] + b[1] = pts1[2 * ((i + 1) % 4) + 1] + + c[0] = pts2[2 * j] + c[1] = pts2[2 * j + 1] + + d[0] = pts2[2 * ((j + 1) % 4)] + d[1] = pts2[2 * ((j + 1) % 4) + 1] + + area_abc = trangle_area(a, b, c) + area_abd = trangle_area(a, b, d) + + if area_abc * area_abd >= 0: + return False + + area_cda = trangle_area(c, d, a) + area_cdb = area_cda + area_abc - area_abd + + if area_cda * area_cdb >= 0: + return False + t = area_cda / (area_abd - area_abc) + + dx = t * (b[0] - a[0]) + dy = t * (b[1] - a[1]) + temp_pts[0] = a[0] + dx + temp_pts[1] = a[1] + dy + return True + + +@cuda.jit('(float32, float32, float32[:])', device=True, inline=True) +def point_in_quadrilateral(pt_x, pt_y, corners): + ab0 = corners[2] - corners[0] + ab1 = corners[3] - corners[1] + + ad0 = corners[6] - corners[0] + ad1 = corners[7] - corners[1] + + ap0 = pt_x - corners[0] + ap1 = pt_y - corners[1] + + abab = ab0 * ab0 + ab1 * ab1 + abap = ab0 * ap0 + ab1 * ap1 + adad = ad0 * ad0 + ad1 * ad1 + adap = ad0 * ap0 + ad1 * ap1 + + return abab >= abap and abap >= 0 and adad >= adap and adap >= 0 + + +@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True) +def quadrilateral_intersection(pts1, pts2, int_pts): + num_of_inter = 0 + for i in range(4): + if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2): + int_pts[num_of_inter * 2] = pts1[2 * i] + int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1] + num_of_inter += 1 + if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1): + int_pts[num_of_inter * 2] = pts2[2 * i] + int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1] + num_of_inter += 1 + temp_pts = cuda.local.array((2, ), dtype=numba.float32) + for i in range(4): + for j in range(4): + has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts) + if has_pts: + int_pts[num_of_inter * 2] = temp_pts[0] + int_pts[num_of_inter * 2 + 1] = temp_pts[1] + num_of_inter += 1 + + return num_of_inter + + +@cuda.jit('(float32[:], float32[:])', device=True, inline=True) +def rbbox_to_corners(corners, rbbox): + # generate clockwise corners and rotate it clockwise + angle = rbbox[4] + a_cos = math.cos(angle) + a_sin = math.sin(angle) + center_x = rbbox[0] + center_y = rbbox[1] + x_d = rbbox[2] + y_d = rbbox[3] + corners_x = cuda.local.array((4, ), dtype=numba.float32) + corners_y = cuda.local.array((4, ), dtype=numba.float32) + corners_x[0] = -x_d / 2 + corners_x[1] = -x_d / 2 + corners_x[2] = x_d / 2 + corners_x[3] = x_d / 2 + corners_y[0] = -y_d / 2 + corners_y[1] = y_d / 2 + corners_y[2] = y_d / 2 + corners_y[3] = -y_d / 2 + for i in range(4): + corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x + corners[2 * i + + 1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y + + +@cuda.jit('(float32[:], float32[:])', device=True, inline=True) +def inter(rbbox1, rbbox2): + corners1 = cuda.local.array((8, ), dtype=numba.float32) + corners2 = cuda.local.array((8, ), dtype=numba.float32) + intersection_corners = cuda.local.array((16, ), dtype=numba.float32) + + rbbox_to_corners(corners1, rbbox1) + rbbox_to_corners(corners2, rbbox2) + + num_intersection = quadrilateral_intersection(corners1, corners2, + intersection_corners) + sort_vertex_in_convex_polygon(intersection_corners, num_intersection) + # print(intersection_corners.reshape([-1, 2])[:num_intersection]) + + return area(intersection_corners, num_intersection) + + +@cuda.jit('(float32[:], float32[:], int32)', device=True, inline=True) +def devRotateIoUEval(rbox1, rbox2, criterion=-1): + area1 = rbox1[2] * rbox1[3] + area2 = rbox2[2] * rbox2[3] + area_inter = inter(rbox1, rbox2) + if criterion == -1: + return area_inter / (area1 + area2 - area_inter) + elif criterion == 0: + return area_inter / area1 + elif criterion == 1: + return area_inter / area2 + else: + return area_inter + + +@cuda.jit( + '(int64, int64, float32[:], float32[:], float32[:], int32)', + fastmath=False) +def rotate_iou_kernel_eval(N, + K, + dev_boxes, + dev_query_boxes, + dev_iou, + criterion=-1): + threadsPerBlock = 8 * 8 + row_start = cuda.blockIdx.x + col_start = cuda.blockIdx.y + tx = cuda.threadIdx.x + row_size = min(N - row_start * threadsPerBlock, threadsPerBlock) + col_size = min(K - col_start * threadsPerBlock, threadsPerBlock) + block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32) + block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32) + + dev_query_box_idx = threadsPerBlock * col_start + tx + dev_box_idx = threadsPerBlock * row_start + tx + if (tx < col_size): + block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0] + block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1] + block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2] + block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3] + block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4] + if (tx < row_size): + block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0] + block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1] + block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2] + block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3] + block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4] + cuda.syncthreads() + if tx < row_size: + for i in range(col_size): + offset = ( + row_start * threadsPerBlock * K + col_start * threadsPerBlock + + tx * K + i) + dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5], + block_boxes[tx * 5:tx * 5 + 5], + criterion) + + +def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0): + """rotated box iou running in gpu. 500x faster than cpu version + (take 5ms in one example with numba.cuda code). + convert from [this project]( + https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation). + + Args: + boxes (float tensor: [N, 5]): rbboxes. format: centers, dims, + angles(clockwise when positive) + query_boxes (float tensor: [K, 5]): [description] + device_id (int, optional): Defaults to 0. [description] + + Returns: + [type]: [description] + """ + boxes = boxes.astype(np.float32) + query_boxes = query_boxes.astype(np.float32) + N = boxes.shape[0] + K = query_boxes.shape[0] + iou = np.zeros((N, K), dtype=np.float32) + if N == 0 or K == 0: + return iou + threadsPerBlock = 8 * 8 + cuda.select_device(device_id) + blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock)) + + stream = cuda.stream() + with stream.auto_synchronize(): + boxes_dev = cuda.to_device(boxes.reshape([-1]), stream) + query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream) + iou_dev = cuda.to_device(iou.reshape([-1]), stream) + rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, + stream](N, K, boxes_dev, query_boxes_dev, + iou_dev, criterion) + iou_dev.copy_to_host(iou.reshape([-1]), stream=stream) + return iou.astype(boxes.dtype) diff --git a/mmdet3d/core/evaluation/mean_ap.py b/mmdet3d/core/evaluation/mean_ap.py new file mode 100644 index 0000000000..f1d185a697 --- /dev/null +++ b/mmdet3d/core/evaluation/mean_ap.py @@ -0,0 +1,385 @@ +import mmcv +import numpy as np +from terminaltables import AsciiTable + +from .bbox_overlaps import bbox_overlaps +from .class_names import get_classes + + +def average_precision(recalls, precisions, mode='area'): + """Calculate average precision (for single or multiple scales). + + Args: + recalls (ndarray): shape (num_scales, num_dets) or (num_dets, ) + precisions (ndarray): shape (num_scales, num_dets) or (num_dets, ) + mode (str): 'area' or '11points', 'area' means calculating the area + under precision-recall curve, '11points' means calculating + the average precision of recalls at [0, 0.1, ..., 1] + + Returns: + float or ndarray: calculated average precision + """ + no_scale = False + if recalls.ndim == 1: + no_scale = True + recalls = recalls[np.newaxis, :] + precisions = precisions[np.newaxis, :] + assert recalls.shape == precisions.shape and recalls.ndim == 2 + num_scales = recalls.shape[0] + ap = np.zeros(num_scales, dtype=np.float32) + if mode == 'area': + zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) + ones = np.ones((num_scales, 1), dtype=recalls.dtype) + mrec = np.hstack((zeros, recalls, ones)) + mpre = np.hstack((zeros, precisions, zeros)) + for i in range(mpre.shape[1] - 1, 0, -1): + mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) + for i in range(num_scales): + ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0] + ap[i] = np.sum( + (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1]) + elif mode == '11points': + for i in range(num_scales): + for thr in np.arange(0, 1 + 1e-3, 0.1): + precs = precisions[i, recalls[i, :] >= thr] + prec = precs.max() if precs.size > 0 else 0 + ap[i] += prec + ap /= 11 + else: + raise ValueError( + 'Unrecognized mode, only "area" and "11points" are supported') + if no_scale: + ap = ap[0] + return ap + + +def tpfp_imagenet(det_bboxes, + gt_bboxes, + gt_ignore, + default_iou_thr, + area_ranges=None): + """Check if detected bboxes are true positive or false positive. + + Args: + det_bbox (ndarray): the detected bbox + gt_bboxes (ndarray): ground truth bboxes of this image + gt_ignore (ndarray): indicate if gts are ignored for evaluation or not + default_iou_thr (float): the iou thresholds for medium and large bboxes + area_ranges (list or None): gt bbox area ranges + + Returns: + tuple: two arrays (tp, fp) whose elements are 0 and 1 + """ + num_dets = det_bboxes.shape[0] + num_gts = gt_bboxes.shape[0] + if area_ranges is None: + area_ranges = [(None, None)] + num_scales = len(area_ranges) + # tp and fp are of shape (num_scales, num_gts), each row is tp or fp + # of a certain scale. + tp = np.zeros((num_scales, num_dets), dtype=np.float32) + fp = np.zeros((num_scales, num_dets), dtype=np.float32) + if gt_bboxes.shape[0] == 0: + if area_ranges == [(None, None)]: + fp[...] = 1 + else: + det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * ( + det_bboxes[:, 3] - det_bboxes[:, 1]) + for i, (min_area, max_area) in enumerate(area_ranges): + fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1 + return tp, fp + ious = bbox_overlaps(det_bboxes, gt_bboxes - 1) + gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)), + default_iou_thr) + # sort all detections by scores in descending order + sort_inds = np.argsort(-det_bboxes[:, -1]) + for k, (min_area, max_area) in enumerate(area_ranges): + gt_covered = np.zeros(num_gts, dtype=bool) + # if no area range is specified, gt_area_ignore is all False + if min_area is None: + gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool) + else: + gt_areas = gt_w * gt_h + gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area) + for i in sort_inds: + max_iou = -1 + matched_gt = -1 + # find best overlapped available gt + for j in range(num_gts): + # different from PASCAL VOC: allow finding other gts if the + # best overlaped ones are already matched by other det bboxes + if gt_covered[j]: + continue + elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou: + max_iou = ious[i, j] + matched_gt = j + # there are 4 cases for a det bbox: + # 1. it matches a gt, tp = 1, fp = 0 + # 2. it matches an ignored gt, tp = 0, fp = 0 + # 3. it matches no gt and within area range, tp = 0, fp = 1 + # 4. it matches no gt but is beyond area range, tp = 0, fp = 0 + if matched_gt >= 0: + gt_covered[matched_gt] = 1 + if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]): + tp[k, i] = 1 + elif min_area is None: + fp[k, i] = 1 + else: + bbox = det_bboxes[i, :4] + area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) + if area >= min_area and area < max_area: + fp[k, i] = 1 + return tp, fp + + +def tpfp_default(det_bboxes, gt_bboxes, gt_ignore, iou_thr, area_ranges=None): + """Check if detected bboxes are true positive or false positive. + + Args: + det_bbox (ndarray): the detected bbox + gt_bboxes (ndarray): ground truth bboxes of this image + gt_ignore (ndarray): indicate if gts are ignored for evaluation or not + iou_thr (float): the iou thresholds + + Returns: + tuple: (tp, fp), two arrays whose elements are 0 and 1 + """ + num_dets = det_bboxes.shape[0] + num_gts = gt_bboxes.shape[0] + if area_ranges is None: + area_ranges = [(None, None)] + num_scales = len(area_ranges) + # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of + # a certain scale + tp = np.zeros((num_scales, num_dets), dtype=np.float32) + fp = np.zeros((num_scales, num_dets), dtype=np.float32) + # if there is no gt bboxes in this image, then all det bboxes + # within area range are false positives + if gt_bboxes.shape[0] == 0: + if area_ranges == [(None, None)]: + fp[...] = 1 + else: + det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * ( + det_bboxes[:, 3] - det_bboxes[:, 1]) + for i, (min_area, max_area) in enumerate(area_ranges): + fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1 + return tp, fp + ious = bbox_overlaps(det_bboxes, gt_bboxes) + ious_max = ious.max(axis=1) + ious_argmax = ious.argmax(axis=1) + sort_inds = np.argsort(-det_bboxes[:, -1]) + for k, (min_area, max_area) in enumerate(area_ranges): + gt_covered = np.zeros(num_gts, dtype=bool) + # if no area range is specified, gt_area_ignore is all False + if min_area is None: + gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool) + else: + gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * ( + gt_bboxes[:, 3] - gt_bboxes[:, 1]) + gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area) + for i in sort_inds: + if ious_max[i] >= iou_thr: + matched_gt = ious_argmax[i] + if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]): + if not gt_covered[matched_gt]: + gt_covered[matched_gt] = True + tp[k, i] = 1 + else: + fp[k, i] = 1 + # otherwise ignore this detected bbox, tp = 0, fp = 0 + elif min_area is None: + fp[k, i] = 1 + else: + bbox = det_bboxes[i, :4] + area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) + if area >= min_area and area < max_area: + fp[k, i] = 1 + return tp, fp + + +def get_cls_results(det_results, gt_bboxes, gt_labels, gt_ignore, class_id): + """Get det results and gt information of a certain class.""" + cls_dets = [det[class_id] + for det in det_results] # det bboxes of this class + cls_gts = [] # gt bboxes of this class + cls_gt_ignore = [] + for j in range(len(gt_bboxes)): + gt_bbox = gt_bboxes[j] + cls_inds = (gt_labels[j] == class_id) + cls_gt = gt_bbox[cls_inds, :] if gt_bbox.shape[0] > 0 else gt_bbox + cls_gts.append(cls_gt) + if gt_ignore is None: + cls_gt_ignore.append(np.zeros(cls_gt.shape[0], dtype=np.int32)) + else: + cls_gt_ignore.append(gt_ignore[j][cls_inds]) + return cls_dets, cls_gts, cls_gt_ignore + + +def eval_map(det_results, + gt_bboxes, + gt_labels, + gt_ignore=None, + scale_ranges=None, + iou_thr=0.5, + dataset=None, + print_summary=True): + """Evaluate mAP of a dataset. + + Args: + det_results (list): a list of list, [[cls1_det, cls2_det, ...], ...] + gt_bboxes (list): ground truth bboxes of each image, a list of K*4 + array. + gt_labels (list): ground truth labels of each image, a list of K array + gt_ignore (list): gt ignore indicators of each image, a list of K array + scale_ranges (list, optional): [(min1, max1), (min2, max2), ...] + iou_thr (float): IoU threshold + dataset (None or str or list): dataset name or dataset classes, there + are minor differences in metrics for different datsets, e.g. + "voc07", "imagenet_det", etc. + print_summary (bool): whether to print the mAP summary + + Returns: + tuple: (mAP, [dict, dict, ...]) + """ + assert len(det_results) == len(gt_bboxes) == len(gt_labels) + if gt_ignore is not None: + assert len(gt_ignore) == len(gt_labels) + for i in range(len(gt_ignore)): + assert len(gt_labels[i]) == len(gt_ignore[i]) + area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges] + if scale_ranges is not None else None) + num_scales = len(scale_ranges) if scale_ranges is not None else 1 + eval_results = [] + num_classes = len(det_results[0]) # positive class num + gt_labels = [ + label if label.ndim == 1 else label[:, 0] for label in gt_labels + ] + for i in range(num_classes): + # get gt and det bboxes of this class + cls_dets, cls_gts, cls_gt_ignore = get_cls_results( + det_results, gt_bboxes, gt_labels, gt_ignore, i) + # calculate tp and fp for each image + tpfp_func = ( + tpfp_imagenet if dataset in ['det', 'vid'] else tpfp_default) + tpfp = [ + tpfp_func(cls_dets[j], cls_gts[j], cls_gt_ignore[j], iou_thr, + area_ranges) for j in range(len(cls_dets)) + ] + tp, fp = tuple(zip(*tpfp)) + # calculate gt number of each scale, gts ignored or beyond scale + # are not counted + num_gts = np.zeros(num_scales, dtype=int) + for j, bbox in enumerate(cls_gts): + if area_ranges is None: + num_gts[0] += np.sum(np.logical_not(cls_gt_ignore[j])) + else: + gt_areas = (bbox[:, 2] - bbox[:, 0]) * ( + bbox[:, 3] - bbox[:, 1]) + for k, (min_area, max_area) in enumerate(area_ranges): + num_gts[k] += np.sum( + np.logical_not(cls_gt_ignore[j]) + & (gt_areas >= min_area) & (gt_areas < max_area)) + # sort all det bboxes by score, also sort tp and fp + cls_dets = np.vstack(cls_dets) + num_dets = cls_dets.shape[0] + sort_inds = np.argsort(-cls_dets[:, -1]) + tp = np.hstack(tp)[:, sort_inds] + fp = np.hstack(fp)[:, sort_inds] + # calculate recall and precision with tp and fp + tp = np.cumsum(tp, axis=1) + fp = np.cumsum(fp, axis=1) + eps = np.finfo(np.float32).eps + recalls = tp / np.maximum(num_gts[:, np.newaxis], eps) + precisions = tp / np.maximum((tp + fp), eps) + # calculate AP + if scale_ranges is None: + recalls = recalls[0, :] + precisions = precisions[0, :] + num_gts = num_gts.item() + mode = 'area' if dataset != 'voc07' else '11points' + ap = average_precision(recalls, precisions, mode) + eval_results.append({ + 'num_gts': num_gts, + 'num_dets': num_dets, + 'recall': recalls, + 'precision': precisions, + 'ap': ap + }) + if scale_ranges is not None: + # shape (num_classes, num_scales) + all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results]) + all_num_gts = np.vstack( + [cls_result['num_gts'] for cls_result in eval_results]) + mean_ap = [] + for i in range(num_scales): + if np.any(all_num_gts[:, i] > 0): + mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean()) + else: + mean_ap.append(0.0) + else: + aps = [] + for cls_result in eval_results: + if cls_result['num_gts'] > 0: + aps.append(cls_result['ap']) + mean_ap = np.array(aps).mean().item() if aps else 0.0 + if print_summary: + print_map_summary(mean_ap, eval_results, dataset, area_ranges) + + return mean_ap, eval_results + + +def print_map_summary(mean_ap, results, dataset=None, ranges=None): + """Print mAP and results of each class. + + Args: + mean_ap(float): calculated from `eval_map` + results(list): calculated from `eval_map` + dataset(None or str or list): dataset name or dataset classes. + ranges(list or Tuple): ranges of areas + """ + num_scales = len(results[0]['ap']) if isinstance(results[0]['ap'], + np.ndarray) else 1 + if ranges is not None: + assert len(ranges) == num_scales + + num_classes = len(results) + + recalls = np.zeros((num_scales, num_classes), dtype=np.float32) + precisions = np.zeros((num_scales, num_classes), dtype=np.float32) + aps = np.zeros((num_scales, num_classes), dtype=np.float32) + num_gts = np.zeros((num_scales, num_classes), dtype=int) + for i, cls_result in enumerate(results): + if cls_result['recall'].size > 0: + recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1] + precisions[:, i] = np.array( + cls_result['precision'], ndmin=2)[:, -1] + aps[:, i] = cls_result['ap'] + num_gts[:, i] = cls_result['num_gts'] + + if dataset is None: + label_names = [str(i) for i in range(0, num_classes)] + elif mmcv.is_str(dataset): + label_names = get_classes(dataset) + else: + label_names = dataset + + if not isinstance(mean_ap, list): + mean_ap = [mean_ap] + header = ['class', 'gts', 'dets', 'recall', 'precision', 'ap'] + for i in range(num_scales): + if ranges is not None: + print('Area range ', ranges[i]) + table_data = [header] + for j in range(num_classes): + row_data = [ + label_names[j], num_gts[i, j], results[j]['num_dets'], + '{:.3f}'.format(recalls[i, j]), + '{:.3f}'.format(precisions[i, j]), '{:.3f}'.format(aps[i, j]) + ] + table_data.append(row_data) + table_data.append(['mAP', '', '', '', '', '{:.3f}'.format(mean_ap[i])]) + table = AsciiTable(table_data) + table.inner_footing_row_border = True + print(table.table) diff --git a/mmdet3d/core/evaluation/recall.py b/mmdet3d/core/evaluation/recall.py new file mode 100644 index 0000000000..45c2627c6b --- /dev/null +++ b/mmdet3d/core/evaluation/recall.py @@ -0,0 +1,185 @@ +import numpy as np +from terminaltables import AsciiTable + +from ..bbox import bbox_overlaps_2d + + +def _recalls(all_ious, proposal_nums, thrs): + + img_num = all_ious.shape[0] + total_gt_num = sum([ious.shape[0] for ious in all_ious]) + + _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32) + for k, proposal_num in enumerate(proposal_nums): + tmp_ious = np.zeros(0) + for i in range(img_num): + ious = all_ious[i][:, :proposal_num].copy() + gt_ious = np.zeros((ious.shape[0])) + if ious.size == 0: + tmp_ious = np.hstack((tmp_ious, gt_ious)) + continue + for j in range(ious.shape[0]): + gt_max_overlaps = ious.argmax(axis=1) + max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps] + gt_idx = max_ious.argmax() + gt_ious[j] = max_ious[gt_idx] + box_idx = gt_max_overlaps[gt_idx] + ious[gt_idx, :] = -1 + ious[:, box_idx] = -1 + tmp_ious = np.hstack((tmp_ious, gt_ious)) + _ious[k, :] = tmp_ious + + _ious = np.fliplr(np.sort(_ious, axis=1)) + recalls = np.zeros((proposal_nums.size, thrs.size)) + for i, thr in enumerate(thrs): + recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num) + + return recalls + + +def set_recall_param(proposal_nums, iou_thrs): + """Check proposal_nums and iou_thrs and set correct format. + """ + if isinstance(proposal_nums, list): + _proposal_nums = np.array(proposal_nums) + elif isinstance(proposal_nums, int): + _proposal_nums = np.array([proposal_nums]) + else: + _proposal_nums = proposal_nums + + if iou_thrs is None: + _iou_thrs = np.array([0.5]) + elif isinstance(iou_thrs, list): + _iou_thrs = np.array(iou_thrs) + elif isinstance(iou_thrs, float): + _iou_thrs = np.array([iou_thrs]) + else: + _iou_thrs = iou_thrs + + return _proposal_nums, _iou_thrs + + +def eval_recalls(gts, + proposals, + proposal_nums=None, + iou_thrs=None, + print_summary=True): + """Calculate recalls. + + Args: + gts(list or ndarray): a list of arrays of shape (n, 4) + proposals(list or ndarray): a list of arrays of shape (k, 4) or (k, 5) + proposal_nums(int or list of int or ndarray): top N proposals + thrs(float or list or ndarray): iou thresholds + + Returns: + ndarray: recalls of different ious and proposal nums + """ + + img_num = len(gts) + assert img_num == len(proposals) + + proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs) + + all_ious = [] + for i in range(img_num): + if proposals[i].ndim == 2 and proposals[i].shape[1] == 5: + scores = proposals[i][:, 4] + sort_idx = np.argsort(scores)[::-1] + img_proposal = proposals[i][sort_idx, :] + else: + img_proposal = proposals[i] + prop_num = min(img_proposal.shape[0], proposal_nums[-1]) + if gts[i] is None or gts[i].shape[0] == 0: + ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32) + else: + ious = bbox_overlaps_2d(gts[i], img_proposal[:prop_num, :4]) + all_ious.append(ious) + all_ious = np.array(all_ious) + recalls = _recalls(all_ious, proposal_nums, iou_thrs) + if print_summary: + print_recall_summary(recalls, proposal_nums, iou_thrs) + return recalls + + +def print_recall_summary(recalls, + proposal_nums, + iou_thrs, + row_idxs=None, + col_idxs=None): + """Print recalls in a table. + + Args: + recalls(ndarray): calculated from `bbox_recalls` + proposal_nums(ndarray or list): top N proposals + iou_thrs(ndarray or list): iou thresholds + row_idxs(ndarray): which rows(proposal nums) to print + col_idxs(ndarray): which cols(iou thresholds) to print + """ + proposal_nums = np.array(proposal_nums, dtype=np.int32) + iou_thrs = np.array(iou_thrs) + if row_idxs is None: + row_idxs = np.arange(proposal_nums.size) + if col_idxs is None: + col_idxs = np.arange(iou_thrs.size) + row_header = [''] + iou_thrs[col_idxs].tolist() + table_data = [row_header] + for i, num in enumerate(proposal_nums[row_idxs]): + row = [ + '{:.3f}'.format(val) + for val in recalls[row_idxs[i], col_idxs].tolist() + ] + row.insert(0, num) + table_data.append(row) + table = AsciiTable(table_data) + print(table.table) + + +def plot_num_recall(recalls, proposal_nums): + """Plot Proposal_num-Recalls curve. + + Args: + recalls(ndarray or list): shape (k,) + proposal_nums(ndarray or list): same shape as `recalls` + """ + if isinstance(proposal_nums, np.ndarray): + _proposal_nums = proposal_nums.tolist() + else: + _proposal_nums = proposal_nums + if isinstance(recalls, np.ndarray): + _recalls = recalls.tolist() + else: + _recalls = recalls + + import matplotlib.pyplot as plt + f = plt.figure() + plt.plot([0] + _proposal_nums, [0] + _recalls) + plt.xlabel('Proposal num') + plt.ylabel('Recall') + plt.axis([0, proposal_nums.max(), 0, 1]) + f.show() + + +def plot_iou_recall(recalls, iou_thrs): + """Plot IoU-Recalls curve. + + Args: + recalls(ndarray or list): shape (k,) + iou_thrs(ndarray or list): same shape as `recalls` + """ + if isinstance(iou_thrs, np.ndarray): + _iou_thrs = iou_thrs.tolist() + else: + _iou_thrs = iou_thrs + if isinstance(recalls, np.ndarray): + _recalls = recalls.tolist() + else: + _recalls = recalls + + import matplotlib.pyplot as plt + f = plt.figure() + plt.plot(_iou_thrs + [1.0], _recalls + [0.]) + plt.xlabel('IoU') + plt.ylabel('Recall') + plt.axis([iou_thrs.min(), 1, 0, 1]) + f.show() diff --git a/mmdet3d/core/optimizer/__init__.py b/mmdet3d/core/optimizer/__init__.py new file mode 100644 index 0000000000..1643fe10e9 --- /dev/null +++ b/mmdet3d/core/optimizer/__init__.py @@ -0,0 +1,5 @@ +from .builder import build_optimizer +from .mix_optimizer import MixedOptimizer +from .registry import OPTIMIZERS + +__all__ = ['OPTIMIZERS', 'build_optimizer', 'MixedOptimizer'] diff --git a/mmdet3d/core/optimizer/builder.py b/mmdet3d/core/optimizer/builder.py new file mode 100644 index 0000000000..c6ae7f6222 --- /dev/null +++ b/mmdet3d/core/optimizer/builder.py @@ -0,0 +1,135 @@ +import re + +import torch + +from mmdet.utils import build_from_cfg, get_root_logger +from .registry import OPTIMIZERS + + +def build_optimizer(model, optimizer_cfg): + """Build optimizer from configs. + + Args: + model (:obj:`nn.Module`): The model with parameters to be optimized. + optimizer_cfg (dict): The config dict of the optimizer. + Positional fields are: + - type: class name of the optimizer. + - lr: base learning rate. + Optional fields are: + - any arguments of the corresponding optimizer type, e.g., + weight_decay, momentum, etc. + - paramwise_options: a dict with 4 accepted fileds + (bias_lr_mult, bias_decay_mult, norm_decay_mult, + dwconv_decay_mult). + `bias_lr_mult` and `bias_decay_mult` will be multiplied to + the lr and weight decay respectively for all bias parameters + (except for the normalization layers), and + `norm_decay_mult` will be multiplied to the weight decay + for all weight and bias parameters of normalization layers. + `dwconv_decay_mult` will be multiplied to the weight decay + for all weight and bias parameters of depthwise conv layers. + + Returns: + torch.optim.Optimizer: The initialized optimizer. + + Example: + >>> import torch + >>> model = torch.nn.modules.Conv1d(1, 1, 1) + >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9, + >>> weight_decay=0.0001) + >>> optimizer = build_optimizer(model, optimizer_cfg) + """ + if hasattr(model, 'module'): + model = model.module + + optimizer_cfg = optimizer_cfg.copy() + + if isinstance(optimizer_cfg, list): + # Assume paramwise_options is None if optimizer_cfg is list + from .mix_optimizer import MixedOptimizer + logger = get_root_logger() + keys = [optimizer.pop('key') for optimizer in optimizer_cfg] + keys_params = {key: [] for key in keys} + keys_params_name = {key: [] for key in keys} + keys_optimizer = [] + for name, param in model.named_parameters(): + param_group = {'params': [param]} + find_flag = False + for key in keys: + if key in name: + keys_params[key].append(param_group) + keys_params_name[key].append(name) + find_flag = True + break + assert find_flag, 'key {} is not matched to any optimizer'.format( + name) + + step_intervals = [] + for key, single_cfg in zip(keys, optimizer_cfg): + optimizer_cls = getattr(torch.optim, single_cfg.pop('type')) + step_intervals.append(single_cfg.pop('step_interval', 1)) + single_optim = optimizer_cls(keys_params[key], **single_cfg) + keys_optimizer.append(single_optim) + logger.info('{} optimizes key:\n {}\n'.format( + optimizer_cls.__name__, keys_params_name[key])) + + mix_optimizer = MixedOptimizer(keys_optimizer, step_intervals) + return mix_optimizer + else: + paramwise_options = optimizer_cfg.pop('paramwise_options', None) + + # if no paramwise option is specified, just use the global setting + if paramwise_options is None: + params = model.parameters() + else: + assert isinstance(paramwise_options, dict) + # get base lr and weight decay + base_lr = optimizer_cfg['lr'] + base_wd = optimizer_cfg.get('weight_decay', None) + # weight_decay must be explicitly specified if mult is specified + if ('bias_decay_mult' in paramwise_options + or 'norm_decay_mult' in paramwise_options + or 'dwconv_decay_mult' in paramwise_options): + assert base_wd is not None + # get param-wise options + bias_lr_mult = paramwise_options.get('bias_lr_mult', 1.) + bias_decay_mult = paramwise_options.get('bias_decay_mult', 1.) + norm_decay_mult = paramwise_options.get('norm_decay_mult', 1.) + dwconv_decay_mult = paramwise_options.get('dwconv_decay_mult', 1.) + named_modules = dict(model.named_modules()) + # set param-wise lr and weight decay + params = [] + for name, param in model.named_parameters(): + param_group = {'params': [param]} + if not param.requires_grad: + # FP16 training needs to copy gradient/weight between master + # weight copy and model weight, it is convenient to keep all + # parameters here to align with model.parameters() + params.append(param_group) + continue + + # for norm layers, overwrite the weight decay of weight and bias + # TODO: obtain the norm layer prefixes dynamically + if re.search(r'(bn|gn)(\d+)?.(weight|bias)', name): + if base_wd is not None: + param_group['weight_decay'] = base_wd * norm_decay_mult + # for other layers, overwrite both lr and weight decay of bias + elif name.endswith('.bias'): + param_group['lr'] = base_lr * bias_lr_mult + if base_wd is not None: + param_group['weight_decay'] = base_wd * bias_decay_mult + + module_name = name.replace('.weight', '').replace('.bias', '') + if module_name in named_modules and base_wd is not None: + module = named_modules[module_name] + # if this Conv2d is depthwise Conv2d + if isinstance(module, torch.nn.Conv2d) and \ + module.in_channels == module.groups: + param_group['weight_decay'] = base_wd * dwconv_decay_mult + # otherwise use the global settings + + params.append(param_group) + + optimizer_cfg['params'] = params + + return build_from_cfg(optimizer_cfg, OPTIMIZERS) diff --git a/mmdet3d/core/optimizer/mix_optimizer.py b/mmdet3d/core/optimizer/mix_optimizer.py new file mode 100644 index 0000000000..250ce1c767 --- /dev/null +++ b/mmdet3d/core/optimizer/mix_optimizer.py @@ -0,0 +1,99 @@ +from torch.optim import Optimizer + +from .registry import OPTIMIZERS + + +@OPTIMIZERS.register_module +class MixedOptimizer(Optimizer): + """Mixed Optimizer that contains multiple optimizers + + This optimizer applies the cocktail optimzation for multi-modality models. + + """ + + def __init__(self, optimizers, step_intervals=None): + self.optimizers = optimizers + self.param_groups = [] + for optimizer in self.optimizers: + self.param_groups += optimizer.param_groups + if not isinstance(step_intervals, list): + step_intervals = [1] * len(self.optimizers) + self.step_intervals = step_intervals + self.num_step_updated = 0 + + def __getstate__(self): + return { + 'num_step_updated': + self.num_step_updated, + 'defaults': [optimizer.defaults for optimizer in self.optimizers], + 'state': [optimizer.state for optimizer in self.optimizers], + 'param_groups': + [optimizer.param_groups for optimizer in self.optimizers], + } + + def __setstate__(self, state): + self.__dict__.update(state) + + def __repr__(self): + format_string = self.__class__.__name__ + ' (\n' + for optimizer in self.optimizers: + format_string += '\t' + optimizer.__repr__ + ',\n' + format_string += ')' + return format_string + + def state_dict(self): + state_dicts = [optimizer.state_dict() for optimizer in self.optimizers] + return { + 'num_step_updated': + self.num_step_updated, + 'state': [state_dict['state'] for state_dict in state_dicts], + 'param_groups': + [state_dict['param_groups'] for state_dict in state_dicts], + } + + def load_state_dict(self, state_dict): + r"""Loads the optimizer state. + + Arguments: + state_dict (dict): optimizer state. Should be an object returned + from a call to :meth:`state_dict`. + """ + assert len(state_dict['state']) == len(self.optimizers) + assert len(state_dict['param_groups']) == len(self.optimizers) + for i, (single_state, single_param_groups) in enumerate( + zip(state_dict['state'], state_dict['param_groups'])): + single_state_dict = dict( + state=single_state, param_groups=single_param_groups) + self.optimizers[i].load_state_dict(single_state_dict) + + self.param_groups = [] + for optimizer in self.optimizers: + self.param_groups += optimizer.param_groups + self.num_step_updated = state_dict['num_step_updated'] + + def zero_grad(self): + r"""Clears the gradients of all optimized :class:`torch.Tensor` s.""" + for optimizer in self.optimizers: + optimizer.zero_grad() + + def step(self, closure=None): + r"""Performs a single optimization step (parameter update). + + Arguments: + closure (callable): A closure that reevaluates the model and + returns the loss. Optional for most optimizers. + """ + loss = None + if closure is not None: + loss = closure() + + self.num_step_updated += 1 + for step_interval, optimizer in zip(self.step_intervals, + self.optimizers): + if self.num_step_updated % step_interval == 0: + optimizer.step() + + return loss + + def add_param_group(self, param_group): + raise NotImplementedError diff --git a/mmdet3d/core/optimizer/registry.py b/mmdet3d/core/optimizer/registry.py new file mode 100644 index 0000000000..de9b738989 --- /dev/null +++ b/mmdet3d/core/optimizer/registry.py @@ -0,0 +1,23 @@ +import inspect + +import torch + +from mmdet.utils import Registry + +OPTIMIZERS = Registry('optimizer') + + +def register_torch_optimizers(): + torch_optimizers = [] + for module_name in dir(torch.optim): + if module_name.startswith('__'): + continue + _optim = getattr(torch.optim, module_name) + if inspect.isclass(_optim) and issubclass(_optim, + torch.optim.Optimizer): + OPTIMIZERS.register_module(_optim) + torch_optimizers.append(module_name) + return torch_optimizers + + +TORCH_OPTIMIZERS = register_torch_optimizers() diff --git a/mmdet3d/core/post_processing/__init__.py b/mmdet3d/core/post_processing/__init__.py new file mode 100644 index 0000000000..11c3d30adf --- /dev/null +++ b/mmdet3d/core/post_processing/__init__.py @@ -0,0 +1,8 @@ +from .bbox_nms import multiclass_nms +from .merge_augs import (merge_aug_bboxes, merge_aug_masks, + merge_aug_proposals, merge_aug_scores) + +__all__ = [ + 'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes', + 'merge_aug_scores', 'merge_aug_masks' +] diff --git a/mmdet3d/core/post_processing/bbox_nms.py b/mmdet3d/core/post_processing/bbox_nms.py new file mode 100644 index 0000000000..76f0d6bb62 --- /dev/null +++ b/mmdet3d/core/post_processing/bbox_nms.py @@ -0,0 +1,68 @@ +import torch + +from mmdet.ops.nms import nms_wrapper + + +def multiclass_nms(multi_bboxes, + multi_scores, + score_thr, + nms_cfg, + max_num=-1, + score_factors=None): + """NMS for multi-class bboxes. + + Args: + multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) + multi_scores (Tensor): shape (n, #class+1), where the last column + contains scores of the background class, but this will be ignored. + score_thr (float): bbox threshold, bboxes with scores lower than it + will not be considered. + nms_thr (float): NMS IoU threshold + max_num (int): if there are more than max_num bboxes after NMS, + only top max_num will be kept. + score_factors (Tensor): The factors multiplied to scores before + applying NMS + + Returns: + tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels + are 0-based. + """ + # scores has num_classes + 1 (last one is BG) + num_classes = multi_scores.shape[1] - 1 + bboxes, labels = [], [] + nms_cfg_ = nms_cfg.copy() + nms_type = nms_cfg_.pop('type', 'nms') + nms_op = getattr(nms_wrapper, nms_type) + # the fg class id range: [0, num_classes-1] + for i in range(0, num_classes): + cls_inds = multi_scores[:, i] > score_thr + if not cls_inds.any(): + continue + # get bboxes and scores of this class + if multi_bboxes.shape[1] == 4: + _bboxes = multi_bboxes[cls_inds, :] + else: + _bboxes = multi_bboxes[cls_inds, i * 4:(i + 1) * 4] + _scores = multi_scores[cls_inds, i] + if score_factors is not None: + _scores *= score_factors[cls_inds] + cls_dets = torch.cat([_bboxes, _scores[:, None]], dim=1) + cls_dets, _ = nms_op(cls_dets, **nms_cfg_) + cls_labels = multi_bboxes.new_full((cls_dets.shape[0], ), + i, + dtype=torch.long) + bboxes.append(cls_dets) + labels.append(cls_labels) + if bboxes: + bboxes = torch.cat(bboxes) + labels = torch.cat(labels) + if bboxes.shape[0] > max_num: + _, inds = bboxes[:, -1].sort(descending=True) + inds = inds[:max_num] + bboxes = bboxes[inds] + labels = labels[inds] + else: + bboxes = multi_bboxes.new_zeros((0, 5)) + labels = multi_bboxes.new_zeros((0, ), dtype=torch.long) + + return bboxes, labels diff --git a/mmdet3d/core/post_processing/merge_augs.py b/mmdet3d/core/post_processing/merge_augs.py new file mode 100644 index 0000000000..2dca68654a --- /dev/null +++ b/mmdet3d/core/post_processing/merge_augs.py @@ -0,0 +1,101 @@ +import numpy as np +import torch + +from mmdet3d.ops import nms +from ..bbox import bbox_mapping_back + + +def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg): + """Merge augmented proposals (multiscale, flip, etc.) + + Args: + aug_proposals (list[Tensor]): proposals from different testing + schemes, shape (n, 5). Note that they are not rescaled to the + original image size. + + img_metas (list[dict]): list of image info dict where each dict has: + 'img_shape', 'scale_factor', 'flip', and my also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmdet/datasets/pipelines/formatting.py:Collect`. + + rpn_test_cfg (dict): rpn test config. + + Returns: + Tensor: shape (n, 4), proposals corresponding to original image scale. + """ + recovered_proposals = [] + for proposals, img_info in zip(aug_proposals, img_metas): + img_shape = img_info['img_shape'] + scale_factor = img_info['scale_factor'] + flip = img_info['flip'] + _proposals = proposals.clone() + _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape, + scale_factor, flip) + recovered_proposals.append(_proposals) + aug_proposals = torch.cat(recovered_proposals, dim=0) + merged_proposals, _ = nms(aug_proposals, rpn_test_cfg.nms_thr) + scores = merged_proposals[:, 4] + _, order = scores.sort(0, descending=True) + num = min(rpn_test_cfg.max_num, merged_proposals.shape[0]) + order = order[:num] + merged_proposals = merged_proposals[order, :] + return merged_proposals + + +def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg): + """Merge augmented detection bboxes and scores. + + Args: + aug_bboxes (list[Tensor]): shape (n, 4*#class) + aug_scores (list[Tensor] or None): shape (n, #class) + img_shapes (list[Tensor]): shape (3, ). + rcnn_test_cfg (dict): rcnn test config. + + Returns: + tuple: (bboxes, scores) + """ + recovered_bboxes = [] + for bboxes, img_info in zip(aug_bboxes, img_metas): + img_shape = img_info[0]['img_shape'] + scale_factor = img_info[0]['scale_factor'] + flip = img_info[0]['flip'] + bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip) + recovered_bboxes.append(bboxes) + bboxes = torch.stack(recovered_bboxes).mean(dim=0) + if aug_scores is None: + return bboxes + else: + scores = torch.stack(aug_scores).mean(dim=0) + return bboxes, scores + + +def merge_aug_scores(aug_scores): + """Merge augmented bbox scores.""" + if isinstance(aug_scores[0], torch.Tensor): + return torch.mean(torch.stack(aug_scores), dim=0) + else: + return np.mean(aug_scores, axis=0) + + +def merge_aug_masks(aug_masks, img_metas, rcnn_test_cfg, weights=None): + """Merge augmented mask prediction. + + Args: + aug_masks (list[ndarray]): shape (n, #class, h, w) + img_shapes (list[ndarray]): shape (3, ). + rcnn_test_cfg (dict): rcnn test config. + + Returns: + tuple: (bboxes, scores) + """ + recovered_masks = [ + mask if not img_info[0]['flip'] else mask[..., ::-1] + for mask, img_info in zip(aug_masks, img_metas) + ] + if weights is None: + merged_masks = np.mean(recovered_masks, axis=0) + else: + merged_masks = np.average( + np.array(recovered_masks), axis=0, weights=np.array(weights)) + return merged_masks diff --git a/mmdet3d/core/utils/__init__.py b/mmdet3d/core/utils/__init__.py new file mode 100644 index 0000000000..47c0a9d933 --- /dev/null +++ b/mmdet3d/core/utils/__init__.py @@ -0,0 +1,11 @@ +from .dist_utils import DistOptimizerHook, allreduce_grads +from .misc import tensor2imgs # merge_batch, merge_hook_batch +from .misc import multi_apply, unmap + +__all__ = [ + 'allreduce_grads', + 'DistOptimizerHook', + 'multi_apply', + 'tensor2imgs', + 'unmap', # 'merge_batch', 'merge_hook_batch' +] diff --git a/mmdet3d/core/utils/contextmanagers.py b/mmdet3d/core/utils/contextmanagers.py new file mode 100644 index 0000000000..5705338c51 --- /dev/null +++ b/mmdet3d/core/utils/contextmanagers.py @@ -0,0 +1,121 @@ +import asyncio +import contextlib +import logging +import os +import time +from typing import List + +import torch + +logger = logging.getLogger(__name__) + +DEBUG_COMPLETED_TIME = bool(os.environ.get('DEBUG_COMPLETED_TIME', False)) + + +@contextlib.asynccontextmanager +async def completed(trace_name='', + name='', + sleep_interval=0.05, + streams: List[torch.cuda.Stream] = None): + """ + Async context manager that waits for work to complete on + given CUDA streams. + + """ + if not torch.cuda.is_available(): + yield + return + + stream_before_context_switch = torch.cuda.current_stream() + if not streams: + streams = [stream_before_context_switch] + else: + streams = [s if s else stream_before_context_switch for s in streams] + + end_events = [ + torch.cuda.Event(enable_timing=DEBUG_COMPLETED_TIME) for _ in streams + ] + + if DEBUG_COMPLETED_TIME: + start = torch.cuda.Event(enable_timing=True) + stream_before_context_switch.record_event(start) + + cpu_start = time.monotonic() + logger.debug('%s %s starting, streams: %s', trace_name, name, streams) + grad_enabled_before = torch.is_grad_enabled() + try: + yield + finally: + current_stream = torch.cuda.current_stream() + assert current_stream == stream_before_context_switch + + if DEBUG_COMPLETED_TIME: + cpu_end = time.monotonic() + for i, stream in enumerate(streams): + event = end_events[i] + stream.record_event(event) + + grad_enabled_after = torch.is_grad_enabled() + + # observed change of torch.is_grad_enabled() during concurrent run of + # async_test_bboxes code + assert grad_enabled_before == grad_enabled_after, \ + 'Unexpected is_grad_enabled() value change' + + are_done = [e.query() for e in end_events] + logger.debug('%s %s completed: %s streams: %s', trace_name, name, + are_done, streams) + with torch.cuda.stream(stream_before_context_switch): + while not all(are_done): + await asyncio.sleep(sleep_interval) + are_done = [e.query() for e in end_events] + logger.debug('%s %s completed: %s streams: %s', trace_name, + name, are_done, streams) + + current_stream = torch.cuda.current_stream() + assert current_stream == stream_before_context_switch + + if DEBUG_COMPLETED_TIME: + cpu_time = (cpu_end - cpu_start) * 1000 + stream_times_ms = '' + for i, stream in enumerate(streams): + elapsed_time = start.elapsed_time(end_events[i]) + stream_times_ms += ' {stream} {elapsed_time:.2f} ms'.format( + stream, elapsed_time) + logger.info('{trace_name} {name} cpu_time {cpu_time:.2f} ms', + trace_name, name, cpu_time, stream_times_ms) + + +@contextlib.asynccontextmanager +async def concurrent(streamqueue: asyncio.Queue, + trace_name='concurrent', + name='stream'): + """Run code concurrently in different streams. + + :param streamqueue: asyncio.Queue instance. + + Queue tasks define the pool of streams used for concurrent execution. + + """ + if not torch.cuda.is_available(): + yield + return + + initial_stream = torch.cuda.current_stream() + + with torch.cuda.stream(initial_stream): + stream = await streamqueue.get() + assert isinstance(stream, torch.cuda.Stream) + + try: + with torch.cuda.stream(stream): + logger.debug('%s %s is starting, stream: %s', trace_name, name, + stream) + yield + current = torch.cuda.current_stream() + assert current == stream + logger.debug('%s %s has finished, stream: %s', trace_name, + name, stream) + finally: + streamqueue.task_done() + streamqueue.put_nowait(stream) diff --git a/mmdet3d/core/utils/dist_utils.py b/mmdet3d/core/utils/dist_utils.py new file mode 100644 index 0000000000..249f71b342 --- /dev/null +++ b/mmdet3d/core/utils/dist_utils.py @@ -0,0 +1,58 @@ +from collections import OrderedDict + +import torch.distributed as dist +from mmcv.runner import OptimizerHook +from torch._utils import (_flatten_dense_tensors, _take_tensors, + _unflatten_dense_tensors) + + +def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): + if bucket_size_mb > 0: + bucket_size_bytes = bucket_size_mb * 1024 * 1024 + buckets = _take_tensors(tensors, bucket_size_bytes) + else: + buckets = OrderedDict() + for tensor in tensors: + tp = tensor.type() + if tp not in buckets: + buckets[tp] = [] + buckets[tp].append(tensor) + buckets = buckets.values() + + for bucket in buckets: + flat_tensors = _flatten_dense_tensors(bucket) + dist.all_reduce(flat_tensors) + flat_tensors.div_(world_size) + for tensor, synced in zip( + bucket, _unflatten_dense_tensors(flat_tensors, bucket)): + tensor.copy_(synced) + + +def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): + grads = [ + param.grad.data for param in params + if param.requires_grad and param.grad is not None + ] + world_size = dist.get_world_size() + if coalesce: + _allreduce_coalesced(grads, world_size, bucket_size_mb) + else: + for tensor in grads: + dist.all_reduce(tensor.div_(world_size)) + + +class DistOptimizerHook(OptimizerHook): + + def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1): + self.grad_clip = grad_clip + self.coalesce = coalesce + self.bucket_size_mb = bucket_size_mb + + def after_train_iter(self, runner): + runner.optimizer.zero_grad() + runner.outputs['loss'].backward() + # allreduce_grads(runner.model.parameters(), self.coalesce, + # self.bucket_size_mb) + if self.grad_clip is not None: + self.clip_grads(runner.model.parameters()) + runner.optimizer.step() diff --git a/mmdet3d/core/utils/kitti_utils.py b/mmdet3d/core/utils/kitti_utils.py new file mode 100644 index 0000000000..53c8800f85 --- /dev/null +++ b/mmdet3d/core/utils/kitti_utils.py @@ -0,0 +1,69 @@ +import numpy as np +import scipy +import torch +from scipy.spatial import Delaunay + + +def in_hull(p, hull): + """ + :param p: (N, K) test points + :param hull: (M, K) M corners of a box + :return (N) bool + """ + try: + if not isinstance(hull, Delaunay): + hull = Delaunay(hull) + flag = hull.find_simplex(p) >= 0 + except scipy.spatial.qhull.QhullError: + print('Warning: not a hull %s' % str(hull)) + flag = np.zeros(p.shape[0], dtype=np.bool) + + return flag + + +def enlarge_box3d(boxes3d, extra_width): + """ + :param boxes3d: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coords + """ + if isinstance(boxes3d, np.ndarray): + large_boxes3d = boxes3d.copy() + else: + large_boxes3d = boxes3d.clone() + large_boxes3d[:, 3:6] += extra_width * 2 + # bugfixed: here should be minus, not addion in LiDAR, 20190508 + large_boxes3d[:, 2] -= extra_width + return large_boxes3d + + +def rotate_pc_along_z(pc, rot_angle): + """ + params pc: (N, 3+C), (N, 3) is in the LiDAR coordinate + params rot_angle: rad scalar + Output pc: updated pc with XYZ rotated + """ + cosval = np.cos(rot_angle) + sinval = np.sin(rot_angle) + rotmat = np.array([[cosval, -sinval], [sinval, cosval]]) + pc[:, 0:2] = np.dot(pc[:, 0:2], rotmat) + return pc + + +def rotate_pc_along_z_torch(pc, rot_angle): + """ + :param pc: (N, 512, 3 + C) in the LiDAR coordinate + :param rot_angle: (N) + :return: + TODO: merge with rotate_pc_along_y_torch in bbox_transform.py + """ + cosa = torch.cos(rot_angle).view(-1, 1) # (N, 1) + sina = torch.sin(rot_angle).view(-1, 1) # (N, 1) + + raw_1 = torch.cat([cosa, -sina], dim=1) # (N, 2) + raw_2 = torch.cat([sina, cosa], dim=1) # (N, 2) + R = torch.cat((raw_1.unsqueeze(dim=1), raw_2.unsqueeze(dim=1)), + dim=1) # (N, 2, 2) + + pc_temp = pc[:, :, 0:2] # (N, 512, 2) + + pc[:, :, 0:2] = torch.matmul(pc_temp, R) # (N, 512, 2) + return pc diff --git a/mmdet3d/core/utils/misc.py b/mmdet3d/core/utils/misc.py new file mode 100644 index 0000000000..a63170636e --- /dev/null +++ b/mmdet3d/core/utils/misc.py @@ -0,0 +1,65 @@ +from functools import partial + +import mmcv +import numpy as np +import torch +import torch.nn.functional as F +from six.moves import map, zip + + +def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True): + num_imgs = tensor.size(0) + mean = np.array(mean, dtype=np.float32) + std = np.array(std, dtype=np.float32) + imgs = [] + for img_id in range(num_imgs): + img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0) + img = mmcv.imdenormalize( + img, mean, std, to_bgr=to_rgb).astype(np.uint8) + imgs.append(np.ascontiguousarray(img)) + return imgs + + +def multi_apply(func, *args, **kwargs): + pfunc = partial(func, **kwargs) if kwargs else func + map_results = map(pfunc, *args) + return tuple(map(list, zip(*map_results))) + + +def unmap(data, count, inds, fill=0): + """ Unmap a subset of item (data) back to the original set of items (of + size count) """ + if data.dim() == 1: + ret = data.new_full((count, ), fill) + ret[inds] = data + else: + new_size = (count, ) + data.size()[1:] + ret = data.new_full(new_size, fill) + ret[inds, :] = data + return ret + + +def merge_batch(data): + for key, elems in data.items(): + if key in ['voxels', 'num_points', 'voxel_labels', 'voxel_centers']: + data[key]._data[0] = torch.cat(elems._data[0], dim=0) + elif key == 'coors': + coors = [] + for i, coor in enumerate(elems._data[0]): + coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) + coors.append(coor_pad) + data[key]._data[0] = torch.cat(coors, dim=0) + return data + + +def merge_hook_batch(data): + for key, elems in data.items(): + if key in ['voxels', 'num_points', 'voxel_labels', 'voxel_centers']: + data[key] = torch.cat(elems, dim=0) + elif key == 'coors': + coors = [] + for i, coor in enumerate(elems): + coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) + coors.append(coor_pad) + data[key] = torch.cat(coors, dim=0) + return data diff --git a/mmdet3d/core/voxel/__init__.py b/mmdet3d/core/voxel/__init__.py new file mode 100644 index 0000000000..7324f2521a --- /dev/null +++ b/mmdet3d/core/voxel/__init__.py @@ -0,0 +1,4 @@ +from .builder import build_voxel_generator +from .voxel_generator import VoxelGenerator + +__all__ = ['build_voxel_generator', 'VoxelGenerator'] diff --git a/mmdet3d/core/voxel/builder.py b/mmdet3d/core/voxel/builder.py new file mode 100644 index 0000000000..cc311a3fe2 --- /dev/null +++ b/mmdet3d/core/voxel/builder.py @@ -0,0 +1,14 @@ +import mmcv + +from . import voxel_generator + + +def build_voxel_generator(cfg, **kwargs): + if isinstance(cfg, voxel_generator.VoxelGenerator): + return cfg + elif isinstance(cfg, dict): + return mmcv.runner.obj_from_dict( + cfg, voxel_generator, default_args=kwargs) + else: + raise TypeError('Invalid type {} for building a sampler'.format( + type(cfg))) diff --git a/mmdet3d/core/voxel/voxel_generator.py b/mmdet3d/core/voxel/voxel_generator.py new file mode 100644 index 0000000000..c21afd9be7 --- /dev/null +++ b/mmdet3d/core/voxel/voxel_generator.py @@ -0,0 +1,207 @@ +import numba +import numpy as np + + +class VoxelGenerator(object): + + def __init__(self, + voxel_size, + point_cloud_range, + max_num_points, + max_voxels=20000): + point_cloud_range = np.array(point_cloud_range, dtype=np.float32) + # [0, -40, -3, 70.4, 40, 1] + voxel_size = np.array(voxel_size, dtype=np.float32) + grid_size = (point_cloud_range[3:] - + point_cloud_range[:3]) / voxel_size + grid_size = np.round(grid_size).astype(np.int64) + + self._voxel_size = voxel_size + self._point_cloud_range = point_cloud_range + self._max_num_points = max_num_points + self._max_voxels = max_voxels + self._grid_size = grid_size + + def generate(self, points): + return points_to_voxel(points, self._voxel_size, + self._point_cloud_range, self._max_num_points, + True, self._max_voxels) + + @property + def voxel_size(self): + return self._voxel_size + + @property + def max_num_points_per_voxel(self): + return self._max_num_points + + @property + def point_cloud_range(self): + return self._point_cloud_range + + @property + def grid_size(self): + return self._grid_size + + +def points_to_voxel(points, + voxel_size, + coors_range, + max_points=35, + reverse_index=True, + max_voxels=20000): + """convert kitti points(N, >=3) to voxels. This version calculate + everything in one loop. now it takes only 4.2ms(complete point cloud) + with jit and 3.2ghz cpu.(don't calculate other features) + + Args: + points: [N, ndim] float tensor. points[:, :3] contain xyz points and + points[:, 3:] contain other information such as reflectivity. + voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size + coors_range: [6] list/tuple or array, float. indicate voxel range. + format: xyzxyz, minmax + max_points: int. indicate maximum points contained in a voxel. + reverse_index: boolean. indicate whether return reversed coordinates. + if points has xyz format and reverse_index is True, output + coordinates will be zyx format, but points in features always + xyz format. + max_voxels: int. indicate maximum voxels this function create. + for second, 20000 is a good choice. you should shuffle points + before call this function because max_voxels may drop some points. + + Returns: + voxels: [M, max_points, ndim] float tensor. only contain points. + coordinates: [M, 3] int32 tensor. + num_points_per_voxel: [M] int32 tensor. + """ + if not isinstance(voxel_size, np.ndarray): + voxel_size = np.array(voxel_size, dtype=points.dtype) + if not isinstance(coors_range, np.ndarray): + coors_range = np.array(coors_range, dtype=points.dtype) + voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size + voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist()) + if reverse_index: + voxelmap_shape = voxelmap_shape[::-1] + # don't create large array in jit(nopython=True) code. + num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32) + coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32) + voxels = np.zeros( + shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype) + coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32) + if reverse_index: + voxel_num = _points_to_voxel_reverse_kernel( + points, voxel_size, coors_range, num_points_per_voxel, + coor_to_voxelidx, voxels, coors, max_points, max_voxels) + + else: + voxel_num = _points_to_voxel_kernel(points, voxel_size, coors_range, + num_points_per_voxel, + coor_to_voxelidx, voxels, coors, + max_points, max_voxels) + + coors = coors[:voxel_num] + voxels = voxels[:voxel_num] + num_points_per_voxel = num_points_per_voxel[:voxel_num] + + return voxels, coors, num_points_per_voxel + + +@numba.jit(nopython=True) +def _points_to_voxel_reverse_kernel(points, + voxel_size, + coors_range, + num_points_per_voxel, + coor_to_voxelidx, + voxels, + coors, + max_points=35, + max_voxels=20000): + # put all computations to one loop. + # we shouldn't create large array in main jit code, otherwise + # reduce performance + N = points.shape[0] + # ndim = points.shape[1] - 1 + ndim = 3 + ndim_minus_1 = ndim - 1 + grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size + # np.round(grid_size) + # grid_size = np.round(grid_size).astype(np.int64)(np.int32) + grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) + coor = np.zeros(shape=(3, ), dtype=np.int32) + voxel_num = 0 + failed = False + for i in range(N): + failed = False + for j in range(ndim): + c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) + if c < 0 or c >= grid_size[j]: + failed = True + break + coor[ndim_minus_1 - j] = c + if failed: + continue + voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] + if voxelidx == -1: + voxelidx = voxel_num + if voxel_num >= max_voxels: + break + voxel_num += 1 + coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx + coors[voxelidx] = coor + num = num_points_per_voxel[voxelidx] + if num < max_points: + voxels[voxelidx, num] = points[i] + num_points_per_voxel[voxelidx] += 1 + return voxel_num + + +@numba.jit(nopython=True) +def _points_to_voxel_kernel(points, + voxel_size, + coors_range, + num_points_per_voxel, + coor_to_voxelidx, + voxels, + coors, + max_points=35, + max_voxels=20000): + # need mutex if write in cuda, but numba.cuda don't support mutex. + # in addition, pytorch don't support cuda in dataloader. + # put all computations to one loop. + # we shouldn't create large array in main jit code, otherwise + # decrease performance + N = points.shape[0] + # ndim = points.shape[1] - 1 + ndim = 3 + grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size + # grid_size = np.round(grid_size).astype(np.int64)(np.int32) + grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) + + # lower_bound = coors_range[:3] + # upper_bound = coors_range[3:] + coor = np.zeros(shape=(3, ), dtype=np.int32) + voxel_num = 0 + failed = False + for i in range(N): + failed = False + for j in range(ndim): + c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) + if c < 0 or c >= grid_size[j]: + failed = True + break + coor[j] = c + if failed: + continue + voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] + if voxelidx == -1: + voxelidx = voxel_num + if voxel_num >= max_voxels: + break + voxel_num += 1 + coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx + coors[voxelidx] = coor + num = num_points_per_voxel[voxelidx] + if num < max_points: + voxels[voxelidx, num] = points[i] + num_points_per_voxel[voxelidx] += 1 + return voxel_num diff --git a/mmdet3d/datasets/__init__.py b/mmdet3d/datasets/__init__.py new file mode 100644 index 0000000000..55c93a06b5 --- /dev/null +++ b/mmdet3d/datasets/__init__.py @@ -0,0 +1,16 @@ +from mmdet.datasets.registry import DATASETS +from .builder import build_dataset +from .coco import CocoDataset +from .dataset_wrappers import ConcatDataset, RepeatDataset +from .kitti2d_dataset import Kitti2DDataset +from .kitti_dataset import KittiDataset +from .loader import DistributedGroupSampler, GroupSampler, build_dataloader +from .nuscenes2d_dataset import NuScenes2DDataset +from .nuscenes_dataset import NuScenesDataset + +__all__ = [ + 'KittiDataset', 'GroupSampler', 'DistributedGroupSampler', + 'build_dataloader', 'ConcatDataset', 'RepeatDataset', 'DATASETS', + 'build_dataset', 'CocoDataset', 'Kitti2DDataset', 'NuScenesDataset', + 'NuScenes2DDataset' +] diff --git a/mmdet3d/datasets/builder.py b/mmdet3d/datasets/builder.py new file mode 100644 index 0000000000..e9ef97abc1 --- /dev/null +++ b/mmdet3d/datasets/builder.py @@ -0,0 +1,45 @@ +import copy + +from mmdet.datasets import ConcatDataset, RepeatDataset +from mmdet.utils import build_from_cfg +from .dataset_wrappers import RepeatFactorDataset +from .registry import DATASETS + + +def _concat_dataset(cfg, default_args=None): + ann_files = cfg['ann_file'] + img_prefixes = cfg.get('img_prefix', None) + seg_prefixes = cfg.get('seg_prefix', None) + proposal_files = cfg.get('proposal_file', None) + + datasets = [] + num_dset = len(ann_files) + for i in range(num_dset): + data_cfg = copy.deepcopy(cfg) + data_cfg['ann_file'] = ann_files[i] + if isinstance(img_prefixes, (list, tuple)): + data_cfg['img_prefix'] = img_prefixes[i] + if isinstance(seg_prefixes, (list, tuple)): + data_cfg['seg_prefix'] = seg_prefixes[i] + if isinstance(proposal_files, (list, tuple)): + data_cfg['proposal_file'] = proposal_files[i] + datasets.append(build_dataset(data_cfg, default_args)) + + return ConcatDataset(datasets) + + +def build_dataset(cfg, default_args=None): + if isinstance(cfg, (list, tuple)): + dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) + elif cfg['type'] == 'RepeatDataset': + dataset = RepeatDataset( + build_dataset(cfg['dataset'], default_args), cfg['times']) + elif cfg['type'] == 'RepeatFactorDataset': + dataset = RepeatFactorDataset( + build_dataset(cfg['dataset'], default_args), cfg['repeat_thr']) + elif isinstance(cfg.get('ann_file'), (list, tuple)): + dataset = _concat_dataset(cfg, default_args) + else: + dataset = build_from_cfg(cfg, DATASETS, default_args) + + return dataset diff --git a/mmdet3d/datasets/dataset_wrappers.py b/mmdet3d/datasets/dataset_wrappers.py new file mode 100644 index 0000000000..ca99192583 --- /dev/null +++ b/mmdet3d/datasets/dataset_wrappers.py @@ -0,0 +1,103 @@ +import math +from collections import defaultdict + +import numpy as np + +from mmdet.datasets import DATASETS + + +# Modified from https://github.com/facebookresearch/detectron2/blob/41d475b75a230221e21d9cac5d69655e3415e3a4/detectron2/data/samplers/distributed_sampler.py#L57 # noqa +@DATASETS.register_module +class RepeatFactorDataset(object): + """A wrapper of repeated dataset with repeat factor. + + Suitable for training on class imbalanced datasets like LVIS. In each + epoch, an image may appear multiple times based on its "repeat factor". + The repeat factor for an image is a function of the frequency the rarest + category labeled in that image. The "frequency of category c" in [0, 1] + is defined as the fraction of images in the training set (without repeats) + in which category c appears. + This wrapper will finally be merged into LVIS dataset. + + See https://arxiv.org/abs/1908.03195 (>= v2) Appendix B.2. + Args: + dataset (:obj:`Dataset`): The dataset to be repeated. + repeat_thr (float): frequency threshold below which data is repeated. + """ + + def __init__(self, dataset, repeat_thr): + self.dataset = dataset + self.repeat_thr = repeat_thr + self.CLASSES = dataset.CLASSES + + repeat_factors = self._get_repeat_factors(dataset, repeat_thr) + repeat_indices = [] + for dataset_index, repeat_factor in enumerate(repeat_factors): + repeat_indices.extend([dataset_index] * math.ceil(repeat_factor)) + self.repeat_indices = repeat_indices + + flags = [] + if hasattr(self.dataset, 'flag'): + for flag, repeat_factor in zip(self.dataset.flag, repeat_factors): + flags.extend([flag] * int(math.ceil(repeat_factor))) + assert len(flags) == len(repeat_indices) + self.flag = np.asarray(flags, dtype=np.uint8) + + def _get_repeat_factors(self, dataset, repeat_thr): + # 1. For each category c, compute the fraction # of images + # that contain it: f(c) + category_freq = defaultdict(int) + for idx, img_info in enumerate(dataset.data_infos): + if 'category_ids' in img_info: + cat_ids = set(img_info['category_ids']) + elif 'gt_names' in img_info: + cat_ids = set([ + gt for gt in img_info['gt_names'] + if gt in dataset.class_names + ]) + else: + labels = dataset.get_ann_info(idx)['labels'] + cat_ids = set([label for label in labels]) + for cat_id in cat_ids: + category_freq[cat_id] += 1 + num_images = len(dataset) + for k, v in category_freq.items(): + category_freq[k] = v / num_images + + # 2. For each category c, compute the category-level repeat factor: + # r(c) = max(1, sqrt(t / f(c))) + category_repeat = { + cat_id: max(1.0, math.sqrt(repeat_thr / cat_freq)) + for cat_id, cat_freq in category_freq.items() + } + + # 3. For each image I, compute the image-level repeat factor: + # r(I) = max_{c in I} r(c) + repeat_factors = [] + for idx, img_info in enumerate(dataset.data_infos): + if 'category_ids' in img_info: + cat_ids = set(img_info['category_ids']) + elif 'gt_names' in img_info: + cat_ids = set([ + gt for gt in img_info['gt_names'] + if gt in dataset.class_names + ]) + else: + labels = dataset.get_ann_info(idx)['labels'] + cat_ids = set([label for label in labels]) + + if len(cat_ids) == 0: + repeat_factor = 1 + else: + repeat_factor = max( + {category_repeat[cat_id] + for cat_id in cat_ids}) + repeat_factors.append(repeat_factor) + return repeat_factors + + def __getitem__(self, idx): + ori_index = self.repeat_indices[idx] + return self.dataset[ori_index] + + def __len__(self): + return len(self.repeat_indices) diff --git a/mmdet3d/datasets/kitti2d_dataset.py b/mmdet3d/datasets/kitti2d_dataset.py new file mode 100644 index 0000000000..ffcad3c413 --- /dev/null +++ b/mmdet3d/datasets/kitti2d_dataset.py @@ -0,0 +1,143 @@ +import mmcv +import numpy as np + +from mmdet.datasets import DATASETS, CustomDataset + + +@DATASETS.register_module +class Kitti2DDataset(CustomDataset): + + CLASSES = ('car', 'pedestrian', 'cyclist') + """ + Annotation format: + [ + { + 'image': { + 'image_idx': 0, + 'image_path': 'training/image_2/000000.png', + 'image_shape': array([ 370, 1224], dtype=int32) + }, + 'point_cloud': { + 'num_features': 4, + 'velodyne_path': 'training/velodyne/000000.bin' + }, + 'calib': { + 'P0': (4, 4), + 'P1': (4, 4), + 'P2': (4, 4), + 'P3': (4, 4), + 'R0_rect':4x4 np.array, + 'Tr_velo_to_cam': 4x4 np.array, + 'Tr_imu_to_velo': 4x4 np.array + }, + 'annos': { + 'name': (n), + 'truncated': (n), + 'occluded': (n), + 'alpha': (n), + 'bbox': (n, 4), + 'dimensions': (n, 3), + 'location': (n, 3), + 'rotation_y': (n), + 'score': (n), + 'index': array([0], dtype=int32), + 'group_ids': array([0], dtype=int32), + 'difficulty': array([0], dtype=int32), + 'num_points_in_gt': (n), + } + } + ] + """ + + def load_annotations(self, ann_file): + self.data_infos = mmcv.load(ann_file) + self.cat2label = { + cat_name: i + for i, cat_name in enumerate(self.class_names) + } + return self.data_infos + + def _filter_imgs(self, min_size=32): + """Filter images without ground truths.""" + valid_inds = [] + for i, img_info in enumerate(self.data_infos): + if len(img_info['annos']['name']) > 0: + valid_inds.append(i) + return valid_inds + + def get_ann_info(self, index): + # Use index to get the annos, thus the evalhook could also use this api + info = self.data_infos[index] + annos = info['annos'] + gt_names = annos['name'] + gt_bboxes = annos['bbox'] + difficulty = annos['difficulty'] + + # remove classes that is not needed + selected = self.keep_arrays_by_name(gt_names, self.CLASSES) + gt_bboxes = gt_bboxes[selected] + gt_names = gt_names[selected] + difficulty = difficulty[selected] + gt_labels = np.array([self.cat2label[n] for n in gt_names]) + + anns_results = dict( + bboxes=gt_bboxes.astype(np.float32), + labels=gt_labels, + ) + return anns_results + + def prepare_train_img(self, idx): + img_raw_info = self.data_infos[idx]['image'] + img_info = dict(filename=img_raw_info['image_path']) + ann_info = self.get_ann_info(idx) + if len(ann_info['bboxes']) == 0: + return None + results = dict(img_info=img_info, ann_info=ann_info) + if self.proposals is not None: + results['proposals'] = self.proposals[idx] + self.pre_pipeline(results) + return self.pipeline(results) + + def prepare_test_img(self, idx): + img_raw_info = self.data_infos[idx]['image'] + img_info = dict(filename=img_raw_info['image_path']) + results = dict(img_info=img_info) + if self.proposals is not None: + results['proposals'] = self.proposals[idx] + self.pre_pipeline(results) + return self.pipeline(results) + + def _set_group_flag(self): + """Set flag according to image aspect ratio. + Images with aspect ratio greater than 1 will be set as group 1, + otherwise group 0. + In kitti's pcd, they are all the same, thus are all zeros + """ + self.flag = np.zeros(len(self), dtype=np.uint8) + + def drop_arrays_by_name(self, gt_names, used_classes): + inds = [i for i, x in enumerate(gt_names) if x not in used_classes] + inds = np.array(inds, dtype=np.int64) + return inds + + def keep_arrays_by_name(self, gt_names, used_classes): + inds = [i for i, x in enumerate(gt_names) if x in used_classes] + inds = np.array(inds, dtype=np.int64) + return inds + + def reformat_bbox(self, outputs, out=None): + from mmdet3d.core.bbox.transforms import bbox2result_kitti2d + sample_idx = [info['image']['image_idx'] for info in self.data_infos] + result_files = bbox2result_kitti2d(outputs, self.CLASSES, sample_idx, + out) + return result_files + + def evaluate(self, result_files, eval_types=None): + from mmdet3d.core.evaluation import kitti_eval + eval_types = ['bbox'] if not eval_types else eval_types + assert eval_types in ('bbox', ['bbox' + ]), 'KITTI data set only evaluate bbox' + gt_annos = [info['annos'] for info in self.data_infos] + ap_result_str, ap_dict = kitti_eval( + gt_annos, result_files, self.CLASSES, eval_types=['bbox']) + return ap_result_str, ap_dict diff --git a/mmdet3d/datasets/kitti_dataset.py b/mmdet3d/datasets/kitti_dataset.py new file mode 100644 index 0000000000..796eea7b80 --- /dev/null +++ b/mmdet3d/datasets/kitti_dataset.py @@ -0,0 +1,579 @@ +import copy +import os +import pickle + +import mmcv +import numpy as np +import torch +import torch.utils.data as torch_data + +from mmdet.datasets.registry import DATASETS +from ..core.bbox import box_np_ops +from .pipelines import Compose +from .utils import remove_dontcare + + +@DATASETS.register_module +class KittiDataset(torch_data.Dataset): + + CLASSES = ('car', 'pedestrian', 'cyclist') + + def __init__(self, + root_path, + ann_file, + split, + pipeline=None, + training=False, + class_names=None, + modality=None, + with_label=True, + test_mode=False): + """ + :param root_path: KITTI data path + :param split: + """ + super().__init__() + self.root_path = root_path + self.root_split_path = os.path.join( + self.root_path, 'training' if split != 'test' else 'testing') + self.class_names = class_names if class_names else self.CLASSES + self.modality = modality + self.with_label = with_label + assert self.modality is not None + self.modality = modality + self.test_mode = test_mode + # TODO: rm the key training if it is not needed + self.training = training + self.pcd_limit_range = [0, -40, -3, 70.4, 40, 0.0] + + self.ann_file = ann_file + with open(ann_file, 'rb') as f: + self.kitti_infos = pickle.load(f) + + # set group flag for the sampler + if not self.test_mode: + self._set_group_flag() + + # processing pipeline + if pipeline is not None: + self.pipeline = Compose(pipeline) + + def __getitem__(self, idx): + if self.test_mode: + return self.prepare_test_data(idx) + while True: + data = self.prepare_train_data(idx) + if data is None: + idx = self._rand_another(idx) + continue + return data + + def prepare_train_data(self, index): + input_dict = self.get_sensor_data(index) + input_dict = self.train_pre_pipeline(input_dict) + if input_dict is None: + return None + example = self.pipeline(input_dict) + if example is None or len(example['gt_bboxes_3d']._data) == 0: + return None + return example + + def train_pre_pipeline(self, input_dict): + gt_bboxes_3d = input_dict['gt_bboxes_3d'] + gt_bboxes = input_dict['gt_bboxes'] + gt_names = input_dict['gt_names'] + difficulty = input_dict['difficulty'] + input_dict['bbox_fields'] = [] + + selected = self.drop_arrays_by_name(gt_names, ['DontCare']) + # selected = self.keep_arrays_by_name(gt_names, self.class_names) + gt_bboxes_3d = gt_bboxes_3d[selected] + gt_bboxes = gt_bboxes[selected] + gt_names = gt_names[selected] + difficulty = difficulty[selected] + gt_bboxes_mask = np.array([n in self.class_names for n in gt_names], + dtype=np.bool_) + + input_dict['gt_bboxes_3d'] = gt_bboxes_3d.astype('float32') + input_dict['gt_bboxes'] = gt_bboxes.astype('float32') + input_dict['gt_names'] = gt_names + input_dict['gt_names_3d'] = copy.deepcopy(gt_names) + input_dict['difficulty'] = difficulty + input_dict['gt_bboxes_mask'] = gt_bboxes_mask + input_dict['gt_bboxes_3d_mask'] = copy.deepcopy(gt_bboxes_mask) + input_dict['bbox_fields'].append('gt_bboxes') + if len(gt_bboxes) == 0: + return None + return input_dict + + def prepare_test_data(self, index): + input_dict = self.get_sensor_data(index) + # input_dict = self.test_pre_pipeline(input_dict) + example = self.pipeline(input_dict) + return example + + def test_pre_pipeline(self, input_dict): + gt_bboxes_3d = input_dict['gt_bboxes_3d'] + gt_bboxes = input_dict['gt_bboxes'] + gt_names = input_dict['gt_names'] + + if gt_bboxes_3d is not None: + selected = self.keep_arrays_by_name(gt_names, self.class_names) + gt_bboxes_3d = gt_bboxes_3d[selected] + gt_bboxes = gt_bboxes[selected] + gt_names = gt_names[selected] + + input_dict['gt_bboxes_3d'] = gt_bboxes_3d + input_dict['gt_bboxes'] = gt_bboxes + input_dict['gt_names'] = gt_names + input_dict['gt_names_3d'] = copy.deepcopy(gt_names) + return input_dict + + def _set_group_flag(self): + """Set flag according to image aspect ratio. + Images with aspect ratio greater than 1 will be set as group 1, + otherwise group 0. + In kitti's pcd, they are all the same, thus are all zeros + """ + self.flag = np.zeros(len(self), dtype=np.uint8) + + def _rand_another(self, idx): + pool = np.where(self.flag == self.flag[idx])[0] + return np.random.choice(pool) + + def get_lidar(self, idx): + lidar_file = os.path.join(self.root_split_path, 'velodyne', + '%06d.bin' % idx) + assert os.path.exists(lidar_file) + return np.fromfile(lidar_file, dtype=np.float32).reshape(-1, 4) + + def get_lidar_reduced(self, idx): + lidar_file = os.path.join(self.root_split_path, 'velodyne_reduced', + '%06d.bin' % idx) + assert os.path.exists(lidar_file) + return np.fromfile(lidar_file, dtype=np.float32).reshape(-1, 4) + + def get_lidar_depth_reduced(self, idx): + lidar_file = os.path.join(self.root_split_path, + 'velodyne_depth_reduced', '%06d.bin' % idx) + assert os.path.exists(lidar_file) + return np.fromfile(lidar_file, dtype=np.float32).reshape(-1, 4) + + def get_pure_depth_reduced(self, idx): + lidar_file = os.path.join(self.root_split_path, 'depth_reduced', + '%06d.bin' % idx) + assert os.path.exists(lidar_file) + return np.fromfile(lidar_file, dtype=np.float32).reshape(-1, 4) + + def get_depth(self, idx): + depth_file = os.path.join(self.root_split_path, 'depth_completion', + '%06d.png' % idx) + assert os.path.exists(depth_file) + depth_img = mmcv.imread(depth_file, -1) / 256.0 + return depth_img + + def __len__(self): + return len(self.kitti_infos) + + def get_sensor_data(self, index): + info = self.kitti_infos[index] + sample_idx = info['image']['image_idx'] + # TODO: consider use torch.Tensor only + rect = info['calib']['R0_rect'].astype(np.float32) + Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) + P2 = info['calib']['P2'].astype(np.float32) + lidar2img = P2 @ rect @ Trv2c + + if self.modality['use_depth'] and self.modality['use_lidar']: + points = self.get_lidar_depth_reduced(sample_idx) + elif self.modality['use_lidar']: + points = self.get_lidar_reduced(sample_idx) + elif self.modality['use_depth']: + points = self.get_pure_depth_reduced(sample_idx) + else: + assert (self.modality['use_depth'] or self.modality['use_lidar']) + + if not self.modality['use_lidar_intensity']: + points = points[:, :3] + + input_dict = dict( + sample_idx=sample_idx, + points=points, + lidar2img=lidar2img, + ) + + # TODO: support image input + if self.modality['use_camera']: + image_info = info['image'] + image_path = image_info['image_path'] + image_path = os.path.join(self.root_path, image_path) + img = mmcv.imread(image_path) + input_dict.update( + dict( + img=img, + img_shape=img.shape, + ori_shape=img.shape, + filename=image_path)) + else: + input_dict.update(dict(img_shape=info['image']['image_shape'])) + if self.with_label: + annos = self.get_ann_info(index) + input_dict.update(annos) + + return input_dict + + def get_ann_info(self, index): + # Use index to get the annos, thus the evalhook could also use this api + info = self.kitti_infos[index] + rect = info['calib']['R0_rect'].astype(np.float32) + Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) + # P2 = info['calib']['P2'].astype(np.float32) + + annos = info['annos'] + # we need other objects to avoid collision when sample + annos = remove_dontcare(annos) + loc = annos['location'] + dims = annos['dimensions'] + rots = annos['rotation_y'] + gt_names = annos['name'] + # print(gt_names, len(loc)) + gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]], + axis=1).astype(np.float32) + difficulty = annos['difficulty'] + # this change gt_bboxes_3d to velodyne coordinates + gt_bboxes_3d = box_np_ops.box_camera_to_lidar(gt_bboxes_3d, rect, + Trv2c) + # only center format is allowed. so we need to convert + # kitti [0.5, 0.5, 0] center to [0.5, 0.5, 0.5] + # box_np_ops.change_box3d_center_(gt_bboxes, [0.5, 0.5, 0], + # [0.5, 0.5, 0.5]) + + # For simplicity gt_bboxes means 2D gt bboxes + anns_results = dict( + gt_bboxes_3d=gt_bboxes_3d, + gt_bboxes=annos['bbox'], + gt_names=gt_names, + difficulty=difficulty) + return anns_results + + def drop_arrays_by_name(self, gt_names, used_classes): + inds = [i for i, x in enumerate(gt_names) if x not in used_classes] + inds = np.array(inds, dtype=np.int64) + return inds + + def keep_arrays_by_name(self, gt_names, used_classes): + inds = [i for i, x in enumerate(gt_names) if x in used_classes] + inds = np.array(inds, dtype=np.int64) + return inds + + def reformat_bbox(self, outputs, out=None): + if not isinstance(outputs[0][0], dict): + sample_idx = [ + info['image']['image_idx'] for info in self.kitti_infos + ] + result_files = self.bbox2result_kitti2d(outputs, self.class_names, + sample_idx, out) + else: + result_files = self.bbox2result_kitti(outputs, self.class_names, + out) + return result_files + + def evaluate(self, result_files, eval_types=None): + from mmdet3d.core.evaluation import kitti_eval + gt_annos = [info['annos'] for info in self.kitti_infos] + if eval_types == 'img_bbox': + ap_result_str, ap_dict = kitti_eval( + gt_annos, result_files, self.class_names, eval_types=['bbox']) + else: + ap_result_str, ap_dict = kitti_eval(gt_annos, result_files, + self.class_names) + return ap_result_str, ap_dict + + def bbox2result_kitti(self, net_outputs, class_names, out=None): + if out: + output_dir = out[:-4] if out.endswith(('.pkl', '.pickle')) else out + result_dir = output_dir + '/data' + mmcv.mkdir_or_exist(result_dir) + + det_annos = [] + print('Converting prediction to KITTI format') + for idx, pred_dicts in enumerate( + mmcv.track_iter_progress(net_outputs)): + annos = [] + info = self.kitti_infos[idx] + image_shape = info['image']['image_shape'][:2] + for i, box_dict in enumerate(pred_dicts): + num_example = 0 + sample_idx = box_dict['sample_idx'] + box_dict = self.convert_valid_bboxes(box_dict, info) + if box_dict['bbox'] is not None or box_dict['bbox'].size.numel( + ) != 0: + box_2d_preds = box_dict['bbox'] + box_preds = box_dict['box3d_camera'] + scores = box_dict['scores'] + box_preds_lidar = box_dict['box3d_lidar'] + label_preds = box_dict['label_preds'] + + anno = { + 'name': [], + 'truncated': [], + 'occluded': [], + 'alpha': [], + 'bbox': [], + 'dimensions': [], + 'location': [], + 'rotation_y': [], + 'score': [] + } + gt_iou = scores * 0 + + for box, box_lidar, bbox, score, label, cur_gt_iou in zip( + box_preds, box_preds_lidar, box_2d_preds, scores, + label_preds, gt_iou): + bbox[2:] = np.minimum(bbox[2:], image_shape[::-1]) + bbox[:2] = np.maximum(bbox[:2], [0, 0]) + anno['name'].append(class_names[int(label)]) + anno['truncated'].append(0.0) + anno['occluded'].append(0) + anno['alpha'].append( + -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6]) + anno['bbox'].append(bbox) + anno['dimensions'].append(box[3:6]) + anno['location'].append(box[:3]) + anno['rotation_y'].append(box[6]) + # anno["gt_iou"].append(cur_gt_iou) + anno['score'].append(score) + + num_example += 1 + + if num_example != 0: + anno = {k: np.stack(v) for k, v in anno.items()} + annos.append(anno) + + if out: + cur_det_file = result_dir + '/%06d.txt' % sample_idx + with open(cur_det_file, 'w') as f: + bbox = anno['bbox'] + loc = anno['location'] + dims = anno['dimensions'] # lhw -> hwl + + for idx in range(len(bbox)): + print( + '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} ' + '{:.4f} {:.4f} {:.4f} ' + '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}' + .format(anno['name'][idx], + anno['alpha'][idx], bbox[idx][0], + bbox[idx][1], bbox[idx][2], + bbox[idx][3], dims[idx][1], + dims[idx][2], dims[idx][0], + loc[idx][0], loc[idx][1], + loc[idx][2], + anno['rotation_y'][idx], + anno['score'][idx]), + file=f) + + if num_example == 0: + annos.append({ + 'name': np.array([]), + 'truncated': np.array([]), + 'occluded': np.array([]), + 'alpha': np.array([]), + 'bbox': np.zeros([0, 4]), + 'dimensions': np.zeros([0, 3]), + 'location': np.zeros([0, 3]), + 'rotation_y': np.array([]), + 'score': np.array([]), + }) + annos[-1]['sample_idx'] = np.array( + [sample_idx] * num_example, dtype=np.int64) + + det_annos += annos + + if out: + if not out.endswith(('.pkl', '.pickle')): + out = '{}.pkl'.format(out) + mmcv.dump(det_annos, out) + print('Result is saved to %s' % out) + + return det_annos + + def bbox2result_kitti2d(self, + net_outputs, + class_names, + sample_ids, + out=None): + """Convert results to kitti format for evaluation and test submission + + Args: + net_outputs (List[array]): list of array storing the bbox and score + class_nanes (List[String]): A list of class names + sample_idx (List[Int]): A list of samples' index, + should have the same length as net_outputs. + + Return: + List([dict]): A list of dict have the kitti format + """ + assert len(net_outputs) == len(sample_ids) + + det_annos = [] + print('Converting prediction to KITTI format') + for i, bboxes_per_sample in enumerate( + mmcv.track_iter_progress(net_outputs)): + annos = [] + anno = dict( + name=[], + truncated=[], + occluded=[], + alpha=[], + bbox=[], + dimensions=[], + location=[], + rotation_y=[], + score=[]) + sample_idx = sample_ids[i] + + num_example = 0 + for label in range(len(bboxes_per_sample)): + bbox = bboxes_per_sample[label] + for i in range(bbox.shape[0]): + anno['name'].append(class_names[int(label)]) + anno['truncated'].append(0.0) + anno['occluded'].append(0) + anno['alpha'].append(0.0) + anno['bbox'].append(bbox[i, :4]) + # set dimensions (height, width, length) to zero + anno['dimensions'].append( + np.zeros(shape=[3], dtype=np.float32)) + # set the 3D translation to (-1000, -1000, -1000) + anno['location'].append( + np.ones(shape=[3], dtype=np.float32) * (-1000.0)) + anno['rotation_y'].append(0.0) + anno['score'].append(bbox[i, 4]) + num_example += 1 + + if num_example == 0: + annos.append( + dict( + name=np.array([]), + truncated=np.array([]), + occluded=np.array([]), + alpha=np.array([]), + bbox=np.zeros([0, 4]), + dimensions=np.zeros([0, 3]), + location=np.zeros([0, 3]), + rotation_y=np.array([]), + score=np.array([]), + )) + else: + anno = {k: np.stack(v) for k, v in anno.items()} + annos.append(anno) + + annos[-1]['sample_idx'] = np.array( + [sample_idx] * num_example, dtype=np.int64) + det_annos += annos + + if out: + # save file in submission format + output_dir = out[:-4] if out.endswith(('.pkl', '.pickle')) else out + result_dir = output_dir + '/data' + mmcv.mkdir_or_exist(result_dir) + out = '{}.pkl'.format(result_dir) + mmcv.dump(det_annos, out) + print('Result is saved to {}'.format(out)) + for i, anno in enumerate(det_annos): + sample_idx = sample_ids[i] + cur_det_file = result_dir + '/%06d.txt' % sample_idx + with open(cur_det_file, 'w') as f: + bbox = anno['bbox'] + loc = anno['location'] + dims = anno['dimensions'][::-1] # lhw -> hwl + for idx in range(len(bbox)): + print( + '{} -1 -1 {:4f} {:4f} {:4f} {:4f} {:4f} {:4f} ' + '{:4f} {:4f} {:4f} {:4f} {:4f} {:4f} {:4f}'.format( + anno['name'][idx], + anno['alpha'][idx], + *bbox[idx], # 4 float + *dims[idx], # 3 float + *loc[idx], # 3 float + anno['rotation_y'][idx], + anno['score'][idx]), + file=f, + ) + print('Result is saved to {}'.format(result_dir)) + + return det_annos + + def convert_valid_bboxes(self, box_dict, info): + # TODO: refactor this function + final_box_preds = box_dict['box3d_lidar'] + final_scores = box_dict['scores'] + final_labels = box_dict['label_preds'] + sample_idx = info['image']['image_idx'] + final_box_preds[:, -1] = box_np_ops.limit_period( + final_box_preds[:, -1] - np.pi, offset=0.5, period=np.pi * 2) + + if final_box_preds.shape[0] == 0: + return dict( + bbox=final_box_preds.new_zeros([0, 4]).numpy(), + box3d_camera=final_box_preds.new_zeros([0, 7]).numpy(), + box3d_lidar=final_box_preds.new_zeros([0, 7]).numpy(), + scores=final_box_preds.new_zeros([0]).numpy(), + label_preds=final_box_preds.new_zeros([0, 4]).numpy(), + sample_idx=sample_idx, + ) + + from mmdet3d.core.bbox import box_torch_ops + rect = info['calib']['R0_rect'].astype(np.float32) + Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) + P2 = info['calib']['P2'].astype(np.float32) + img_shape = info['image']['image_shape'] + rect = final_box_preds.new_tensor(rect) + Trv2c = final_box_preds.new_tensor(Trv2c) + P2 = final_box_preds.new_tensor(P2) + + final_box_preds_camera = box_torch_ops.box_lidar_to_camera( + final_box_preds, rect, Trv2c) + locs = final_box_preds_camera[:, :3] + dims = final_box_preds_camera[:, 3:6] + angles = final_box_preds_camera[:, 6] + camera_box_origin = [0.5, 1.0, 0.5] + box_corners = box_torch_ops.center_to_corner_box3d( + locs, dims, angles, camera_box_origin, axis=1) + box_corners_in_image = box_torch_ops.project_to_image(box_corners, P2) + # box_corners_in_image: [N, 8, 2] + minxy = torch.min(box_corners_in_image, dim=1)[0] + maxxy = torch.max(box_corners_in_image, dim=1)[0] + box_2d_preds = torch.cat([minxy, maxxy], dim=1) + # Post-processing + # check final_box_preds_camera + image_shape = final_box_preds.new_tensor(img_shape) + valid_cam_inds = ((final_box_preds_camera[:, 0] < image_shape[1]) & + (final_box_preds_camera[:, 1] < image_shape[0]) & + (final_box_preds_camera[:, 2] > 0) & + (final_box_preds_camera[:, 3] > 0)) + # check final_box_preds + limit_range = final_box_preds.new_tensor(self.pcd_limit_range) + valid_pcd_inds = ((final_box_preds[:, :3] > limit_range[:3]) & + (final_box_preds[:, :3] < limit_range[3:])) + valid_inds = valid_cam_inds & valid_pcd_inds.all(-1) + + if valid_inds.sum() > 0: + return dict( + bbox=box_2d_preds[valid_inds, :].numpy(), + box3d_camera=final_box_preds_camera[valid_inds, :].numpy(), + box3d_lidar=final_box_preds[valid_inds, :].numpy(), + scores=final_scores[valid_inds].numpy(), + label_preds=final_labels[valid_inds].numpy(), + sample_idx=sample_idx, + ) + else: + return dict( + bbox=final_box_preds.new_zeros([0, 4]).numpy(), + box3d_camera=final_box_preds.new_zeros([0, 7]).numpy(), + box3d_lidar=final_box_preds.new_zeros([0, 7]).numpy(), + scores=final_box_preds.new_zeros([0]).numpy(), + label_preds=final_box_preds.new_zeros([0, 4]).numpy(), + sample_idx=sample_idx, + ) diff --git a/mmdet3d/datasets/loader/__init__.py b/mmdet3d/datasets/loader/__init__.py new file mode 100644 index 0000000000..4404615be6 --- /dev/null +++ b/mmdet3d/datasets/loader/__init__.py @@ -0,0 +1,4 @@ +from .build_loader import build_dataloader +from .sampler import DistributedGroupSampler, GroupSampler + +__all__ = ['GroupSampler', 'DistributedGroupSampler', 'build_dataloader'] diff --git a/mmdet3d/datasets/loader/build_loader.py b/mmdet3d/datasets/loader/build_loader.py new file mode 100644 index 0000000000..14ff9b1375 --- /dev/null +++ b/mmdet3d/datasets/loader/build_loader.py @@ -0,0 +1,57 @@ +import platform +import random +from functools import partial + +import numpy as np +from mmcv.parallel import collate +from mmcv.runner import get_dist_info +from torch.utils.data import DataLoader + +from .sampler import DistributedGroupSampler, DistributedSampler, GroupSampler + +if platform.system() != 'Windows': + # https://github.com/pytorch/pytorch/issues/973 + import resource + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) + + +def build_dataloader(dataset, + samples_per_gpu, + workers_per_gpu, + num_gpus=1, + dist=True, + seed=None, + **kwargs): + shuffle = kwargs.get('shuffle', True) + if dist: + rank, world_size = get_dist_info() + if shuffle: + sampler = DistributedGroupSampler(dataset, samples_per_gpu, + world_size, rank) + else: + sampler = DistributedSampler( + dataset, world_size, rank, shuffle=False) + batch_size = samples_per_gpu + num_workers = workers_per_gpu + else: + sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None + batch_size = num_gpus * samples_per_gpu + num_workers = num_gpus * workers_per_gpu + + data_loader = DataLoader( + dataset, + batch_size=batch_size, + sampler=sampler, + num_workers=num_workers, + collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), + pin_memory=False, + worker_init_fn=worker_init_fn if seed is not None else None, + **kwargs) + + return data_loader + + +def worker_init_fn(seed): + np.random.seed(seed) + random.seed(seed) diff --git a/mmdet3d/datasets/loader/sampler.py b/mmdet3d/datasets/loader/sampler.py new file mode 100644 index 0000000000..f3dd996207 --- /dev/null +++ b/mmdet3d/datasets/loader/sampler.py @@ -0,0 +1,164 @@ +from __future__ import division +import math + +import numpy as np +import torch +from mmcv.runner import get_dist_info +from torch.utils.data import DistributedSampler as _DistributedSampler +from torch.utils.data import Sampler + + +class DistributedSampler(_DistributedSampler): + + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): + super().__init__(dataset, num_replicas=num_replicas, rank=rank) + self.shuffle = shuffle + + def __iter__(self): + # deterministically shuffle based on epoch + if self.shuffle: + g = torch.Generator() + g.manual_seed(self.epoch) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = torch.arange(len(self.dataset)).tolist() + + # add extra samples to make it evenly divisible + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank:self.total_size:self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices) + + +class GroupSampler(Sampler): + + def __init__(self, dataset, samples_per_gpu=1): + assert hasattr(dataset, 'flag') + self.dataset = dataset + self.samples_per_gpu = samples_per_gpu + self.flag = dataset.flag.astype(np.int64) + self.group_sizes = np.bincount(self.flag) + self.num_samples = 0 + for i, size in enumerate(self.group_sizes): + self.num_samples += int(np.ceil( + size / self.samples_per_gpu)) * self.samples_per_gpu + + def __iter__(self): + indices = [] + for i, size in enumerate(self.group_sizes): + if size == 0: + continue + indice = np.where(self.flag == i)[0] + assert len(indice) == size + np.random.shuffle(indice) + num_extra = int(np.ceil(size / self.samples_per_gpu) + ) * self.samples_per_gpu - len(indice) + indice = np.concatenate( + [indice, np.random.choice(indice, num_extra)]) + indices.append(indice) + indices = np.concatenate(indices) + indices = [ + indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu] + for i in np.random.permutation( + range(len(indices) // self.samples_per_gpu)) + ] + indices = np.concatenate(indices) + indices = indices.astype(np.int64).tolist() + assert len(indices) == self.num_samples + return iter(indices) + + def __len__(self): + return self.num_samples + + +class DistributedGroupSampler(Sampler): + """Sampler that restricts data loading to a subset of the dataset. + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each + process can pass a DistributedSampler instance as a DataLoader sampler, + and load a subset of the original dataset that is exclusive to it. + .. note:: + Dataset is assumed to be of constant size. + Arguments: + dataset: Dataset used for sampling. + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + """ + + def __init__(self, + dataset, + samples_per_gpu=1, + num_replicas=None, + rank=None): + _rank, _num_replicas = get_dist_info() + if num_replicas is None: + num_replicas = _num_replicas + if rank is None: + rank = _rank + self.dataset = dataset + self.samples_per_gpu = samples_per_gpu + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + + assert hasattr(self.dataset, 'flag') + self.flag = self.dataset.flag + self.group_sizes = np.bincount(self.flag) + + self.num_samples = 0 + for i, j in enumerate(self.group_sizes): + self.num_samples += int( + math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / + self.num_replicas)) * self.samples_per_gpu + self.total_size = self.num_samples * self.num_replicas + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + + indices = [] + for i, size in enumerate(self.group_sizes): + if size > 0: + indice = np.where(self.flag == i)[0] + assert len(indice) == size + indice = indice[list(torch.randperm(int(size), + generator=g))].tolist() + extra = int( + math.ceil( + size * 1.0 / self.samples_per_gpu / self.num_replicas) + ) * self.samples_per_gpu * self.num_replicas - len(indice) + # pad indice + tmp = indice.copy() + for _ in range(extra // size): + indice.extend(tmp) + indice.extend(tmp[:extra % size]) + indices.extend(indice) + + assert len(indices) == self.total_size + + indices = [ + indices[j] for i in list( + torch.randperm( + len(indices) // self.samples_per_gpu, generator=g)) + for j in range(i * self.samples_per_gpu, (i + 1) * + self.samples_per_gpu) + ] + + # subsample + offset = self.num_samples * self.rank + indices = indices[offset:offset + self.num_samples] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch diff --git a/mmdet3d/datasets/nuscenes2d_dataset.py b/mmdet3d/datasets/nuscenes2d_dataset.py new file mode 100644 index 0000000000..636a55e7ee --- /dev/null +++ b/mmdet3d/datasets/nuscenes2d_dataset.py @@ -0,0 +1,38 @@ +from pycocotools.coco import COCO + +from mmdet3d.core.evaluation.coco_utils import getImgIds +from mmdet.datasets import DATASETS, CocoDataset + + +@DATASETS.register_module +class NuScenes2DDataset(CocoDataset): + + CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', + 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', + 'barrier') + + def load_annotations(self, ann_file): + if not self.class_names: + self.class_names = self.CLASSES + self.coco = COCO(ann_file) + # send class_names into the get id + # in case we only need to train on several classes + # by default self.class_names = CLASSES + self.cat_ids = self.coco.getCatIds(catNms=self.class_names) + + self.cat2label = { + cat_id: i # + 1 rm +1 here thus the 0-79 are fg, 80 is bg + for i, cat_id in enumerate(self.cat_ids) + } + # send cat ids to the get img id + # in case we only need to train on several classes + if len(self.cat_ids) < len(self.CLASSES): + self.img_ids = getImgIds(self.coco, catIds=self.cat_ids) + else: + self.img_ids = self.coco.getImgIds() + img_infos = [] + for i in self.img_ids: + info = self.coco.loadImgs([i])[0] + info['filename'] = info['file_name'] + img_infos.append(info) + return img_infos diff --git a/mmdet3d/datasets/nuscenes_dataset.py b/mmdet3d/datasets/nuscenes_dataset.py new file mode 100644 index 0000000000..b46f687f62 --- /dev/null +++ b/mmdet3d/datasets/nuscenes_dataset.py @@ -0,0 +1,495 @@ +import copy +import os.path as osp +import tempfile + +import mmcv +import numpy as np +import pyquaternion +import torch.utils.data as torch_data +from nuscenes.utils.data_classes import Box as NuScenesBox + +from mmdet.datasets import DATASETS +from ..core.bbox import box_np_ops +from .pipelines import Compose + + +@DATASETS.register_module +class NuScenesDataset(torch_data.Dataset): + NumPointFeatures = 4 # xyz, timestamp. set 4 to use kitti pretrain + NameMapping = { + 'movable_object.barrier': 'barrier', + 'vehicle.bicycle': 'bicycle', + 'vehicle.bus.bendy': 'bus', + 'vehicle.bus.rigid': 'bus', + 'vehicle.car': 'car', + 'vehicle.construction': 'construction_vehicle', + 'vehicle.motorcycle': 'motorcycle', + 'human.pedestrian.adult': 'pedestrian', + 'human.pedestrian.child': 'pedestrian', + 'human.pedestrian.construction_worker': 'pedestrian', + 'human.pedestrian.police_officer': 'pedestrian', + 'movable_object.trafficcone': 'traffic_cone', + 'vehicle.trailer': 'trailer', + 'vehicle.truck': 'truck' + } + DefaultAttribute = { + 'car': 'vehicle.parked', + 'pedestrian': 'pedestrian.moving', + 'trailer': 'vehicle.parked', + 'truck': 'vehicle.parked', + 'bus': 'vehicle.moving', + 'motorcycle': 'cycle.without_rider', + 'construction_vehicle': 'vehicle.parked', + 'bicycle': 'cycle.without_rider', + 'barrier': '', + 'traffic_cone': '', + } + AttrMapping = { + 'cycle.with_rider': 0, + 'cycle.without_rider': 1, + 'pedestrian.moving': 2, + 'pedestrian.standing': 3, + 'pedestrian.sitting_lying_down': 4, + 'vehicle.moving': 5, + 'vehicle.parked': 6, + 'vehicle.stopped': 7, + } + AttrMapping_rev = [ + 'cycle.with_rider', + 'cycle.without_rider', + 'pedestrian.moving', + 'pedestrian.standing', + 'pedestrian.sitting_lying_down', + 'vehicle.moving', + 'vehicle.parked', + 'vehicle.stopped', + ] + CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', + 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', + 'barrier') + + def __init__(self, + ann_file, + pipeline=None, + root_path=None, + class_names=None, + load_interval=1, + with_velocity=True, + test_mode=False, + modality=None, + eval_version='detection_cvpr_2019', + with_label=True, + max_sweeps=10, + filter_empty_gt=True): + super().__init__() + self.data_root = root_path + self.class_names = class_names if class_names else self.CLASSES + self.test_mode = test_mode + self.load_interval = load_interval + self.with_label = with_label + self.max_sweeps = max_sweeps + + self.ann_file = ann_file + data = mmcv.load(ann_file) + self.infos = list(sorted(data['infos'], key=lambda e: e['timestamp'])) + self.infos = self.infos[::load_interval] + self.metadata = data['metadata'] + self.version = self.metadata['version'] + self.with_velocity = with_velocity + self.eval_version = eval_version + from nuscenes.eval.detection.config import config_factory + self.eval_detection_configs = config_factory(self.eval_version) + + if modality is None: + modality = dict( + use_camera=False, + use_lidar=True, + use_radar=False, + use_map=False, + use_external=False, + ) + self.modality = modality + # set group flag for the sampler + if not self.test_mode: + self._set_group_flag() + + # processing pipeline + if pipeline is not None: + self.pipeline = Compose(pipeline) + + # kitti map: nusc det name -> kitti eval name + self._kitti_name_mapping = { + 'car': 'car', + 'pedestrian': 'pedestrian', + } # we only eval these classes in kitti + + def __getitem__(self, idx): + if self.test_mode: + return self.prepare_test_data(idx) + while True: + data = self.prepare_train_data(idx) + if data is None: + idx = self._rand_another(idx) + continue + return data + + def _set_group_flag(self): + """Set flag according to image aspect ratio. + Images with aspect ratio greater than 1 will be set as group 1, + otherwise group 0. + In kitti's pcd, they are all the same, thus are all zeros + """ + self.flag = np.zeros(len(self), dtype=np.uint8) + + def _rand_another(self, idx): + pool = np.where(self.flag == self.flag[idx])[0] + return np.random.choice(pool) + + def __len__(self): + return len(self.infos) + + def prepare_train_data(self, index): + input_dict = self.get_sensor_data(index) + input_dict = self.train_pre_pipeline(input_dict) + if input_dict is None: + return None + example = self.pipeline(input_dict) + if len(example['gt_bboxes_3d']._data) == 0: + return None + return example + + def train_pre_pipeline(self, input_dict): + if len(input_dict['gt_bboxes_3d']) == 0: + return None + return input_dict + + def prepare_test_data(self, index): + input_dict = self.get_sensor_data(index) + # input_dict = self.test_pre_pipeline(input_dict) + example = self.pipeline(input_dict) + return example + + def test_pre_pipeline(self, input_dict): + gt_names = input_dict['gt_names'] + input_dict['gt_names_3d'] = copy.deepcopy(gt_names) + return input_dict + + def get_sensor_data(self, index): + info = self.infos[index] + points = np.fromfile( + info['lidar_path'], dtype=np.float32, count=-1).reshape([-1, 5]) + # standard protocal modified from SECOND.Pytorch + points[:, 3] /= 255 + points[:, 4] = 0 + sweep_points_list = [points] + ts = info['timestamp'] / 1e6 + + for idx, sweep in enumerate(info['sweeps']): + if idx >= self.max_sweeps: + break + points_sweep = np.fromfile( + sweep['data_path'], dtype=np.float32, + count=-1).reshape([-1, 5]) + sweep_ts = sweep['timestamp'] / 1e6 + points_sweep[:, 3] /= 255 + points_sweep[:, :3] = points_sweep[:, :3] @ sweep[ + 'sensor2lidar_rotation'].T + points_sweep[:, :3] += sweep['sensor2lidar_translation'] + points_sweep[:, 4] = ts - sweep_ts + sweep_points_list.append(points_sweep) + + points = np.concatenate(sweep_points_list, axis=0)[:, [0, 1, 2, 4]] + input_dict = dict( + points=points, + sample_idx=info['token'], + ) + + if self.modality['use_camera']: + # TODO support image + imgs = [] + ori_shapes = [] + image_paths = [] + lidar2img_rts = [] + for cam_type, cam_info in info['cams'].items(): + image_path = cam_info['data_path'] + # image_path = osp.join(self.data_root, image_path) + img = mmcv.imread(image_path) + imgs.append(img) + ori_shapes.append(img.shape) + image_paths.append(image_path) + # obtain lidar to image transformation matrix + lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) + lidar2cam_t = cam_info[ + 'sensor2lidar_translation'] @ lidar2cam_r.T + lidar2cam_rt = np.eye(4) + lidar2cam_rt[:3, :3] = lidar2cam_r.T + lidar2cam_rt[3, :3] = -lidar2cam_t + intrinsic = cam_info['cam_intrinsic'] + viewpad = np.eye(4) + viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic + lidar2img_rt = (viewpad @ lidar2cam_rt.T) + lidar2img_rts.append(lidar2img_rt) + + input_dict.update( + dict( + img=imgs, + img_shape=ori_shapes, + ori_shape=ori_shapes, + filename=image_paths, + lidar2img=lidar2img_rts, + )) + + if self.with_label: + annos = self.get_ann_info(index) + input_dict.update(annos) + + return input_dict + + def get_ann_info(self, index): + info = self.infos[index] + # filter out bbox containing no points + mask = info['num_lidar_pts'] > 0 + gt_bboxes_3d = info['gt_boxes'][mask] + # the nuscenes box center is [0.5, 0.5, 0.5], we keep it + # the same as KITTI [0.5, 0.5, 0] + box_np_ops.change_box3d_center_(gt_bboxes_3d, [0.5, 0.5, 0.5], + [0.5, 0.5, 0]) + gt_names_3d = info['gt_names'][mask] + + if self.with_velocity: + gt_velocity = info['gt_velocity'][mask] + nan_mask = np.isnan(gt_velocity[:, 0]) + gt_velocity[nan_mask] = [0.0, 0.0] + gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1) + + gt_bboxes_3d_mask = np.array( + [n in self.class_names for n in gt_names_3d], dtype=np.bool_) + anns_results = dict( + gt_bboxes_3d=gt_bboxes_3d, + gt_names_3d=gt_names_3d, + gt_bboxes_3d_mask=gt_bboxes_3d_mask, + ) + return anns_results + + def _format_bbox(self, results, jsonfile_prefix=None): + nusc_annos = {} + mapped_class_names = self.class_names + token2info = {} + for info in self.infos: + token2info[info['token']] = info + print('Start to convert detection format...') + for det in mmcv.track_iter_progress(results): + annos = [] + boxes = output_to_nusc_box(det[0]) + boxes = lidar_nusc_box_to_global(token2info[det[0]['sample_idx']], + boxes, mapped_class_names, + self.eval_detection_configs, + self.eval_version) + for i, box in enumerate(boxes): + name = mapped_class_names[box.label] + if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2: + if name in [ + 'car', + 'construction_vehicle', + 'bus', + 'truck', + 'trailer', + ]: + attr = 'vehicle.moving' + elif name in ['bicycle', 'motorcycle']: + attr = 'cycle.with_rider' + else: + attr = NuScenesDataset.DefaultAttribute[name] + else: + if name in ['pedestrian']: + attr = 'pedestrian.standing' + elif name in ['bus']: + attr = 'vehicle.stopped' + else: + attr = NuScenesDataset.DefaultAttribute[name] + + nusc_anno = dict( + sample_token=det[0]['sample_idx'], + translation=box.center.tolist(), + size=box.wlh.tolist(), + rotation=box.orientation.elements.tolist(), + velocity=box.velocity[:2].tolist(), + detection_name=name, + detection_score=box.score, + attribute_name=attr) + annos.append(nusc_anno) + nusc_annos[det[0]['sample_idx']] = annos + nusc_submissions = { + 'meta': self.modality, + 'results': nusc_annos, + } + + mmcv.mkdir_or_exist(jsonfile_prefix) + res_path = osp.join(jsonfile_prefix, 'results_nusc.json') + print('Results writes to', res_path) + mmcv.dump(nusc_submissions, res_path) + return res_path + + def _evaluate_single(self, + result_path, + logger=None, + metric='bbox', + result_name='pts_bbox'): + from nuscenes import NuScenes + from nuscenes.eval.detection.evaluate import NuScenesEval + + output_dir = osp.join(*osp.split(result_path)[:-1]) + nusc = NuScenes( + version=self.version, dataroot=self.data_root, verbose=False) + eval_set_map = { + 'v1.0-mini': 'mini_train', + 'v1.0-trainval': 'val', + } + nusc_eval = NuScenesEval( + nusc, + config=self.eval_detection_configs, + result_path=result_path, + eval_set=eval_set_map[self.version], + output_dir=output_dir, + verbose=False) + nusc_eval.main(render_curves=False) + + # record metrics + metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json')) + detail = dict() + metric_prefix = '{}_NuScenes'.format(result_name) + for name in self.class_names: + for k, v in metrics['label_aps'][name].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val + for k, v in metrics['label_tp_errors'][name].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}_{}'.format(metric_prefix, name, k)] = val + + detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score'] + detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap'] + return detail + + def format_results(self, results, jsonfile_prefix=None): + """Format the results to json (standard format for COCO evaluation). + + Args: + results (list): Testing results of the dataset. + jsonfile_prefix (str | None): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Default: None. + + Returns: + tuple: (result_files, tmp_dir), result_files is a dict containing + the json filepaths, tmp_dir is the temporal directory created + for saving json files when jsonfile_prefix is not specified. + """ + assert isinstance(results, list), 'results must be a list' + assert len(results) == len(self), ( + 'The length of results is not equal to the dataset len: {} != {}'. + format(len(results), len(self))) + + if jsonfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + jsonfile_prefix = osp.join(tmp_dir.name, 'results') + else: + tmp_dir = None + + if not isinstance(results[0], dict): + result_files = self._format_bbox(results, jsonfile_prefix) + else: + result_files = dict() + for name in results[0]: + print('Formating bboxes of {}'.format(name)) + results_ = [out[name] for out in results] + tmp_file_ = osp.join(jsonfile_prefix, name) + result_files.update( + {name: self._format_bbox(results_, tmp_file_)}) + return result_files, tmp_dir + + def evaluate(self, + results, + metric='bbox', + logger=None, + jsonfile_prefix=None, + result_names=['pts_bbox']): + """Evaluation in nuScenes protocol. + + Args: + results (list): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + jsonfile_prefix (str | None): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Default: None. + + Returns: + dict[str: float] + """ + result_files, tmp_dir = self.format_results(results, jsonfile_prefix) + + if isinstance(result_files, dict): + results_dict = dict() + for name in result_names: + print('Evaluating bboxes of {}'.format(name)) + ret_dict = self._evaluate_single(result_files[name]) + results_dict.update(ret_dict) + elif isinstance(result_files, str): + results_dict = self._evaluate_single(result_files) + + if tmp_dir is not None: + tmp_dir.cleanup() + return results_dict + + +def output_to_nusc_box(detection): + box3d = detection['box3d_lidar'].numpy() + scores = detection['scores'].numpy() + labels = detection['label_preds'].numpy() + # TODO: check whether this is necessary + # with dir_offset & dir_limit in the head + box3d[:, 6] = -box3d[:, 6] - np.pi / 2 + # the trained model is in [0.5, 0.5, 0], + # change them back to nuscenes [0.5, 0.5, 0.5] + box_np_ops.change_box3d_center_(box3d, [0.5, 0.5, 0], [0.5, 0.5, 0.5]) + box_list = [] + for i in range(box3d.shape[0]): + quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box3d[i, 6]) + velocity = (*box3d[i, 7:9], 0.0) + # velo_val = np.linalg.norm(box3d[i, 7:9]) + # velo_ori = box3d[i, 6] + # velocity = ( + # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0) + box = NuScenesBox( + box3d[i, :3], + box3d[i, 3:6], + quat, + label=labels[i], + score=scores[i], + velocity=velocity) + box_list.append(box) + return box_list + + +def lidar_nusc_box_to_global(info, + boxes, + classes, + eval_configs, + eval_version='detection_cvpr_2019'): + box_list = [] + for box in boxes: + # Move box to ego vehicle coord system + box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation'])) + box.translate(np.array(info['lidar2ego_translation'])) + # filter det in ego. + cls_range_map = eval_configs.class_range + radius = np.linalg.norm(box.center[:2], 2) + det_range = cls_range_map[classes[box.label]] + if radius > det_range: + continue + # Move box to global coord system + box.rotate(pyquaternion.Quaternion(info['ego2global_rotation'])) + box.translate(np.array(info['ego2global_translation'])) + box_list.append(box) + return box_list diff --git a/mmdet3d/datasets/pipelines/__init__.py b/mmdet3d/datasets/pipelines/__init__.py new file mode 100644 index 0000000000..44863eed45 --- /dev/null +++ b/mmdet3d/datasets/pipelines/__init__.py @@ -0,0 +1,13 @@ +from mmdet.dataset import Compose +from .formating import (Collect, Collect3D, ImageToTensor, ToDataContainer, + ToTensor, Transpose, to_tensor) +from .train_aug import (GlobalRotScale, ObjectNoise, ObjectRangeFilter, + ObjectSample, PointShuffle, PointsRangeFilter, + RandomFlip3D) + +__all__ = [ + 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer', + 'Transpose', 'Collect', 'PhotoMetricDistortion', 'ObjectSample', + 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScale', 'PointShuffle', + 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D' +] diff --git a/mmdet3d/datasets/pipelines/data_augment_utils.py b/mmdet3d/datasets/pipelines/data_augment_utils.py new file mode 100644 index 0000000000..268958cef6 --- /dev/null +++ b/mmdet3d/datasets/pipelines/data_augment_utils.py @@ -0,0 +1,326 @@ +import numba +import numpy as np + +from mmdet3d.core.bbox import box_np_ops + + +@numba.njit +def _rotation_box2d_jit_(corners, angle, rot_mat_T): + rot_sin = np.sin(angle) + rot_cos = np.cos(angle) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + corners[:] = corners @ rot_mat_T + + +@numba.jit(nopython=True) +def box_collision_test(boxes, qboxes, clockwise=True): + N = boxes.shape[0] + K = qboxes.shape[0] + ret = np.zeros((N, K), dtype=np.bool_) + slices = np.array([1, 2, 3, 0]) + lines_boxes = np.stack((boxes, boxes[:, slices, :]), + axis=2) # [N, 4, 2(line), 2(xy)] + lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2) + # vec = np.zeros((2,), dtype=boxes.dtype) + boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes) + qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes) + for i in range(N): + for j in range(K): + # calculate standup first + iw = ( + min(boxes_standup[i, 2], qboxes_standup[j, 2]) - + max(boxes_standup[i, 0], qboxes_standup[j, 0])) + if iw > 0: + ih = ( + min(boxes_standup[i, 3], qboxes_standup[j, 3]) - + max(boxes_standup[i, 1], qboxes_standup[j, 1])) + if ih > 0: + for k in range(4): + for l in range(4): + A = lines_boxes[i, k, 0] + B = lines_boxes[i, k, 1] + C = lines_qboxes[j, l, 0] + D = lines_qboxes[j, l, 1] + acd = (D[1] - A[1]) * (C[0] - + A[0]) > (C[1] - A[1]) * ( + D[0] - A[0]) + bcd = (D[1] - B[1]) * (C[0] - + B[0]) > (C[1] - B[1]) * ( + D[0] - B[0]) + if acd != bcd: + abc = (C[1] - A[1]) * (B[0] - A[0]) > ( + B[1] - A[1]) * ( + C[0] - A[0]) + abd = (D[1] - A[1]) * (B[0] - A[0]) > ( + B[1] - A[1]) * ( + D[0] - A[0]) + if abc != abd: + ret[i, j] = True # collision. + break + if ret[i, j] is True: + break + if ret[i, j] is False: + # now check complete overlap. + # box overlap qbox: + box_overlap_qbox = True + for l in range(4): # point l in qboxes + for k in range(4): # corner k in boxes + vec = boxes[i, k] - boxes[i, (k + 1) % 4] + if clockwise: + vec = -vec + cross = vec[1] * ( + boxes[i, k, 0] - qboxes[j, l, 0]) + cross -= vec[0] * ( + boxes[i, k, 1] - qboxes[j, l, 1]) + if cross >= 0: + box_overlap_qbox = False + break + if box_overlap_qbox is False: + break + + if box_overlap_qbox is False: + qbox_overlap_box = True + for l in range(4): # point l in boxes + for k in range(4): # corner k in qboxes + vec = qboxes[j, k] - qboxes[j, (k + 1) % 4] + if clockwise: + vec = -vec + cross = vec[1] * ( + qboxes[j, k, 0] - boxes[i, l, 0]) + cross -= vec[0] * ( + qboxes[j, k, 1] - boxes[i, l, 1]) + if cross >= 0: # + qbox_overlap_box = False + break + if qbox_overlap_box is False: + break + if qbox_overlap_box: + ret[i, j] = True # collision. + else: + ret[i, j] = True # collision. + return ret + + +@numba.njit +def noise_per_box(boxes, valid_mask, loc_noises, rot_noises): + # boxes: [N, 5] + # valid_mask: [N] + # loc_noises: [N, M, 3] + # rot_noises: [N, M] + num_boxes = boxes.shape[0] + num_tests = loc_noises.shape[1] + box_corners = box_np_ops.box2d_to_corner_jit(boxes) + current_corners = np.zeros((4, 2), dtype=boxes.dtype) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + success_mask = -np.ones((num_boxes, ), dtype=np.int64) + # print(valid_mask) + for i in range(num_boxes): + if valid_mask[i]: + for j in range(num_tests): + current_corners[:] = box_corners[i] + current_corners -= boxes[i, :2] + _rotation_box2d_jit_(current_corners, rot_noises[i, j], + rot_mat_T) + current_corners += boxes[i, :2] + loc_noises[i, j, :2] + coll_mat = box_collision_test( + current_corners.reshape(1, 4, 2), box_corners) + coll_mat[0, i] = False + # print(coll_mat) + if not coll_mat.any(): + success_mask[i] = j + box_corners[i] = current_corners + break + return success_mask + + +@numba.njit +def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises, + global_rot_noises): + # boxes: [N, 5] + # valid_mask: [N] + # loc_noises: [N, M, 3] + # rot_noises: [N, M] + num_boxes = boxes.shape[0] + num_tests = loc_noises.shape[1] + box_corners = box_np_ops.box2d_to_corner_jit(boxes) + current_corners = np.zeros((4, 2), dtype=boxes.dtype) + current_box = np.zeros((1, 5), dtype=boxes.dtype) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + dst_pos = np.zeros((2, ), dtype=boxes.dtype) + success_mask = -np.ones((num_boxes, ), dtype=np.int64) + corners_norm = np.zeros((4, 2), dtype=boxes.dtype) + corners_norm[1, 1] = 1.0 + corners_norm[2] = 1.0 + corners_norm[3, 0] = 1.0 + corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) + corners_norm = corners_norm.reshape(4, 2) + for i in range(num_boxes): + if valid_mask[i]: + for j in range(num_tests): + current_box[0, :] = boxes[i] + current_radius = np.sqrt(boxes[i, 0]**2 + boxes[i, 1]**2) + current_grot = np.arctan2(boxes[i, 0], boxes[i, 1]) + dst_grot = current_grot + global_rot_noises[i, j] + dst_pos[0] = current_radius * np.sin(dst_grot) + dst_pos[1] = current_radius * np.cos(dst_grot) + current_box[0, :2] = dst_pos + current_box[0, -1] += (dst_grot - current_grot) + + rot_sin = np.sin(current_box[0, -1]) + rot_cos = np.cos(current_box[0, -1]) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + current_corners[:] = current_box[ + 0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2] + current_corners -= current_box[0, :2] + _rotation_box2d_jit_(current_corners, rot_noises[i, j], + rot_mat_T) + current_corners += current_box[0, :2] + loc_noises[i, j, :2] + coll_mat = box_collision_test( + current_corners.reshape(1, 4, 2), box_corners) + coll_mat[0, i] = False + if not coll_mat.any(): + success_mask[i] = j + box_corners[i] = current_corners + loc_noises[i, j, :2] += (dst_pos - boxes[i, :2]) + rot_noises[i, j] += (dst_grot - current_grot) + break + return success_mask + + +def _select_transform(transform, indices): + result = np.zeros((transform.shape[0], *transform.shape[2:]), + dtype=transform.dtype) + for i in range(transform.shape[0]): + if indices[i] != -1: + result[i] = transform[i, indices[i]] + return result + + +@numba.njit +def _rotation_matrix_3d_(rot_mat_T, angle, axis): + rot_sin = np.sin(angle) + rot_cos = np.cos(angle) + rot_mat_T[:] = np.eye(3) + if axis == 1: + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 2] = -rot_sin + rot_mat_T[2, 0] = rot_sin + rot_mat_T[2, 2] = rot_cos + elif axis == 2 or axis == -1: + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = -rot_sin + rot_mat_T[1, 0] = rot_sin + rot_mat_T[1, 1] = rot_cos + elif axis == 0: + rot_mat_T[1, 1] = rot_cos + rot_mat_T[1, 2] = -rot_sin + rot_mat_T[2, 1] = rot_sin + rot_mat_T[2, 2] = rot_cos + + +@numba.njit +def points_transform_(points, centers, point_masks, loc_transform, + rot_transform, valid_mask): + num_box = centers.shape[0] + num_points = points.shape[0] + rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype) + for i in range(num_box): + _rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2) + for i in range(num_points): + for j in range(num_box): + if valid_mask[j]: + if point_masks[i, j] == 1: + points[i, :3] -= centers[j, :3] + points[i:i + 1, :3] = points[i:i + 1, :3] @ rot_mat_T[j] + points[i, :3] += centers[j, :3] + points[i, :3] += loc_transform[j] + break # only apply first box's transform + + +@numba.njit +def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask): + num_box = boxes.shape[0] + for i in range(num_box): + if valid_mask[i]: + boxes[i, :3] += loc_transform[i] + boxes[i, 6] += rot_transform[i] + + +def noise_per_object_v3_(gt_boxes, + points=None, + valid_mask=None, + rotation_perturb=np.pi / 4, + center_noise_std=1.0, + global_random_rot_range=np.pi / 4, + num_try=100): + """random rotate or remove each groundtrutn independently. + use kitti viewer to test this function points_transform_ + + Args: + gt_boxes: [N, 7], gt box in lidar.points_transform_ + points: [M, 4], point cloud in lidar. + """ + num_boxes = gt_boxes.shape[0] + if not isinstance(rotation_perturb, (list, tuple, np.ndarray)): + rotation_perturb = [-rotation_perturb, rotation_perturb] + if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)): + global_random_rot_range = [ + -global_random_rot_range, global_random_rot_range + ] + enable_grot = np.abs(global_random_rot_range[0] - + global_random_rot_range[1]) >= 1e-3 + + if not isinstance(center_noise_std, (list, tuple, np.ndarray)): + center_noise_std = [ + center_noise_std, center_noise_std, center_noise_std + ] + if valid_mask is None: + valid_mask = np.ones((num_boxes, ), dtype=np.bool_) + center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype) + + loc_noises = np.random.normal( + scale=center_noise_std, size=[num_boxes, num_try, 3]) + rot_noises = np.random.uniform( + rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try]) + gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1]) + grot_lowers = global_random_rot_range[0] - gt_grots + grot_uppers = global_random_rot_range[1] - gt_grots + global_rot_noises = np.random.uniform( + grot_lowers[..., np.newaxis], + grot_uppers[..., np.newaxis], + size=[num_boxes, num_try]) + + origin = [0.5, 0.5, 0] + gt_box_corners = box_np_ops.center_to_corner_box3d( + gt_boxes[:, :3], + gt_boxes[:, 3:6], + gt_boxes[:, 6], + origin=origin, + axis=2) + + # TODO: rewrite this noise box function? + if not enable_grot: + selected_noise = noise_per_box(gt_boxes[:, [0, 1, 3, 4, 6]], + valid_mask, loc_noises, rot_noises) + else: + selected_noise = noise_per_box_v2_(gt_boxes[:, [0, 1, 3, 4, 6]], + valid_mask, loc_noises, rot_noises, + global_rot_noises) + + loc_transforms = _select_transform(loc_noises, selected_noise) + rot_transforms = _select_transform(rot_noises, selected_noise) + surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners) + if points is not None: + # TODO: replace this points_in_convex function by my tools? + point_masks = box_np_ops.points_in_convex_polygon_3d_jit( + points[:, :3], surfaces) + points_transform_(points, gt_boxes[:, :3], point_masks, loc_transforms, + rot_transforms, valid_mask) + + box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask) diff --git a/mmdet3d/datasets/pipelines/dbsampler.py b/mmdet3d/datasets/pipelines/dbsampler.py new file mode 100644 index 0000000000..f0357d2d40 --- /dev/null +++ b/mmdet3d/datasets/pipelines/dbsampler.py @@ -0,0 +1,509 @@ +import copy +import os +import pickle + +import cv2 +import mmcv +import numpy as np + +from mmdet3d.core.bbox import box_np_ops +from mmdet3d.datasets.pipelines import data_augment_utils +from ..registry import OBJECTSAMPLERS + + +class BatchSampler: + + def __init__(self, + sampled_list, + name=None, + epoch=None, + shuffle=True, + drop_reminder=False): + self._sampled_list = sampled_list + self._indices = np.arange(len(sampled_list)) + if shuffle: + np.random.shuffle(self._indices) + self._idx = 0 + self._example_num = len(sampled_list) + self._name = name + self._shuffle = shuffle + self._epoch = epoch + self._epoch_counter = 0 + self._drop_reminder = drop_reminder + + def _sample(self, num): + if self._idx + num >= self._example_num: + ret = self._indices[self._idx:].copy() + self._reset() + else: + ret = self._indices[self._idx:self._idx + num] + self._idx += num + return ret + + def _reset(self): + assert self._name is not None + # print("reset", self._name) + if self._shuffle: + np.random.shuffle(self._indices) + self._idx = 0 + + def sample(self, num): + indices = self._sample(num) + return [self._sampled_list[i] for i in indices] + + +@OBJECTSAMPLERS.register_module +class DataBaseSampler(object): + + def __init__(self, info_path, root_path, rate, prepare, object_rot_range, + sample_groups, use_road_plane): + super().__init__() + self.root_path = root_path + self.info_path = info_path + self.rate = rate + self.prepare = prepare + self.object_rot_range = object_rot_range + + with open(info_path, 'rb') as f: + db_infos = pickle.load(f) + + # filter database infos + from mmdet3d.apis import get_root_logger + logger = get_root_logger() + for k, v in db_infos.items(): + logger.info(f'load {len(v)} {k} database infos') + for prep_func, val in prepare.items(): + db_infos = getattr(self, prep_func)(db_infos, val) + logger.info('After filter database:') + for k, v in db_infos.items(): + logger.info(f'load {len(v)} {k} database infos') + + self.db_infos = db_infos + + # load sample groups + # TODO: more elegant way to load sample groups + self.sample_groups = [] + for name, num in sample_groups.items(): + self.sample_groups.append({name: int(num)}) + + self.group_db_infos = self.db_infos # just use db_infos + self.sample_classes = [] + self.sample_max_nums = [] + for group_info in self.sample_groups: + self.sample_classes += list(group_info.keys()) + self.sample_max_nums += list(group_info.values()) + + self.sampler_dict = {} + for k, v in self.group_db_infos.items(): + self.sampler_dict[k] = BatchSampler(v, k, shuffle=True) + + self.object_rot_range = object_rot_range + self.object_rot_enable = np.abs(self.object_rot_range[0] - + self.object_rot_range[1]) >= 1e-3 + + # TODO: No group_sampling currently + + @staticmethod + def filter_by_difficulty(db_infos, removed_difficulty): + new_db_infos = {} + for key, dinfos in db_infos.items(): + new_db_infos[key] = [ + info for info in dinfos + if info['difficulty'] not in removed_difficulty + ] + return new_db_infos + + @staticmethod + def filter_by_min_points(db_infos, min_gt_points_dict): + for name, min_num in min_gt_points_dict.items(): + min_num = int(min_num) + if min_num > 0: + filtered_infos = [] + for info in db_infos[name]: + if info['num_points_in_gt'] >= min_num: + filtered_infos.append(info) + db_infos[name] = filtered_infos + return db_infos + + def sample_all(self, gt_bboxes, gt_names, img=None): + sampled_num_dict = {} + sample_num_per_class = [] + for class_name, max_sample_num in zip(self.sample_classes, + self.sample_max_nums): + sampled_num = int(max_sample_num - + np.sum([n == class_name for n in gt_names])) + sampled_num = np.round(self.rate * sampled_num).astype(np.int64) + sampled_num_dict[class_name] = sampled_num + sample_num_per_class.append(sampled_num) + + sampled = [] + sampled_gt_bboxes = [] + avoid_coll_boxes = gt_bboxes + + for class_name, sampled_num in zip(self.sample_classes, + sample_num_per_class): + if sampled_num > 0: + sampled_cls = self.sample_class_v2(class_name, sampled_num, + avoid_coll_boxes) + + sampled += sampled_cls + if len(sampled_cls) > 0: + if len(sampled_cls) == 1: + sampled_gt_box = sampled_cls[0]['box3d_lidar'][ + np.newaxis, ...] + else: + sampled_gt_box = np.stack( + [s['box3d_lidar'] for s in sampled_cls], axis=0) + + sampled_gt_bboxes += [sampled_gt_box] + avoid_coll_boxes = np.concatenate( + [avoid_coll_boxes, sampled_gt_box], axis=0) + + ret = None + if len(sampled) > 0: + sampled_gt_bboxes = np.concatenate(sampled_gt_bboxes, axis=0) + # center = sampled_gt_bboxes[:, 0:3] + + num_sampled = len(sampled) + s_points_list = [] + count = 0 + for info in sampled: + file_path = os.path.join( + self.root_path, + info['path']) if self.root_path else info['path'] + s_points = np.fromfile( + file_path, dtype=np.float32).reshape([-1, 4]) + + if 'rot_transform' in info: + rot = info['rot_transform'] + s_points[:, :3] = box_np_ops.rotation_points_single_angle( + s_points[:, :3], rot, axis=2) + s_points[:, :3] += info['box3d_lidar'][:3] + + count += 1 + + s_points_list.append(s_points) + + ret = { + 'gt_names': + np.array([s['name'] for s in sampled]), + 'difficulty': + np.array([s['difficulty'] for s in sampled]), + 'gt_bboxes_3d': + sampled_gt_bboxes, + 'points': + np.concatenate(s_points_list, axis=0), + 'gt_masks': + np.ones((num_sampled, ), dtype=np.bool_), + 'group_ids': + np.arange(gt_bboxes.shape[0], + gt_bboxes.shape[0] + len(sampled)) + } + + return ret + + def sample_class_v2(self, name, num, gt_bboxes): + sampled = self.sampler_dict[name].sample(num) + sampled = copy.deepcopy(sampled) + num_gt = gt_bboxes.shape[0] + num_sampled = len(sampled) + gt_bboxes_bv = box_np_ops.center_to_corner_box2d( + gt_bboxes[:, 0:2], gt_bboxes[:, 3:5], gt_bboxes[:, 6]) + + sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0) + + valid_mask = np.zeros([gt_bboxes.shape[0]], dtype=np.bool_) + valid_mask = np.concatenate( + [valid_mask, + np.ones([sp_boxes.shape[0]], dtype=np.bool_)], axis=0) + boxes = np.concatenate([gt_bboxes, sp_boxes], axis=0).copy() + if self.object_rot_enable: + assert False, 'This part needs to be checked' + # place samples to any place in a circle. + # TODO: rm it if not needed + data_augment_utils.noise_per_object_v3_( + boxes, + None, + valid_mask, + 0, + 0, + self._global_rot_range, + num_try=100) + + sp_boxes_new = boxes[gt_bboxes.shape[0]:] + sp_boxes_bv = box_np_ops.center_to_corner_box2d( + sp_boxes_new[:, 0:2], sp_boxes_new[:, 3:5], sp_boxes_new[:, 6]) + + total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0) + coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv) + diag = np.arange(total_bv.shape[0]) + coll_mat[diag, diag] = False + + valid_samples = [] + for i in range(num_gt, num_gt + num_sampled): + if coll_mat[i].any(): + coll_mat[i] = False + coll_mat[:, i] = False + else: + if self.object_rot_enable: + assert False, 'This part needs to be checked' + sampled[i - num_gt]['box3d_lidar'][:2] = boxes[i, :2] + sampled[i - num_gt]['box3d_lidar'][-1] = boxes[i, -1] + sampled[i - num_gt]['rot_transform'] = ( + boxes[i, -1] - sp_boxes[i - num_gt, -1]) + valid_samples.append(sampled[i - num_gt]) + return valid_samples + + +@OBJECTSAMPLERS.register_module +class MMDataBaseSampler(DataBaseSampler): + + def __init__(self, + info_path, + root_path, + rate, + prepare, + object_rot_range, + sample_groups, + check_2D_collision=False, + collision_thr=0, + collision_in_classes=False, + depth_consistent=False, + blending_type=None): + super(MMDataBaseSampler, self).__init__( + info_path=info_path, + root_path=root_path, + rate=rate, + prepare=prepare, + object_rot_range=object_rot_range, + sample_groups=sample_groups, + use_road_plane=False, + ) + self.blending_type = blending_type + self.depth_consistent = depth_consistent + self.check_2D_collision = check_2D_collision + self.collision_thr = collision_thr + self.collision_in_classes = collision_in_classes + + def sample_all(self, gt_bboxes_3d, gt_names, gt_bboxes_2d=None, img=None): + sampled_num_dict = {} + sample_num_per_class = [] + for class_name, max_sample_num in zip(self.sample_classes, + self.sample_max_nums): + sampled_num = int(max_sample_num - + np.sum([n == class_name for n in gt_names])) + sampled_num = np.round(self.rate * sampled_num).astype(np.int64) + sampled_num_dict[class_name] = sampled_num + sample_num_per_class.append(sampled_num) + + sampled = [] + sampled_gt_bboxes_3d = [] + sampled_gt_bboxes_2d = [] + avoid_coll_boxes_3d = gt_bboxes_3d + avoid_coll_boxes_2d = gt_bboxes_2d + + for class_name, sampled_num in zip(self.sample_classes, + sample_num_per_class): + if sampled_num > 0: + sampled_cls = self.sample_class_v2(class_name, sampled_num, + avoid_coll_boxes_3d, + avoid_coll_boxes_2d) + + sampled += sampled_cls + if len(sampled_cls) > 0: + if len(sampled_cls) == 1: + sampled_gt_box_3d = sampled_cls[0]['box3d_lidar'][ + np.newaxis, ...] + sampled_gt_box_2d = sampled_cls[0]['box2d_camera'][ + np.newaxis, ...] + else: + sampled_gt_box_3d = np.stack( + [s['box3d_lidar'] for s in sampled_cls], axis=0) + sampled_gt_box_2d = np.stack( + [s['box2d_camera'] for s in sampled_cls], axis=0) + + sampled_gt_bboxes_3d += [sampled_gt_box_3d] + sampled_gt_bboxes_2d += [sampled_gt_box_2d] + if self.collision_in_classes: + # TODO: check whether check collision check among + # classes is necessary + avoid_coll_boxes_3d = np.concatenate( + [avoid_coll_boxes_3d, sampled_gt_box_3d], axis=0) + avoid_coll_boxes_2d = np.concatenate( + [avoid_coll_boxes_2d, sampled_gt_box_2d], axis=0) + + ret = None + if len(sampled) > 0: + sampled_gt_bboxes_3d = np.concatenate(sampled_gt_bboxes_3d, axis=0) + sampled_gt_bboxes_2d = np.concatenate(sampled_gt_bboxes_2d, axis=0) + + num_sampled = len(sampled) + s_points_list = [] + count = 0 + + if self.depth_consistent: + # change the paster order based on distance + center = sampled_gt_bboxes_3d[:, 0:3] + paste_order = np.argsort( + -np.power(np.sum(np.power(center, 2), axis=-1), 1 / 2), + axis=-1) + + for idx in range(len(sampled)): + if self.depth_consistent: + inds = np.where(paste_order == idx)[0][0] + info = sampled[inds] + else: + info = sampled[idx] + pcd_file_path = os.path.join( + self.root_path, + info['path']) if self.root_path else info['path'] + img_file_path = pcd_file_path + '.png' + mask_file_path = pcd_file_path + '.mask.png' + s_points = np.fromfile( + pcd_file_path, dtype=np.float32).reshape([-1, 4]) + s_patch = mmcv.imread(img_file_path) + s_mask = mmcv.imread(mask_file_path, 'grayscale') + + if 'rot_transform' in info: + rot = info['rot_transform'] + s_points[:, :3] = box_np_ops.rotation_points_single_angle( + s_points[:, :3], rot, axis=2) + # TODO: might need to rot 2d bbox in the future + + # the points of each sample already minus the object center + # so this time it needs to add the offset back + s_points[:, :3] += info['box3d_lidar'][:3] + img = self.paste_obj( + img, + s_patch, + s_mask, + bbox_2d=info['box2d_camera'].astype(np.int32)) + + count += 1 + s_points_list.append(s_points) + + ret = dict( + img=img, + gt_names=np.array([s['name'] for s in sampled]), + difficulty=np.array([s['difficulty'] for s in sampled]), + gt_bboxes_3d=sampled_gt_bboxes_3d, + gt_bboxes_2d=sampled_gt_bboxes_2d, + points=np.concatenate(s_points_list, axis=0), + gt_masks=np.ones((num_sampled, ), dtype=np.bool_), + group_ids=np.arange(gt_bboxes_3d.shape[0], + gt_bboxes_3d.shape[0] + len(sampled))) + + return ret + + def paste_obj(self, img, obj_img, obj_mask, bbox_2d): + # paste the image patch back + x1, y1, x2, y2 = bbox_2d + # the bbox might exceed the img size because the img is different + img_h, img_w = img.shape[:2] + w = np.maximum(min(x2, img_w - 1) - x1 + 1, 1) + h = np.maximum(min(y2, img_h - 1) - y1 + 1, 1) + obj_mask = obj_mask[:h, :w] + obj_img = obj_img[:h, :w] + + # choose a blend option + if not self.blending_type: + blending_op = 'none' + + else: + blending_choice = np.random.randint(len(self.blending_type)) + blending_op = self.blending_type[blending_choice] + + if blending_op.find('poisson') != -1: + # options: cv2.NORMAL_CLONE=1, or cv2.MONOCHROME_TRANSFER=3 + # cv2.MIXED_CLONE mixed the texture, thus is not used. + if blending_op == 'poisson': + mode = np.random.choice([1, 3], 1)[0] + elif blending_op == 'poisson_normal': + mode = cv2.NORMAL_CLONE + elif blending_op == 'poisson_transfer': + mode = cv2.MONOCHROME_TRANSFER + else: + raise NotImplementedError + center = (int(x1 + w / 2), int(y1 + h / 2)) + img = cv2.seamlessClone(obj_img, img, obj_mask * 255, center, mode) + else: + if blending_op == 'gaussian': + obj_mask = cv2.GaussianBlur( + obj_mask.astype(np.float32), (5, 5), 2) + elif blending_op == 'box': + obj_mask = cv2.blur(obj_mask.astype(np.float32), (3, 3)) + paste_mask = 1 - obj_mask + img[y1:y1 + h, + x1:x1 + w] = (img[y1:y1 + h, x1:x1 + w].astype(np.float32) * + paste_mask[..., None]).astype(np.uint8) + img[y1:y1 + h, x1:x1 + w] += (obj_img.astype(np.float32) * + obj_mask[..., None]).astype(np.uint8) + return img + + def sample_class_v2(self, name, num, gt_bboxes_3d, gt_bboxes_2d): + sampled = self.sampler_dict[name].sample(num) + sampled = copy.deepcopy(sampled) + num_gt = gt_bboxes_3d.shape[0] + num_sampled = len(sampled) + # avoid collision in BEV first + gt_bboxes_bv = box_np_ops.center_to_corner_box2d( + gt_bboxes_3d[:, 0:2], gt_bboxes_3d[:, 3:5], gt_bboxes_3d[:, 6]) + sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0) + sp_boxes_bv = box_np_ops.center_to_corner_box2d( + sp_boxes[:, 0:2], sp_boxes[:, 3:5], sp_boxes[:, 6]) + total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0) + coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv) + + # Then avoid collision in 2D space + if self.check_2D_collision: + sp_boxes_2d = np.stack([i['box2d_camera'] for i in sampled], + axis=0) + total_bbox_2d = np.concatenate([gt_bboxes_2d, sp_boxes_2d], + axis=0) # Nx4 + # random select a collision threshold + if isinstance(self.collision_thr, float): + collision_thr = self.collision_thr + elif isinstance(self.collision_thr, list): + collision_thr = np.random.choice(self.collision_thr) + elif isinstance(self.collision_thr, dict): + mode = self.collision_thr.get('mode', 'value') + if mode == 'value': + collision_thr = np.random.choice( + self.collision_thr['thr_range']) + elif mode == 'range': + collision_thr = np.random.uniform( + self.collision_thr['thr_range'][0], + self.collision_thr['thr_range'][1]) + + if collision_thr == 0: + # use similar collision test as BEV did + # Nx4 (x1, y1, x2, y2) -> corners: Nx4x2 + # ((x1, y1), (x2, y1), (x1, y2), (x2, y2)) + x1y1 = total_bbox_2d[:, :2] + x2y2 = total_bbox_2d[:, 2:] + x1y2 = np.stack([total_bbox_2d[:, 0], total_bbox_2d[:, 3]], + axis=-1) + x2y1 = np.stack([total_bbox_2d[:, 2], total_bbox_2d[:, 1]], + axis=-1) + total_2d = np.stack([x1y1, x2y1, x1y2, x2y2], axis=1) + coll_mat_2d = data_augment_utils.box_collision_test( + total_2d, total_2d) + else: + # use iof rather than iou to protect the foreground + overlaps = box_np_ops.iou_jit(total_bbox_2d, total_bbox_2d, + 'iof') + coll_mat_2d = overlaps > collision_thr + coll_mat = coll_mat + coll_mat_2d + + diag = np.arange(total_bv.shape[0]) + coll_mat[diag, diag] = False + + valid_samples = [] + for i in range(num_gt, num_gt + num_sampled): + if coll_mat[i].any(): + coll_mat[i] = False + coll_mat[:, i] = False + else: + valid_samples.append(sampled[i - num_gt]) + + return valid_samples diff --git a/mmdet3d/datasets/pipelines/formating.py b/mmdet3d/datasets/pipelines/formating.py new file mode 100644 index 0000000000..14eeaa96e3 --- /dev/null +++ b/mmdet3d/datasets/pipelines/formating.py @@ -0,0 +1,165 @@ +import numpy as np +from mmcv.parallel import DataContainer as DC + +from mmdet.datasets.pipelines import PIPELINES, to_tensor + +PIPELINES._module_dict.pop('DefaultFormatBundle') + + +@PIPELINES.register_module +class DefaultFormatBundle(object): + """Default formatting bundle. + + It simplifies the pipeline of formatting common fields, including "img", + "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg". + These fields are formatted as follows. + + - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) + - proposals: (1)to tensor, (2)to DataContainer + - gt_bboxes: (1)to tensor, (2)to DataContainer + - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer + - gt_labels: (1)to tensor, (2)to DataContainer + - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True) + - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, + (3)to DataContainer (stack=True) + """ + + def __init__(self, ): + return + + def __call__(self, results): + if 'img' in results: + if isinstance(results['img'], list): + # process multiple imgs in single frame + imgs = [img.transpose(2, 0, 1) for img in results['img']] + imgs = np.ascontiguousarray(np.stack(imgs, axis=0)) + results['img'] = DC(to_tensor(imgs), stack=True) + else: + img = np.ascontiguousarray(results['img'].transpose(2, 0, 1)) + results['img'] = DC(to_tensor(img), stack=True) + for key in [ + 'proposals', 'gt_bboxes', 'gt_bboxes_3d', 'gt_bboxes_ignore', + 'gt_labels', 'gt_labels_3d' + ]: + if key not in results: + continue + if isinstance(results[key], list): + results[key] = DC([to_tensor(res) for res in results[key]]) + else: + results[key] = DC(to_tensor(results[key])) + if 'gt_masks' in results: + results['gt_masks'] = DC(results['gt_masks'], cpu_only=True) + if 'gt_semantic_seg' in results: + results['gt_semantic_seg'] = DC( + to_tensor(results['gt_semantic_seg'][None, ...]), stack=True) + return results + + def __repr__(self): + return self.__class__.__name__ + + +@PIPELINES.register_module +class Collect3D(object): + + def __init__(self, + keys, + pcd_shape=[1, 1600, 1408], + meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', + 'pad_shape', 'scale_factor', 'flip', 'pcd_flip', + 'img_norm_cfg', 'rect', 'Trv2c', 'P2', 'pcd_trans', + 'sample_idx', 'pcd_scale_factor', 'pcd_rotation')): + self.keys = keys + self.meta_keys = meta_keys + self.pcd_shape = pcd_shape + + def __call__(self, results): + data = {} + img_meta = {} + for key in self.meta_keys: + if key in results: + img_meta[key] = results[key] + img_meta.update(pcd_shape=self.pcd_shape, pcd_pad_shape=self.pcd_shape) + data['img_meta'] = DC(img_meta, cpu_only=True) + for key in self.keys: + data[key] = results[key] + return data + + def __repr__(self): + return self.__class__.__name__ + '(keys={}, meta_keys={})'.format( + self.keys, self.meta_keys) + + +@PIPELINES.register_module +class DefaultFormatBundle3D(DefaultFormatBundle): + """Default formatting bundle. + + It simplifies the pipeline of formatting common fields for voxels, + including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and + "gt_semantic_seg". + These fields are formatted as follows. + + - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) + - proposals: (1)to tensor, (2)to DataContainer + - gt_bboxes: (1)to tensor, (2)to DataContainer + - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer + - gt_labels: (1)to tensor, (2)to DataContainer + """ + + def __init__(self, class_names, with_gt=True, with_label=True): + super(DefaultFormatBundle3D, self).__init__() + self.class_names = class_names + self.with_gt = with_gt + self.with_label = with_label + + def __call__(self, results): + # Format 3D data + for key in [ + 'voxels', 'coors', 'voxel_centers', 'num_points', 'points' + ]: + if key not in results: + continue + results[key] = DC(to_tensor(results[key]), stack=False) + + if self.with_gt: + # Clean GT bboxes in the final + if 'gt_bboxes_3d_mask' in results: + gt_bboxes_3d_mask = results['gt_bboxes_3d_mask'] + results['gt_bboxes_3d'] = results['gt_bboxes_3d'][ + gt_bboxes_3d_mask] + results['gt_names_3d'] = results['gt_names_3d'][ + gt_bboxes_3d_mask] + if 'gt_bboxes_mask' in results: + gt_bboxes_mask = results['gt_bboxes_mask'] + if 'gt_bboxes' in results: + results['gt_bboxes'] = results['gt_bboxes'][gt_bboxes_mask] + results['gt_names'] = results['gt_names'][gt_bboxes_mask] + if self.with_label: + if 'gt_names' in results and len(results['gt_names']) == 0: + results['gt_labels'] = np.array([], dtype=np.int64) + elif 'gt_names' in results and isinstance( + results['gt_names'][0], list): + # gt_labels might be a list of list in multi-view setting + results['gt_labels'] = [ + np.array([self.class_names.index(n) for n in res], + dtype=np.int64) for res in results['gt_names'] + ] + elif 'gt_names' in results: + results['gt_labels'] = np.array([ + self.class_names.index(n) for n in results['gt_names'] + ], + dtype=np.int64) + # we still assume one pipeline for one frame LiDAR + # thus, the 3D name is list[string] + results['gt_labels_3d'] = np.array([ + self.class_names.index(n) for n in results['gt_names_3d'] + ], + dtype=np.int64) + results = super(DefaultFormatBundle3D, self).__call__(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += '(class_names={}, '.format(self.class_names) + repr_str += 'with_gt={}, with_label={})'.format( + self.with_gt, self.with_label) + return repr_str diff --git a/mmdet3d/datasets/pipelines/loading.py b/mmdet3d/datasets/pipelines/loading.py new file mode 100644 index 0000000000..31437faae4 --- /dev/null +++ b/mmdet3d/datasets/pipelines/loading.py @@ -0,0 +1,143 @@ +import os.path as osp + +import mmcv +import numpy as np +import pycocotools.mask as maskUtils + +from mmdet.datasets.pipelines import PIPELINES + + +@PIPELINES.register_module +class LoadImageFromFile(object): + + def __init__(self, to_float32=False): + self.to_float32 = to_float32 + + def __call__(self, results): + if results['img_prefix'] is not None: + filename = osp.join(results['img_prefix'], + results['img_info']['filename']) + else: + filename = results['img_info']['filename'] + img = mmcv.imread(filename) + if self.to_float32: + img = img.astype(np.float32) + results['filename'] = filename + results['img'] = img + results['img_shape'] = img.shape + results['ori_shape'] = img.shape + return results + + def __repr__(self): + return self.__class__.__name__ + '(to_float32={})'.format( + self.to_float32) + + +@PIPELINES.register_module +class LoadAnnotations(object): + + def __init__(self, + with_bbox=True, + with_label=True, + with_mask=False, + with_seg=False, + poly2mask=True): + self.with_bbox = with_bbox + self.with_label = with_label + self.with_mask = with_mask + self.with_seg = with_seg + self.poly2mask = poly2mask + + def _load_bboxes(self, results): + ann_info = results['ann_info'] + results['gt_bboxes'] = ann_info['bboxes'] + + gt_bboxes_ignore = ann_info.get('bboxes_ignore', None) + if gt_bboxes_ignore is not None: + results['gt_bboxes_ignore'] = gt_bboxes_ignore + results['bbox_fields'].append('gt_bboxes_ignore') + results['bbox_fields'].append('gt_bboxes') + return results + + def _load_labels(self, results): + results['gt_labels'] = results['ann_info']['labels'] + return results + + def _poly2mask(self, mask_ann, img_h, img_w): + if isinstance(mask_ann, list): + # polygon -- a single object might consist of multiple parts + # we merge all parts into one mask rle code + rles = maskUtils.frPyObjects(mask_ann, img_h, img_w) + rle = maskUtils.merge(rles) + elif isinstance(mask_ann['counts'], list): + # uncompressed RLE + rle = maskUtils.frPyObjects(mask_ann, img_h, img_w) + else: + # rle + rle = mask_ann + mask = maskUtils.decode(rle) + return mask + + def _load_masks(self, results): + h, w = results['img_info']['height'], results['img_info']['width'] + gt_masks = results['ann_info']['masks'] + if self.poly2mask: + gt_masks = [self._poly2mask(mask, h, w) for mask in gt_masks] + results['gt_masks'] = gt_masks + results['mask_fields'].append('gt_masks') + return results + + def _load_semantic_seg(self, results): + results['gt_semantic_seg'] = mmcv.imread( + osp.join(results['seg_prefix'], results['ann_info']['seg_map']), + flag='unchanged').squeeze() + results['seg_fields'].append('gt_semantic_seg') + return results + + def __call__(self, results): + if self.with_bbox: + results = self._load_bboxes(results) + if results is None: + return None + if self.with_label: + results = self._load_labels(results) + if self.with_mask: + results = self._load_masks(results) + if self.with_seg: + results = self._load_semantic_seg(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += ('(with_bbox={}, with_label={}, with_mask={},' + ' with_seg={})').format(self.with_bbox, self.with_label, + self.with_mask, self.with_seg) + return repr_str + + +@PIPELINES.register_module +class LoadProposals(object): + + def __init__(self, num_max_proposals=None): + self.num_max_proposals = num_max_proposals + + def __call__(self, results): + proposals = results['proposals'] + if proposals.shape[1] not in (4, 5): + raise AssertionError( + 'proposals should have shapes (n, 4) or (n, 5), ' + 'but found {}'.format(proposals.shape)) + proposals = proposals[:, :4] + + if self.num_max_proposals is not None: + proposals = proposals[:self.num_max_proposals] + + if len(proposals) == 0: + proposals = np.array([[0, 0, 0, 0]], dtype=np.float32) + results['proposals'] = proposals + results['bbox_fields'].append('proposals') + return results + + def __repr__(self): + return self.__class__.__name__ + '(num_max_proposals={})'.format( + self.num_max_proposals) diff --git a/mmdet3d/datasets/pipelines/train_aug.py b/mmdet3d/datasets/pipelines/train_aug.py new file mode 100644 index 0000000000..cf8eb71de4 --- /dev/null +++ b/mmdet3d/datasets/pipelines/train_aug.py @@ -0,0 +1,326 @@ +import numpy as np + +from mmdet3d.core.bbox import box_np_ops +from mmdet3d.utils import build_from_cfg +from mmdet.datasets.registry import PIPELINES +from ..registry import OBJECTSAMPLERS +from .data_augment_utils import noise_per_object_v3_ +from .transforms import RandomFlip + + +@PIPELINES.register_module +class RandomFlip3D(RandomFlip): + """Flip the points & bbox. + + If the input dict contains the key "flip", then the flag will be used, + otherwise it will be randomly decided by a ratio specified in the init + method. + + Args: + flip_ratio (float, optional): The flipping probability. + """ + + def __init__(self, sync_2d=True, **kwargs): + super(RandomFlip3D, self).__init__(**kwargs) + self.sync_2d = sync_2d + + def random_flip_points(self, gt_bboxes_3d, points): + gt_bboxes_3d[:, 1] = -gt_bboxes_3d[:, 1] + gt_bboxes_3d[:, 6] = -gt_bboxes_3d[:, 6] + np.pi + points[:, 1] = -points[:, 1] + if gt_bboxes_3d.shape[1] == 9: + # flip velocitys at the same time + gt_bboxes_3d[:, 8] = -gt_bboxes_3d[:, 8] + return gt_bboxes_3d, points + + def __call__(self, input_dict): + super(RandomFlip3D, self).__call__(input_dict) + if self.sync_2d: + input_dict['pcd_flip'] = input_dict['flip'] + else: + flip = True if np.random.rand() < self.flip_ratio else False + input_dict['pcd_flip'] = flip + if input_dict['pcd_flip']: + # flip image + gt_bboxes_3d = input_dict['gt_bboxes_3d'] + points = input_dict['points'] + gt_bboxes_3d, points = self.random_flip_points( + gt_bboxes_3d, points) + input_dict['gt_bboxes_3d'] = gt_bboxes_3d + input_dict['points'] = points + return input_dict + + +@PIPELINES.register_module +class ObjectSample(object): + + def __init__(self, db_sampler, sample_2d=False): + self.sampler_cfg = db_sampler + self.sample_2d = sample_2d + if 'type' not in db_sampler.keys(): + db_sampler['type'] = 'DataBaseSampler' + self.db_sampler = build_from_cfg(db_sampler, OBJECTSAMPLERS) + + @staticmethod + def remove_points_in_boxes(points, boxes): + masks = box_np_ops.points_in_rbbox(points, boxes) + points = points[np.logical_not(masks.any(-1))] + return points + + def __call__(self, input_dict): + gt_bboxes_3d = input_dict['gt_bboxes_3d'] + gt_names_3d = input_dict['gt_names_3d'] + gt_bboxes_3d_mask = input_dict['gt_bboxes_3d_mask'] + # change to float for blending operation + points = input_dict['points'] + # rect = input_dict['rect'] + # Trv2c = input_dict['Trv2c'] + # P2 = input_dict['P2'] + if self.sample_2d: + img = input_dict['img'] # .astype(np.float32) + gt_bboxes_2d = input_dict['gt_bboxes'] + gt_bboxes_mask = input_dict['gt_bboxes_mask'] + gt_names = input_dict['gt_names'] + # Assume for now 3D & 2D bboxes are the same + sampled_dict = self.db_sampler.sample_all( + gt_bboxes_3d, gt_names_3d, gt_bboxes_2d=gt_bboxes_2d, img=img) + else: + sampled_dict = self.db_sampler.sample_all( + gt_bboxes_3d, gt_names_3d, img=None) + + if sampled_dict is not None: + sampled_gt_names = sampled_dict['gt_names'] + sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d'] + sampled_points = sampled_dict['points'] + sampled_gt_masks = sampled_dict['gt_masks'] + + gt_names_3d = np.concatenate([gt_names_3d, sampled_gt_names], + axis=0) + gt_bboxes_3d = np.concatenate([gt_bboxes_3d, sampled_gt_bboxes_3d + ]).astype(np.float32) + gt_bboxes_3d_mask = np.concatenate( + [gt_bboxes_3d_mask, sampled_gt_masks], axis=0) + points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d) + # check the points dimension + dim_inds = points.shape[-1] + points = np.concatenate([sampled_points[:, :dim_inds], points], + axis=0) + + if self.sample_2d: + sampled_gt_bboxes_2d = sampled_dict['gt_bboxes_2d'] + gt_bboxes_2d = np.concatenate( + [gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32) + gt_bboxes_mask = np.concatenate( + [gt_bboxes_mask, sampled_gt_masks], axis=0) + gt_names = np.concatenate([gt_names, sampled_gt_names], axis=0) + input_dict['gt_names'] = gt_names + input_dict['gt_bboxes'] = gt_bboxes_2d + input_dict['gt_bboxes_mask'] = gt_bboxes_mask + input_dict['img'] = sampled_dict['img'] # .astype(np.uint8) + + input_dict['gt_bboxes_3d'] = gt_bboxes_3d + input_dict['gt_names_3d'] = gt_names_3d + input_dict['points'] = points + input_dict['gt_bboxes_3d_mask'] = gt_bboxes_3d_mask + return input_dict + + def __repr__(self): + return self.__class__.__name__ + + +@PIPELINES.register_module +class ObjectNoise(object): + + def __init__(self, + loc_noise_std=[0.25, 0.25, 0.25], + global_rot_range=[0.0, 0.0], + rot_uniform_noise=[-0.15707963267, 0.15707963267], + num_try=100): + self.loc_noise_std = loc_noise_std + self.global_rot_range = global_rot_range + self.rot_uniform_noise = rot_uniform_noise + self.num_try = num_try + + def __call__(self, input_dict): + gt_bboxes_3d = input_dict['gt_bboxes_3d'] + points = input_dict['points'] + gt_bboxes_3d_mask = input_dict['gt_bboxes_3d_mask'] + # TODO: check this inplace function + noise_per_object_v3_( + gt_bboxes_3d, + points, + gt_bboxes_3d_mask, + rotation_perturb=self.rot_uniform_noise, + center_noise_std=self.loc_noise_std, + global_random_rot_range=self.global_rot_range, + num_try=self.num_try) + input_dict['gt_bboxes_3d'] = gt_bboxes_3d.astype('float32') + input_dict['points'] = points + return input_dict + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += '(num_try={},'.format(self.num_try) + repr_str += ' loc_noise_std={},'.format(self.loc_noise_std) + repr_str += ' global_rot_range={},'.format(self.global_rot_range) + repr_str += ' rot_uniform_noise={})'.format(self.rot_uniform_noise) + return repr_str + + +@PIPELINES.register_module +class GlobalRotScale(object): + + def __init__(self, + rot_uniform_noise=[-0.78539816, 0.78539816], + scaling_uniform_noise=[0.95, 1.05], + trans_normal_noise=[0, 0, 0]): + self.rot_uniform_noise = rot_uniform_noise + self.scaling_uniform_noise = scaling_uniform_noise + self.trans_normal_noise = trans_normal_noise + + def _trans_bbox_points(self, gt_boxes, points): + noise_trans = np.random.normal(0, self.trans_normal_noise[0], 3).T + points[:, :3] += noise_trans + gt_boxes[:, :3] += noise_trans + return gt_boxes, points, noise_trans + + def _rot_bbox_points(self, gt_boxes, points, rotation=np.pi / 4): + if not isinstance(rotation, list): + rotation = [-rotation, rotation] + noise_rotation = np.random.uniform(rotation[0], rotation[1]) + points[:, :3], rot_mat_T = box_np_ops.rotation_points_single_angle( + points[:, :3], noise_rotation, axis=2) + gt_boxes[:, :3], _ = box_np_ops.rotation_points_single_angle( + gt_boxes[:, :3], noise_rotation, axis=2) + gt_boxes[:, 6] += noise_rotation + if gt_boxes.shape[1] == 9: + # rotate velo vector + rot_cos = np.cos(noise_rotation) + rot_sin = np.sin(noise_rotation) + rot_mat_T_bev = np.array([[rot_cos, -rot_sin], [rot_sin, rot_cos]], + dtype=points.dtype) + gt_boxes[:, 7:9] = gt_boxes[:, 7:9] @ rot_mat_T_bev + return gt_boxes, points, rot_mat_T + + def _scale_bbox_points(self, + gt_boxes, + points, + min_scale=0.95, + max_scale=1.05): + noise_scale = np.random.uniform(min_scale, max_scale) + points[:, :3] *= noise_scale + gt_boxes[:, :6] *= noise_scale + if gt_boxes.shape[1] == 9: + gt_boxes[:, 7:] *= noise_scale + return gt_boxes, points, noise_scale + + def __call__(self, input_dict): + gt_bboxes_3d = input_dict['gt_bboxes_3d'] + points = input_dict['points'] + + gt_bboxes_3d, points, rotation_factor = self._rot_bbox_points( + gt_bboxes_3d, points, rotation=self.rot_uniform_noise) + gt_bboxes_3d, points, scale_factor = self._scale_bbox_points( + gt_bboxes_3d, points, *self.scaling_uniform_noise) + gt_bboxes_3d, points, trans_factor = self._trans_bbox_points( + gt_bboxes_3d, points) + + input_dict['gt_bboxes_3d'] = gt_bboxes_3d.astype('float32') + input_dict['points'] = points + input_dict['pcd_scale_factor'] = scale_factor + input_dict['pcd_rotation'] = rotation_factor + input_dict['pcd_trans'] = trans_factor + return input_dict + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += '(rot_uniform_noise={},'.format(self.rot_uniform_noise) + repr_str += ' scaling_uniform_noise={},'.format( + self.scaling_uniform_noise) + repr_str += ' trans_normal_noise={})'.format(self.trans_normal_noise) + return repr_str + + +@PIPELINES.register_module +class PointShuffle(object): + + def __call__(self, input_dict): + np.random.shuffle(input_dict['points']) + return input_dict + + def __repr__(self): + return self.__class__.__name__ + + +@PIPELINES.register_module +class ObjectRangeFilter(object): + + def __init__(self, point_cloud_range): + self.pcd_range = np.array(point_cloud_range, dtype=np.float32) + self.bev_range = self.pcd_range[[0, 1, 3, 4]] + + @staticmethod + def limit_period(val, offset=0.5, period=np.pi): + return val - np.floor(val / period + offset) * period + + @staticmethod + def filter_gt_box_outside_range(gt_bboxes_3d, limit_range): + """remove gtbox outside training range. + this function should be applied after other prep functions + Args: + gt_bboxes_3d ([type]): [description] + limit_range ([type]): [description] + """ + gt_bboxes_3d_bv = box_np_ops.center_to_corner_box2d( + gt_bboxes_3d[:, [0, 1]], gt_bboxes_3d[:, [3, 3 + 1]], + gt_bboxes_3d[:, 6]) + bounding_box = box_np_ops.minmax_to_corner_2d( + np.asarray(limit_range)[np.newaxis, ...]) + ret = box_np_ops.points_in_convex_polygon_jit( + gt_bboxes_3d_bv.reshape(-1, 2), bounding_box) + return np.any(ret.reshape(-1, 4), axis=1) + + def __call__(self, input_dict): + gt_bboxes_3d = input_dict['gt_bboxes_3d'] + gt_names_3d = input_dict['gt_names_3d'] + gt_bboxes_3d_mask = input_dict['gt_bboxes_3d_mask'] + mask = self.filter_gt_box_outside_range(gt_bboxes_3d, self.bev_range) + gt_bboxes_3d = gt_bboxes_3d[mask] + gt_names_3d = gt_names_3d[mask] + # the mask should also be updated + gt_bboxes_3d_mask = gt_bboxes_3d_mask[mask] + + # limit rad to [-pi, pi] + gt_bboxes_3d[:, 6] = self.limit_period( + gt_bboxes_3d[:, 6], offset=0.5, period=2 * np.pi) + input_dict['gt_bboxes_3d'] = gt_bboxes_3d.astype('float32') + input_dict['gt_names_3d'] = gt_names_3d + input_dict['gt_bboxes_3d_mask'] = gt_bboxes_3d_mask + return input_dict + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += '(point_cloud_range={})'.format(self.pcd_range.tolist()) + return repr_str + + +@PIPELINES.register_module +class PointsRangeFilter(object): + + def __init__(self, point_cloud_range): + self.pcd_range = np.array( + point_cloud_range, dtype=np.float32)[np.newaxis, :] + + def __call__(self, input_dict): + points = input_dict['points'] + points_mask = ((points[:, :3] >= self.pcd_range[:, :3]) + & (points[:, :3] < self.pcd_range[:, 3:])) + points_mask = points_mask[:, 0] & points_mask[:, 1] & points_mask[:, 2] + clean_points = points[points_mask, :] + input_dict['points'] = clean_points + return input_dict + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += '(point_cloud_range={})'.format(self.pcd_range.tolist()) + return repr_str diff --git a/mmdet3d/datasets/registry.py b/mmdet3d/datasets/registry.py new file mode 100644 index 0000000000..b1acde485c --- /dev/null +++ b/mmdet3d/datasets/registry.py @@ -0,0 +1,3 @@ +from mmdet3d.utils import Registry + +OBJECTSAMPLERS = Registry('object_sampler') diff --git a/mmdet3d/datasets/utils.py b/mmdet3d/datasets/utils.py new file mode 100644 index 0000000000..9e3a7a2fc3 --- /dev/null +++ b/mmdet3d/datasets/utils.py @@ -0,0 +1,37 @@ +from collections import Sequence + +import mmcv +import numpy as np +import torch + + +def remove_dontcare(image_anno): + img_filtered_annotations = {} + relevant_annotation_indices = [ + i for i, x in enumerate(image_anno['name']) if x != 'DontCare' + ] + for key in image_anno.keys(): + img_filtered_annotations[key] = ( + image_anno[key][relevant_annotation_indices]) + return img_filtered_annotations + + +def to_tensor(data): + # TODO: remove this duplicated method in the future + """Convert objects of various python types to :obj:`torch.Tensor`. + Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, + :class:`Sequence`, :class:`int` and :class:`float`. + """ + if isinstance(data, torch.Tensor): + return data + elif isinstance(data, np.ndarray): + return torch.from_numpy(data) + elif isinstance(data, Sequence) and not mmcv.is_str(data): + return torch.tensor(data) + elif isinstance(data, int): + return torch.LongTensor([data]) + elif isinstance(data, float): + return torch.FloatTensor([data]) + else: + raise TypeError('type {} cannot be converted to tensor.'.format( + type(data))) diff --git a/mmdet3d/models/__init__.py b/mmdet3d/models/__init__.py new file mode 100644 index 0000000000..4e2b48972e --- /dev/null +++ b/mmdet3d/models/__init__.py @@ -0,0 +1,21 @@ +from .anchor_heads import * # noqa: F401,F403 +from .backbones import * # noqa: F401,F403 +from .bbox_heads import * # noqa: F401,F403 +from .builder import (build_backbone, build_detector, build_head, build_loss, + build_neck, build_roi_extractor, build_shared_head) +from .detectors import * # noqa: F401,F403 +from .fusion_layers import * # noqa: F401,F403 +from .losses import * # noqa: F401,F403 +from .middle_encoders import * # noqa: F401,F403 +from .necks import * # noqa: F401,F403 +from .registry import (BACKBONES, DETECTORS, HEADS, LOSSES, MIDDLE_ENCODERS, + NECKS, ROI_EXTRACTORS, SHARED_HEADS, VOXEL_ENCODERS) +from .roi_extractors import * # noqa: F401,F403 +from .voxel_encoders import * # noqa: F401,F403 + +__all__ = [ + 'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'SHARED_HEADS', 'HEADS', 'LOSSES', + 'VOXEL_ENCODERS', 'MIDDLE_ENCODERS', 'DETECTORS', 'build_backbone', + 'build_neck', 'build_roi_extractor', 'build_shared_head', 'build_head', + 'build_loss', 'build_detector' +] diff --git a/mmdet3d/models/anchor_heads/__init__.py b/mmdet3d/models/anchor_heads/__init__.py new file mode 100644 index 0000000000..a86c226f0c --- /dev/null +++ b/mmdet3d/models/anchor_heads/__init__.py @@ -0,0 +1,4 @@ +from .boxvelo_head import Anchor3DVeloHead +from .second_head import SECONDHead + +__all__ = ['Anchor3DVeloHead', 'SECONDHead'] diff --git a/mmdet3d/models/anchor_heads/boxvelo_head.py b/mmdet3d/models/anchor_heads/boxvelo_head.py new file mode 100644 index 0000000000..d30d759784 --- /dev/null +++ b/mmdet3d/models/anchor_heads/boxvelo_head.py @@ -0,0 +1,224 @@ +import numpy as np +import torch +from mmcv.cnn import normal_init + +from mmdet3d.core import box_torch_ops, boxes3d_to_bev_torch_lidar +from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu +from ..registry import HEADS +from ..utils import bias_init_with_prob +from .second_head import SECONDHead + + +@HEADS.register_module +class Anchor3DVeloHead(SECONDHead): + """Anchor-based head for 3D anchor with velocity + Args: + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of channels of the feature map. + anchor_scales (Iterable): Anchor scales. + anchor_ratios (Iterable): Anchor aspect ratios. + anchor_strides (Iterable): Anchor strides. + anchor_base_sizes (Iterable): Anchor base sizes. + target_means (Iterable): Mean values of regression targets. + target_stds (Iterable): Std values of regression targets. + loss_cls (dict): Config of classification loss. + loss_bbox (dict): Config of localization loss. + """ # noqa: W605 + + def __init__(self, + class_names, + num_classes, + in_channels, + train_cfg, + test_cfg, + cache_anchor=False, + feat_channels=256, + use_direction_classifier=True, + encode_bg_as_zeros=False, + box_code_size=9, + anchor_generator=dict(type='AnchorGeneratorRange', ), + anchor_range=[0, -39.68, -1.78, 69.12, 39.68, -1.78], + anchor_strides=[2], + anchor_sizes=[[1.6, 3.9, 1.56]], + anchor_rotations=[0, 1.57], + anchor_custom_values=[0, 0], + assigner_per_size=False, + assign_per_class=False, + diff_rad_by_sin=True, + dir_offset=0, + dir_limit_offset=1, + target_means=(.0, .0, .0, .0), + target_stds=(1.0, 1.0, 1.0, 1.0), + bbox_coder=dict(type='ResidualCoder', ), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2)): + super().__init__(class_names, in_channels, train_cfg, test_cfg, + cache_anchor, feat_channels, use_direction_classifier, + encode_bg_as_zeros, box_code_size, anchor_generator, + anchor_range, anchor_strides, anchor_sizes, + anchor_rotations, anchor_custom_values, + assigner_per_size, assign_per_class, diff_rad_by_sin, + dir_offset, dir_limit_offset, target_means, + target_stds, bbox_coder, loss_cls, loss_bbox, + loss_dir) + self.num_classes = num_classes + # build head layers & losses + if not self.use_sigmoid_cls: + self.num_classes += 1 + self._init_layers() + + def init_weights(self): + # pass + # use the initialization when ready + bias_cls = bias_init_with_prob(0.01) + normal_init(self.conv_cls, std=0.01, bias=bias_cls) + normal_init(self.conv_reg, std=0.01) + + @staticmethod + def add_sin_difference(boxes1, boxes2): + # Caution: the 7th dim is the rotation, (last dim without velo) + rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos( + boxes2[..., 6:7]) + rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[..., + 6:7]) + boxes1 = torch.cat( + [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1) + boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]], + dim=-1) + return boxes1, boxes2 + + def get_bboxes_single(self, + cls_scores, + bbox_preds, + dir_cls_preds, + mlvl_anchors, + input_meta, + rescale=False): + assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_dir_scores = [] + for cls_score, bbox_pred, dir_cls_pred, anchors in zip( + cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:] + dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2) + dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1] + + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.num_classes) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + scores = cls_score.softmax(-1) + bbox_pred = bbox_pred.permute(1, 2, + 0).reshape(-1, self.box_code_size) + + nms_pre = self.test_cfg.get('nms_pre', -1) + if nms_pre > 0 and scores.shape[0] > nms_pre: + if self.use_sigmoid_cls: + max_scores, _ = scores.max(dim=1) + else: + max_scores, _ = scores[:, :-1].max(dim=1) + _, topk_inds = max_scores.topk(nms_pre) + anchors = anchors[topk_inds, :] + bbox_pred = bbox_pred[topk_inds, :] + scores = scores[topk_inds, :] + dir_cls_score = dir_cls_score[topk_inds] + + bboxes = self.bbox_coder.decode_torch(anchors, bbox_pred, + self.target_means, + self.target_stds) + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_dir_scores.append(dir_cls_score) + + mlvl_bboxes = torch.cat(mlvl_bboxes) + mlvl_bboxes_for_nms = boxes3d_to_bev_torch_lidar(mlvl_bboxes) + mlvl_scores = torch.cat(mlvl_scores) + mlvl_dir_scores = torch.cat(mlvl_dir_scores) + + if self.use_sigmoid_cls: + # Add a dummy background class to the front when using sigmoid + padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) + mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) + + score_thr = self.test_cfg.get('score_thr', 0) + result = self.multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms, + mlvl_scores, mlvl_dir_scores, score_thr, + self.test_cfg.max_per_img) + + result.update(dict(sample_idx=input_meta['sample_idx'])) + return result + + def multiclass_nms(self, mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_scores, + mlvl_dir_scores, score_thr, max_num): + # do multi class nms + # the fg class id range: [0, num_classes-1] + num_classes = mlvl_scores.shape[1] - 1 + bboxes = [] + scores = [] + labels = [] + dir_scores = [] + for i in range(0, num_classes): + # get bboxes and scores of this class + cls_inds = mlvl_scores[:, i] > score_thr + if not cls_inds.any(): + continue + _scores = mlvl_scores[cls_inds, i] + _bboxes_for_nms = mlvl_bboxes_for_nms[cls_inds, :] + if self.test_cfg.use_rotate_nms: + nms_func = nms_gpu + else: + nms_func = nms_normal_gpu + selected = nms_func(_bboxes_for_nms, _scores, + self.test_cfg.nms_thr) + + _mlvl_bboxes = mlvl_bboxes[cls_inds, :] + _mlvl_dir_scores = mlvl_dir_scores[cls_inds] + + if len(selected) > 0: + bboxes.append(_mlvl_bboxes[selected]) + scores.append(_scores[selected]) + dir_scores.append(_mlvl_dir_scores[selected]) + dir_rot = box_torch_ops.limit_period( + bboxes[-1][..., 6] - self.dir_offset, + self.dir_limit_offset, np.pi) + bboxes[-1][..., 6] = ( + dir_rot + self.dir_offset + + np.pi * dir_scores[-1].to(bboxes[-1].dtype)) + + cls_label = mlvl_bboxes.new_full((len(selected), ), + i, + dtype=torch.long) + labels.append(cls_label) + + if bboxes: + bboxes = torch.cat(bboxes, dim=0) + scores = torch.cat(scores, dim=0) + labels = torch.cat(labels, dim=0) + dir_scores = torch.cat(dir_scores, dim=0) + if bboxes.shape[0] > max_num: + _, inds = scores.sort(descending=True) + inds = inds[:max_num] + bboxes = bboxes[inds, :] + labels = labels[inds] + scores = scores[inds] + dir_scores = dir_scores[inds] + return dict( + box3d_lidar=bboxes.cpu(), + scores=scores.cpu(), + label_preds=labels.cpu(), + ) + else: + return dict( + box3d_lidar=mlvl_bboxes.new_zeros([0, + self.box_code_size]).cpu(), + scores=mlvl_bboxes.new_zeros([0]).cpu(), + label_preds=mlvl_bboxes.new_zeros([0, 4]).cpu(), + ) diff --git a/mmdet3d/models/anchor_heads/second_head.py b/mmdet3d/models/anchor_heads/second_head.py new file mode 100644 index 0000000000..fa59ffa717 --- /dev/null +++ b/mmdet3d/models/anchor_heads/second_head.py @@ -0,0 +1,405 @@ +from __future__ import division + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import normal_init + +from mmdet3d.core import (PseudoSampler, box_torch_ops, + boxes3d_to_bev_torch_lidar, build_anchor_generator, + build_assigner, build_bbox_coder, build_sampler, + multi_apply) +from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu +from ..builder import build_loss +from ..registry import HEADS +from ..utils import bias_init_with_prob +from .train_mixins import AnchorTrainMixin + + +@HEADS.register_module +class SECONDHead(nn.Module, AnchorTrainMixin): + """Anchor-based head (RPN, RetinaNet, SSD, etc.). + Args: + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of channels of the feature map. + anchor_scales (Iterable): Anchor scales. + anchor_ratios (Iterable): Anchor aspect ratios. + anchor_strides (Iterable): Anchor strides. + anchor_base_sizes (Iterable): Anchor base sizes. + target_means (Iterable): Mean values of regression targets. + target_stds (Iterable): Std values of regression targets. + loss_cls (dict): Config of classification loss. + loss_bbox (dict): Config of localization loss. + """ # noqa: W605 + + def __init__(self, + class_name, + in_channels, + train_cfg, + test_cfg, + cache_anchor=False, + feat_channels=256, + use_direction_classifier=True, + encode_bg_as_zeros=False, + box_code_size=7, + anchor_generator=dict(type='AnchorGeneratorRange'), + anchor_range=[0, -39.68, -1.78, 69.12, 39.68, -1.78], + anchor_strides=[2], + anchor_sizes=[[1.6, 3.9, 1.56]], + anchor_rotations=[0, 1.57], + anchor_custom_values=[], + assigner_per_size=False, + assign_per_class=False, + diff_rad_by_sin=True, + dir_offset=0, + dir_limit_offset=1, + target_means=(.0, .0, .0, .0), + target_stds=(1.0, 1.0, 1.0, 1.0), + bbox_coder=dict(type='ResidualCoder'), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2)): + super().__init__() + self.in_channels = in_channels + self.num_classes = len(class_name) + self.feat_channels = feat_channels + self.diff_rad_by_sin = diff_rad_by_sin + self.use_direction_classifier = use_direction_classifier + # self.encode_background_as_zeros = encode_bg_as_zeros + self.box_code_size = box_code_size + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.bbox_coder = build_bbox_coder(bbox_coder) + self.assigner_per_size = assigner_per_size + self.assign_per_class = assign_per_class + self.dir_offset = dir_offset + self.dir_limit_offset = dir_limit_offset + + # build target assigner & sampler + if train_cfg is not None: + self.sampling = loss_cls['type'] not in ['FocalLoss', 'GHMC'] + if self.sampling: + self.bbox_sampler = build_sampler(train_cfg.sampler) + else: + self.bbox_sampler = PseudoSampler() + if isinstance(train_cfg.assigner, dict): + self.bbox_assigner = build_assigner(train_cfg.assigner) + elif isinstance(train_cfg.assigner, list): + self.bbox_assigner = [ + build_assigner(res) for res in train_cfg.assigner + ] + + # build anchor generator + self.anchor_range = anchor_range + self.anchor_rotations = anchor_rotations + self.anchor_strides = anchor_strides + self.anchor_sizes = anchor_sizes + self.target_means = target_means + self.target_stds = target_stds + self.anchor_generators = [] + # In 3D detection, the anchor stride is connected with anchor size + self.num_anchors = ( + len(self.anchor_rotations) * len(self.anchor_sizes)) + # if len(self.anchor_sizes) != self.anchor_strides: + # # this means different anchor in the same anchor strides + # anchor_sizes = [self.anchor_sizes] + for anchor_stride in self.anchor_strides: + anchor_generator.update( + anchor_ranges=anchor_range, + sizes=self.anchor_sizes, + stride=anchor_stride, + rotations=anchor_rotations, + custom_values=anchor_custom_values, + cache_anchor=cache_anchor) + self.anchor_generators.append( + build_anchor_generator(anchor_generator)) + + self._init_layers() + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + if not self.use_sigmoid_cls: + self.num_classes += 1 + self.loss_cls = build_loss(loss_cls) + self.loss_bbox = build_loss(loss_bbox) + self.loss_dir = build_loss(loss_dir) + self.fp16_enabled = False + + def _init_layers(self): + self.cls_out_channels = self.num_anchors * self.num_classes + self.conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1) + self.conv_reg = nn.Conv2d(self.feat_channels, + self.num_anchors * self.box_code_size, 1) + if self.use_direction_classifier: + self.conv_dir_cls = nn.Conv2d(self.feat_channels, + self.num_anchors * 2, 1) + + def init_weights(self): + bias_cls = bias_init_with_prob(0.01) + normal_init(self.conv_cls, std=0.01, bias=bias_cls) + normal_init(self.conv_reg, std=0.01) + + def forward_single(self, x): + cls_score = self.conv_cls(x) + bbox_pred = self.conv_reg(x) + dir_cls_preds = None + if self.use_direction_classifier: + dir_cls_preds = self.conv_dir_cls(x) + return cls_score, bbox_pred, dir_cls_preds + + def forward(self, feats): + return multi_apply(self.forward_single, feats) + + def get_anchors(self, featmap_sizes, input_metas): + """Get anchors according to feature map sizes. + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + input_metas (list[dict]): contain pcd and img's meta info. + Returns: + tuple: anchors of each image, valid flags of each image + """ + num_imgs = len(input_metas) + num_levels = len(featmap_sizes) + + # since feature map sizes of all images are the same, we only compute + # anchors for one time + multi_level_anchors = [] + for i in range(num_levels): + anchors = self.anchor_generators[i].grid_anchors(featmap_sizes[i]) + if not self.assigner_per_size: + anchors = anchors.reshape(-1, anchors.size(-1)) + multi_level_anchors.append(anchors) + anchor_list = [multi_level_anchors for _ in range(num_imgs)] + return anchor_list + + def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels, + label_weights, bbox_targets, bbox_weights, dir_targets, + dir_weights, num_total_samples): + # classification loss + if num_total_samples is None: + num_total_samples = int(cls_score.shape[0]) + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.num_classes) + loss_cls = self.loss_cls( + cls_score, labels, label_weights, avg_factor=num_total_samples) + + # regression loss + bbox_targets = bbox_targets.reshape(-1, self.box_code_size) + bbox_weights = bbox_weights.reshape(-1, self.box_code_size) + code_weight = self.train_cfg.get('code_weight', None) + + if code_weight: + bbox_weights = bbox_weights * bbox_weights.new_tensor(code_weight) + bbox_pred = bbox_pred.permute(0, 2, 3, + 1).reshape(-1, self.box_code_size) + if self.diff_rad_by_sin: + bbox_pred, bbox_targets = self.add_sin_difference( + bbox_pred, bbox_targets) + loss_bbox = self.loss_bbox( + bbox_pred, + bbox_targets, + bbox_weights, + avg_factor=num_total_samples) + + # direction classification loss + loss_dir = None + if self.use_direction_classifier: + dir_cls_preds = dir_cls_preds.permute(0, 2, 3, 1).reshape(-1, 2) + dir_targets = dir_targets.reshape(-1) + dir_weights = dir_weights.reshape(-1) + loss_dir = self.loss_dir( + dir_cls_preds, + dir_targets, + dir_weights, + avg_factor=num_total_samples) + + return loss_cls, loss_bbox, loss_dir + + @staticmethod + def add_sin_difference(boxes1, boxes2): + rad_pred_encoding = torch.sin(boxes1[..., -1:]) * torch.cos( + boxes2[..., -1:]) + rad_tg_encoding = torch.cos(boxes1[..., -1:]) * torch.sin(boxes2[..., + -1:]) + boxes1 = torch.cat([boxes1[..., :-1], rad_pred_encoding], dim=-1) + boxes2 = torch.cat([boxes2[..., :-1], rad_tg_encoding], dim=-1) + return boxes1, boxes2 + + def loss(self, + cls_scores, + bbox_preds, + dir_cls_preds, + gt_bboxes, + gt_labels, + input_metas, + gt_bboxes_ignore=None): + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == len(self.anchor_generators) + + anchor_list = self.get_anchors(featmap_sizes, input_metas) + label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 + cls_reg_targets = self.anchor_target_3d( + anchor_list, + gt_bboxes, + input_metas, + self.target_means, + self.target_stds, + gt_bboxes_ignore_list=gt_bboxes_ignore, + gt_labels_list=gt_labels, + num_classes=self.num_classes, + label_channels=label_channels, + sampling=self.sampling) + + if cls_reg_targets is None: + return None + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + dir_targets_list, dir_weights_list, num_total_pos, + num_total_neg) = cls_reg_targets + num_total_samples = ( + num_total_pos + num_total_neg if self.sampling else num_total_pos) + + # num_total_samples = None + losses_cls, losses_bbox, losses_dir = multi_apply( + self.loss_single, + cls_scores, + bbox_preds, + dir_cls_preds, + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + dir_targets_list, + dir_weights_list, + num_total_samples=num_total_samples) + return dict( + loss_cls_3d=losses_cls, + loss_bbox_3d=losses_bbox, + loss_dir_3d=losses_dir) + + def get_bboxes(self, + cls_scores, + bbox_preds, + dir_cls_preds, + input_metas, + rescale=False): + assert len(cls_scores) == len(bbox_preds) + assert len(cls_scores) == len(dir_cls_preds) + num_levels = len(cls_scores) + + mlvl_anchors = [ + self.anchor_generators[i].grid_anchors( + cls_scores[i].size()[-2:]).reshape(-1, self.box_code_size) + for i in range(num_levels) + ] + result_list = [] + for img_id in range(len(input_metas)): + cls_score_list = [ + cls_scores[i][img_id].detach() for i in range(num_levels) + ] + bbox_pred_list = [ + bbox_preds[i][img_id].detach() for i in range(num_levels) + ] + dir_cls_pred_list = [ + dir_cls_preds[i][img_id].detach() for i in range(num_levels) + ] + + input_meta = input_metas[img_id] + proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list, + dir_cls_pred_list, mlvl_anchors, + input_meta, rescale) + result_list.append(proposals) + return result_list + + def get_bboxes_single(self, + cls_scores, + bbox_preds, + dir_cls_preds, + mlvl_anchors, + input_meta, + rescale=False): + assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_dir_scores = [] + mlvl_bboxes_for_nms = [] + for cls_score, bbox_pred, dir_cls_pred, anchors in zip( + cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + if self.use_direction_classifier: + assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:] + + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.num_classes) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + scores = cls_score.softmax(-1) + bbox_pred = bbox_pred.permute(1, 2, + 0).reshape(-1, self.box_code_size) + dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2) + dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1] + + score_thr = self.test_cfg.get('score_thr', 0) + if score_thr > 0: + if self.use_sigmoid_cls: + max_scores, _ = scores.max(dim=1) + else: + max_scores, _ = scores[:, 1:].max(dim=1) + thr_inds = (max_scores >= score_thr) + anchors = anchors[thr_inds] + bbox_pred = bbox_pred[thr_inds] + scores = scores[thr_inds] + dir_cls_scores = dir_cls_score[thr_inds] + bboxes = self.bbox_coder.decode_torch(anchors, bbox_pred, + self.target_means, + self.target_stds) + bboxes_for_nms = boxes3d_to_bev_torch_lidar(bboxes) + mlvl_bboxes_for_nms.append(bboxes_for_nms) + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_dir_scores.append(dir_cls_scores) + + mlvl_bboxes = torch.cat(mlvl_bboxes) + mlvl_bboxes_for_nms = torch.cat(mlvl_bboxes_for_nms) + mlvl_scores = torch.cat(mlvl_scores) + mlvl_dir_scores = torch.cat(mlvl_dir_scores) + + if len(mlvl_scores) > 0: + mlvl_scores, mlvl_label_preds = mlvl_scores.max(dim=-1) + if self.test_cfg.use_rotate_nms: + nms_func = nms_gpu + else: + nms_func = nms_normal_gpu + selected = nms_func(mlvl_bboxes_for_nms, mlvl_scores, + self.test_cfg.nms_thr) + else: + selected = [] + + if len(selected) > 0: + selected_bboxes = mlvl_bboxes[selected] + selected_scores = mlvl_scores[selected] + selected_label_preds = mlvl_label_preds[selected] + selected_dir_scores = mlvl_dir_scores[selected] + dir_rot = box_torch_ops.limit_period( + selected_bboxes[..., -1] - self.dir_offset, + self.dir_limit_offset, np.pi) + selected_bboxes[..., -1] = ( + dir_rot + self.dir_offset + + np.pi * selected_dir_scores.to(selected_bboxes.dtype)) + + return dict( + box3d_lidar=selected_bboxes.cpu(), + scores=selected_scores.cpu(), + label_preds=selected_label_preds.cpu(), + sample_idx=input_meta['sample_idx'], + ) + + return dict( + box3d_lidar=mlvl_scores.new_zeros([0, 7]).cpu(), + scores=mlvl_scores.new_zeros([0]).cpu(), + label_preds=mlvl_scores.new_zeros([0, 4]).cpu(), + sample_idx=input_meta['sample_idx'], + ) diff --git a/mmdet3d/models/anchor_heads/train_mixins.py b/mmdet3d/models/anchor_heads/train_mixins.py new file mode 100644 index 0000000000..721a7c68db --- /dev/null +++ b/mmdet3d/models/anchor_heads/train_mixins.py @@ -0,0 +1,245 @@ +import numpy as np +import torch + +from mmdet3d.core import box_torch_ops, images_to_levels, multi_apply + + +class AnchorTrainMixin(object): + + def anchor_target_3d(self, + anchor_list, + gt_bboxes_list, + input_metas, + target_means, + target_stds, + gt_bboxes_ignore_list=None, + gt_labels_list=None, + label_channels=1, + num_classes=1, + sampling=True): + """Compute regression and classification targets for anchors. + + Args: + anchor_list (list[list]): Multi level anchors of each image. + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image. + img_metas (list[dict]): Meta info of each image. + target_means (Iterable): Mean value of regression targets. + target_stds (Iterable): Std value of regression targets. + + Returns: + tuple + """ + num_imgs = len(input_metas) + assert len(anchor_list) == num_imgs + + # anchor number of multi levels + num_level_anchors = [ + anchors.view(-1, self.box_code_size).size(0) + for anchors in anchor_list[0] + ] + # concat all level anchors and flags to a single tensor + for i in range(num_imgs): + anchor_list[i] = torch.cat(anchor_list[i]) + + # compute targets for each image + if gt_bboxes_ignore_list is None: + gt_bboxes_ignore_list = [None for _ in range(num_imgs)] + if gt_labels_list is None: + gt_labels_list = [None for _ in range(num_imgs)] + + (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights, + all_dir_targets, all_dir_weights, pos_inds_list, + neg_inds_list) = multi_apply( + self.anchor_target_3d_single, + anchor_list, + gt_bboxes_list, + gt_bboxes_ignore_list, + gt_labels_list, + input_metas, + target_means=target_means, + target_stds=target_stds, + label_channels=label_channels, + num_classes=num_classes, + sampling=sampling) + + # no valid anchors + if any([labels is None for labels in all_labels]): + return None + # sampled anchors of all images + num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list]) + num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list]) + # split targets to a list w.r.t. multiple levels + labels_list = images_to_levels(all_labels, num_level_anchors) + label_weights_list = images_to_levels(all_label_weights, + num_level_anchors) + bbox_targets_list = images_to_levels(all_bbox_targets, + num_level_anchors) + bbox_weights_list = images_to_levels(all_bbox_weights, + num_level_anchors) + dir_targets_list = images_to_levels(all_dir_targets, num_level_anchors) + dir_weights_list = images_to_levels(all_dir_weights, num_level_anchors) + return (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, dir_targets_list, dir_weights_list, + num_total_pos, num_total_neg) + + def anchor_target_3d_single(self, + anchors, + gt_bboxes, + gt_bboxes_ignore, + gt_labels, + input_meta, + target_means, + target_stds, + label_channels=1, + num_classes=1, + sampling=True): + if isinstance(self.bbox_assigner, list): + feat_size = anchors.size(0) * anchors.size(1) * anchors.size(2) + rot_angles = anchors.size(-2) + assert len(self.bbox_assigner) == anchors.size(-3) + (total_labels, total_label_weights, total_bbox_targets, + total_bbox_weights, total_dir_targets, total_dir_weights, + total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], [] + current_anchor_num = 0 + for i, assigner in enumerate(self.bbox_assigner): + current_anchors = anchors[..., i, :, :].reshape( + -1, self.box_code_size) + current_anchor_num += current_anchors.size(0) + if self.assign_per_class: + gt_per_cls = (gt_labels == i) + anchor_targets = self.anchor_target_single_assigner( + assigner, current_anchors, gt_bboxes[gt_per_cls, :], + gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta, + target_means, target_stds, label_channels, num_classes, + sampling) + else: + anchor_targets = self.anchor_target_single_assigner( + assigner, current_anchors, gt_bboxes, gt_bboxes_ignore, + gt_labels, input_meta, target_means, target_stds, + label_channels, num_classes, sampling) + + (labels, label_weights, bbox_targets, bbox_weights, + dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets + total_labels.append(labels.reshape(feat_size, 1, rot_angles)) + total_label_weights.append( + label_weights.reshape(feat_size, 1, rot_angles)) + total_bbox_targets.append( + bbox_targets.reshape(feat_size, 1, rot_angles, + anchors.size(-1))) + total_bbox_weights.append( + bbox_weights.reshape(feat_size, 1, rot_angles, + anchors.size(-1))) + total_dir_targets.append( + dir_targets.reshape(feat_size, 1, rot_angles)) + total_dir_weights.append( + dir_weights.reshape(feat_size, 1, rot_angles)) + total_pos_inds.append(pos_inds) + total_neg_inds.append(neg_inds) + + total_labels = torch.cat(total_labels, dim=-2).reshape(-1) + total_label_weights = torch.cat( + total_label_weights, dim=-2).reshape(-1) + total_bbox_targets = torch.cat( + total_bbox_targets, dim=-3).reshape(-1, anchors.size(-1)) + total_bbox_weights = torch.cat( + total_bbox_weights, dim=-3).reshape(-1, anchors.size(-1)) + total_dir_targets = torch.cat( + total_dir_targets, dim=-2).reshape(-1) + total_dir_weights = torch.cat( + total_dir_weights, dim=-2).reshape(-1) + total_pos_inds = torch.cat(total_pos_inds, dim=0).reshape(-1) + total_neg_inds = torch.cat(total_neg_inds, dim=0).reshape(-1) + return (total_labels, total_label_weights, total_bbox_targets, + total_bbox_weights, total_dir_targets, total_dir_weights, + total_pos_inds, total_neg_inds) + else: + return self.anchor_target_single_assigner( + self.bbox_assigner, anchors, gt_bboxes, gt_bboxes_ignore, + gt_labels, input_meta, target_means, target_stds, + label_channels, num_classes, sampling) + + def anchor_target_single_assigner(self, + bbox_assigner, + anchors, + gt_bboxes, + gt_bboxes_ignore, + gt_labels, + input_meta, + target_means, + target_stds, + label_channels=1, + num_classes=1, + sampling=True): + anchors = anchors.reshape(-1, anchors.size(-1)) + num_valid_anchors = anchors.shape[0] + bbox_targets = torch.zeros_like(anchors) + bbox_weights = torch.zeros_like(anchors) + dir_targets = anchors.new_zeros((anchors.shape[0]), dtype=torch.long) + dir_weights = anchors.new_zeros((anchors.shape[0]), dtype=torch.float) + labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long) + label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) + if len(gt_bboxes) > 0: + assign_result = bbox_assigner.assign(anchors, gt_bboxes, + gt_bboxes_ignore, gt_labels) + sampling_result = self.bbox_sampler.sample(assign_result, anchors, + gt_bboxes) + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + else: + pos_inds = torch.nonzero( + anchors.new_zeros((anchors.shape[0], ), dtype=torch.long) > 0 + ).squeeze(-1).unique() + neg_inds = torch.nonzero( + anchors.new_zeros((anchors.shape[0], ), dtype=torch.long) == + 0).squeeze(-1).unique() + + if gt_labels is not None: + labels += num_classes + if len(pos_inds) > 0: + pos_bbox_targets = self.bbox_coder.encode_torch( + sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes, + target_means, target_stds) + pos_dir_targets = get_direction_target( + sampling_result.pos_bboxes, + pos_bbox_targets, + self.dir_offset, + one_hot=False) + bbox_targets[pos_inds, :] = pos_bbox_targets + bbox_weights[pos_inds, :] = 1.0 + dir_targets[pos_inds] = pos_dir_targets + dir_weights[pos_inds] = 1.0 + + if gt_labels is None: + labels[pos_inds] = 1 + else: + labels[pos_inds] = gt_labels[ + sampling_result.pos_assigned_gt_inds] + if self.train_cfg.pos_weight <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg.pos_weight + + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + return (labels, label_weights, bbox_targets, bbox_weights, dir_targets, + dir_weights, pos_inds, neg_inds) + + +def get_direction_target(anchors, + reg_targets, + dir_offset=0, + num_bins=2, + one_hot=True): + rot_gt = reg_targets[..., 6] + anchors[..., 6] + offset_rot = box_torch_ops.limit_period(rot_gt - dir_offset, 0, 2 * np.pi) + dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long() + dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1) + if one_hot: + dir_targets = torch.zeros( + *list(dir_cls_targets.shape), + num_bins, + dtype=anchors.dtype, + device=dir_cls_targets.device) + dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0) + dir_cls_targets = dir_targets + return dir_cls_targets diff --git a/mmdet3d/models/backbones/__init__.py b/mmdet3d/models/backbones/__init__.py new file mode 100644 index 0000000000..f3070c1133 --- /dev/null +++ b/mmdet3d/models/backbones/__init__.py @@ -0,0 +1,4 @@ +from mmdet.models.backbone import SSDVGG, HRNet, ResNet, ResNetV1d, ResNeXt +from .second import SECOND + +__all__ = ['ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'SECOND'] diff --git a/mmdet3d/models/backbones/second.py b/mmdet3d/models/backbones/second.py new file mode 100644 index 0000000000..0f1e18eb31 --- /dev/null +++ b/mmdet3d/models/backbones/second.py @@ -0,0 +1,84 @@ +from functools import partial + +import torch.nn as nn +from mmcv.runner import load_checkpoint + +from ..registry import BACKBONES +from ..utils import build_norm_layer + + +class Empty(nn.Module): + + def __init__(self, *args, **kwargs): + super(Empty, self).__init__() + + def forward(self, *args, **kwargs): + if len(args) == 1: + return args[0] + elif len(args) == 0: + return None + return args + + +@BACKBONES.register_module +class SECOND(nn.Module): + """Compare with RPN, RPNV2 support arbitrary number of stage. + """ + + def __init__(self, + in_channels=128, + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + num_filters=[128, 128, 256], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01)): + super(SECOND, self).__init__() + assert len(layer_strides) == len(layer_nums) + assert len(num_filters) == len(layer_nums) + + if norm_cfg is not None: + Conv2d = partial(nn.Conv2d, bias=False) + else: + Conv2d = partial(nn.Conv2d, bias=True) + + in_filters = [in_channels, *num_filters[:-1]] + # note that when stride > 1, conv2d with same padding isn't + # equal to pad-conv2d. we should use pad-conv2d. + blocks = [] + + for i, layer_num in enumerate(layer_nums): + norm_layer = ( + build_norm_layer(norm_cfg, num_filters[i])[1] + if norm_cfg is not None else Empty) + block = [ + nn.ZeroPad2d(1), + Conv2d( + in_filters[i], num_filters[i], 3, stride=layer_strides[i]), + norm_layer, + nn.ReLU(inplace=True), + ] + for j in range(layer_num): + norm_layer = ( + build_norm_layer(norm_cfg, num_filters[i])[1] + if norm_cfg is not None else Empty) + block.append( + Conv2d(num_filters[i], num_filters[i], 3, padding=1)) + block.append(norm_layer) + block.append(nn.ReLU(inplace=True)) + + block = nn.Sequential(*block) + blocks.append(block) + + self.blocks = nn.ModuleList(blocks) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + from mmdet3d.apis import get_root_logger + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + + def forward(self, x): + outs = [] + for i in range(len(self.blocks)): + x = self.blocks[i](x) + outs.append(x) + return tuple(outs) diff --git a/mmdet3d/models/bbox_heads/__init__.py b/mmdet3d/models/bbox_heads/__init__.py new file mode 100644 index 0000000000..41998d7d17 --- /dev/null +++ b/mmdet3d/models/bbox_heads/__init__.py @@ -0,0 +1,8 @@ +from mmdet.models.bbox_heads import (BBoxHead, ConvFCBBoxHead, + DoubleConvFCBBoxHead, Shared2FCBBoxHead, + Shared4Conv1FCBBoxHead) + +__all__ = [ + 'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead', + 'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead' +] diff --git a/mmdet3d/models/builder.py b/mmdet3d/models/builder.py new file mode 100644 index 0000000000..8d101b18cb --- /dev/null +++ b/mmdet3d/models/builder.py @@ -0,0 +1,56 @@ +from torch import nn + +from mmdet.models.registry import (BACKBONES, DETECTORS, HEADS, LOSSES, NECKS, + ROI_EXTRACTORS, SHARED_HEADS) +from ..utils import build_from_cfg +from .registry import FUSION_LAYERS, MIDDLE_ENCODERS, VOXEL_ENCODERS + + +def build(cfg, registry, default_args=None): + if isinstance(cfg, list): + modules = [ + build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg + ] + return nn.Sequential(*modules) + else: + return build_from_cfg(cfg, registry, default_args) + + +def build_backbone(cfg): + return build(cfg, BACKBONES) + + +def build_neck(cfg): + return build(cfg, NECKS) + + +def build_roi_extractor(cfg): + return build(cfg, ROI_EXTRACTORS) + + +def build_shared_head(cfg): + return build(cfg, SHARED_HEADS) + + +def build_head(cfg): + return build(cfg, HEADS) + + +def build_loss(cfg): + return build(cfg, LOSSES) + + +def build_detector(cfg, train_cfg=None, test_cfg=None): + return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg)) + + +def build_voxel_encoder(cfg): + return build(cfg, VOXEL_ENCODERS) + + +def build_middle_encoder(cfg): + return build(cfg, MIDDLE_ENCODERS) + + +def build_fusion_layer(cfg): + return build(cfg, FUSION_LAYERS) diff --git a/mmdet3d/models/detectors/__init__.py b/mmdet3d/models/detectors/__init__.py new file mode 100644 index 0000000000..15fb21656e --- /dev/null +++ b/mmdet3d/models/detectors/__init__.py @@ -0,0 +1,14 @@ +from .base import BaseDetector +from .mvx_faster_rcnn import (DynamicMVXFasterRCNN, DynamicMVXFasterRCNNV2, + DynamicMVXFasterRCNNV3) +from .mvx_single_stage import MVXSingleStageDetector +from .mvx_two_stage import MVXTwoStageDetector +from .single_stage import SingleStageDetector +from .two_stage import TwoStageDetector +from .voxelnet import DynamicVoxelNet, VoxelNet + +__all__ = [ + 'BaseDetector', 'SingleStageDetector', 'VoxelNet', 'DynamicVoxelNet', + 'TwoStageDetector', 'MVXSingleStageDetector', 'MVXTwoStageDetector', + 'DynamicMVXFasterRCNN', 'DynamicMVXFasterRCNNV2', 'DynamicMVXFasterRCNNV3' +] diff --git a/mmdet3d/models/detectors/base.py b/mmdet3d/models/detectors/base.py new file mode 100644 index 0000000000..83df170c51 --- /dev/null +++ b/mmdet3d/models/detectors/base.py @@ -0,0 +1,110 @@ +from abc import ABCMeta, abstractmethod + +import torch.nn as nn + + +class BaseDetector(nn.Module, metaclass=ABCMeta): + """Base class for detectors""" + + def __init__(self): + super(BaseDetector, self).__init__() + self.fp16_enabled = False + + @property + def with_neck(self): + return hasattr(self, 'neck') and self.neck is not None + + @property + def with_voxel_encoder(self): + return hasattr(self, + 'voxel_encoder') and self.voxel_encoder is not None + + @property + def with_middle_encoder(self): + return hasattr(self, + 'middle_encoder') and self.middle_encoder is not None + + @property + def with_shared_head(self): + return hasattr(self, 'shared_head') and self.shared_head is not None + + @property + def with_bbox(self): + return hasattr(self, 'bbox_head') and self.bbox_head is not None + + @property + def with_mask(self): + return hasattr(self, 'mask_head') and self.mask_head is not None + + @abstractmethod + def extract_feat(self, imgs): + pass + + def extract_feats(self, imgs): + assert isinstance(imgs, list) + for img in imgs: + yield self.extract_feat(img) + + @abstractmethod + def forward_train(self, **kwargs): + pass + + @abstractmethod + def simple_test(self, **kwargs): + pass + + @abstractmethod + def aug_test(self, **kwargs): + pass + + def init_weights(self, pretrained=None): + if pretrained is not None: + from mmdet3d.apis import get_root_logger + logger = get_root_logger() + logger.info('load model from: {}'.format(pretrained)) + + def forward_test(self, imgs, img_metas, **kwargs): + """ + Args: + imgs (List[Tensor]): the outer list indicates test-time + augmentations and inner Tensor should have a shape NxCxHxW, + which contains all images in the batch. + img_meta (List[List[dict]]): the outer list indicates test-time + augs (multiscale, flip, etc.) and the inner list indicates + images in a batch + """ + for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]: + if not isinstance(var, list): + raise TypeError('{} must be a list, but got {}'.format( + name, type(var))) + + num_augs = len(imgs) + if num_augs != len(img_metas): + raise ValueError( + 'num of augmentations ({}) != num of image meta ({})'.format( + len(imgs), len(img_metas))) + # TODO: remove the restriction of imgs_per_gpu == 1 when prepared + imgs_per_gpu = imgs[0].size(0) + assert imgs_per_gpu == 1 + + if num_augs == 1: + return self.simple_test(imgs[0], img_metas[0], **kwargs) + else: + return self.aug_test(imgs, img_metas, **kwargs) + + def forward(self, img, img_meta, return_loss=True, **kwargs): + """ + Calls either forward_train or forward_test depending on whether + return_loss=True. Note this setting will change the expected inputs. + When `return_loss=True`, img and img_meta are single-nested (i.e. + Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta + should be double nested (i.e. List[Tensor], List[List[dict]]), with + the outer list indicating test time augmentations. + """ + + # TODO: current version only support 2D detector now, find + # a better way to be compatible with both + if return_loss: + return self.forward_train(img, img_meta, **kwargs) + else: + return self.forward_test(img, img_meta, **kwargs) diff --git a/mmdet3d/models/detectors/mvx_faster_rcnn.py b/mmdet3d/models/detectors/mvx_faster_rcnn.py new file mode 100644 index 0000000000..34cdb420bc --- /dev/null +++ b/mmdet3d/models/detectors/mvx_faster_rcnn.py @@ -0,0 +1,103 @@ +import torch +import torch.nn.functional as F + +from mmdet.models.registry import DETECTORS +from .mvx_two_stage import MVXTwoStageDetector + + +@DETECTORS.register_module +class DynamicMVXFasterRCNN(MVXTwoStageDetector): + + def __init__(self, **kwargs): + super(DynamicMVXFasterRCNN, self).__init__(**kwargs) + + def extract_pts_feat(self, points, img_feats, img_meta): + if not self.with_pts_bbox: + return None + voxels, coors = self.voxelize(points) + # adopt an early fusion strategy + if self.with_fusion: + voxels = self.pts_fusion_layer(img_feats, points, voxels, img_meta) + voxel_features, feature_coors = self.pts_voxel_encoder(voxels, coors) + batch_size = coors[-1, 0] + 1 + x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size) + x = self.pts_backbone(x) + if self.with_pts_neck: + x = self.pts_neck(x) + return x + + @torch.no_grad() + def voxelize(self, points): + coors = [] + # dynamic voxelization only provide a coors mapping + for res in points: + res_coors = self.pts_voxel_layer(res) + coors.append(res_coors) + points = torch.cat(points, dim=0) + coors_batch = [] + for i, coor in enumerate(coors): + coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) + coors_batch.append(coor_pad) + coors_batch = torch.cat(coors_batch, dim=0) + return points, coors_batch + + +@DETECTORS.register_module +class DynamicMVXFasterRCNNV2(DynamicMVXFasterRCNN): + + def __init__(self, **kwargs): + super(DynamicMVXFasterRCNNV2, self).__init__(**kwargs) + + def extract_pts_feat(self, points, img_feats, img_meta): + if not self.with_pts_bbox: + return None + voxels, coors = self.voxelize(points) + voxel_features, feature_coors = self.pts_voxel_encoder( + voxels, coors, points, img_feats, img_meta) + batch_size = coors[-1, 0] + 1 + x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size) + x = self.pts_backbone(x) + if self.with_pts_neck: + x = self.pts_neck(x) + return x + + +@DETECTORS.register_module +class MVXFasterRCNNV2(MVXTwoStageDetector): + + def __init__(self, **kwargs): + super(MVXFasterRCNNV2, self).__init__(**kwargs) + + def extract_pts_feat(self, pts, img_feats, img_meta): + if not self.with_pts_bbox: + return None + voxels, num_points, coors = self.voxelize(pts) + voxel_features = self.pts_voxel_encoder(voxels, num_points, coors, + img_feats, img_meta) + + batch_size = coors[-1, 0] + 1 + x = self.pts_middle_encoder(voxel_features, coors, batch_size) + x = self.pts_backbone(x) + + if self.with_pts_neck: + x = self.pts_neck(x) + return x + + +@DETECTORS.register_module +class DynamicMVXFasterRCNNV3(DynamicMVXFasterRCNN): + + def __init__(self, **kwargs): + super(DynamicMVXFasterRCNNV3, self).__init__(**kwargs) + + def extract_pts_feat(self, points, img_feats, img_meta): + if not self.with_pts_bbox: + return None + voxels, coors = self.voxelize(points) + voxel_features, feature_coors = self.pts_voxel_encoder(voxels, coors) + batch_size = coors[-1, 0] + 1 + x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size) + x = self.pts_backbone(x) + if self.with_pts_neck: + x = self.pts_neck(x, coors, points, img_feats, img_meta) + return x diff --git a/mmdet3d/models/detectors/mvx_single_stage.py b/mmdet3d/models/detectors/mvx_single_stage.py new file mode 100644 index 0000000000..5bb7890d7a --- /dev/null +++ b/mmdet3d/models/detectors/mvx_single_stage.py @@ -0,0 +1,330 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mmdet3d.ops import Voxelization +from mmdet.models.registry import DETECTORS +from .. import builder +from .base import BaseDetector + + +@DETECTORS.register_module +class MVXSingleStageDetector(BaseDetector): + + def __init__(self, + voxel_layer, + voxel_encoder, + middle_encoder, + fusion_layer, + img_backbone, + pts_backbone, + img_neck=None, + pts_neck=None, + pts_bbox_head=None, + img_bbox_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(MVXSingleStageDetector, self).__init__() + self.voxel_layer = Voxelization(**voxel_layer) + self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder) + self.middle_encoder = builder.build_middle_encoder(middle_encoder) + self.pts_backbone = builder.build_backbone(pts_backbone) + + if fusion_layer: + self.fusion_layer = builder.build_fusion_layer(fusion_layer) + if img_backbone: + self.img_backbone = builder.build_backbone(img_backbone) + + pts_bbox_head.update(train_cfg=train_cfg) + pts_bbox_head.update(test_cfg=test_cfg) + self.pts_bbox_head = builder.build_head(pts_bbox_head) + if img_neck is not None: + self.img_neck = builder.build_neck(img_neck) + if pts_neck is not None: + self.pts_neck = builder.build_neck(pts_neck) + if img_bbox_head is not None: + self.img_bbox_head = builder.build_head(img_bbox_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.init_weights(pretrained=pretrained) + + def init_weights(self, pretrained=None): + super(MVXSingleStageDetector, self).init_weights(pretrained) + if self.with_img_backbone: + self.img_backbone.init_weights(pretrained=pretrained) + if self.with_img_neck: + if isinstance(self.img_neck, nn.Sequential): + for m in self.img_neck: + m.init_weights() + else: + self.img_neck.init_weights() + if self.with_img_bbox: + self.img_bbox_head.init_weights() + if self.with_pts_bbox: + self.pts_bbox_head.init_weights() + + @property + def with_pts_bbox(self): + return hasattr(self, + 'pts_bbox_head') and self.pts_bbox_head is not None + + @property + def with_img_bbox(self): + return hasattr(self, + 'img_bbox_head') and self.img_bbox_head is not None + + @property + def with_img_backbone(self): + return hasattr(self, 'img_backbone') and self.img_backbone is not None + + @property + def with_fusion(self): + return hasattr(self, 'fusion_layer') and self.fusion_layer is not None + + @property + def with_img_neck(self): + return hasattr(self, 'img_neck') and self.img_neck is not None + + @property + def with_pts_neck(self): + return hasattr(self, 'pts_neck') and self.pts_neck is not None + + def extract_feat(self, points, img, img_meta): + if self.with_img_backbone: + img_feats = self.img_backbone(img) + if self.with_img_neck: + img_feats = self.img_neck(img_feats) + + voxels, num_points, coors = self.voxelize(points) + voxel_features = self.voxel_encoder(voxels, num_points, coors) + batch_size = coors[-1, 0] + 1 + x = self.middle_encoder(voxel_features, coors, batch_size) + x = self.pts_backbone(x) + if self.with_neck: + x = self.pts_neck(x) + return x + + @torch.no_grad() + def voxelize(self, points): + voxels, coors, num_points = [], [], [] + for res in points: + res_voxels, res_coors, res_num_points = self.voxel_layer(res) + voxels.append(res_voxels) + coors.append(res_coors) + num_points.append(res_num_points) + voxels = torch.cat(voxels, dim=0) + num_points = torch.cat(num_points, dim=0) + coors_batch = [] + for i, coor in enumerate(coors): + coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) + coors_batch.append(coor_pad) + coors_batch = torch.cat(coors_batch, dim=0) + return voxels, num_points, coors_batch + + def forward_train(self, + points, + img_meta, + gt_bboxes_3d, + gt_labels, + img=None, + gt_bboxes_ignore=None): + x = self.extract_feat(points, img=img, img_meta=img_meta) + outs = self.pts_bbox_head(x) + loss_inputs = outs + (gt_bboxes_3d, gt_labels, img_meta) + losses = self.pts_bbox_head.loss( + *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + return losses + + def forward_test(self, **kwargs): + return self.simple_test(**kwargs) + + def forward(self, return_loss=True, **kwargs): + if return_loss: + return self.forward_train(**kwargs) + else: + return self.forward_test(**kwargs) + + def simple_test(self, + points, + img_meta, + img=None, + gt_bboxes_3d=None, + rescale=False): + x = self.extract_feat(points, img, img_meta) + outs = self.pts_bbox_head(x) + bbox_inputs = outs + (img_meta, rescale) + bbox_list = self.pts_bbox_head.get_bboxes(*bbox_inputs) + return bbox_list + + def aug_test(self, points, imgs, img_metas, rescale=False): + raise NotImplementedError + + +@DETECTORS.register_module +class DynamicMVXNet(MVXSingleStageDetector): + + def __init__(self, + voxel_layer, + voxel_encoder, + middle_encoder, + pts_backbone, + fusion_layer=None, + img_backbone=None, + img_neck=None, + pts_neck=None, + pts_bbox_head=None, + img_bbox_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(DynamicMVXNet, self).__init__( + voxel_layer=voxel_layer, + voxel_encoder=voxel_encoder, + middle_encoder=middle_encoder, + img_backbone=img_backbone, + fusion_layer=fusion_layer, + pts_backbone=pts_backbone, + pts_neck=pts_neck, + img_neck=img_neck, + img_bbox_head=img_bbox_head, + pts_bbox_head=pts_bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + pretrained=pretrained, + ) + + def extract_feat(self, points, img, img_meta): + if self.with_img_backbone: + img_feats = self.img_backbone(img) + if self.with_img_neck: + img_feats = self.img_neck(img_feats) + + voxels, coors = self.voxelize(points) + # adopt an early fusion strategy + if self.with_fusion: + voxels = self.fusion_layer(img_feats, points, voxels, img_meta) + + voxel_features, feature_coors = self.voxel_encoder(voxels, coors) + batch_size = coors[-1, 0] + 1 + x = self.middle_encoder(voxel_features, feature_coors, batch_size) + x = self.pts_backbone(x) + if self.with_pts_neck: + x = self.pts_neck(x) + return x + + @torch.no_grad() + def voxelize(self, points): + coors = [] + # dynamic voxelization only provide a coors mapping + for res in points: + res_coors = self.voxel_layer(res) + coors.append(res_coors) + points = torch.cat(points, dim=0) + coors_batch = [] + for i, coor in enumerate(coors): + coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) + coors_batch.append(coor_pad) + coors_batch = torch.cat(coors_batch, dim=0) + return points, coors_batch + + +@DETECTORS.register_module +class DynamicMVXNetV2(DynamicMVXNet): + + def __init__(self, + voxel_layer, + voxel_encoder, + middle_encoder, + pts_backbone, + fusion_layer=None, + img_backbone=None, + img_neck=None, + pts_neck=None, + pts_bbox_head=None, + img_bbox_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(DynamicMVXNetV2, self).__init__( + voxel_layer=voxel_layer, + voxel_encoder=voxel_encoder, + middle_encoder=middle_encoder, + img_backbone=img_backbone, + fusion_layer=fusion_layer, + pts_backbone=pts_backbone, + pts_neck=pts_neck, + img_neck=img_neck, + img_bbox_head=img_bbox_head, + pts_bbox_head=pts_bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + pretrained=pretrained, + ) + + def extract_feat(self, points, img, img_meta): + if self.with_img_backbone: + img_feats = self.img_backbone(img) + if self.with_img_neck: + img_feats = self.img_neck(img_feats) + + voxels, coors = self.voxelize(points) + + voxel_features, feature_coors = self.voxel_encoder( + voxels, coors, points, img_feats, img_meta) + batch_size = coors[-1, 0] + 1 + x = self.middle_encoder(voxel_features, feature_coors, batch_size) + x = self.pts_backbone(x) + if self.with_pts_neck: + x = self.pts_neck(x) + return x + + +@DETECTORS.register_module +class DynamicMVXNetV3(DynamicMVXNet): + + def __init__(self, + voxel_layer, + voxel_encoder, + middle_encoder, + pts_backbone, + fusion_layer=None, + img_backbone=None, + img_neck=None, + pts_neck=None, + pts_bbox_head=None, + img_bbox_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(DynamicMVXNetV3, self).__init__( + voxel_layer=voxel_layer, + voxel_encoder=voxel_encoder, + middle_encoder=middle_encoder, + img_backbone=img_backbone, + fusion_layer=fusion_layer, + pts_backbone=pts_backbone, + pts_neck=pts_neck, + img_neck=img_neck, + img_bbox_head=img_bbox_head, + pts_bbox_head=pts_bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + pretrained=pretrained, + ) + + def extract_feat(self, points, img, img_meta): + if self.with_img_backbone: + img_feats = self.img_backbone(img) + if self.with_img_neck: + img_feats = self.img_neck(img_feats) + + voxels, coors = self.voxelize(points) + voxel_features, feature_coors = self.voxel_encoder(voxels, coors) + batch_size = coors[-1, 0] + 1 + x = self.middle_encoder(voxel_features, feature_coors, batch_size) + x = self.pts_backbone(x) + if self.with_pts_neck: + x = self.pts_neck(x, coors, points, img_feats, img_meta) + return x diff --git a/mmdet3d/models/detectors/mvx_two_stage.py b/mmdet3d/models/detectors/mvx_two_stage.py new file mode 100644 index 0000000000..b085c632a2 --- /dev/null +++ b/mmdet3d/models/detectors/mvx_two_stage.py @@ -0,0 +1,376 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mmdet3d.core import (bbox2result_coco, bbox2roi, build_assigner, + build_sampler) +from mmdet3d.ops import Voxelization +from mmdet.models.registry import DETECTORS +from .. import builder +from .base import BaseDetector +from .test_mixins import BBoxTestMixin, RPNTestMixin + + +@DETECTORS.register_module +class MVXTwoStageDetector(BaseDetector, RPNTestMixin, BBoxTestMixin): + + def __init__(self, + pts_voxel_layer=None, + pts_voxel_encoder=None, + pts_middle_encoder=None, + pts_fusion_layer=None, + img_backbone=None, + pts_backbone=None, + img_neck=None, + pts_neck=None, + pts_bbox_head=None, + img_bbox_head=None, + img_shared_head=None, + img_rpn_head=None, + img_bbox_roi_extractor=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(MVXTwoStageDetector, self).__init__() + + if pts_voxel_layer: + self.pts_voxel_layer = Voxelization(**pts_voxel_layer) + if pts_voxel_encoder: + self.pts_voxel_encoder = builder.build_voxel_encoder( + pts_voxel_encoder) + if pts_middle_encoder: + self.pts_middle_encoder = builder.build_middle_encoder( + pts_middle_encoder) + if pts_backbone: + self.pts_backbone = builder.build_backbone(pts_backbone) + if pts_fusion_layer: + self.pts_fusion_layer = builder.build_fusion_layer( + pts_fusion_layer) + if pts_neck is not None: + self.pts_neck = builder.build_neck(pts_neck) + if pts_bbox_head: + pts_train_cfg = train_cfg.pts if train_cfg else None + pts_bbox_head.update(train_cfg=pts_train_cfg) + pts_test_cfg = test_cfg.pts if test_cfg else None + pts_bbox_head.update(test_cfg=pts_test_cfg) + self.pts_bbox_head = builder.build_head(pts_bbox_head) + + if img_backbone: + self.img_backbone = builder.build_backbone(img_backbone) + if img_neck is not None: + self.img_neck = builder.build_neck(img_neck) + if img_shared_head is not None: + self.img_shared_head = builder.build_shared_head(img_shared_head) + if img_rpn_head is not None: + self.img_rpn_head = builder.build_head(img_rpn_head) + if img_bbox_head is not None: + self.img_bbox_roi_extractor = builder.build_roi_extractor( + img_bbox_roi_extractor) + self.img_bbox_head = builder.build_head(img_bbox_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.init_weights(pretrained=pretrained) + + def init_weights(self, pretrained=None): + super(MVXTwoStageDetector, self).init_weights(pretrained) + if self.with_img_backbone: + self.img_backbone.init_weights(pretrained=pretrained) + if self.with_img_neck: + if isinstance(self.img_neck, nn.Sequential): + for m in self.img_neck: + m.init_weights() + else: + self.img_neck.init_weights() + if self.with_shared_head: + self.img_shared_head.init_weights(pretrained=pretrained) + if self.with_img_rpn: + self.img_rpn_head.init_weights() + if self.with_img_bbox: + self.img_bbox_roi_extractor.init_weights() + self.img_bbox_head.init_weights() + if self.with_pts_bbox: + self.pts_bbox_head.init_weights() + + @property + def with_img_shared_head(self): + return hasattr(self, + 'img_shared_head') and self.img_shared_head is not None + + @property + def with_pts_bbox(self): + return hasattr(self, + 'pts_bbox_head') and self.pts_bbox_head is not None + + @property + def with_img_bbox(self): + return hasattr(self, + 'img_bbox_head') and self.img_bbox_head is not None + + @property + def with_img_backbone(self): + return hasattr(self, 'img_backbone') and self.img_backbone is not None + + @property + def with_fusion(self): + return hasattr(self, + 'pts_fusion_layer') and self.fusion_layer is not None + + @property + def with_img_neck(self): + return hasattr(self, 'img_neck') and self.img_neck is not None + + @property + def with_pts_neck(self): + return hasattr(self, 'pts_neck') and self.pts_neck is not None + + @property + def with_img_rpn(self): + return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None + + def extract_img_feat(self, img, img_meta): + if self.with_img_backbone: + if img.dim() == 5 and img.size(0) == 1: + img.squeeze_() + elif img.dim() == 5 and img.size(0) > 1: + B, N, C, H, W = img.size() + img = img.view(B * N, C, H, W) + img_feats = self.img_backbone(img) + else: + return None + if self.with_img_neck: + img_feats = self.img_neck(img_feats) + if torch.isnan(img_feats[0]).any(): + import pdb + pdb.set_trace() + return img_feats + + def extract_pts_feat(self, pts, img_feats, img_meta): + if not self.with_pts_bbox: + return None + voxels, num_points, coors = self.voxelize(pts) + voxel_features = self.pts_voxel_encoder(voxels, num_points, coors) + batch_size = coors[-1, 0] + 1 + x = self.pts_middle_encoder(voxel_features, coors, batch_size) + x = self.pts_backbone(x) + if self.with_pts_neck: + x = self.pts_neck(x) + return x + + def extract_feat(self, points, img, img_meta): + img_feats = self.extract_img_feat(img, img_meta) + pts_feats = self.extract_pts_feat(points, img_feats, img_meta) + return (img_feats, pts_feats) + + @torch.no_grad() + def voxelize(self, points): + voxels, coors, num_points = [], [], [] + for res in points: + res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res) + voxels.append(res_voxels) + coors.append(res_coors) + num_points.append(res_num_points) + voxels = torch.cat(voxels, dim=0) + num_points = torch.cat(num_points, dim=0) + coors_batch = [] + for i, coor in enumerate(coors): + coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) + coors_batch.append(coor_pad) + coors_batch = torch.cat(coors_batch, dim=0) + return voxels, num_points, coors_batch + + def forward_train(self, + points=None, + img_meta=None, + gt_bboxes_3d=None, + gt_labels_3d=None, + gt_labels=None, + gt_bboxes=None, + img=None, + proposals=None, + gt_bboxes_ignore=None): + img_feats, pts_feats = self.extract_feat( + points, img=img, img_meta=img_meta) + losses = dict() + if pts_feats: + losses_pts = self.forward_pts_train(pts_feats, gt_bboxes_3d, + gt_labels_3d, img_meta, + gt_bboxes_ignore) + losses.update(losses_pts) + if img_feats: + losses_img = self.forward_img_train( + img_feats, + img_meta=img_meta, + gt_bboxes=gt_bboxes, + gt_labels=gt_labels, + gt_bboxes_ignore=gt_bboxes_ignore, + proposals=proposals, + ) + losses.update(losses_img) + return losses + + def forward_pts_train(self, + pts_feats, + gt_bboxes_3d, + gt_labels_3d, + img_meta, + gt_bboxes_ignore=None): + outs = self.pts_bbox_head(pts_feats) + loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_meta) + losses = self.pts_bbox_head.loss( + *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + return losses + + def forward_img_train(self, + x, + img_meta, + gt_bboxes, + gt_labels, + gt_bboxes_ignore=None, + proposals=None): + losses = dict() + # RPN forward and loss + if self.with_img_rpn: + rpn_outs = self.img_rpn_head(x) + rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, + self.train_cfg.img_rpn) + rpn_losses = self.img_rpn_head.loss( + *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + losses.update(rpn_losses) + + proposal_cfg = self.train_cfg.get('img_rpn_proposal', + self.test_cfg.img_rpn) + proposal_inputs = rpn_outs + (img_meta, proposal_cfg) + proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs) + else: + proposal_list = proposals + + # assign gts and sample proposals + if self.with_img_bbox: + bbox_assigner = build_assigner(self.train_cfg.img_rcnn.assigner) + bbox_sampler = build_sampler( + self.train_cfg.img_rcnn.sampler, context=self) + num_imgs = len(img_meta) + if gt_bboxes_ignore is None: + gt_bboxes_ignore = [None for _ in range(num_imgs)] + sampling_results = [] + for i in range(num_imgs): + assign_result = bbox_assigner.assign(proposal_list[i], + gt_bboxes[i], + gt_bboxes_ignore[i], + gt_labels[i]) + sampling_result = bbox_sampler.sample( + assign_result, + proposal_list[i], + gt_bboxes[i], + gt_labels[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + sampling_results.append(sampling_result) + + # bbox head forward and loss + if self.with_img_bbox: + rois = bbox2roi([res.bboxes for res in sampling_results]) + # TODO: a more flexible way to decide which feature maps to use + bbox_feats = self.img_bbox_roi_extractor( + x[:self.img_bbox_roi_extractor.num_inputs], rois) + if self.with_shared_head: + bbox_feats = self.img_shared_head(bbox_feats) + cls_score, bbox_pred = self.img_bbox_head(bbox_feats) + + bbox_targets = self.img_bbox_head.get_target( + sampling_results, gt_bboxes, gt_labels, + self.train_cfg.img_rcnn) + loss_bbox = self.img_bbox_head.loss(cls_score, bbox_pred, + *bbox_targets) + losses.update(loss_bbox) + + return losses + + def forward_test(self, **kwargs): + return self.simple_test(**kwargs) + + def forward(self, return_loss=True, **kwargs): + if return_loss: + return self.forward_train(**kwargs) + else: + return self.forward_test(**kwargs) + + def simple_test_img(self, x, img_meta, proposals=None, rescale=False): + """Test without augmentation.""" + if proposals is None: + proposal_list = self.simple_test_rpn(x, img_meta, + self.test_cfg.img_rpn) + else: + proposal_list = proposals + + det_bboxes, det_labels = self.simple_test_bboxes( + x, + img_meta, + proposal_list, + self.test_cfg.img_rcnn, + rescale=rescale) + bbox_results = bbox2result_coco(det_bboxes, det_labels, + self.img_bbox_head.num_classes) + + return bbox_results + + def simple_test_bboxes(self, + x, + img_meta, + proposals, + rcnn_test_cfg, + rescale=False): + """Test only det bboxes without augmentation.""" + rois = bbox2roi(proposals) + roi_feats = self.img_bbox_roi_extractor( + x[:len(self.img_bbox_roi_extractor.featmap_strides)], rois) + if self.with_img_shared_head: + roi_feats = self.img_shared_head(roi_feats) + cls_score, bbox_pred = self.img_bbox_head(roi_feats) + + img_shape = img_meta[0]['img_shape'] + scale_factor = img_meta[0]['scale_factor'] + det_bboxes, det_labels = self.img_bbox_head.get_det_bboxes( + rois, + cls_score, + bbox_pred, + img_shape, + scale_factor, + rescale=rescale, + cfg=rcnn_test_cfg) + return det_bboxes, det_labels + + def simple_test_rpn(self, x, img_meta, rpn_test_cfg): + rpn_outs = self.img_rpn_head(x) + proposal_inputs = rpn_outs + (img_meta, rpn_test_cfg) + proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs) + return proposal_list + + def simple_test_pts(self, x, img_meta, rescale=False): + outs = self.pts_bbox_head(x) + bbox_inputs = outs + (img_meta, rescale) + bbox_list = self.pts_bbox_head.get_bboxes(*bbox_inputs) + return bbox_list + + def simple_test(self, + points, + img_meta, + img=None, + gt_bboxes_3d=None, + rescale=False): + img_feats, pts_feats = self.extract_feat( + points, img=img, img_meta=img_meta) + + bbox_list = dict() + if pts_feats and self.with_pts_bbox: + bbox_pts = self.simple_test_pts( + pts_feats, img_meta, rescale=rescale) + bbox_list.update(pts_bbox=bbox_pts) + if img_feats and self.with_img_bbox: + bbox_img = self.simple_test_img( + img_feats, img_meta, rescale=rescale) + bbox_list.update(img_bbox=bbox_img) + return bbox_list + + def aug_test(self, points, imgs, img_metas, rescale=False): + raise NotImplementedError diff --git a/mmdet3d/models/detectors/single_stage.py b/mmdet3d/models/detectors/single_stage.py new file mode 100644 index 0000000000..e10eb25b61 --- /dev/null +++ b/mmdet3d/models/detectors/single_stage.py @@ -0,0 +1,89 @@ +import torch.nn as nn + +from mmdet3d.core import bbox2result_coco +from mmdet.models.registry import DETECTORS +from .. import builder +from .base import BaseDetector + + +@DETECTORS.register_module +class SingleStageDetector(BaseDetector): + """Base class for single-stage detectors. + + Single-stage detectors directly and densely predict bounding boxes on the + output features of the backbone+neck. + """ + + def __init__(self, + backbone, + neck=None, + bbox_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(SingleStageDetector, self).__init__() + self.backbone = builder.build_backbone(backbone) + if neck is not None: + self.neck = builder.build_neck(neck) + bbox_head.update(train_cfg=train_cfg) + bbox_head.update(test_cfg=test_cfg) + self.bbox_head = builder.build_head(bbox_head) + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.init_weights(pretrained=pretrained) + + def init_weights(self, pretrained=None): + super(SingleStageDetector, self).init_weights(pretrained) + self.backbone.init_weights(pretrained=pretrained) + if self.with_neck: + if isinstance(self.neck, nn.Sequential): + for m in self.neck: + m.init_weights() + else: + self.neck.init_weights() + self.bbox_head.init_weights() + + def extract_feat(self, img): + """Directly extract features from the backbone+neck + """ + x = self.backbone(img) + if self.with_neck: + x = self.neck(x) + return x + + def forward_dummy(self, img): + """Used for computing network flops. + + See `mmedetection/tools/get_flops.py` + """ + x = self.extract_feat(img) + outs = self.bbox_head(x) + return outs + + def forward_train(self, + img, + img_metas, + gt_bboxes, + gt_labels, + gt_bboxes_ignore=None): + x = self.extract_feat(img) + outs = self.bbox_head(x) + loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, self.train_cfg) + losses = self.bbox_head.loss( + *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + return losses + + def simple_test(self, img, img_meta, rescale=False): + x = self.extract_feat(img) + outs = self.bbox_head(x) + bbox_inputs = outs + (img_meta, self.test_cfg, rescale) + bbox_list = self.bbox_head.get_bboxes(*bbox_inputs) + bbox_results = [ + bbox2result_coco(det_bboxes, det_labels, + self.bbox_head.num_classes) + for det_bboxes, det_labels in bbox_list + ] + return bbox_results[0] + + def aug_test(self, imgs, img_metas, rescale=False): + raise NotImplementedError diff --git a/mmdet3d/models/detectors/test_mixins.py b/mmdet3d/models/detectors/test_mixins.py new file mode 100644 index 0000000000..a457e523f3 --- /dev/null +++ b/mmdet3d/models/detectors/test_mixins.py @@ -0,0 +1,266 @@ +import logging +import sys + +import torch + +from mmdet3d.core import (bbox2roi, bbox_mapping, merge_aug_bboxes, + merge_aug_masks, merge_aug_proposals, multiclass_nms) + +logger = logging.getLogger(__name__) + +if sys.version_info >= (3, 7): + from mmdet3d.utils.contextmanagers import completed + + +class RPNTestMixin(object): + + if sys.version_info >= (3, 7): + + async def async_test_rpn(self, x, img_meta, rpn_test_cfg): + sleep_interval = rpn_test_cfg.pop('async_sleep_interval', 0.025) + async with completed( + __name__, 'rpn_head_forward', + sleep_interval=sleep_interval): + rpn_outs = self.rpn_head(x) + + proposal_inputs = rpn_outs + (img_meta, rpn_test_cfg) + + proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) + return proposal_list + + def simple_test_rpn(self, x, img_meta, rpn_test_cfg): + rpn_outs = self.rpn_head(x) + proposal_inputs = rpn_outs + (img_meta, rpn_test_cfg) + proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) + return proposal_list + + def aug_test_rpn(self, feats, img_metas, rpn_test_cfg): + imgs_per_gpu = len(img_metas[0]) + aug_proposals = [[] for _ in range(imgs_per_gpu)] + for x, img_meta in zip(feats, img_metas): + proposal_list = self.simple_test_rpn(x, img_meta, rpn_test_cfg) + for i, proposals in enumerate(proposal_list): + aug_proposals[i].append(proposals) + # reorganize the order of 'img_metas' to match the dimensions + # of 'aug_proposals' + aug_img_metas = [] + for i in range(imgs_per_gpu): + aug_img_meta = [] + for j in range(len(img_metas)): + aug_img_meta.append(img_metas[j][i]) + aug_img_metas.append(aug_img_meta) + # after merging, proposals will be rescaled to the original image size + merged_proposals = [ + merge_aug_proposals(proposals, aug_img_meta, rpn_test_cfg) + for proposals, aug_img_meta in zip(aug_proposals, aug_img_metas) + ] + return merged_proposals + + +class BBoxTestMixin(object): + + if sys.version_info >= (3, 7): + + async def async_test_bboxes(self, + x, + img_meta, + proposals, + rcnn_test_cfg, + rescale=False, + bbox_semaphore=None, + global_lock=None): + """Async test only det bboxes without augmentation.""" + rois = bbox2roi(proposals) + roi_feats = self.bbox_roi_extractor( + x[:len(self.bbox_roi_extractor.featmap_strides)], rois) + if self.with_shared_head: + roi_feats = self.shared_head(roi_feats) + sleep_interval = rcnn_test_cfg.get('async_sleep_interval', 0.017) + + async with completed( + __name__, 'bbox_head_forward', + sleep_interval=sleep_interval): + cls_score, bbox_pred = self.bbox_head(roi_feats) + + img_shape = img_meta[0]['img_shape'] + scale_factor = img_meta[0]['scale_factor'] + det_bboxes, det_labels = self.bbox_head.get_det_bboxes( + rois, + cls_score, + bbox_pred, + img_shape, + scale_factor, + rescale=rescale, + cfg=rcnn_test_cfg) + return det_bboxes, det_labels + + def simple_test_bboxes(self, + x, + img_meta, + proposals, + rcnn_test_cfg, + rescale=False): + """Test only det bboxes without augmentation.""" + rois = bbox2roi(proposals) + roi_feats = self.bbox_roi_extractor( + x[:len(self.bbox_roi_extractor.featmap_strides)], rois) + if self.with_shared_head: + roi_feats = self.shared_head(roi_feats) + cls_score, bbox_pred = self.bbox_head(roi_feats) + img_shape = img_meta[0]['img_shape'] + scale_factor = img_meta[0]['scale_factor'] + det_bboxes, det_labels = self.bbox_head.get_det_bboxes( + rois, + cls_score, + bbox_pred, + img_shape, + scale_factor, + rescale=rescale, + cfg=rcnn_test_cfg) + return det_bboxes, det_labels + + def aug_test_bboxes(self, feats, img_metas, proposal_list, rcnn_test_cfg): + aug_bboxes = [] + aug_scores = [] + for x, img_meta in zip(feats, img_metas): + # only one image in the batch + img_shape = img_meta[0]['img_shape'] + scale_factor = img_meta[0]['scale_factor'] + flip = img_meta[0]['flip'] + # TODO more flexible + proposals = bbox_mapping(proposal_list[0][:, :4], img_shape, + scale_factor, flip) + rois = bbox2roi([proposals]) + # recompute feature maps to save GPU memory + roi_feats = self.bbox_roi_extractor( + x[:len(self.bbox_roi_extractor.featmap_strides)], rois) + if self.with_shared_head: + roi_feats = self.shared_head(roi_feats) + cls_score, bbox_pred = self.bbox_head(roi_feats) + bboxes, scores = self.bbox_head.get_det_bboxes( + rois, + cls_score, + bbox_pred, + img_shape, + scale_factor, + rescale=False, + cfg=None) + aug_bboxes.append(bboxes) + aug_scores.append(scores) + # after merging, bboxes will be rescaled to the original image size + merged_bboxes, merged_scores = merge_aug_bboxes( + aug_bboxes, aug_scores, img_metas, rcnn_test_cfg) + det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores, + rcnn_test_cfg.score_thr, + rcnn_test_cfg.nms, + rcnn_test_cfg.max_per_img) + return det_bboxes, det_labels + + +class MaskTestMixin(object): + + if sys.version_info >= (3, 7): + + async def async_test_mask(self, + x, + img_meta, + det_bboxes, + det_labels, + rescale=False, + mask_test_cfg=None): + # image shape of the first image in the batch (only one) + ori_shape = img_meta[0]['ori_shape'] + scale_factor = img_meta[0]['scale_factor'] + if det_bboxes.shape[0] == 0: + segm_result = [[] + for _ in range(self.mask_head.num_classes - 1)] + else: + _bboxes = ( + det_bboxes[:, :4] * + scale_factor if rescale else det_bboxes) + mask_rois = bbox2roi([_bboxes]) + mask_feats = self.mask_roi_extractor( + x[:len(self.mask_roi_extractor.featmap_strides)], + mask_rois) + + if self.with_shared_head: + mask_feats = self.shared_head(mask_feats) + if mask_test_cfg and mask_test_cfg.get('async_sleep_interval'): + sleep_interval = mask_test_cfg['async_sleep_interval'] + else: + sleep_interval = 0.035 + async with completed( + __name__, + 'mask_head_forward', + sleep_interval=sleep_interval): + mask_pred = self.mask_head(mask_feats) + segm_result = self.mask_head.get_seg_masks( + mask_pred, _bboxes, det_labels, self.test_cfg.rcnn, + ori_shape, scale_factor, rescale) + return segm_result + + def simple_test_mask(self, + x, + img_meta, + det_bboxes, + det_labels, + rescale=False): + # image shape of the first image in the batch (only one) + ori_shape = img_meta[0]['ori_shape'] + scale_factor = img_meta[0]['scale_factor'] + if det_bboxes.shape[0] == 0: + segm_result = [[] for _ in range(self.mask_head.num_classes)] + else: + # if det_bboxes is rescaled to the original image size, we need to + # rescale it back to the testing scale to obtain RoIs. + if rescale and not isinstance(scale_factor, float): + scale_factor = torch.from_numpy(scale_factor).to( + det_bboxes.device) + _bboxes = ( + det_bboxes[:, :4] * scale_factor if rescale else det_bboxes) + mask_rois = bbox2roi([_bboxes]) + mask_feats = self.mask_roi_extractor( + x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois) + if self.with_shared_head: + mask_feats = self.shared_head(mask_feats) + mask_pred = self.mask_head(mask_feats) + segm_result = self.mask_head.get_seg_masks(mask_pred, _bboxes, + det_labels, + self.test_cfg.rcnn, + ori_shape, scale_factor, + rescale) + return segm_result + + def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels): + if det_bboxes.shape[0] == 0: + segm_result = [[] for _ in range(self.mask_head.num_classes)] + else: + aug_masks = [] + for x, img_meta in zip(feats, img_metas): + img_shape = img_meta[0]['img_shape'] + scale_factor = img_meta[0]['scale_factor'] + flip = img_meta[0]['flip'] + _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, + scale_factor, flip) + mask_rois = bbox2roi([_bboxes]) + mask_feats = self.mask_roi_extractor( + x[:len(self.mask_roi_extractor.featmap_strides)], + mask_rois) + if self.with_shared_head: + mask_feats = self.shared_head(mask_feats) + mask_pred = self.mask_head(mask_feats) + # convert to numpy array to save memory + aug_masks.append(mask_pred.sigmoid().cpu().numpy()) + merged_masks = merge_aug_masks(aug_masks, img_metas, + self.test_cfg.rcnn) + + ori_shape = img_metas[0][0]['ori_shape'] + segm_result = self.mask_head.get_seg_masks( + merged_masks, + det_bboxes, + det_labels, + self.test_cfg.rcnn, + ori_shape, + scale_factor=1.0, + rescale=False) + return segm_result diff --git a/mmdet3d/models/detectors/two_stage.py b/mmdet3d/models/detectors/two_stage.py new file mode 100644 index 0000000000..91a0e1ba33 --- /dev/null +++ b/mmdet3d/models/detectors/two_stage.py @@ -0,0 +1,314 @@ +import torch +import torch.nn as nn + +from mmdet3d.core import (bbox2result_coco, bbox2roi, build_assigner, + build_sampler) +from mmdet.models.registry import DETECTORS +from .. import builder +from .base import BaseDetector +from .test_mixins import BBoxTestMixin, MaskTestMixin, RPNTestMixin + + +@DETECTORS.register_module +class TwoStageDetector(BaseDetector, RPNTestMixin, BBoxTestMixin, + MaskTestMixin): + """Base class for two-stage detectors. + + Two-stage detectors typically consisting of a region proposal network and a + task-specific regression head. + """ + + def __init__(self, + backbone, + neck=None, + shared_head=None, + rpn_head=None, + bbox_roi_extractor=None, + bbox_head=None, + mask_roi_extractor=None, + mask_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(TwoStageDetector, self).__init__() + self.backbone = builder.build_backbone(backbone) + + if neck is not None: + self.neck = builder.build_neck(neck) + + if shared_head is not None: + self.shared_head = builder.build_shared_head(shared_head) + + if rpn_head is not None: + self.rpn_head = builder.build_head(rpn_head) + + if bbox_head is not None: + self.bbox_roi_extractor = builder.build_roi_extractor( + bbox_roi_extractor) + self.bbox_head = builder.build_head(bbox_head) + + if mask_head is not None: + if mask_roi_extractor is not None: + self.mask_roi_extractor = builder.build_roi_extractor( + mask_roi_extractor) + self.share_roi_extractor = False + else: + self.share_roi_extractor = True + self.mask_roi_extractor = self.bbox_roi_extractor + self.mask_head = builder.build_head(mask_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + self.init_weights(pretrained=pretrained) + + @property + def with_rpn(self): + return hasattr(self, 'rpn_head') and self.rpn_head is not None + + def init_weights(self, pretrained=None): + super(TwoStageDetector, self).init_weights(pretrained) + self.backbone.init_weights(pretrained=pretrained) + if self.with_neck: + if isinstance(self.neck, nn.Sequential): + for m in self.neck: + m.init_weights() + else: + self.neck.init_weights() + if self.with_shared_head: + self.shared_head.init_weights(pretrained=pretrained) + if self.with_rpn: + self.rpn_head.init_weights() + if self.with_bbox: + self.bbox_roi_extractor.init_weights() + self.bbox_head.init_weights() + if self.with_mask: + self.mask_head.init_weights() + if not self.share_roi_extractor: + self.mask_roi_extractor.init_weights() + + def extract_feat(self, img): + """Directly extract features from the backbone+neck + """ + x = self.backbone(img) + if self.with_neck: + x = self.neck(x) + return x + + def forward_dummy(self, img): + """Used for computing network flops. + + See `mmedetection/tools/get_flops.py` + """ + outs = () + # backbone + x = self.extract_feat(img) + # rpn + if self.with_rpn: + rpn_outs = self.rpn_head(x) + outs = outs + (rpn_outs, ) + proposals = torch.randn(1000, 4).cuda() + # bbox head + rois = bbox2roi([proposals]) + if self.with_bbox: + bbox_feats = self.bbox_roi_extractor( + x[:self.bbox_roi_extractor.num_inputs], rois) + if self.with_shared_head: + bbox_feats = self.shared_head(bbox_feats) + cls_score, bbox_pred = self.bbox_head(bbox_feats) + outs = outs + (cls_score, bbox_pred) + # mask head + if self.with_mask: + mask_rois = rois[:100] + mask_feats = self.mask_roi_extractor( + x[:self.mask_roi_extractor.num_inputs], mask_rois) + if self.with_shared_head: + mask_feats = self.shared_head(mask_feats) + mask_pred = self.mask_head(mask_feats) + outs = outs + (mask_pred, ) + return outs + + def forward_train(self, + img, + img_meta, + gt_bboxes, + gt_labels, + gt_bboxes_ignore=None, + gt_masks=None, + proposals=None): + """ + Args: + img (Tensor): of shape (N, C, H, W) encoding input images. + Typically these should be mean centered and std scaled. + + img_meta (list[dict]): list of image info dict where each dict has: + 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmdet/datasets/pipelines/formatting.py:Collect`. + + gt_bboxes (list[Tensor]): each item are the truth boxes for each + image in [tl_x, tl_y, br_x, br_y] format. + + gt_labels (list[Tensor]): class indices corresponding to each box + + gt_bboxes_ignore (None | list[Tensor]): specify which bounding + boxes can be ignored when computing the loss. + + gt_masks (None | Tensor) : true segmentation masks for each box + used if the architecture supports a segmentation task. + + proposals : override rpn proposals with custom proposals. Use when + `with_rpn` is False. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + x = self.extract_feat(img) + + losses = dict() + + # RPN forward and loss + if self.with_rpn: + rpn_outs = self.rpn_head(x) + rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, + self.train_cfg.rpn) + rpn_losses = self.rpn_head.loss( + *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + losses.update(rpn_losses) + + proposal_cfg = self.train_cfg.get('rpn_proposal', + self.test_cfg.rpn) + proposal_inputs = rpn_outs + (img_meta, proposal_cfg) + proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) + else: + proposal_list = proposals + + # assign gts and sample proposals + if self.with_bbox or self.with_mask: + bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner) + bbox_sampler = build_sampler( + self.train_cfg.rcnn.sampler, context=self) + num_imgs = img.size(0) + if gt_bboxes_ignore is None: + gt_bboxes_ignore = [None for _ in range(num_imgs)] + sampling_results = [] + for i in range(num_imgs): + assign_result = bbox_assigner.assign(proposal_list[i], + gt_bboxes[i], + gt_bboxes_ignore[i], + gt_labels[i]) + sampling_result = bbox_sampler.sample( + assign_result, + proposal_list[i], + gt_bboxes[i], + gt_labels[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + sampling_results.append(sampling_result) + + # bbox head forward and loss + if self.with_bbox: + rois = bbox2roi([res.bboxes for res in sampling_results]) + # TODO: a more flexible way to decide which feature maps to use + bbox_feats = self.bbox_roi_extractor( + x[:self.bbox_roi_extractor.num_inputs], rois) + if self.with_shared_head: + bbox_feats = self.shared_head(bbox_feats) + cls_score, bbox_pred = self.bbox_head(bbox_feats) + + bbox_targets = self.bbox_head.get_target(sampling_results, + gt_bboxes, gt_labels, + self.train_cfg.rcnn) + loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, + *bbox_targets) + losses.update(loss_bbox) + + # mask head forward and loss + if self.with_mask: + if not self.share_roi_extractor: + pos_rois = bbox2roi( + [res.pos_bboxes for res in sampling_results]) + mask_feats = self.mask_roi_extractor( + x[:self.mask_roi_extractor.num_inputs], pos_rois) + if self.with_shared_head: + mask_feats = self.shared_head(mask_feats) + else: + pos_inds = [] + device = bbox_feats.device + for res in sampling_results: + pos_inds.append( + torch.ones( + res.pos_bboxes.shape[0], + device=device, + dtype=torch.uint8)) + pos_inds.append( + torch.zeros( + res.neg_bboxes.shape[0], + device=device, + dtype=torch.uint8)) + pos_inds = torch.cat(pos_inds) + mask_feats = bbox_feats[pos_inds] + + if mask_feats.shape[0] > 0: + mask_pred = self.mask_head(mask_feats) + mask_targets = self.mask_head.get_target( + sampling_results, gt_masks, self.train_cfg.rcnn) + pos_labels = torch.cat( + [res.pos_gt_labels for res in sampling_results]) + loss_mask = self.mask_head.loss(mask_pred, mask_targets, + pos_labels) + losses.update(loss_mask) + + return losses + + def simple_test(self, img, img_meta, proposals=None, rescale=False): + """Test without augmentation.""" + assert self.with_bbox, 'Bbox head must be implemented.' + x = self.extract_feat(img) + + if proposals is None: + proposal_list = self.simple_test_rpn(x, img_meta, + self.test_cfg.rpn) + else: + proposal_list = proposals + + det_bboxes, det_labels = self.simple_test_bboxes( + x, img_meta, proposal_list, self.test_cfg.rcnn, rescale=rescale) + bbox_results = bbox2result_coco(det_bboxes, det_labels, + self.bbox_head.num_classes) + + if not self.with_mask: + return bbox_results + else: + segm_results = self.simple_test_mask( + x, img_meta, det_bboxes, det_labels, rescale=rescale) + return bbox_results, segm_results + + def aug_test(self, imgs, img_metas, rescale=False): + """Test with augmentations. + + If rescale is False, then returned bboxes and masks will fit the scale + of imgs[0]. + """ + # recompute feats to save memory + proposal_list = self.aug_test_rpn( + self.extract_feats(imgs), img_metas, self.test_cfg.rpn) + det_bboxes, det_labels = self.aug_test_bboxes( + self.extract_feats(imgs), img_metas, proposal_list, + self.test_cfg.rcnn) + + if rescale: + _det_bboxes = det_bboxes + else: + _det_bboxes = det_bboxes.clone() + _det_bboxes[:, :4] *= img_metas[0][0]['scale_factor'] + bbox_results = bbox2result_coco(_det_bboxes, det_labels, + self.bbox_head.num_classes) + + # det_bboxes always keep the original scale + if self.with_mask: + segm_results = self.aug_test_mask( + self.extract_feats(imgs), img_metas, det_bboxes, det_labels) + return bbox_results, segm_results + else: + return bbox_results diff --git a/mmdet3d/models/detectors/voxelnet.py b/mmdet3d/models/detectors/voxelnet.py new file mode 100644 index 0000000000..5095cf51e1 --- /dev/null +++ b/mmdet3d/models/detectors/voxelnet.py @@ -0,0 +1,140 @@ +import torch +import torch.nn.functional as F + +from mmdet3d.ops import Voxelization +from mmdet.models.registry import DETECTORS +from .. import builder +from .single_stage import SingleStageDetector + + +@DETECTORS.register_module +class VoxelNet(SingleStageDetector): + + def __init__(self, + voxel_layer, + voxel_encoder, + middle_encoder, + backbone, + neck=None, + bbox_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(VoxelNet, self).__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + pretrained=pretrained, + ) + self.voxel_layer = Voxelization(**voxel_layer) + self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder) + self.middle_encoder = builder.build_middle_encoder(middle_encoder) + + def extract_feat(self, points, img_meta): + voxels, num_points, coors = self.voxelize(points) + voxel_features = self.voxel_encoder(voxels, num_points, coors) + batch_size = coors[-1, 0].item() + 1 + x = self.middle_encoder(voxel_features, coors, batch_size) + x = self.backbone(x) + if self.with_neck: + x = self.neck(x) + return x + + @torch.no_grad() + def voxelize(self, points): + voxels, coors, num_points = [], [], [] + for res in points: + res_voxels, res_coors, res_num_points = self.voxel_layer(res) + voxels.append(res_voxels) + coors.append(res_coors) + num_points.append(res_num_points) + voxels = torch.cat(voxels, dim=0) + num_points = torch.cat(num_points, dim=0) + coors_batch = [] + for i, coor in enumerate(coors): + coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) + coors_batch.append(coor_pad) + coors_batch = torch.cat(coors_batch, dim=0) + return voxels, num_points, coors_batch + + def forward_train(self, + points, + img_meta, + gt_bboxes_3d, + gt_labels_3d, + gt_bboxes_ignore=None): + x = self.extract_feat(points, img_meta) + outs = self.bbox_head(x) + loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_meta) + losses = self.bbox_head.loss( + *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + return losses + + def forward_test(self, **kwargs): + return self.simple_test(**kwargs) + + def forward(self, return_loss=True, **kwargs): + if return_loss: + return self.forward_train(**kwargs) + else: + return self.forward_test(**kwargs) + + def simple_test(self, points, img_meta, gt_bboxes_3d=None, rescale=False): + x = self.extract_feat(points, img_meta) + outs = self.bbox_head(x) + bbox_inputs = outs + (img_meta, rescale) + bbox_list = self.bbox_head.get_bboxes(*bbox_inputs) + return bbox_list + + +@DETECTORS.register_module +class DynamicVoxelNet(VoxelNet): + + def __init__(self, + voxel_layer, + voxel_encoder, + middle_encoder, + backbone, + neck=None, + bbox_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(DynamicVoxelNet, self).__init__( + voxel_layer=voxel_layer, + voxel_encoder=voxel_encoder, + middle_encoder=middle_encoder, + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + pretrained=pretrained, + ) + + def extract_feat(self, points, img_meta): + voxels, coors = self.voxelize(points) + voxel_features, feature_coors = self.voxel_encoder(voxels, coors) + batch_size = coors[-1, 0].item() + 1 + x = self.middle_encoder(voxel_features, feature_coors, batch_size) + x = self.backbone(x) + if self.with_neck: + x = self.neck(x) + return x + + @torch.no_grad() + def voxelize(self, points): + coors = [] + # dynamic voxelization only provide a coors mapping + for res in points: + res_coors = self.voxel_layer(res) + coors.append(res_coors) + points = torch.cat(points, dim=0) + coors_batch = [] + for i, coor in enumerate(coors): + coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) + coors_batch.append(coor_pad) + coors_batch = torch.cat(coors_batch, dim=0) + return points, coors_batch diff --git a/mmdet3d/models/fusion_layers/__init__.py b/mmdet3d/models/fusion_layers/__init__.py new file mode 100644 index 0000000000..93142ced2f --- /dev/null +++ b/mmdet3d/models/fusion_layers/__init__.py @@ -0,0 +1,3 @@ +from .point_fusion import PointFusion + +__all__ = ['PointFusion'] diff --git a/mmdet3d/models/fusion_layers/point_fusion.py b/mmdet3d/models/fusion_layers/point_fusion.py new file mode 100644 index 0000000000..005306f269 --- /dev/null +++ b/mmdet3d/models/fusion_layers/point_fusion.py @@ -0,0 +1,287 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import xavier_init + +from mmdet3d.models.utils import ConvModule +from ..plugins import NonLocal2D +from ..registry import FUSION_LAYERS + + +def point_sample( + img_features, + points, + lidar2img_rt, + pcd_rotate_mat, + img_scale_factor, + img_crop_offset, + pcd_trans_factor, + pcd_scale_factor, + pcd_flip, + img_flip, + img_pad_shape, + img_shape, + aligned=True, + padding_mode='zeros', + align_corners=True, +): + """sample image features using point coordinates + + Arguments: + img_features (Tensor): 1xCxHxW image features + points (Tensor): Nx3 point cloud coordinates + P (Tensor): 4x4 transformation matrix + scale_factor (Tensor): scale_factor of images + img_pad_shape (int, int): int tuple indicates the h & w after padding, + this is necessary to obtain features in feature map + img_shape (int, int): int tuple indicates the h & w before padding + after scaling, this is necessary for flipping coordinates + return: + (Tensor): NxC image features sampled by point coordinates + """ + # aug order: flip -> trans -> scale -> rot + # The transformation follows the augmentation order in data pipeline + if pcd_flip: + # if the points are flipped, flip them back first + points[:, 1] = -points[:, 1] + + points -= pcd_trans_factor + # the points should be scaled to the original scale in velo coordinate + points /= pcd_scale_factor + # the points should be rotated back + # pcd_rotate_mat @ pcd_rotate_mat.inverse() is not exactly an identity + # matrix, use angle to create the inverse rot matrix neither. + points = points @ pcd_rotate_mat.inverse() + + # project points from velo coordinate to camera coordinate + num_points = points.shape[0] + pts_4d = torch.cat([points, points.new_ones(size=(num_points, 1))], dim=-1) + pts_2d = pts_4d @ lidar2img_rt.t() + + # cam_points is Tensor of Nx4 whose last column is 1 + # transform camera coordinate to image coordinate + + pts_2d[:, 2] = torch.clamp(pts_2d[:, 2], min=1e-5) + pts_2d[:, 0] /= pts_2d[:, 2] + pts_2d[:, 1] /= pts_2d[:, 2] + + # img transformation: scale -> crop -> flip + # the image is resized by img_scale_factor + img_coors = pts_2d[:, 0:2] * img_scale_factor # Nx2 + img_coors -= img_crop_offset + + # grid sample, the valid grid range should be in [-1,1] + coor_x, coor_y = torch.split(img_coors, 1, dim=1) # each is Nx1 + + if img_flip: + # by default we take it as horizontal flip + # use img_shape before padding for flip + orig_h, orig_w = img_shape + coor_x = orig_w - coor_x + + h, w = img_pad_shape + coor_y = coor_y / h * 2 - 1 + coor_x = coor_x / w * 2 - 1 + grid = torch.cat([coor_x, coor_y], + dim=1).unsqueeze(0).unsqueeze(0) # Nx2 -> 1x1xNx2 + + # align_corner=True provides higher performance + mode = 'bilinear' if aligned else 'nearest' + point_features = F.grid_sample( + img_features, + grid, + mode=mode, + padding_mode=padding_mode, + align_corners=align_corners) # 1xCx1xN feats + + return point_features.squeeze().t() + + +@FUSION_LAYERS.register_module +class PointFusion(nn.Module): + """Fuse image features from fused single scale features + """ + + def __init__(self, + img_channels, + pts_channels, + mid_channels, + out_channels, + img_levels=3, + conv_cfg=None, + norm_cfg=None, + activation=None, + activate_out=True, + fuse_out=False, + refine_type=None, + dropout_ratio=0, + aligned=True, + align_corners=True, + padding_mode='zeros', + lateral_conv=True): + super(PointFusion, self).__init__() + if isinstance(img_levels, int): + img_levels = [img_levels] + if isinstance(img_channels, int): + img_channels = [img_channels] * len(img_levels) + assert isinstance(img_levels, list) + assert isinstance(img_channels, list) + assert len(img_channels) == len(img_levels) + + self.img_levels = img_levels + self.activation = activation + self.activate_out = activate_out + self.fuse_out = fuse_out + self.refine_type = refine_type + self.dropout_ratio = dropout_ratio + self.img_channels = img_channels + self.aligned = aligned + self.align_corners = align_corners + self.padding_mode = padding_mode + + self.lateral_convs = None + if lateral_conv: + self.lateral_convs = nn.ModuleList() + for i in range(len(img_channels)): + l_conv = ConvModule( + img_channels[i], + mid_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + activation=self.activation, + inplace=False) + self.lateral_convs.append(l_conv) + self.img_transform = nn.Sequential( + nn.Linear(mid_channels * len(img_channels), out_channels), + nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01), + ) + else: + self.img_transform = nn.Sequential( + nn.Linear(sum(img_channels), out_channels), + nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01), + ) + self.pts_transform = nn.Sequential( + nn.Linear(pts_channels, out_channels), + nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01), + ) + + if self.fuse_out: + self.fuse_conv = nn.Sequential( + nn.Linear(mid_channels, out_channels), + # For pts the BN is initialized differently by default + # TODO: check whether this is necessary + nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01), + nn.ReLU(inplace=False)) + + if self.refine_type == 'non_local': + self.refine = NonLocal2D( + out_channels, + reduction=1, + use_scale=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg) + self.init_weights() + + # default init_weights for conv(msra) and norm in ConvModule + def init_weights(self): + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Linear)): + xavier_init(m, distribution='uniform') + + def forward(self, img_feats, pts, pts_feats, img_meta): + """ + img_feats (List[Tensor]): img features + pts: [List[Tensor]]: a batch of points with shape Nx3 + pts_feats (Tensor): a tensor consist of point features of the + total batch + + """ + img_pts = self.obtain_mlvl_feats(img_feats, pts, img_meta) + img_pre_fuse = self.img_transform(img_pts) + if self.training and self.dropout_ratio > 0: + img_pre_fuse = F.dropout(img_pre_fuse, self.dropout_ratio) + pts_pre_fuse = self.pts_transform(pts_feats) + + fuse_out = img_pre_fuse + pts_pre_fuse + if self.activate_out: + fuse_out = F.relu(fuse_out) + if self.fuse_out: + fuse_out = self.fuse_conv(fuse_out) + + if self.refine_type is not None: + fuse_out_T = fuse_out.t()[None, ..., None] # NxC -> 1xCxNx1 + batch_idx = 0 + attentive = [] + for i in range(len(pts)): + end_idx = batch_idx + len(pts[i]) + attentive.append( + self.refine(fuse_out_T[:, :, batch_idx:end_idx])) + batch_idx = end_idx + fuse_out = torch.cat(attentive, dim=-2).squeeze().t() + return fuse_out + + def obtain_mlvl_feats(self, img_feats, pts, img_meta): + if self.lateral_convs is not None: + img_ins = [ + lateral_conv(img_feats[i]) + for i, lateral_conv in zip(self.img_levels, self.lateral_convs) + ] + else: + img_ins = img_feats + img_feats_per_point = [] + # Sample multi-level features + for i in range(len(img_meta)): + mlvl_img_feats = [] + for level in range(len(self.img_levels)): + if torch.isnan(img_ins[level][i:i + 1]).any(): + import pdb + pdb.set_trace() + mlvl_img_feats.append( + self.sample_single(img_ins[level][i:i + 1], pts[i][:, :3], + img_meta[i])) + mlvl_img_feats = torch.cat(mlvl_img_feats, dim=-1) + img_feats_per_point.append(mlvl_img_feats) + + img_pts = torch.cat(img_feats_per_point, dim=0) + return img_pts + + def sample_single(self, img_feats, pts, img_meta): + pcd_scale_factor = ( + img_meta['pcd_scale_factor'] + if 'pcd_scale_factor' in img_meta.keys() else 1) + pcd_trans_factor = ( + pts.new_tensor(img_meta['pcd_trans']) + if 'pcd_trans' in img_meta.keys() else 0) + pcd_rotate_mat = ( + pts.new_tensor(img_meta['pcd_rotation']) + if 'pcd_rotation' in img_meta.keys() else + torch.eye(3).type_as(pts).to(pts.device)) + img_scale_factor = ( + img_meta['scale_factor'] + if 'scale_factor' in img_meta.keys() else 1) + pcd_flip = img_meta['pcd_flip'] if 'pcd_flip' in img_meta.keys( + ) else False + img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False + img_crop_offset = ( + pts.new_tensor(img_meta['img_crop_offset']) + if 'img_crop_offset' in img_meta.keys() else 0) + img_pts = point_sample( + img_feats, + pts, + pts.new_tensor(img_meta['lidar2img']), + pcd_rotate_mat, + img_scale_factor, + img_crop_offset, + pcd_trans_factor, + pcd_scale_factor, + pcd_flip=pcd_flip, + img_flip=img_flip, + img_pad_shape=img_meta['pad_shape'][:2], + img_shape=img_meta['img_shape'][:2], + aligned=self.aligned, + padding_mode=self.padding_mode, + align_corners=self.align_corners, + ) + return img_pts diff --git a/mmdet3d/models/losses/__init__.py b/mmdet3d/models/losses/__init__.py new file mode 100644 index 0000000000..e2da0a0955 --- /dev/null +++ b/mmdet3d/models/losses/__init__.py @@ -0,0 +1,3 @@ +from mmdet.models.losses import FocalLoss, SmoothL1Loss + +__all__ = ['FocalLoss', 'SmoothL1Loss'] diff --git a/mmdet3d/models/middle_encoders/__init__.py b/mmdet3d/models/middle_encoders/__init__.py new file mode 100644 index 0000000000..b20bcb049a --- /dev/null +++ b/mmdet3d/models/middle_encoders/__init__.py @@ -0,0 +1,4 @@ +from .pillar_scatter import PointPillarsScatter +from .sparse_encoder import SparseEncoder + +__all__ = ['PointPillarsScatter', 'SparseEncoder'] diff --git a/mmdet3d/models/middle_encoders/pillar_scatter.py b/mmdet3d/models/middle_encoders/pillar_scatter.py new file mode 100644 index 0000000000..5e502ed4f5 --- /dev/null +++ b/mmdet3d/models/middle_encoders/pillar_scatter.py @@ -0,0 +1,85 @@ +import torch +from torch import nn + +from ..registry import MIDDLE_ENCODERS + + +@MIDDLE_ENCODERS.register_module +class PointPillarsScatter(nn.Module): + + def __init__(self, in_channels, output_shape): + """ + Point Pillar's Scatter. + Converts learned features from dense tensor to sparse pseudo image. + + Args: + output_shape (list[int]): Required output shape of features. + in_channels (int): Number of input features. + """ + + super().__init__() + self.name = 'PointPillarsScatter' + self.output_shape = output_shape + self.ny = output_shape[0] + self.nx = output_shape[1] + self.nchannels = in_channels + + def forward(self, voxel_features, coors, batch_size=None): + # TODO: rewrite the function in a batch manner + # no need to deal with different batch cases + if batch_size is not None: + return self.forward_batch(voxel_features, coors, batch_size) + else: + return self.forward_single(voxel_features, coors) + + def forward_single(self, voxel_features, coors): + # Create the canvas for this sample + canvas = torch.zeros( + self.nchannels, + self.nx * self.ny, + dtype=voxel_features.dtype, + device=voxel_features.device) + + indices = coors[:, 1] * self.nx + coors[:, 2] + indices = indices.long() + voxels = voxel_features.t() + # Now scatter the blob back to the canvas. + canvas[:, indices] = voxels + # Undo the column stacking to final 4-dim tensor + canvas = canvas.view(1, self.nchannels, self.ny, self.nx) + return [canvas] + + def forward_batch(self, voxel_features, coors, batch_size): + + # batch_canvas will be the final output. + batch_canvas = [] + for batch_itt in range(batch_size): + # Create the canvas for this sample + canvas = torch.zeros( + self.nchannels, + self.nx * self.ny, + dtype=voxel_features.dtype, + device=voxel_features.device) + + # Only include non-empty pillars + batch_mask = coors[:, 0] == batch_itt + this_coors = coors[batch_mask, :] + indices = this_coors[:, 2] * self.nx + this_coors[:, 3] + indices = indices.type(torch.long) + voxels = voxel_features[batch_mask, :] + voxels = voxels.t() + + # Now scatter the blob back to the canvas. + canvas[:, indices] = voxels + + # Append to a list for later stacking. + batch_canvas.append(canvas) + + # Stack to 3-dim tensor (batch-size, nchannels, nrows*ncols) + batch_canvas = torch.stack(batch_canvas, 0) + + # Undo the column stacking to final 4-dim tensor + batch_canvas = batch_canvas.view(batch_size, self.nchannels, self.ny, + self.nx) + + return batch_canvas diff --git a/mmdet3d/models/middle_encoders/sparse_encoder.py b/mmdet3d/models/middle_encoders/sparse_encoder.py new file mode 100644 index 0000000000..70b437a47a --- /dev/null +++ b/mmdet3d/models/middle_encoders/sparse_encoder.py @@ -0,0 +1,215 @@ +import torch.nn as nn + +import mmdet3d.ops.spconv as spconv +from ..registry import MIDDLE_ENCODERS +from ..utils import build_norm_layer + + +@MIDDLE_ENCODERS.register_module +class SparseEncoder(nn.Module): + + def __init__(self, + in_channels, + output_shape, + pre_act, + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01)): + super().__init__() + self.sparse_shape = output_shape + self.output_shape = output_shape + self.in_channels = in_channels + self.pre_act = pre_act + # Spconv init all weight on its own + # TODO: make the network could be modified + + if pre_act: + self.conv_input = spconv.SparseSequential( + spconv.SubMConv3d( + in_channels, + 16, + 3, + padding=1, + bias=False, + indice_key='subm1'), ) + block = self.pre_act_block + else: + norm_name, norm_layer = build_norm_layer(norm_cfg, 16) + self.conv_input = spconv.SparseSequential( + spconv.SubMConv3d( + in_channels, + 16, + 3, + padding=1, + bias=False, + indice_key='subm1'), + norm_layer, + nn.ReLU(), + ) + block = self.post_act_block + + self.conv1 = spconv.SparseSequential( + block(16, 16, 3, norm_cfg=norm_cfg, padding=1, + indice_key='subm1'), ) + + self.conv2 = spconv.SparseSequential( + # [1600, 1408, 41] -> [800, 704, 21] + block( + 16, + 32, + 3, + norm_cfg=norm_cfg, + stride=2, + padding=1, + indice_key='spconv2', + conv_type='spconv'), + block(32, 32, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm2'), + block(32, 32, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm2'), + ) + + self.conv3 = spconv.SparseSequential( + # [800, 704, 21] -> [400, 352, 11] + block( + 32, + 64, + 3, + norm_cfg=norm_cfg, + stride=2, + padding=1, + indice_key='spconv3', + conv_type='spconv'), + block(64, 64, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm3'), + block(64, 64, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm3'), + ) + + self.conv4 = spconv.SparseSequential( + # [400, 352, 11] -> [200, 176, 5] + block( + 64, + 64, + 3, + norm_cfg=norm_cfg, + stride=2, + padding=(0, 1, 1), + indice_key='spconv4', + conv_type='spconv'), + block(64, 64, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm4'), + block(64, 64, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm4'), + ) + + norm_name, norm_layer = build_norm_layer(norm_cfg, 128) + self.conv_out = spconv.SparseSequential( + # [200, 176, 5] -> [200, 176, 2] + spconv.SparseConv3d( + 128, + 128, (3, 1, 1), + stride=(2, 1, 1), + padding=0, + bias=False, + indice_key='spconv_down2'), + norm_layer, + nn.ReLU(), + ) + + def forward(self, voxel_features, coors, batch_size): + """ + :param voxel_features: (N, C) + :param coors: (N, 4) [batch_idx, z_idx, y_idx, x_idx] + :param batch_size: + :return: + """ + coors = coors.int() + input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors, + self.sparse_shape, + batch_size) + x = self.conv_input(input_sp_tensor) + + x_conv1 = self.conv1(x) + x_conv2 = self.conv2(x_conv1) + x_conv3 = self.conv3(x_conv2) + x_conv4 = self.conv4(x_conv3) + + # for detection head + # [200, 176, 5] -> [200, 176, 2] + out = self.conv_out(x_conv4) + spatial_features = out.dense() + + N, C, D, H, W = spatial_features.shape + spatial_features = spatial_features.view(N, C * D, H, W) + + return spatial_features + + def pre_act_block(self, + in_channels, + out_channels, + kernel_size, + indice_key=None, + stride=1, + padding=0, + conv_type='subm', + norm_cfg=None): + norm_name, norm_layer = build_norm_layer(norm_cfg, in_channels) + if conv_type == 'subm': + m = spconv.SparseSequential( + norm_layer, + nn.ReLU(inplace=True), + spconv.SubMConv3d( + in_channels, + out_channels, + kernel_size, + padding=padding, + bias=False, + indice_key=indice_key), + ) + elif conv_type == 'spconv': + m = spconv.SparseSequential( + norm_layer, + nn.ReLU(inplace=True), + spconv.SparseConv3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=False, + indice_key=indice_key), + ) + else: + raise NotImplementedError + return m + + def post_act_block(self, + in_channels, + out_channels, + kernel_size, + indice_key, + stride=1, + padding=0, + conv_type='subm', + norm_cfg=None): + norm_name, norm_layer = build_norm_layer(norm_cfg, out_channels) + if conv_type == 'subm': + m = spconv.SparseSequential( + spconv.SubMConv3d( + in_channels, + out_channels, + kernel_size, + bias=False, + indice_key=indice_key), + norm_layer, + nn.ReLU(inplace=True), + ) + elif conv_type == 'spconv': + m = spconv.SparseSequential( + spconv.SparseConv3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=False, + indice_key=indice_key), + norm_layer, + nn.ReLU(inplace=True), + ) + else: + raise NotImplementedError + return m diff --git a/mmdet3d/models/necks/__init__.py b/mmdet3d/models/necks/__init__.py new file mode 100644 index 0000000000..85904b497c --- /dev/null +++ b/mmdet3d/models/necks/__init__.py @@ -0,0 +1,4 @@ +from mmdet.models.necks.fpn import FPN +from .second_fpn import SECONDFPN + +__all__ = ['FPN', 'SECONDFPN'] diff --git a/mmdet3d/models/necks/second_fpn.py b/mmdet3d/models/necks/second_fpn.py new file mode 100644 index 0000000000..59d676ffea --- /dev/null +++ b/mmdet3d/models/necks/second_fpn.py @@ -0,0 +1,147 @@ +import logging +from functools import partial + +import torch +import torch.nn as nn +from mmcv.cnn import constant_init, kaiming_init +from mmcv.runner import load_checkpoint +from torch.nn import Sequential +from torch.nn.modules.batchnorm import _BatchNorm + +from .. import builder +from ..registry import NECKS +from ..utils import build_norm_layer + + +class Empty(nn.Module): + + def __init__(self, *args, **kwargs): + super(Empty, self).__init__() + + def forward(self, *args, **kwargs): + if len(args) == 1: + return args[0] + elif len(args) == 0: + return None + return args + + +@NECKS.register_module +class SECONDFPN(nn.Module): + """Compare with RPN, RPNV2 support arbitrary number of stage. + """ + + def __init__(self, + use_norm=True, + in_channels=[128, 128, 256], + upsample_strides=[1, 2, 4], + num_upsample_filters=[256, 256, 256], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01)): + # if for GroupNorm, + # cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True) + super(SECONDFPN, self).__init__() + assert len(num_upsample_filters) == len(upsample_strides) + self.in_channels = in_channels + + if norm_cfg is not None: + ConvTranspose2d = partial(nn.ConvTranspose2d, bias=False) + else: + ConvTranspose2d = partial(nn.ConvTranspose2d, bias=True) + + deblocks = [] + + for i, num_upsample_filter in enumerate(num_upsample_filters): + norm_layer = ( + build_norm_layer(norm_cfg, num_upsample_filter)[1] + if norm_cfg is not None else Empty) + deblock = Sequential( + ConvTranspose2d( + in_channels[i], + num_upsample_filter, + upsample_strides[i], + stride=upsample_strides[i]), + norm_layer, + nn.ReLU(inplace=True), + ) + deblocks.append(deblock) + self.deblocks = nn.ModuleList(deblocks) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + # keeping the initiation yields better results + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + return + + def forward(self, inputs): + assert len(inputs) == len(self.in_channels) + ups = [deblock(inputs[i]) for i, deblock in enumerate(self.deblocks)] + + if len(ups) > 1: + x = torch.cat(ups, dim=1) + else: + x = ups[0] + return [x] + + +@NECKS.register_module +class SECONDFusionFPN(SECONDFPN): + """Compare with RPN, RPNV2 support arbitrary number of stage. + """ + + def __init__(self, + use_norm=True, + in_channels=[128, 128, 256], + upsample_strides=[1, 2, 4], + num_upsample_filters=[256, 256, 256], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + down_sample_rate=[40, 8, 8], + fusion_layer=None, + cat_points=False): + super(SECONDFusionFPN, self).__init__( + use_norm, + in_channels, + upsample_strides, + num_upsample_filters, + norm_cfg, + ) + self.fusion_layer = None + if fusion_layer is not None: + self.fusion_layer = builder.build_fusion_layer(fusion_layer) + self.cat_points = cat_points + self.down_sample_rate = down_sample_rate + + def forward(self, + inputs, + coors=None, + points=None, + img_feats=None, + img_meta=None): + assert len(inputs) == len(self.in_channels) + ups = [deblock(inputs[i]) for i, deblock in enumerate(self.deblocks)] + + if len(ups) > 1: + x = torch.cat(ups, dim=1) + else: + x = ups[0] + if (self.fusion_layer is not None and img_feats is not None): + downsample_pts_coors = torch.zeros_like(coors) + downsample_pts_coors[:, 0] = coors[:, 0] + downsample_pts_coors[:, 1] = ( + coors[:, 1] / self.down_sample_rate[0]) + downsample_pts_coors[:, 2] = ( + coors[:, 2] / self.down_sample_rate[1]) + downsample_pts_coors[:, 3] = ( + coors[:, 3] / self.down_sample_rate[2]) + # fusion for each point + x = self.fusion_layer(img_feats, points, x, downsample_pts_coors, + img_meta) + return [x] diff --git a/mmdet3d/models/registry.py b/mmdet3d/models/registry.py new file mode 100644 index 0000000000..9eb47d3ba6 --- /dev/null +++ b/mmdet3d/models/registry.py @@ -0,0 +1,5 @@ +from mmdet.utils import Registry + +VOXEL_ENCODERS = Registry('voxel_encoder') +MIDDLE_ENCODERS = Registry('middle_encoder') +FUSION_LAYERS = Registry('fusion_layer') diff --git a/mmdet3d/models/roi_extractors/__init__.py b/mmdet3d/models/roi_extractors/__init__.py new file mode 100644 index 0000000000..80c3c30f88 --- /dev/null +++ b/mmdet3d/models/roi_extractors/__init__.py @@ -0,0 +1,3 @@ +from mmdet.models.roi_extractors.single_level import SingleRoIExtractor + +__all__ = ['SingleRoIExtractor'] diff --git a/mmdet3d/models/utils/__init__.py b/mmdet3d/models/utils/__init__.py new file mode 100644 index 0000000000..8cd39f7324 --- /dev/null +++ b/mmdet3d/models/utils/__init__.py @@ -0,0 +1,3 @@ +from mmdet.models.utils import ResLayer, bias_init_with_prob + +__all__ = ['bias_init_with_prob', 'ResLayer'] diff --git a/mmdet3d/models/utils/weight_init.py b/mmdet3d/models/utils/weight_init.py new file mode 100644 index 0000000000..17d49880fd --- /dev/null +++ b/mmdet3d/models/utils/weight_init.py @@ -0,0 +1,46 @@ +import numpy as np +import torch.nn as nn + + +def xavier_init(module, gain=1, bias=0, distribution='normal'): + assert distribution in ['uniform', 'normal'] + if distribution == 'uniform': + nn.init.xavier_uniform_(module.weight, gain=gain) + else: + nn.init.xavier_normal_(module.weight, gain=gain) + if hasattr(module, 'bias'): + nn.init.constant_(module.bias, bias) + + +def normal_init(module, mean=0, std=1, bias=0): + nn.init.normal_(module.weight, mean, std) + if hasattr(module, 'bias'): + nn.init.constant_(module.bias, bias) + + +def uniform_init(module, a=0, b=1, bias=0): + nn.init.uniform_(module.weight, a, b) + if hasattr(module, 'bias'): + nn.init.constant_(module.bias, bias) + + +def kaiming_init(module, + mode='fan_out', + nonlinearity='relu', + bias=0, + distribution='normal'): + assert distribution in ['uniform', 'normal'] + if distribution == 'uniform': + nn.init.kaiming_uniform_( + module.weight, mode=mode, nonlinearity=nonlinearity) + else: + nn.init.kaiming_normal_( + module.weight, mode=mode, nonlinearity=nonlinearity) + if hasattr(module, 'bias'): + nn.init.constant_(module.bias, bias) + + +def bias_init_with_prob(prior_prob): + """ initialize conv/fc bias value according to giving probablity""" + bias_init = float(-np.log((1 - prior_prob) / prior_prob)) + return bias_init diff --git a/mmdet3d/models/voxel_encoders/__init__.py b/mmdet3d/models/voxel_encoders/__init__.py new file mode 100644 index 0000000000..96f13579b8 --- /dev/null +++ b/mmdet3d/models/voxel_encoders/__init__.py @@ -0,0 +1,8 @@ +from .pillar_encoder import AlignedPillarFeatureNet, PillarFeatureNet +from .voxel_encoder import (DynamicVFE, VoxelFeatureExtractor, + VoxelFeatureExtractorV2, VoxelFeatureExtractorV3) + +__all__ = [ + 'PillarFeatureNet', 'AlignedPillarFeatureNet', 'VoxelFeatureExtractor', + 'DynamicVFE', 'VoxelFeatureExtractorV2', 'VoxelFeatureExtractorV3' +] diff --git a/mmdet3d/models/voxel_encoders/pillar_encoder.py b/mmdet3d/models/voxel_encoders/pillar_encoder.py new file mode 100644 index 0000000000..21cf57acf8 --- /dev/null +++ b/mmdet3d/models/voxel_encoders/pillar_encoder.py @@ -0,0 +1,378 @@ +import torch +from torch import nn + +from mmdet3d.ops import DynamicScatter, build_norm_layer +from ..registry import VOXEL_ENCODERS +from .utils import PFNLayer, get_paddings_indicator + + +@VOXEL_ENCODERS.register_module +class PillarFeatureNet(nn.Module): + + def __init__(self, + num_input_features=4, + use_norm=True, + num_filters=(64, ), + with_distance=False, + with_cluster_center=True, + with_voxel_center=True, + voxel_size=(0.2, 0.2, 4), + point_cloud_range=(0, -40, -3, 70.4, 40, 1), + mode='max'): + """ Pillar Feature Net. + The network prepares the pillar features and performs forward pass + through PFNLayers. + + Args: + num_input_features (int). Number of input features, + either x, y, z or x, y, z, r. + use_norm (bool). Whether to include BatchNorm. + num_filters (list[int]). Number of features in each of the + N PFNLayers. + with_distance (bool). Whether to include Euclidean distance + to points. + voxel_size (list[float]). Size of voxels, only utilize x and y + size. + point_cloud_range (list[float>]). Point cloud range, only + utilize x and y min. + """ + + super(PillarFeatureNet, self).__init__() + assert len(num_filters) > 0 + if with_cluster_center: + num_input_features += 3 + if with_voxel_center: + num_input_features += 2 + if with_distance: + num_input_features += 1 + self._with_distance = with_distance + self._with_cluster_center = with_cluster_center + self._with_voxel_center = with_voxel_center + + # Create PillarFeatureNet layers + self.num_input_features = num_input_features + num_filters = [num_input_features] + list(num_filters) + pfn_layers = [] + for i in range(len(num_filters) - 1): + in_filters = num_filters[i] + out_filters = num_filters[i + 1] + if i < len(num_filters) - 2: + last_layer = False + else: + last_layer = True + pfn_layers.append( + PFNLayer( + in_filters, + out_filters, + use_norm, + last_layer=last_layer, + mode=mode)) + self.pfn_layers = nn.ModuleList(pfn_layers) + + # Need pillar (voxel) size and x/y offset in order to calculate offset + self.vx = voxel_size[0] + self.vy = voxel_size[1] + self.x_offset = self.vx / 2 + point_cloud_range[0] + self.y_offset = self.vy / 2 + point_cloud_range[1] + self.point_cloud_range = point_cloud_range + + def forward(self, features, num_points, coors): + features_ls = [features] + # Find distance of x, y, and z from cluster center + if self._with_cluster_center: + points_mean = features[:, :, :3].sum( + dim=1, keepdim=True) / num_points.type_as(features).view( + -1, 1, 1) + f_cluster = features[:, :, :3] - points_mean + features_ls.append(f_cluster) + + # Find distance of x, y, and z from pillar center + if self._with_voxel_center: + f_center = features[:, :, :2] + f_center[:, :, 0] = f_center[:, :, 0] - ( + coors[:, 3].type_as(features).unsqueeze(1) * self.vx + + self.x_offset) + f_center[:, :, 1] = f_center[:, :, 1] - ( + coors[:, 2].type_as(features).unsqueeze(1) * self.vy + + self.y_offset) + features_ls.append(f_center) + + if self._with_distance: + points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) + features_ls.append(points_dist) + + # Combine together feature decorations + features = torch.cat(features_ls, dim=-1) + # The feature decorations were calculated without regard to whether + # pillar was empty. Need to ensure that + # empty pillars remain set to zeros. + voxel_count = features.shape[1] + mask = get_paddings_indicator(num_points, voxel_count, axis=0) + mask = torch.unsqueeze(mask, -1).type_as(features) + features *= mask + + for pfn in self.pfn_layers: + features = pfn(features, num_points) + + return features.squeeze() + + +@VOXEL_ENCODERS.register_module +class DynamicPillarFeatureNet(PillarFeatureNet): + + def __init__(self, + num_input_features=4, + use_norm=True, + num_filters=(64, ), + with_distance=False, + with_cluster_center=True, + with_voxel_center=True, + voxel_size=(0.2, 0.2, 4), + point_cloud_range=(0, -40, -3, 70.4, 40, 1), + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), + mode='max'): + """ + Dynamic Pillar Feature Net for Dynamic Voxelization. + The difference is in the forward part + """ + + super(DynamicPillarFeatureNet, self).__init__( + num_input_features, + use_norm, + num_filters, + with_distance, + with_cluster_center=with_cluster_center, + with_voxel_center=with_voxel_center, + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + mode=mode) + + num_filters = [self.num_input_features] + list(num_filters) + pfn_layers = [] + # TODO: currently only support one PFNLayer + + for i in range(len(num_filters) - 1): + in_filters = num_filters[i] + out_filters = num_filters[i + 1] + if i > 0: + in_filters *= 2 + norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters) + pfn_layers.append( + nn.Sequential( + nn.Linear(in_filters, out_filters, bias=False), norm_layer, + nn.ReLU(inplace=True))) + self.num_pfn = len(pfn_layers) + self.pfn_layers = nn.ModuleList(pfn_layers) + self.pfn_scatter = DynamicScatter(voxel_size, point_cloud_range, + (mode != 'max')) + self.cluster_scatter = DynamicScatter( + voxel_size, point_cloud_range, average_points=True) + + def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors): + # Step 1: scatter voxel into canvas + # Calculate necessary things for canvas creation + canvas_y = int( + (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy) + canvas_x = int( + (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx) + canvas_channel = voxel_mean.size(1) + batch_size = pts_coors[-1, 0] + 1 + canvas_len = canvas_y * canvas_x * batch_size + # Create the canvas for this sample + canvas = voxel_mean.new_zeros(canvas_channel, canvas_len) + # Only include non-empty pillars + indices = ( + voxel_coors[:, 0] * canvas_y * canvas_x + + voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3]) + # Scatter the blob back to the canvas + canvas[:, indices.long()] = voxel_mean.t() + + # Step 2: get voxel mean for each point + voxel_index = ( + pts_coors[:, 0] * canvas_y * canvas_x + + pts_coors[:, 2] * canvas_x + pts_coors[:, 3]) + center_per_point = canvas[:, voxel_index.long()].t() + return center_per_point + + def forward(self, features, coors): + """ + features (torch.Tensor): NxC + coors (torch.Tensor): Nx(1+NDim) + """ + features_ls = [features] + # Find distance of x, y, and z from cluster center + if self._with_cluster_center: + voxel_mean, mean_coors = self.cluster_scatter(features, coors) + points_mean = self.map_voxel_center_to_point( + coors, voxel_mean, mean_coors) + # TODO: maybe also do cluster for reflectivity + f_cluster = features[:, :3] - points_mean[:, :3] + features_ls.append(f_cluster) + + # Find distance of x, y, and z from pillar center + if self._with_voxel_center: + f_center = features.new_zeros(size=(features.size(0), 2)) + f_center[:, 0] = features[:, 0] - ( + coors[:, 3].type_as(features) * self.vx + self.x_offset) + f_center[:, 1] = features[:, 1] - ( + coors[:, 2].type_as(features) * self.vy + self.y_offset) + features_ls.append(f_center) + + if self._with_distance: + points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True) + features_ls.append(points_dist) + + # Combine together feature decorations + features = torch.cat(features_ls, dim=-1) + for i, pfn in enumerate(self.pfn_layers): + point_feats = pfn(features) + voxel_feats, voxel_coors = self.pfn_scatter(point_feats, coors) + if i != len(self.pfn_layers) - 1: + # need to concat voxel feats if it is not the last pfn + feat_per_point = self.map_voxel_center_to_point( + coors, voxel_feats, voxel_coors) + features = torch.cat([point_feats, feat_per_point], dim=1) + + return voxel_feats, voxel_coors + + +@VOXEL_ENCODERS.register_module +class AlignedPillarFeatureNet(nn.Module): + + def __init__(self, + num_input_features=4, + use_norm=True, + num_filters=(64, ), + with_distance=False, + with_cluster_center=True, + with_voxel_center=True, + voxel_size=(0.2, 0.2, 4), + point_cloud_range=(0, -40, -3, 70.4, 40, 1), + mode='max'): + """ Pillar Feature Net. + + The network prepares the pillar features and performs forward pass + through PFNLayers. + + Args: + num_input_features (int): Number of input features, either x, y, z + or x, y, z, r. + use_norm (bool): Whether to include BatchNorm. + num_filters (list[int]): Number of features in each of the N + PFNLayers. + with_distance (bool): Whether to include Euclidean distance to + points. + voxel_size (list[float]): Size of voxels, only utilize x and y + size. + point_cloud_range: (list[float]): Point cloud range, only + utilize x and y min. + """ + + super(AlignedPillarFeatureNet, self).__init__() + + assert len(num_filters) > 0 + if with_cluster_center: + print('Use cluster center') + num_input_features += 3 + if with_voxel_center: + print('Use voxel center') + num_input_features += 2 + if with_distance: + num_input_features += 1 + self._with_distance = with_distance + self._with_cluster_center = with_cluster_center + self._with_voxel_center = with_voxel_center + + # Create PillarFeatureNet layers + num_filters = [num_input_features] + list(num_filters) + pfn_layers = [] + for i in range(len(num_filters) - 1): + in_filters = num_filters[i] + out_filters = num_filters[i + 1] + if i < len(num_filters) - 2: + last_layer = False + else: + last_layer = True + pfn_layers.append( + PFNLayer( + in_filters, + out_filters, + use_norm, + last_layer=last_layer, + mode=mode)) + self.pfn_layers = nn.ModuleList(pfn_layers) + + # Need pillar (voxel) size and x/y offset in order to + # calculate pillar offset + self.vx = voxel_size[0] + self.vy = voxel_size[1] + self.vz = voxel_size[2] + self.x_offset = self.vx / 2 + point_cloud_range[0] + self.y_offset = self.vy / 2 + point_cloud_range[1] + self.z_offset = self.vz / 2 + point_cloud_range[2] + + def forward(self, features, num_points, coors): + features_ls = [features] + # Find distance of x, y, and z from cluster center + if self._with_cluster_center: + points_mean = features[:, :, :3].sum( + dim=1, keepdim=True) / num_points.type_as(features).view( + -1, 1, 1) + f_cluster = features[:, :, :3] - points_mean + features_ls.append(f_cluster) + + x_distance = features[:, :, 0] - ( + coors[:, 3].type_as(features).unsqueeze(1) * self.vx + + self.x_offset) + y_distance = features[:, :, 1] - ( + coors[:, 2].type_as(features).unsqueeze(1) * self.vy + + self.y_offset) + z_distance = features[:, :, 2] - ( + coors[:, 1].type_as(features).unsqueeze(1) * self.vz + + self.z_offset) + + normed_x_distance = 1 - torch.abs(x_distance / self.vx) + normed_y_distance = 1 - torch.abs(y_distance / self.vy) + normed_z_distance = 1 - torch.abs(z_distance / self.vz) + + x_mask = torch.gt(normed_x_distance, 0).type_as(features) + y_mask = torch.gt(normed_y_distance, 0).type_as(features) + z_mask = torch.gt(normed_z_distance, 0).type_as(features) + + nonzero_points_mask = x_mask.mul(y_mask).mul(z_mask) + aligned_distance = normed_x_distance.mul(normed_y_distance).mul( + normed_z_distance).mul(nonzero_points_mask) + + # Find distance of x, y, and z from pillar center + if self._with_voxel_center: + f_center = features[:, :, :2] + f_center[:, :, 0] = f_center[:, :, 0] - ( + coors[:, 3].type_as(features).unsqueeze(1) * self.vx + + self.x_offset) + f_center[:, :, 1] = f_center[:, :, 1] - ( + coors[:, 2].type_as(features).unsqueeze(1) * self.vy + + self.y_offset) + features_ls.append(f_center) + + if self._with_distance: + points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) + features_ls.append(points_dist) + + # Combine together feature decorations + features = torch.cat(features_ls, dim=-1) + + # The feature decorations were calculated without regard to + # whether pillar was empty. Need to ensure that + # empty pillars remain set to zeros. + voxel_count = features.shape[1] + mask = get_paddings_indicator(num_points, voxel_count, axis=0) + mask = torch.unsqueeze(mask, -1).type_as(features) + features *= mask + + for pfn in self.pfn_layers: + if pfn.last_vfe: + features = pfn(features, aligned_distance) + else: + features = pfn(features) + + return features.squeeze() diff --git a/mmdet3d/models/voxel_encoders/utils.py b/mmdet3d/models/voxel_encoders/utils.py new file mode 100644 index 0000000000..c81a6b92fb --- /dev/null +++ b/mmdet3d/models/voxel_encoders/utils.py @@ -0,0 +1,148 @@ +import torch +from torch import nn +from torch.nn import functional as F + +from ..utils import build_norm_layer + + +class Empty(nn.Module): + + def __init__(self, *args, **kwargs): + super(Empty, self).__init__() + + def forward(self, *args, **kwargs): + if len(args) == 1: + return args[0] + elif len(args) == 0: + return None + return args + + +def get_paddings_indicator(actual_num, max_num, axis=0): + """Create boolean mask by actually number of a padded tensor. + + Args: + actual_num ([type]): [description] + max_num ([type]): [description] + + Returns: + [type]: [description] + """ + actual_num = torch.unsqueeze(actual_num, axis + 1) + # tiled_actual_num: [N, M, 1] + max_num_shape = [1] * len(actual_num.shape) + max_num_shape[axis + 1] = -1 + max_num = torch.arange( + max_num, dtype=torch.int, device=actual_num.device).view(max_num_shape) + # tiled_actual_num: [[3,3,3,3,3], [4,4,4,4,4], [2,2,2,2,2]] + # tiled_max_num: [[0,1,2,3,4], [0,1,2,3,4], [0,1,2,3,4]] + paddings_indicator = actual_num.int() > max_num + # paddings_indicator shape: [batch_size, max_num] + return paddings_indicator + + +class VFELayer(nn.Module): + + def __init__(self, + in_channels, + out_channels, + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), + max_out=True, + cat_max=True): + super(VFELayer, self).__init__() + self.cat_max = cat_max + self.max_out = max_out + # self.units = int(out_channels / 2) + if norm_cfg: + norm_name, norm_layer = build_norm_layer(norm_cfg, out_channels) + self.norm = norm_layer + self.linear = nn.Linear(in_channels, out_channels, bias=False) + else: + self.norm = Empty(out_channels) + self.linear = nn.Linear(in_channels, out_channels, bias=True) + + def forward(self, inputs): + # [K, T, 7] tensordot [7, units] = [K, T, units] + voxel_count = inputs.shape[1] + x = self.linear(inputs) + x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2, + 1).contiguous() + pointwise = F.relu(x) + # [K, T, units] + if self.max_out: + aggregated = torch.max(pointwise, dim=1, keepdim=True)[0] + else: + # this is for fusion layer + return pointwise + + if not self.cat_max: + return aggregated.squeeze(1) + else: + # [K, 1, units] + repeated = aggregated.repeat(1, voxel_count, 1) + concatenated = torch.cat([pointwise, repeated], dim=2) + # [K, T, 2 * units] + return concatenated + + +class PFNLayer(nn.Module): + + def __init__(self, + in_channels, + out_channels, + use_norm=True, + last_layer=False, + mode='max'): + """ Pillar Feature Net Layer. + + The Pillar Feature Net is composed of a series of these layers, but the + PointPillars paper results only used a single PFNLayer. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + use_norm (bool): Whether to include BatchNorm. + last_layer (bool): If last_layer, there is no concatenation of + features. + """ + + super().__init__() + self.name = 'PFNLayer' + self.last_vfe = last_layer + if not self.last_vfe: + out_channels = out_channels // 2 + self.units = out_channels + + if use_norm: + self.norm = nn.BatchNorm1d(self.units, eps=1e-3, momentum=0.01) + self.linear = nn.Linear(in_channels, self.units, bias=False) + else: + self.norm = Empty(self.unints) + self.linear = nn.Linear(in_channels, self.units, bias=True) + + self.mode = mode + + def forward(self, inputs, num_voxels=None, aligned_distance=None): + + x = self.linear(inputs) + x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2, + 1).contiguous() + x = F.relu(x) + + if self.mode == 'max': + if aligned_distance is not None: + x = x.mul(aligned_distance.unsqueeze(-1)) + x_max = torch.max(x, dim=1, keepdim=True)[0] + elif self.mode == 'avg': + if aligned_distance is not None: + x = x.mul(aligned_distance.unsqueeze(-1)) + x_max = x.sum( + dim=1, keepdim=True) / num_voxels.type_as(inputs).view( + -1, 1, 1) + + if self.last_vfe: + return x_max + else: + x_repeat = x_max.repeat(1, inputs.shape[1], 1) + x_concatenated = torch.cat([x, x_repeat], dim=2) + return x_concatenated diff --git a/mmdet3d/models/voxel_encoders/voxel_encoder.py b/mmdet3d/models/voxel_encoders/voxel_encoder.py new file mode 100644 index 0000000000..c8afaf2216 --- /dev/null +++ b/mmdet3d/models/voxel_encoders/voxel_encoder.py @@ -0,0 +1,478 @@ +import torch +from torch import nn +from torch.nn import functional as F + +from mmdet3d.ops import DynamicScatter +from .. import builder +from ..registry import VOXEL_ENCODERS +from ..utils import build_norm_layer +from .utils import Empty, VFELayer, get_paddings_indicator + + +@VOXEL_ENCODERS.register_module +class VoxelFeatureExtractor(nn.Module): + + def __init__(self, + num_input_features=4, + use_norm=True, + num_filters=[32, 128], + with_distance=False, + name='VoxelFeatureExtractor'): + super(VoxelFeatureExtractor, self).__init__() + self.name = name + assert len(num_filters) == 2 + num_input_features += 3 # add mean features + if with_distance: + num_input_features += 1 + self._with_distance = with_distance + self.vfe1 = VFELayer(num_input_features, num_filters[0], use_norm) + self.vfe2 = VFELayer(num_filters[0], num_filters[1], use_norm) + + if use_norm: + self.linear = nn.Linear(num_filters[1], num_filters[1], bias=False) + self.norm = nn.BatchNorm1d(num_filters[1], eps=1e-3, momentum=0.01) + else: + self.linear = nn.Linear(num_filters[1], num_filters[1], bias=True) + self.norm = Empty(num_filters[1]) + + def forward(self, features, num_voxels, **kwargs): + # features: [concated_num_points, num_voxel_size, 3(4)] + # num_voxels: [concated_num_points] + # t = time.time() + # torch.cuda.synchronize() + + points_mean = features[:, :, :3].sum( + dim=1, keepdim=True) / num_voxels.type_as(features).view(-1, 1, 1) + features_relative = features[:, :, :3] - points_mean + if self._with_distance: + points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) + features = torch.cat([features, features_relative, points_dist], + dim=-1) + else: + features = torch.cat([features, features_relative], dim=-1) + voxel_count = features.shape[1] + mask = get_paddings_indicator(num_voxels, voxel_count, axis=0) + mask = torch.unsqueeze(mask, -1).type_as(features) + # mask = features.max(dim=2, keepdim=True)[0] != 0 + + # torch.cuda.synchronize() + # print("vfe prep forward time", time.time() - t) + x = self.vfe1(features) + x *= mask + x = self.vfe2(x) + x *= mask + x = self.linear(x) + x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2, + 1).contiguous() + x = F.relu(x) + x *= mask + # x: [concated_num_points, num_voxel_size, 128] + voxelwise = torch.max(x, dim=1)[0] + return voxelwise + + +@VOXEL_ENCODERS.register_module +class VoxelFeatureExtractorV2(nn.Module): + + def __init__(self, + num_input_features=4, + use_norm=True, + num_filters=[32, 128], + with_distance=False, + name='VoxelFeatureExtractor'): + super(VoxelFeatureExtractorV2, self).__init__() + self.name = name + assert len(num_filters) > 0 + num_input_features += 3 + if with_distance: + num_input_features += 1 + self._with_distance = with_distance + + num_filters = [num_input_features] + num_filters + filters_pairs = [[num_filters[i], num_filters[i + 1]] + for i in range(len(num_filters) - 1)] + self.vfe_layers = nn.ModuleList( + [VFELayer(i, o, use_norm) for i, o in filters_pairs]) + + if use_norm: + self.linear = nn.Linear( + num_filters[-1], num_filters[-1], bias=False) + self.norm = nn.BatchNorm1d( + num_filters[-1], eps=1e-3, momentum=0.01) + else: + self.linear = nn.Linear( + num_filters[-1], num_filters[-1], bias=True) + self.norm = Empty(num_filters[-1]) + + def forward(self, features, num_voxels, **kwargs): + # features: [concated_num_points, num_voxel_size, 3(4)] + # num_voxels: [concated_num_points] + points_mean = features[:, :, :3].sum( + dim=1, keepdim=True) / num_voxels.type_as(features).view(-1, 1, 1) + features_relative = features[:, :, :3] - points_mean + if self._with_distance: + points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) + features = torch.cat([features, features_relative, points_dist], + dim=-1) + else: + features = torch.cat([features, features_relative], dim=-1) + voxel_count = features.shape[1] + mask = get_paddings_indicator(num_voxels, voxel_count, axis=0) + mask = torch.unsqueeze(mask, -1).type_as(features) + for vfe in self.vfe_layers: + features = vfe(features) + features *= mask + features = self.linear(features) + features = self.norm(features.permute(0, 2, 1).contiguous()).permute( + 0, 2, 1).contiguous() + features = F.relu(features) + features *= mask + # x: [concated_num_points, num_voxel_size, 128] + voxelwise = torch.max(features, dim=1)[0] + return voxelwise + + +@VOXEL_ENCODERS.register_module +class VoxelFeatureExtractorV3(nn.Module): + + def __init__(self, + num_input_features=4, + use_norm=True, + num_filters=[32, 128], + with_distance=False, + name='VoxelFeatureExtractor'): + super(VoxelFeatureExtractorV3, self).__init__() + self.name = name + + def forward(self, features, num_points, coors): + # features: [concated_num_points, num_voxel_size, 3(4)] + # num_points: [concated_num_points] + points_mean = features[:, :, :4].sum( + dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1) + return points_mean.contiguous() + + +@VOXEL_ENCODERS.register_module +class DynamicVFEV3(nn.Module): + + def __init__(self, + num_input_features=4, + voxel_size=(0.2, 0.2, 4), + point_cloud_range=(0, -40, -3, 70.4, 40, 1)): + super(DynamicVFEV3, self).__init__() + self.scatter = DynamicScatter(voxel_size, point_cloud_range, True) + + @torch.no_grad() + def forward(self, features, coors): + # This function is used from the start of the voxelnet + # num_points: [concated_num_points] + features, features_coors = self.scatter(features, coors) + return features, features_coors + + +@VOXEL_ENCODERS.register_module +class DynamicVFE(nn.Module): + + def __init__(self, + num_input_features=4, + num_filters=[], + with_distance=False, + with_cluster_center=False, + with_voxel_center=False, + voxel_size=(0.2, 0.2, 4), + point_cloud_range=(0, -40, -3, 70.4, 40, 1), + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), + mode='max', + fusion_layer=None, + return_point_feats=False): + super(DynamicVFE, self).__init__() + assert len(num_filters) > 0 + if with_cluster_center: + num_input_features += 3 + if with_voxel_center: + num_input_features += 3 + if with_distance: + num_input_features += 3 + self.num_input_features = num_input_features + self._with_distance = with_distance + self._with_cluster_center = with_cluster_center + self._with_voxel_center = with_voxel_center + self.return_point_feats = return_point_feats + + # Need pillar (voxel) size and x/y offset in order to calculate offset + self.vx = voxel_size[0] + self.vy = voxel_size[1] + self.vz = voxel_size[2] + self.x_offset = self.vx / 2 + point_cloud_range[0] + self.y_offset = self.vy / 2 + point_cloud_range[1] + self.z_offset = self.vz / 2 + point_cloud_range[2] + self.point_cloud_range = point_cloud_range + self.scatter = DynamicScatter(voxel_size, point_cloud_range, True) + + num_filters = [self.num_input_features] + list(num_filters) + vfe_layers = [] + for i in range(len(num_filters) - 1): + in_filters = num_filters[i] + out_filters = num_filters[i + 1] + if i > 0: + in_filters *= 2 + norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters) + vfe_layers.append( + nn.Sequential( + nn.Linear(in_filters, out_filters, bias=False), norm_layer, + nn.ReLU(inplace=True))) + self.vfe_layers = nn.ModuleList(vfe_layers) + self.num_vfe = len(vfe_layers) + self.vfe_scatter = DynamicScatter(voxel_size, point_cloud_range, + (mode != 'max')) + self.cluster_scatter = DynamicScatter( + voxel_size, point_cloud_range, average_points=True) + self.fusion_layer = None + if fusion_layer is not None: + self.fusion_layer = builder.build_fusion_layer(fusion_layer) + + def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors): + # Step 1: scatter voxel into canvas + # Calculate necessary things for canvas creation + canvas_z = int( + (self.point_cloud_range[5] - self.point_cloud_range[2]) / self.vz) + canvas_y = int( + (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy) + canvas_x = int( + (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx) + # canvas_channel = voxel_mean.size(1) + batch_size = pts_coors[-1, 0] + 1 + canvas_len = canvas_z * canvas_y * canvas_x * batch_size + # Create the canvas for this sample + canvas = voxel_mean.new_zeros(canvas_len, dtype=torch.long) + # Only include non-empty pillars + indices = ( + voxel_coors[:, 0] * canvas_z * canvas_y * canvas_x + + voxel_coors[:, 1] * canvas_y * canvas_x + + voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3]) + # Scatter the blob back to the canvas + canvas[indices.long()] = torch.arange( + start=0, end=voxel_mean.size(0), device=voxel_mean.device) + + # Step 2: get voxel mean for each point + voxel_index = ( + pts_coors[:, 0] * canvas_z * canvas_y * canvas_x + + pts_coors[:, 1] * canvas_y * canvas_x + + pts_coors[:, 2] * canvas_x + pts_coors[:, 3]) + voxel_inds = canvas[voxel_index.long()] + center_per_point = voxel_mean[voxel_inds, ...] + return center_per_point + + def forward(self, + features, + coors, + points=None, + img_feats=None, + img_meta=None): + """ + features (torch.Tensor): NxC + coors (torch.Tensor): Nx(1+NDim) + """ + features_ls = [features] + # Find distance of x, y, and z from cluster center + if self._with_cluster_center: + voxel_mean, mean_coors = self.cluster_scatter(features, coors) + points_mean = self.map_voxel_center_to_point( + coors, voxel_mean, mean_coors) + # TODO: maybe also do cluster for reflectivity + f_cluster = features[:, :3] - points_mean[:, :3] + features_ls.append(f_cluster) + + # Find distance of x, y, and z from pillar center + if self._with_voxel_center: + f_center = features.new_zeros(size=(features.size(0), 3)) + f_center[:, 0] = features[:, 0] - ( + coors[:, 3].type_as(features) * self.vx + self.x_offset) + f_center[:, 1] = features[:, 1] - ( + coors[:, 2].type_as(features) * self.vy + self.y_offset) + f_center[:, 2] = features[:, 2] - ( + coors[:, 1].type_as(features) * self.vz + self.z_offset) + features_ls.append(f_center) + + if self._with_distance: + points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True) + features_ls.append(points_dist) + + # Combine together feature decorations + features = torch.cat(features_ls, dim=-1) + for i, vfe in enumerate(self.vfe_layers): + point_feats = vfe(features) + if (i == len(self.vfe_layers) - 1 and self.fusion_layer is not None + and img_feats is not None): + point_feats = self.fusion_layer(img_feats, points, point_feats, + img_meta) + voxel_feats, voxel_coors = self.vfe_scatter(point_feats, coors) + if i != len(self.vfe_layers) - 1: + # need to concat voxel feats if it is not the last vfe + feat_per_point = self.map_voxel_center_to_point( + coors, voxel_feats, voxel_coors) + features = torch.cat([point_feats, feat_per_point], dim=1) + + if self.return_point_feats: + return point_feats + return voxel_feats, voxel_coors + + +@VOXEL_ENCODERS.register_module +class HardVFE(nn.Module): + + def __init__(self, + num_input_features=4, + num_filters=[], + with_distance=False, + with_cluster_center=False, + with_voxel_center=False, + voxel_size=(0.2, 0.2, 4), + point_cloud_range=(0, -40, -3, 70.4, 40, 1), + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), + mode='max', + fusion_layer=None, + return_point_feats=False): + super(HardVFE, self).__init__() + assert len(num_filters) > 0 + if with_cluster_center: + num_input_features += 3 + if with_voxel_center: + num_input_features += 3 + if with_distance: + num_input_features += 3 + self.num_input_features = num_input_features + self._with_distance = with_distance + self._with_cluster_center = with_cluster_center + self._with_voxel_center = with_voxel_center + self.return_point_feats = return_point_feats + + # Need pillar (voxel) size and x/y offset to calculate pillar offset + self.vx = voxel_size[0] + self.vy = voxel_size[1] + self.vz = voxel_size[2] + self.x_offset = self.vx / 2 + point_cloud_range[0] + self.y_offset = self.vy / 2 + point_cloud_range[1] + self.z_offset = self.vz / 2 + point_cloud_range[2] + self.point_cloud_range = point_cloud_range + self.scatter = DynamicScatter(voxel_size, point_cloud_range, True) + + num_filters = [self.num_input_features] + list(num_filters) + vfe_layers = [] + for i in range(len(num_filters) - 1): + in_filters = num_filters[i] + out_filters = num_filters[i + 1] + if i > 0: + in_filters *= 2 + # TODO: pass norm_cfg to VFE + # norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters) + if i == (len(num_filters) - 2): + cat_max = False + max_out = True + if fusion_layer: + max_out = False + else: + max_out = True + cat_max = True + vfe_layers.append( + VFELayer( + in_filters, + out_filters, + norm_cfg=norm_cfg, + max_out=max_out, + cat_max=cat_max)) + self.vfe_layers = nn.ModuleList(vfe_layers) + self.num_vfe = len(vfe_layers) + + self.fusion_layer = None + if fusion_layer is not None: + self.fusion_layer = builder.build_fusion_layer(fusion_layer) + + def forward(self, + features, + num_points, + coors, + img_feats=None, + img_meta=None): + """ + features (torch.Tensor): NxMxC + coors (torch.Tensor): Nx(1+NDim) + """ + features_ls = [features] + # Find distance of x, y, and z from cluster center + if self._with_cluster_center: + points_mean = ( + features[:, :, :3].sum(dim=1, keepdim=True) / + num_points.type_as(features).view(-1, 1, 1)) + # TODO: maybe also do cluster for reflectivity + f_cluster = features[:, :, :3] - points_mean + features_ls.append(f_cluster) + + # Find distance of x, y, and z from pillar center + if self._with_voxel_center: + f_center = features.new_zeros( + size=(features.size(0), features.size(1), 3)) + f_center[:, :, 0] = features[:, :, 0] - ( + coors[:, 3].type_as(features).unsqueeze(1) * self.vx + + self.x_offset) + f_center[:, :, 1] = features[:, :, 1] - ( + coors[:, 2].type_as(features).unsqueeze(1) * self.vy + + self.y_offset) + f_center[:, :, 2] = features[:, :, 2] - ( + coors[:, 1].type_as(features).unsqueeze(1) * self.vz + + self.z_offset) + features_ls.append(f_center) + + if self._with_distance: + points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) + features_ls.append(points_dist) + + # Combine together feature decorations + voxel_feats = torch.cat(features_ls, dim=-1) + # The feature decorations were calculated without regard to whether + # pillar was empty. + # Need to ensure that empty voxels remain set to zeros. + voxel_count = voxel_feats.shape[1] + mask = get_paddings_indicator(num_points, voxel_count, axis=0) + voxel_feats *= mask.unsqueeze(-1).type_as(voxel_feats) + + for i, vfe in enumerate(self.vfe_layers): + voxel_feats = vfe(voxel_feats) + if torch.isnan(voxel_feats).any(): + import pdb + pdb.set_trace() + if (self.fusion_layer is not None and img_feats is not None): + voxel_feats = self.fusion_with_mask(features, mask, voxel_feats, + coors, img_feats, img_meta) + if torch.isnan(voxel_feats).any(): + import pdb + pdb.set_trace() + return voxel_feats + + def fusion_with_mask(self, features, mask, voxel_feats, coors, img_feats, + img_meta): + # the features is consist of a batch of points + batch_size = coors[-1, 0] + 1 + points = [] + for i in range(batch_size): + single_mask = (coors[:, 0] == i) + points.append(features[single_mask][mask[single_mask]]) + + point_feats = voxel_feats[mask] + if torch.isnan(point_feats).any(): + import pdb + pdb.set_trace() + point_feats = self.fusion_layer(img_feats, points, point_feats, + img_meta) + if torch.isnan(point_feats).any(): + import pdb + pdb.set_trace() + voxel_canvas = voxel_feats.new_zeros( + size=(voxel_feats.size(0), voxel_feats.size(1), + point_feats.size(-1))) + voxel_canvas[mask] = point_feats + out = torch.max(voxel_canvas, dim=1)[0] + if torch.isnan(out).any(): + import pdb + pdb.set_trace() + return out diff --git a/mmdet3d/ops/__init__.py b/mmdet3d/ops/__init__.py new file mode 100644 index 0000000000..6489651139 --- /dev/null +++ b/mmdet3d/ops/__init__.py @@ -0,0 +1,11 @@ +from mmdet.ops import (RoIAlign, SigmoidFocalLoss, build_norm_layer, + get_compiler_version, get_compiling_cuda_version, nms, + roi_align, sigmoid_focal_loss) +from .voxel import DynamicScatter, Voxelization, dynamic_scatter, voxelization + +__all__ = [ + 'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'get_compiler_version', + 'get_compiling_cuda_version', 'build_conv_layer', 'build_norm_layer', + 'batched_nms', 'Voxelization', 'voxelization', 'dynamic_scatter', + 'DynamicScatter', 'sigmoid_focal_loss', 'SigmoidFocalLoss' +] diff --git a/mmdet3d/ops/iou3d/__init__.py b/mmdet3d/ops/iou3d/__init__.py new file mode 100644 index 0000000000..df156f916f --- /dev/null +++ b/mmdet3d/ops/iou3d/__init__.py @@ -0,0 +1,4 @@ +from .iou3d_utils import (boxes_iou3d_gpu, boxes_iou_bev, nms_gpu, + nms_normal_gpu) + +__all__ = ['boxes_iou_bev', 'boxes_iou3d_gpu', 'nms_gpu', 'nms_normal_gpu'] diff --git a/mmdet3d/ops/iou3d/iou3d_utils.py b/mmdet3d/ops/iou3d/iou3d_utils.py new file mode 100644 index 0000000000..a12578e1de --- /dev/null +++ b/mmdet3d/ops/iou3d/iou3d_utils.py @@ -0,0 +1,113 @@ +import torch + +from . import iou3d_cuda + + +def boxes_iou_bev(boxes_a, boxes_b): + """ + :param boxes_a: (M, 5) + :param boxes_b: (N, 5) + :return: + ans_iou: (M, N) + """ + + ans_iou = torch.cuda.FloatTensor( + torch.Size((boxes_a.shape[0], boxes_b.shape[0]))).zero_() + + iou3d_cuda.boxes_iou_bev_gpu(boxes_a.contiguous(), boxes_b.contiguous(), + ans_iou) + + return ans_iou + + +def boxes_iou3d_gpu(boxes_a, boxes_b, mode='iou'): + """ + :param boxes_a: (N, 7) [x, y, z, h, w, l, ry] + :param boxes_b: (M, 7) [x, y, z, h, w, l, ry] + :param mode "iou" (intersection over union) or iof (intersection over + foreground). + :return: + ans_iou: (M, N) + """ + boxes_a_bev = boxes3d_to_bev_torch(boxes_a) + boxes_b_bev = boxes3d_to_bev_torch(boxes_b) + + # bev overlap + overlaps_bev = torch.cuda.FloatTensor( + torch.Size((boxes_a.shape[0], boxes_b.shape[0]))).zero_() # (N, M) + iou3d_cuda.boxes_overlap_bev_gpu(boxes_a_bev.contiguous(), + boxes_b_bev.contiguous(), overlaps_bev) + + # height overlap + boxes_a_height_min = (boxes_a[:, 1] - boxes_a[:, 3]).view(-1, 1) + boxes_a_height_max = boxes_a[:, 1].view(-1, 1) + boxes_b_height_min = (boxes_b[:, 1] - boxes_b[:, 3]).view(1, -1) + boxes_b_height_max = boxes_b[:, 1].view(1, -1) + + max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min) + min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max) + overlaps_h = torch.clamp(min_of_max - max_of_min, min=0) + + # 3d iou + overlaps_3d = overlaps_bev * overlaps_h + + vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1) + vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1) + + if mode == 'iou': + # the clamp func is used to avoid division of 0 + iou3d = overlaps_3d / torch.clamp( + vol_a + vol_b - overlaps_3d, min=1e-8) + else: + iou3d = overlaps_3d / torch.clamp(vol_a, min=1e-8) + + return iou3d + + +def nms_gpu(boxes, scores, thresh): + """ + :param boxes: (N, 5) [x1, y1, x2, y2, ry] + :param scores: (N) + :param thresh: + :return: + """ + # areas = (x2 - x1) * (y2 - y1) + order = scores.sort(0, descending=True)[1] + + boxes = boxes[order].contiguous() + + keep = torch.LongTensor(boxes.size(0)) + num_out = iou3d_cuda.nms_gpu(boxes, keep, thresh) + return order[keep[:num_out].cuda()].contiguous() + + +def nms_normal_gpu(boxes, scores, thresh): + """ + :param boxes: (N, 5) [x1, y1, x2, y2, ry] + :param scores: (N) + :param thresh: + :return: + """ + # areas = (x2 - x1) * (y2 - y1) + order = scores.sort(0, descending=True)[1] + + boxes = boxes[order].contiguous() + + keep = torch.LongTensor(boxes.size(0)) + num_out = iou3d_cuda.nms_normal_gpu(boxes, keep, thresh) + return order[keep[:num_out].cuda()].contiguous() + + +def boxes3d_to_bev_torch(boxes3d): + """ + :param boxes3d: (N, 7) [x, y, z, h, w, l, ry] in camera coords + :return: + boxes_bev: (N, 5) [x1, y1, x2, y2, ry] + """ + boxes_bev = boxes3d.new(torch.Size((boxes3d.shape[0], 5))) + cu, cv = boxes3d[:, 0], boxes3d[:, 2] + half_l, half_w = boxes3d[:, 5] / 2, boxes3d[:, 4] / 2 + boxes_bev[:, 0], boxes_bev[:, 1] = cu - half_l, cv - half_w + boxes_bev[:, 2], boxes_bev[:, 3] = cu + half_l, cv + half_w + boxes_bev[:, 4] = boxes3d[:, 6] + return boxes_bev diff --git a/mmdet3d/ops/iou3d/setup.py b/mmdet3d/ops/iou3d/setup.py new file mode 100644 index 0000000000..bd148e6ddc --- /dev/null +++ b/mmdet3d/ops/iou3d/setup.py @@ -0,0 +1,18 @@ +from setuptools import setup + +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +setup( + name='iou3d', + ext_modules=[ + CUDAExtension( + 'iou3d_cuda', [ + 'src/iou3d.cpp', + 'src/iou3d_kernel.cu', + ], + extra_compile_args={ + 'cxx': ['-g', '-I /usr/local/cuda/include'], + 'nvcc': ['-O2'] + }) + ], + cmdclass={'build_ext': BuildExtension}) diff --git a/mmdet3d/ops/iou3d/src/iou3d.cpp b/mmdet3d/ops/iou3d/src/iou3d.cpp new file mode 100644 index 0000000000..2cf4b650c7 --- /dev/null +++ b/mmdet3d/ops/iou3d/src/iou3d.cpp @@ -0,0 +1,179 @@ +#include +#include +#include +#include +#include + +#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") +#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") +#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x) + +#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) + +#define CHECK_ERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; + + +void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap); +void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou); +void nmsLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh); +void nmsNormalLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh); + +int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_overlap){ + // params boxes_a: (N, 5) [x1, y1, x2, y2, ry] + // params boxes_b: (M, 5) + // params ans_overlap: (N, M) + + CHECK_INPUT(boxes_a); + CHECK_INPUT(boxes_b); + CHECK_INPUT(ans_overlap); + + int num_a = boxes_a.size(0); + int num_b = boxes_b.size(0); + + const float * boxes_a_data = boxes_a.data(); + const float * boxes_b_data = boxes_b.data(); + float * ans_overlap_data = ans_overlap.data(); + + boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_overlap_data); + + return 1; +} + +int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou){ + // params boxes_a: (N, 5) [x1, y1, x2, y2, ry] + // params boxes_b: (M, 5) + // params ans_overlap: (N, M) + + CHECK_INPUT(boxes_a); + CHECK_INPUT(boxes_b); + CHECK_INPUT(ans_iou); + + int num_a = boxes_a.size(0); + int num_b = boxes_b.size(0); + + const float * boxes_a_data = boxes_a.data(); + const float * boxes_b_data = boxes_b.data(); + float * ans_iou_data = ans_iou.data(); + + boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data); + + return 1; +} + +int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){ + // params boxes: (N, 5) [x1, y1, x2, y2, ry] + // params keep: (N) + + CHECK_INPUT(boxes); + CHECK_CONTIGUOUS(keep); + + int boxes_num = boxes.size(0); + const float * boxes_data = boxes.data(); + long * keep_data = keep.data(); + + const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); + + unsigned long long *mask_data = NULL; + CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long))); + nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh); + + // unsigned long long mask_cpu[boxes_num * col_blocks]; + // unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks]; + std::vector mask_cpu(boxes_num * col_blocks); + +// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks); + CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long), + cudaMemcpyDeviceToHost)); + + cudaFree(mask_data); + + unsigned long long remv_cpu[col_blocks]; + memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long)); + + int num_to_keep = 0; + + for (int i = 0; i < boxes_num; i++){ + int nblock = i / THREADS_PER_BLOCK_NMS; + int inblock = i % THREADS_PER_BLOCK_NMS; + + if (!(remv_cpu[nblock] & (1ULL << inblock))){ + keep_data[num_to_keep++] = i; + unsigned long long *p = &mask_cpu[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++){ + remv_cpu[j] |= p[j]; + } + } + } + if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" ); + + return num_to_keep; +} + + +int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){ + // params boxes: (N, 5) [x1, y1, x2, y2, ry] + // params keep: (N) + + CHECK_INPUT(boxes); + CHECK_CONTIGUOUS(keep); + + int boxes_num = boxes.size(0); + const float * boxes_data = boxes.data(); + long * keep_data = keep.data(); + + const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); + + unsigned long long *mask_data = NULL; + CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long))); + nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh); + + // unsigned long long mask_cpu[boxes_num * col_blocks]; + // unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks]; + std::vector mask_cpu(boxes_num * col_blocks); + +// printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks); + CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long), + cudaMemcpyDeviceToHost)); + + cudaFree(mask_data); + + unsigned long long remv_cpu[col_blocks]; + memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long)); + + int num_to_keep = 0; + + for (int i = 0; i < boxes_num; i++){ + int nblock = i / THREADS_PER_BLOCK_NMS; + int inblock = i % THREADS_PER_BLOCK_NMS; + + if (!(remv_cpu[nblock] & (1ULL << inblock))){ + keep_data[num_to_keep++] = i; + unsigned long long *p = &mask_cpu[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++){ + remv_cpu[j] |= p[j]; + } + } + } + if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" ); + + return num_to_keep; +} + + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu, "oriented boxes overlap"); + m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou"); + m.def("nms_gpu", &nms_gpu, "oriented nms gpu"); + m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu"); +} diff --git a/mmdet3d/ops/iou3d/src/iou3d_kernel.cu b/mmdet3d/ops/iou3d/src/iou3d_kernel.cu new file mode 100644 index 0000000000..7aac72ed03 --- /dev/null +++ b/mmdet3d/ops/iou3d/src/iou3d_kernel.cu @@ -0,0 +1,381 @@ +#include +#define THREADS_PER_BLOCK 16 +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) + +//#define DEBUG +const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; +const float EPS = 1e-8; +struct Point { + float x, y; + __device__ Point() {} + __device__ Point(double _x, double _y){ + x = _x, y = _y; + } + + __device__ void set(float _x, float _y){ + x = _x; y = _y; + } + + __device__ Point operator +(const Point &b)const{ + return Point(x + b.x, y + b.y); + } + + __device__ Point operator -(const Point &b)const{ + return Point(x - b.x, y - b.y); + } +}; + +__device__ inline float cross(const Point &a, const Point &b){ + return a.x * b.y - a.y * b.x; +} + +__device__ inline float cross(const Point &p1, const Point &p2, const Point &p0){ + return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y); +} + +__device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2){ + int ret = min(p1.x,p2.x) <= max(q1.x,q2.x) && + min(q1.x,q2.x) <= max(p1.x,p2.x) && + min(p1.y,p2.y) <= max(q1.y,q2.y) && + min(q1.y,q2.y) <= max(p1.y,p2.y); + return ret; +} + +__device__ inline int check_in_box2d(const float *box, const Point &p){ + //params: box (5) [x1, y1, x2, y2, angle] + const float MARGIN = 1e-5; + + float center_x = (box[0] + box[2]) / 2; + float center_y = (box[1] + box[3]) / 2; + float angle_cos = cos(-box[4]), angle_sin = sin(-box[4]); // rotate the point in the opposite direction of box + float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * angle_sin + center_x; + float rot_y = -(p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y; +#ifdef DEBUG + printf("box: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", box[0], box[1], box[2], box[3], box[4]); + printf("center: (%.3f, %.3f), cossin(%.3f, %.3f), src(%.3f, %.3f), rot(%.3f, %.3f)\n", center_x, center_y, + angle_cos, angle_sin, p.x, p.y, rot_x, rot_y); +#endif + return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN && rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN); +} + +__device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){ + // fast exclusion + if (check_rect_cross(p0, p1, q0, q1) == 0) return 0; + + // check cross standing + float s1 = cross(q0, p1, p0); + float s2 = cross(p1, q1, p0); + float s3 = cross(p0, q1, q0); + float s4 = cross(q1, p1, q0); + + if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0; + + // calculate intersection of two lines + float s5 = cross(q1, p1, p0); + if(fabs(s5 - s1) > EPS){ + ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1); + ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1); + + } + else{ + float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y; + float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y; + float D = a0 * b1 - a1 * b0; + + ans.x = (b0 * c1 - b1 * c0) / D; + ans.y = (a1 * c0 - a0 * c1) / D; + } + + return 1; +} + +__device__ inline void rotate_around_center(const Point ¢er, const float angle_cos, const float angle_sin, Point &p){ + float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * angle_sin + center.x; + float new_y = -(p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; + p.set(new_x, new_y); +} + +__device__ inline int point_cmp(const Point &a, const Point &b, const Point ¢er){ + return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x); +} + +__device__ inline float box_overlap(const float *box_a, const float *box_b){ + // params: box_a (5) [x1, y1, x2, y2, angle] + // params: box_b (5) [x1, y1, x2, y2, angle] + + float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3], a_angle = box_a[4]; + float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3], b_angle = box_b[4]; + + Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2); + Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2); +#ifdef DEBUG + printf("a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle, + b_x1, b_y1, b_x2, b_y2, b_angle); + printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y); +#endif + + Point box_a_corners[5]; + box_a_corners[0].set(a_x1, a_y1); + box_a_corners[1].set(a_x2, a_y1); + box_a_corners[2].set(a_x2, a_y2); + box_a_corners[3].set(a_x1, a_y2); + + Point box_b_corners[5]; + box_b_corners[0].set(b_x1, b_y1); + box_b_corners[1].set(b_x2, b_y1); + box_b_corners[2].set(b_x2, b_y2); + box_b_corners[3].set(b_x1, b_y2); + + // get oriented corners + float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle); + float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle); + + for (int k = 0; k < 4; k++){ +#ifdef DEBUG + printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y); +#endif + rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]); + rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]); +#ifdef DEBUG + printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y); +#endif + } + + box_a_corners[4] = box_a_corners[0]; + box_b_corners[4] = box_b_corners[0]; + + // get intersection of lines + Point cross_points[16]; + Point poly_center; + int cnt = 0, flag = 0; + + poly_center.set(0, 0); + for (int i = 0; i < 4; i++){ + for (int j = 0; j < 4; j++){ + flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]); + if (flag){ + poly_center = poly_center + cross_points[cnt]; + cnt++; + } + } + } + + // check corners + for (int k = 0; k < 4; k++){ + if (check_in_box2d(box_a, box_b_corners[k])){ + poly_center = poly_center + box_b_corners[k]; + cross_points[cnt] = box_b_corners[k]; + cnt++; + } + if (check_in_box2d(box_b, box_a_corners[k])){ + poly_center = poly_center + box_a_corners[k]; + cross_points[cnt] = box_a_corners[k]; + cnt++; + } + } + + poly_center.x /= cnt; + poly_center.y /= cnt; + + // sort the points of polygon + Point temp; + for (int j = 0; j < cnt - 1; j++){ + for (int i = 0; i < cnt - j - 1; i++){ + if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){ + temp = cross_points[i]; + cross_points[i] = cross_points[i + 1]; + cross_points[i + 1] = temp; + } + } + } + +#ifdef DEBUG + printf("cnt=%d\n", cnt); + for (int i = 0; i < cnt; i++){ + printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y); + } +#endif + + // get the overlap areas + float area = 0; + for (int k = 0; k < cnt - 1; k++){ + area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]); + } + + return fabs(area) / 2.0; +} + +__device__ inline float iou_bev(const float *box_a, const float *box_b){ + // params: box_a (5) [x1, y1, x2, y2, angle] + // params: box_b (5) [x1, y1, x2, y2, angle] + float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]); + float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]); + float s_overlap = box_overlap(box_a, box_b); + return s_overlap / fmaxf(sa + sb - s_overlap, EPS); +} + +__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){ + const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; + const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; + + if (a_idx >= num_a || b_idx >= num_b){ + return; + } + const float * cur_box_a = boxes_a + a_idx * 5; + const float * cur_box_b = boxes_b + b_idx * 5; + float s_overlap = box_overlap(cur_box_a, cur_box_b); + ans_overlap[a_idx * num_b + b_idx] = s_overlap; +} + +__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){ + const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; + const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; + + if (a_idx >= num_a || b_idx >= num_b){ + return; + } + + const float * cur_box_a = boxes_a + a_idx * 5; + const float * cur_box_b = boxes_b + b_idx * 5; + float cur_iou_bev = iou_bev(cur_box_a, cur_box_b); + ans_iou[a_idx * num_b + b_idx] = cur_iou_bev; +} + +__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh, + const float *boxes, unsigned long long *mask){ + //params: boxes (N, 5) [x1, y1, x2, y2, ry] + //params: mask (N, N/THREADS_PER_BLOCK_NMS) + + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + + __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5]; + + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0]; + block_boxes[threadIdx.x * 5 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1]; + block_boxes[threadIdx.x * 5 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2]; + block_boxes[threadIdx.x * 5 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3]; + block_boxes[threadIdx.x * 5 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; + const float *cur_box = boxes + cur_box_idx * 5; + + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh){ + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); + mask[cur_box_idx * col_blocks + col_start] = t; + } +} + + +__device__ inline float iou_normal(float const * const a, float const * const b) { + float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]); + float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]); + float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f); + float interS = width * height; + float Sa = (a[2] - a[0]) * (a[3] - a[1]); + float Sb = (b[2] - b[0]) * (b[3] - b[1]); + return interS / fmaxf(Sa + Sb - interS, EPS); +} + + +__global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_thresh, + const float *boxes, unsigned long long *mask){ + //params: boxes (N, 5) [x1, y1, x2, y2, ry] + //params: mask (N, N/THREADS_PER_BLOCK_NMS) + + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); + + __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5]; + + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0]; + block_boxes[threadIdx.x * 5 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1]; + block_boxes[threadIdx.x * 5 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2]; + block_boxes[threadIdx.x * 5 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3]; + block_boxes[threadIdx.x * 5 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; + const float *cur_box = boxes + cur_box_idx * 5; + + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh){ + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); + mask[cur_box_idx * col_blocks + col_start] = t; + } +} + + + + + +void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){ + + dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) + dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK); + + boxes_overlap_kernel<<>>(num_a, boxes_a, num_b, boxes_b, ans_overlap); +#ifdef DEBUG + cudaDeviceSynchronize(); // for using printf in kernel function +#endif +} + +void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){ + + dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) + dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK); + + boxes_iou_bev_kernel<<>>(num_a, boxes_a, num_b, boxes_b, ans_iou); +} + + +void nmsLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh){ + dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), + DIVUP(boxes_num, THREADS_PER_BLOCK_NMS)); + dim3 threads(THREADS_PER_BLOCK_NMS); + nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes, mask); +} + + +void nmsNormalLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh){ + dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS), + DIVUP(boxes_num, THREADS_PER_BLOCK_NMS)); + dim3 threads(THREADS_PER_BLOCK_NMS); + nms_normal_kernel<<>>(boxes_num, nms_overlap_thresh, boxes, mask); +} diff --git a/mmdet3d/ops/norm.py b/mmdet3d/ops/norm.py new file mode 100644 index 0000000000..c054e62e45 --- /dev/null +++ b/mmdet3d/ops/norm.py @@ -0,0 +1,10 @@ +import torch.nn as nn + +from mmdet.ops.norm import norm_cfg +from .sync_bn import NaiveSyncBatchNorm1d, NaiveSyncBatchNorm2d + +norm_cfg.update({ + 'BN1d': ('bn', nn.BatchNorm1d), + 'naiveSyncBN2d': ('bn', NaiveSyncBatchNorm2d), + 'naiveSyncBN1d': ('bn', NaiveSyncBatchNorm1d), +}) diff --git a/mmdet3d/ops/spconv/__init__.py b/mmdet3d/ops/spconv/__init__.py new file mode 100644 index 0000000000..20214baa7d --- /dev/null +++ b/mmdet3d/ops/spconv/__init__.py @@ -0,0 +1,37 @@ +# Copyright 2019 Yan Yan +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d, + SparseConvTranspose3d, SparseInverseConv2d, + SparseInverseConv3d, SubMConv2d, SubMConv3d) +from .modules import SparseModule, SparseSequential +from .pool import SparseMaxPool2d, SparseMaxPool3d +from .structure import SparseConvTensor, scatter_nd + +__all__ = [ + 'SparseConv2d', + 'SparseConv3d', + 'SubMConv2d', + 'SubMConv3d', + 'SparseConvTranspose2d', + 'SparseConvTranspose3d', + 'SparseInverseConv2d', + 'SparseInverseConv3d', + 'SparseModule', + 'SparseSequential', + 'SparseMaxPool2d', + 'SparseMaxPool3d', + 'SparseConvTensor', + 'scatter_nd', +] diff --git a/mmdet3d/ops/spconv/conv.py b/mmdet3d/ops/spconv/conv.py new file mode 100644 index 0000000000..3655749fe6 --- /dev/null +++ b/mmdet3d/ops/spconv/conv.py @@ -0,0 +1,446 @@ +# Copyright 2019 Yan Yan +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import numpy as np +import torch +from torch.nn import init +from torch.nn.parameter import Parameter + +from . import functional as Fsp +from . import ops +from .modules import SparseModule +from .structure import SparseConvTensor + + +def _calculate_fan_in_and_fan_out_hwio(tensor): + dimensions = tensor.ndimension() + if dimensions < 2: + raise ValueError('fan in and fan out can not be computed for tensor' + 'with fewer than 2 dimensions') + + if dimensions == 2: # Linear + fan_in = tensor.size(-2) + fan_out = tensor.size(-1) + else: + num_input_fmaps = tensor.size(-2) + num_output_fmaps = tensor.size(-1) + receptive_field_size = 1 + if tensor.dim() > 2: + receptive_field_size = tensor[..., 0, 0].numel() + fan_in = num_input_fmaps * receptive_field_size + fan_out = num_output_fmaps * receptive_field_size + + return fan_in, fan_out + + +class SparseConvolution(SparseModule): + + def __init__(self, + ndim, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + subm=False, + output_padding=0, + transposed=False, + inverse=False, + indice_key=None, + fused_bn=False): + super(SparseConvolution, self).__init__() + assert groups == 1 + if not isinstance(kernel_size, (list, tuple)): + kernel_size = [kernel_size] * ndim + if not isinstance(stride, (list, tuple)): + stride = [stride] * ndim + if not isinstance(padding, (list, tuple)): + padding = [padding] * ndim + if not isinstance(dilation, (list, tuple)): + dilation = [dilation] * ndim + if not isinstance(output_padding, (list, tuple)): + output_padding = [output_padding] * ndim + + for d, s in zip(dilation, stride): + assert any([s == 1, d == 1]), "don't support this." + + self.ndim = ndim + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.conv1x1 = np.prod(kernel_size) == 1 + self.stride = stride + self.padding = padding + self.dilation = dilation + self.transposed = transposed + self.inverse = inverse + self.output_padding = output_padding + self.groups = groups + self.subm = subm + self.indice_key = indice_key + self.fused_bn = fused_bn + + self.weight = Parameter( + torch.Tensor(*kernel_size, in_channels, out_channels)) + if bias: + self.bias = Parameter(torch.Tensor(out_channels)) + else: + self.register_parameter('bias', None) + self.reset_parameters() + + def reset_parameters(self): + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + if self.bias is not None: + fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight) + bound = 1 / math.sqrt(fan_in) + init.uniform_(self.bias, -bound, bound) + + def forward(self, input): + assert isinstance(input, SparseConvTensor) + features = input.features + device = features.device + indices = input.indices + spatial_shape = input.spatial_shape + batch_size = input.batch_size + if not self.subm: + if self.transposed: + out_spatial_shape = ops.get_deconv_output_size( + spatial_shape, self.kernel_size, self.stride, self.padding, + self.dilation, self.output_padding) + else: + out_spatial_shape = ops.get_conv_output_size( + spatial_shape, self.kernel_size, self.stride, self.padding, + self.dilation) + + else: + out_spatial_shape = spatial_shape + # input.update_grid(out_spatial_shape) + # t = time.time() + if self.conv1x1: + features = torch.mm( + input.features, + self.weight.view(self.in_channels, self.out_channels)) + if self.bias is not None: + features += self.bias + out_tensor = SparseConvTensor(features, input.indices, + input.spatial_shape, + input.batch_size) + out_tensor.indice_dict = input.indice_dict + out_tensor.grid = input.grid + return out_tensor + datas = input.find_indice_pair(self.indice_key) + if self.inverse: + assert datas is not None and self.indice_key is not None + _, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas + assert indice_pairs.shape[0] == np.prod( + self.kernel_size + ), 'inverse conv must have same kernel size as its couple conv' + else: + if self.indice_key is not None and datas is not None: + outids, _, indice_pairs, indice_pair_num, _ = datas + else: + outids, indice_pairs, indice_pair_num = ops.get_indice_pairs( + indices, + batch_size, + spatial_shape, + self.kernel_size, + self.stride, + self.padding, + self.dilation, + self.output_padding, + self.subm, + self.transposed, + grid=input.grid) + input.indice_dict[self.indice_key] = (outids, indices, + indice_pairs, + indice_pair_num, + spatial_shape) + if self.fused_bn: + assert self.bias is not None + out_features = ops.fused_indice_conv(features, self.weight, + self.bias, + indice_pairs.to(device), + indice_pair_num, + outids.shape[0], self.inverse, + self.subm) + else: + if self.subm: + out_features = Fsp.indice_subm_conv(features, self.weight, + indice_pairs.to(device), + indice_pair_num, + outids.shape[0]) + else: + if self.inverse: + out_features = Fsp.indice_inverse_conv( + features, self.weight, indice_pairs.to(device), + indice_pair_num, outids.shape[0]) + else: + out_features = Fsp.indice_conv(features, self.weight, + indice_pairs.to(device), + indice_pair_num, + outids.shape[0]) + + if self.bias is not None: + out_features += self.bias + out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape, + batch_size) + out_tensor.indice_dict = input.indice_dict + out_tensor.grid = input.grid + return out_tensor + + +class SparseConv2d(SparseConvolution): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + indice_key=None): + super(SparseConv2d, self).__init__( + 2, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + indice_key=indice_key) + + +class SparseConv3d(SparseConvolution): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + indice_key=None): + super(SparseConv3d, self).__init__( + 3, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + indice_key=indice_key) + + +class SparseConv4d(SparseConvolution): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + indice_key=None): + super(SparseConv4d, self).__init__( + 4, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + indice_key=indice_key) + + +class SparseConvTranspose2d(SparseConvolution): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + indice_key=None): + super(SparseConvTranspose2d, self).__init__( + 2, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + transposed=True, + indice_key=indice_key) + + +class SparseConvTranspose3d(SparseConvolution): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + indice_key=None): + super(SparseConvTranspose3d, self).__init__( + 3, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + transposed=True, + indice_key=indice_key) + + +class SparseInverseConv2d(SparseConvolution): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + indice_key, + bias=True): + super(SparseInverseConv2d, self).__init__( + 2, + in_channels, + out_channels, + kernel_size, + bias=bias, + inverse=True, + indice_key=indice_key) + + +class SparseInverseConv3d(SparseConvolution): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + indice_key, + bias=True): + super(SparseInverseConv3d, self).__init__( + 3, + in_channels, + out_channels, + kernel_size, + bias=bias, + inverse=True, + indice_key=indice_key) + + +class SubMConv2d(SparseConvolution): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + indice_key=None): + super(SubMConv2d, self).__init__( + 2, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + True, + indice_key=indice_key) + + +class SubMConv3d(SparseConvolution): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + indice_key=None): + super(SubMConv3d, self).__init__( + 3, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + True, + indice_key=indice_key) + + +class SubMConv4d(SparseConvolution): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + indice_key=None): + super(SubMConv4d, self).__init__( + 4, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + True, + indice_key=indice_key) diff --git a/mmdet3d/ops/spconv/functional.py b/mmdet3d/ops/spconv/functional.py new file mode 100644 index 0000000000..92daf190dc --- /dev/null +++ b/mmdet3d/ops/spconv/functional.py @@ -0,0 +1,98 @@ +# Copyright 2019 Yan Yan +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from torch.autograd import Function + +from . import ops as ops + + +class SparseConvFunction(Function): + + @staticmethod + def forward(ctx, features, filters, indice_pairs, indice_pair_num, + num_activate_out): + ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters) + return ops.indice_conv(features, filters, indice_pairs, + indice_pair_num, num_activate_out, False) + + @staticmethod + def backward(ctx, grad_output): + indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors + input_bp, filters_bp = ops.indice_conv_backward( + features, filters, grad_output, indice_pairs, indice_pair_num, + False) + + return input_bp, filters_bp, None, None, None + + +class SparseInverseConvFunction(Function): + + @staticmethod + def forward(ctx, features, filters, indice_pairs, indice_pair_num, + num_activate_out): + ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters) + return ops.indice_conv(features, filters, indice_pairs, + indice_pair_num, num_activate_out, True, False) + + @staticmethod + def backward(ctx, grad_output): + indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors + input_bp, filters_bp = ops.indice_conv_backward( + features, filters, grad_output, indice_pairs, indice_pair_num, + True, False) + + return input_bp, filters_bp, None, None, None + + +class SubMConvFunction(Function): + + @staticmethod + def forward(ctx, features, filters, indice_pairs, indice_pair_num, + num_activate_out): + ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters) + return ops.indice_conv(features, filters, indice_pairs, + indice_pair_num, num_activate_out, False, True) + + @staticmethod + def backward(ctx, grad_output): + indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors + input_bp, filters_bp = ops.indice_conv_backward( + features, filters, grad_output, indice_pairs, indice_pair_num, + False, True) + + return input_bp, filters_bp, None, None, None + + +class SparseMaxPoolFunction(Function): + + @staticmethod + def forward(ctx, features, indice_pairs, indice_pair_num, + num_activate_out): + out = ops.indice_maxpool(features, indice_pairs, indice_pair_num, + num_activate_out) + ctx.save_for_backward(indice_pairs, indice_pair_num, features, out) + return out + + @staticmethod + def backward(ctx, grad_output): + indice_pairs, indice_pair_num, features, out = ctx.saved_tensors + input_bp = ops.indice_maxpool_backward(features, out, grad_output, + indice_pairs, indice_pair_num) + return input_bp, None, None, None + + +indice_conv = SparseConvFunction.apply +indice_inverse_conv = SparseInverseConvFunction.apply +indice_subm_conv = SubMConvFunction.apply +indice_maxpool = SparseMaxPoolFunction.apply diff --git a/mmdet3d/ops/spconv/include/paramsgrid.h b/mmdet3d/ops/spconv/include/paramsgrid.h new file mode 100644 index 0000000000..9dafd417af --- /dev/null +++ b/mmdet3d/ops/spconv/include/paramsgrid.h @@ -0,0 +1,62 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PARAMS_GRID_H_ +#define PARAMS_GRID_H_ +#include +#include + +namespace detail { +template int getTotalSize(std::vector arg) { return arg.size(); } + +template +int getTotalSize(std::vector arg, std::vector... args) { + return arg.size() * getTotalSize(args...); +} +template int getSize(std::vector arg) { return arg.size(); } + +template +void assigner(TT &src, std::vector counter, std::vector &arg) { + std::get(src) = arg[counter[Idx]]; +} + +template +void assigner(TT &src, std::vector counter, std::vector &arg, + std::vector &... args) { + std::get(src) = arg[counter[Idx]]; + assigner(src, counter, args...); +} +} // namespace detail +template +std::vector> paramsGrid(std::vector... args) { + int length = detail::getTotalSize(args...); + std::vector sizes = {detail::getSize(args)...}; + int size = sizes.size(); + + std::vector> params(length); + std::vector counter(size); + for (int i = 0; i < length; ++i) { + detail::assigner<0>(params[i], counter, args...); + counter[size - 1] += 1; + for (int c = size - 1; c >= 0; --c) { + if (counter[c] == sizes[c] && c > 0) { + counter[c - 1] += 1; + counter[c] = 0; + } + } + } + return params; +} + +#endif diff --git a/mmdet3d/ops/spconv/include/prettyprint.h b/mmdet3d/ops/spconv/include/prettyprint.h new file mode 100644 index 0000000000..0bc06189f3 --- /dev/null +++ b/mmdet3d/ops/spconv/include/prettyprint.h @@ -0,0 +1,445 @@ +// Copyright Louis Delacroix 2010 - 2014. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// +// A pretty printing library for C++ +// +// Usage: +// Include this header, and operator<< will "just work". + +#ifndef H_PRETTY_PRINT +#define H_PRETTY_PRINT + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace pretty_print +{ + namespace detail + { + // SFINAE type trait to detect whether T::const_iterator exists. + + struct sfinae_base + { + using yes = char; + using no = yes[2]; + }; + + template + struct has_const_iterator : private sfinae_base + { + private: + template static yes & test(typename C::const_iterator*); + template static no & test(...); + public: + static const bool value = sizeof(test(nullptr)) == sizeof(yes); + using type = T; + }; + + template + struct has_begin_end : private sfinae_base + { + private: + template + static yes & f(typename std::enable_if< + std::is_same(&C::begin)), + typename C::const_iterator(C::*)() const>::value>::type *); + + template static no & f(...); + + template + static yes & g(typename std::enable_if< + std::is_same(&C::end)), + typename C::const_iterator(C::*)() const>::value, void>::type*); + + template static no & g(...); + + public: + static bool const beg_value = sizeof(f(nullptr)) == sizeof(yes); + static bool const end_value = sizeof(g(nullptr)) == sizeof(yes); + }; + + } // namespace detail + + + // Holds the delimiter values for a specific character type + + template + struct delimiters_values + { + using char_type = TChar; + const char_type * prefix; + const char_type * delimiter; + const char_type * postfix; + }; + + + // Defines the delimiter values for a specific container and character type + + template + struct delimiters + { + using type = delimiters_values; + static const type values; + }; + + + // Functor to print containers. You can use this directly if you want + // to specificy a non-default delimiters type. The printing logic can + // be customized by specializing the nested template. + + template , + typename TDelimiters = delimiters> + struct print_container_helper + { + using delimiters_type = TDelimiters; + using ostream_type = std::basic_ostream; + + template + struct printer + { + static void print_body(const U & c, ostream_type & stream) + { + using std::begin; + using std::end; + + auto it = begin(c); + const auto the_end = end(c); + + if (it != the_end) + { + for ( ; ; ) + { + stream << *it; + + if (++it == the_end) break; + + if (delimiters_type::values.delimiter != NULL) + stream << delimiters_type::values.delimiter; + } + } + } + }; + + print_container_helper(const T & container) + : container_(container) + { } + + inline void operator()(ostream_type & stream) const + { + if (delimiters_type::values.prefix != NULL) + stream << delimiters_type::values.prefix; + + printer::print_body(container_, stream); + + if (delimiters_type::values.postfix != NULL) + stream << delimiters_type::values.postfix; + } + + private: + const T & container_; + }; + + // Specialization for pairs + + template + template + struct print_container_helper::printer> + { + using ostream_type = typename print_container_helper::ostream_type; + + static void print_body(const std::pair & c, ostream_type & stream) + { + stream << c.first; + if (print_container_helper::delimiters_type::values.delimiter != NULL) + stream << print_container_helper::delimiters_type::values.delimiter; + stream << c.second; + } + }; + + // Specialization for tuples + + template + template + struct print_container_helper::printer> + { + using ostream_type = typename print_container_helper::ostream_type; + using element_type = std::tuple; + + template struct Int { }; + + static void print_body(const element_type & c, ostream_type & stream) + { + tuple_print(c, stream, Int<0>()); + } + + static void tuple_print(const element_type &, ostream_type &, Int) + { + } + + static void tuple_print(const element_type & c, ostream_type & stream, + typename std::conditional, std::nullptr_t>::type) + { + stream << std::get<0>(c); + tuple_print(c, stream, Int<1>()); + } + + template + static void tuple_print(const element_type & c, ostream_type & stream, Int) + { + if (print_container_helper::delimiters_type::values.delimiter != NULL) + stream << print_container_helper::delimiters_type::values.delimiter; + + stream << std::get(c); + + tuple_print(c, stream, Int()); + } + }; + + // Prints a print_container_helper to the specified stream. + + template + inline std::basic_ostream & operator<<( + std::basic_ostream & stream, + const print_container_helper & helper) + { + helper(stream); + return stream; + } + + + // Basic is_container template; specialize to derive from std::true_type for all desired container types + + template + struct is_container : public std::integral_constant::value && + detail::has_begin_end::beg_value && + detail::has_begin_end::end_value> { }; + + template + struct is_container : std::true_type { }; + + template + struct is_container : std::false_type { }; + + template + struct is_container> : std::true_type { }; + + template + struct is_container> : std::true_type { }; + + template + struct is_container> : std::true_type { }; + + + // Default delimiters + + template struct delimiters { static const delimiters_values values; }; + template const delimiters_values delimiters::values = { "[", ", ", "]" }; + template struct delimiters { static const delimiters_values values; }; + template const delimiters_values delimiters::values = { L"[", L", ", L"]" }; + + + // Delimiters for (multi)set and unordered_(multi)set + + template + struct delimiters< ::std::set, char> { static const delimiters_values values; }; + + template + const delimiters_values delimiters< ::std::set, char>::values = { "{", ", ", "}" }; + + template + struct delimiters< ::std::set, wchar_t> { static const delimiters_values values; }; + + template + const delimiters_values delimiters< ::std::set, wchar_t>::values = { L"{", L", ", L"}" }; + + template + struct delimiters< ::std::multiset, char> { static const delimiters_values values; }; + + template + const delimiters_values delimiters< ::std::multiset, char>::values = { "{", ", ", "}" }; + + template + struct delimiters< ::std::multiset, wchar_t> { static const delimiters_values values; }; + + template + const delimiters_values delimiters< ::std::multiset, wchar_t>::values = { L"{", L", ", L"}" }; + + template + struct delimiters< ::std::unordered_set, char> { static const delimiters_values values; }; + + template + const delimiters_values delimiters< ::std::unordered_set, char>::values = { "{", ", ", "}" }; + + template + struct delimiters< ::std::unordered_set, wchar_t> { static const delimiters_values values; }; + + template + const delimiters_values delimiters< ::std::unordered_set, wchar_t>::values = { L"{", L", ", L"}" }; + + template + struct delimiters< ::std::unordered_multiset, char> { static const delimiters_values values; }; + + template + const delimiters_values delimiters< ::std::unordered_multiset, char>::values = { "{", ", ", "}" }; + + template + struct delimiters< ::std::unordered_multiset, wchar_t> { static const delimiters_values values; }; + + template + const delimiters_values delimiters< ::std::unordered_multiset, wchar_t>::values = { L"{", L", ", L"}" }; + + + // Delimiters for pair and tuple + + template struct delimiters, char> { static const delimiters_values values; }; + template const delimiters_values delimiters, char>::values = { "(", ", ", ")" }; + template struct delimiters< ::std::pair, wchar_t> { static const delimiters_values values; }; + template const delimiters_values delimiters< ::std::pair, wchar_t>::values = { L"(", L", ", L")" }; + + template struct delimiters, char> { static const delimiters_values values; }; + template const delimiters_values delimiters, char>::values = { "(", ", ", ")" }; + template struct delimiters< ::std::tuple, wchar_t> { static const delimiters_values values; }; + template const delimiters_values delimiters< ::std::tuple, wchar_t>::values = { L"(", L", ", L")" }; + + + // Type-erasing helper class for easy use of custom delimiters. + // Requires TCharTraits = std::char_traits and TChar = char or wchar_t, and MyDelims needs to be defined for TChar. + // Usage: "cout << pretty_print::custom_delims(x)". + + struct custom_delims_base + { + virtual ~custom_delims_base() { } + virtual std::ostream & stream(::std::ostream &) = 0; + virtual std::wostream & stream(::std::wostream &) = 0; + }; + + template + struct custom_delims_wrapper : custom_delims_base + { + custom_delims_wrapper(const T & t_) : t(t_) { } + + std::ostream & stream(std::ostream & s) + { + return s << print_container_helper, Delims>(t); + } + + std::wostream & stream(std::wostream & s) + { + return s << print_container_helper, Delims>(t); + } + + private: + const T & t; + }; + + template + struct custom_delims + { + template + custom_delims(const Container & c) : base(new custom_delims_wrapper(c)) { } + + std::unique_ptr base; + }; + + template + inline std::basic_ostream & operator<<(std::basic_ostream & s, const custom_delims & p) + { + return p.base->stream(s); + } + + + // A wrapper for a C-style array given as pointer-plus-size. + // Usage: std::cout << pretty_print_array(arr, n) << std::endl; + + template + struct array_wrapper_n + { + typedef const T * const_iterator; + typedef T value_type; + + array_wrapper_n(const T * const a, size_t n) : _array(a), _n(n) { } + inline const_iterator begin() const { return _array; } + inline const_iterator end() const { return _array + _n; } + + private: + const T * const _array; + size_t _n; + }; + + + // A wrapper for hash-table based containers that offer local iterators to each bucket. + // Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket 5 of container m.) + + template + struct bucket_print_wrapper + { + typedef typename T::const_local_iterator const_iterator; + typedef typename T::size_type size_type; + + const_iterator begin() const + { + return m_map.cbegin(n); + } + + const_iterator end() const + { + return m_map.cend(n); + } + + bucket_print_wrapper(const T & m, size_type bucket) : m_map(m), n(bucket) { } + + private: + const T & m_map; + const size_type n; + }; + +} // namespace pretty_print + + +// Global accessor functions for the convenience wrappers + +template +inline pretty_print::array_wrapper_n pretty_print_array(const T * const a, size_t n) +{ + return pretty_print::array_wrapper_n(a, n); +} + +template pretty_print::bucket_print_wrapper +bucket_print(const T & m, typename T::size_type n) +{ + return pretty_print::bucket_print_wrapper(m, n); +} + + +// Main magic entry point: An overload snuck into namespace std. +// Can we do better? + +namespace std +{ + // Prints a container to the stream using default delimiters + + template + inline typename enable_if< ::pretty_print::is_container::value, + basic_ostream &>::type + operator<<(basic_ostream & stream, const T & container) + { + return stream << ::pretty_print::print_container_helper(container); + } +} + + + +#endif // H_PRETTY_PRINT diff --git a/mmdet3d/ops/spconv/include/pybind11_utils.h b/mmdet3d/ops/spconv/include/pybind11_utils.h new file mode 100644 index 0000000000..d23a0f6dbf --- /dev/null +++ b/mmdet3d/ops/spconv/include/pybind11_utils.h @@ -0,0 +1,61 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include // everything needed for embedding +#include +#include +#include +#include + +#include + +namespace py = pybind11; + +template +std::vector array2Vector(TPyObject arr){ + py::array arr_np = arr; + size_t size = arr.attr("size").template cast(); + py::array_t arr_cc = arr_np; + std::vector data(arr_cc.data(), arr_cc.data() + size); + return data; +} + +template +std::vector arrayT2Vector(py::array_t arr) +{ + std::vector data(arr.data(), arr.data() + arr.size()); + return data; +} + +template +tv::TensorView array2TensorView(TPyObject arr){ + py::array arr_np = arr; + py::array_t arr_cc = arr_np; + tv::Shape shape; + for (int i = 0; i < arr_cc.ndim(); ++i){ + shape.push_back(arr_cc.shape(i)); + } + return tv::TensorView(arr_cc.mutable_data(), shape); +} +template +tv::TensorView arrayT2TensorView(py::array_t arr){ + tv::Shape shape; + for (int i = 0; i < arr.ndim(); ++i){ + shape.push_back(arr.shape(i)); + } + return tv::TensorView(arr.mutable_data(), shape); +} diff --git a/mmdet3d/ops/spconv/include/spconv/box_iou.h b/mmdet3d/ops/spconv/include/spconv/box_iou.h new file mode 100644 index 0000000000..937013374b --- /dev/null +++ b/mmdet3d/ops/spconv/include/spconv/box_iou.h @@ -0,0 +1,157 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#ifndef BOX_IOU_H +#define BOX_IOU_H + +#include +// must include pybind11/eigen.h if using eigen matrix as arguments. +#include +#include +#include + +namespace spconv { +// #include "voxelnet/core/cc/pybind11_helper.h" +namespace py = pybind11; +using namespace pybind11::literals; +template +inline py::array_t constant(ShapeContainer shape, DType value) { + // create ROWMAJOR array. + py::array_t array(shape); + std::fill(array.mutable_data(), array.mutable_data() + array.size(), value); + return array; +} + +template +inline py::array_t zeros(std::vector shape) { + return constant>(shape, 0); +} + +template +py::array_t +rbbox_iou(py::array_t box_corners, py::array_t qbox_corners, + py::array_t standup_iou, DType standup_thresh) { + namespace bg = boost::geometry; + typedef bg::model::point point_t; + typedef bg::model::polygon polygon_t; + polygon_t poly, qpoly; + std::vector poly_inter, poly_union; + DType inter_area, union_area; + auto box_corners_r = box_corners.template unchecked<3>(); + auto qbox_corners_r = qbox_corners.template unchecked<3>(); + auto standup_iou_r = standup_iou.template unchecked<2>(); + auto N = box_corners_r.shape(0); + auto K = qbox_corners_r.shape(0); + py::array_t overlaps = zeros({int(N), int(K)}); + auto overlaps_rw = overlaps.template mutable_unchecked<2>(); + if (N == 0 || K == 0) { + return overlaps; + } + for (int k = 0; k < K; ++k) { + for (int n = 0; n < N; ++n) { + if (standup_iou_r(n, k) <= standup_thresh) + continue; + bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1))); + bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1))); + bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1))); + bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1))); + bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1))); + bg::append(qpoly, + point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1))); + bg::append(qpoly, + point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1))); + bg::append(qpoly, + point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1))); + bg::append(qpoly, + point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1))); + bg::append(qpoly, + point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1))); + + bg::intersection(poly, qpoly, poly_inter); + + if (!poly_inter.empty()) { + inter_area = bg::area(poly_inter.front()); + bg::union_(poly, qpoly, poly_union); + if (!poly_union.empty()) { + union_area = bg::area(poly_union.front()); + overlaps_rw(n, k) = inter_area / union_area; + } + poly_union.clear(); + } + poly.clear(); + qpoly.clear(); + poly_inter.clear(); + } + } + return overlaps; +} + +template +py::array_t +rbbox_intersection(py::array_t box_corners, py::array_t qbox_corners, + py::array_t standup_iou, DType standup_thresh) { + namespace bg = boost::geometry; + typedef bg::model::point point_t; + typedef bg::model::polygon polygon_t; + polygon_t poly, qpoly; + std::vector poly_inter, poly_union; + DType inter_area, union_area; + auto box_corners_r = box_corners.template unchecked<3>(); + auto qbox_corners_r = qbox_corners.template unchecked<3>(); + auto standup_iou_r = standup_iou.template unchecked<2>(); + auto N = box_corners_r.shape(0); + auto K = qbox_corners_r.shape(0); + py::array_t overlaps = zeros({int(N), int(K)}); + auto overlaps_rw = overlaps.template mutable_unchecked<2>(); + if (N == 0 || K == 0) { + return overlaps; + } + for (int k = 0; k < K; ++k) { + for (int n = 0; n < N; ++n) { + if (standup_iou_r(n, k) <= standup_thresh) + continue; + bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1))); + bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1))); + bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1))); + bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1))); + bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1))); + bg::append(qpoly, + point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1))); + bg::append(qpoly, + point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1))); + bg::append(qpoly, + point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1))); + bg::append(qpoly, + point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1))); + bg::append(qpoly, + point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1))); + + bg::intersection(poly, qpoly, poly_inter); + + if (!poly_inter.empty()) { + inter_area = bg::area(poly_inter.front()); + overlaps_rw(n, k) = inter_area; + } + poly.clear(); + qpoly.clear(); + poly_inter.clear(); + } + } + return overlaps; +} + + +} // namespace spconv +#endif diff --git a/mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h b/mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h new file mode 100644 index 0000000000..526127d2ac --- /dev/null +++ b/mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h @@ -0,0 +1,127 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef FUSED_SPARSE_CONV_OP_H_ +#define FUSED_SPARSE_CONV_OP_H_ + +#include +#include +#include +#include +#include +#include + +namespace spconv { +// torch.jit's doc says only support int64, so we need to convert to int32. + +template +torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters, torch::Tensor bias, + torch::Tensor indicePairs, torch::Tensor indiceNum, + int64_t numActOut, int64_t _inverse, int64_t _subM) { + bool subM = _subM != 0; + bool inverse = _inverse != 0; + auto device = features.device().type(); + auto ndim = filters.dim() - 2; + auto kernelVolume = indicePairs.size(0); + auto numInPlanes = features.size(1); + auto numOutPlanes = filters.size(ndim + 1); + auto indicePairNumCpu = indiceNum.to({torch::kCPU}); + auto indicePairMaxSizeIter = std::max_element( + indicePairNumCpu.data(), indicePairNumCpu.data() + kernelVolume); + int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data(); + int indicePairMaxSize = *indicePairMaxSizeIter; + + /*if (_subM){ + std::vector indicePairNumVec(indicePairNumCpu.data(), indicePairNumCpu.data() + kernelVolume); + indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset); + + auto indicePairVecMaxSizeIter = std::max_element( + indicePairNumVec.begin(), indicePairNumVec.end()); + indicePairMaxSize = *indicePairVecMaxSizeIter; + }*/ + + auto options = + torch::TensorOptions().dtype(features.dtype()).device(features.device()); + // auto indicePairOptions = + // torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device()); + + torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options).copy_(bias); + torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options); + torch::Tensor outputBuffer = + torch::zeros({indicePairMaxSize, numOutPlanes}, options); + filters = filters.view({-1, numInPlanes, numOutPlanes}); + if (subM) { // the center index of subm conv don't need gather and scatter + // add. + torch::mm_out(output, features, filters[indicePairMaxOffset]); + } + double totalGatherTime = 0; + double totalGEMMTime = 0; + double totalSAddTime = 0; + for (int i = 0; i < kernelVolume; ++i) { + auto nHot = indicePairNumCpu.data()[i]; + if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { + continue; + } + // auto timer = spconv::CudaContextTimer<>(); + auto outputBufferBlob = + torch::from_blob(outputBuffer.data(), {nHot, numOutPlanes}, options); + auto inputBufferBlob = + torch::from_blob(inputBuffer.data(), {nHot, numInPlanes}, options); + + if (device == torch::kCPU) { + functor::SparseGatherFunctor gatherFtor; + gatherFtor(tv::CPU(), tv::torch2tv(inputBuffer), + tv::torch2tv(features), + tv::torch2tv(indicePairs).subview(i, inverse), nHot); + } else { + functor::SparseGatherFunctor gatherFtor; + gatherFtor(tv::TorchGPU(), tv::torch2tv(inputBuffer), + tv::torch2tv(features), + tv::torch2tv(indicePairs).subview(i, inverse), nHot); + TV_CHECK_CUDA_ERR(); + /* slower than SparseGatherFunctor, may due to int->long conversion + auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64); + auto indicePairBlob = torch::from_blob(indicePairLong.data(), {nHot}, + indicePairOptions); + torch::index_select_out(inputBufferBlob, features, 0, + indicePairBlob);*/ + } + // totalGatherTime += timer.report() / 1000.0; + torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]); + // totalGEMMTime += timer.report() / 1000.0; + + if (device == torch::kCPU) { + functor::SparseScatterAddFunctor scatterFtor; + scatterFtor(tv::CPU(), tv::torch2tv(output), + tv::torch2tv(outputBuffer), + tv::torch2tv(indicePairs).subview(i, !inverse), nHot, + true); + } else { + functor::SparseScatterAddFunctor scatterFtor; + scatterFtor(tv::TorchGPU(), tv::torch2tv(output), + tv::torch2tv(outputBuffer), + tv::torch2tv(indicePairs).subview(i, !inverse), nHot, + true); + TV_CHECK_CUDA_ERR(); + } + // totalSAddTime += timer.report() / 1000.0; + } + // std::cout << "gather time " << totalGatherTime << std::endl; + // std::cout << "gemm time " << totalGEMMTime << std::endl; + // std::cout << "scatteradd time " << totalSAddTime << std::endl; + return output; +} +} // namespace spconv + +#endif diff --git a/mmdet3d/ops/spconv/include/spconv/geometry.h b/mmdet3d/ops/spconv/include/spconv/geometry.h new file mode 100644 index 0000000000..e193e037d7 --- /dev/null +++ b/mmdet3d/ops/spconv/include/spconv/geometry.h @@ -0,0 +1,301 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SPCONV_GEOMETRY_H_ +#define SPCONV_GEOMETRY_H_ + +#include +#include +#include + +namespace spconv { +template +TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos, + const Index *kernelSize, + const Index *stride, const Index *padding, + const Index *dilation, + const Index *outSpatialShape, Index *out) { + Index lowers[NDim]; + Index uppers[NDim]; + Index counter[NDim]; + Index counterSize[NDim]; + Index pointCounter = 0; + Index val; + Index numPoints = 1; + Index m, offset; + bool valid = false; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 + + stride[i] + padding[i]) / + stride[i]; + uppers[i] = (input_pos[i] + padding[i]) / stride[i]; + } + +#pragma unroll + for (unsigned i = 0; i < NDim; ++i) { + counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1); + numPoints *= counterSize[i]; + } + +#pragma unroll + for (int i = 0; i < NDim; ++i) { + counter[i] = 0; + } + for (int i = 0; i < numPoints; ++i) { + valid = true; + m = 1; + offset = 0; +#pragma unroll + for (int j = NDim - 1; j >= 0; --j) { + val = uppers[j] - counter[j] * dilation[j]; + out[pointCounter * (NDim + 1) + j] = val; + if (val < 0 || (val > outSpatialShape[j] - 1)) { + valid = false; + // break; + } + offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j]; + m *= kernelSize[j]; + } + + out[pointCounter * (NDim + 1) + NDim] = offset; + if (valid) + ++pointCounter; + counter[NDim - 1] += 1; +#pragma unroll + for (int c = NDim - 1; c >= 0; --c) { + if (counter[c] == counterSize[c] && c > 0) { + counter[c - 1] += 1; + counter[c] = 0; + } + } + } + return pointCounter; +} + +template +TV_HOST_DEVICE Index getValidOutPosTranspose( + const Index *input_pos, const Index *kernelSize, const Index *stride, + const Index *padding, const Index *dilation, const Index *outSpatialShape, + Index *out) { + Index lowers[NDim]; + Index uppers[NDim]; + Index counter[NDim]; + Index counterSize[NDim]; + Index pointCounter = 0; + Index val; + Index numPoints = 1; + Index m, offset; + bool valid = false; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + lowers[i] = input_pos[i] * stride[i] - padding[i]; + uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i]; + } +#pragma unroll + for (unsigned i = 0; i < NDim; ++i) { + counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1); + numPoints *= counterSize[i]; + } +#pragma unroll + for (int i = 0; i < NDim; ++i) { + counter[i] = 0; + } + for (int i = 0; i < numPoints; ++i) { + valid = true; + m = 1; + offset = 0; +#pragma unroll + for (int j = NDim - 1; j >= 0; --j) { + val = uppers[j] - counter[j] * dilation[j]; + out[pointCounter * (NDim + 1) + j] = val; + if (val < 0 || (val > outSpatialShape[j] - 1)) { + valid = false; + // break; + } + offset += m * (val - lowers[j]) / dilation[j]; + m *= kernelSize[j]; + } + out[pointCounter * (NDim + 1) + NDim] = offset; + if (valid) + ++pointCounter; + counter[NDim - 1] += 1; +#pragma unroll + for (int c = NDim - 1; c >= 0; --c) { + if (counter[c] == counterSize[c] && c > 0) { + counter[c - 1] += 1; + counter[c] = 0; + } + } + } + return pointCounter; +} + +template +Index getIndicePairsConv(tv::TensorView indicesIn, + tv::TensorView indicesOut, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const Index *kernelSize, const Index *stride, + const Index *padding, const Index *dilation, + const Index *outSpatialShape) { + // indicesOut: num_active * kernelVolume * (NDim + 1) + Index numAct = 0; + auto numActIn = indicesIn.dim(0); + Index batchIdx = 0; + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + std::vector validPoints_(kernelVolume * (NDim + 1)); + Index* validPoints = validPoints_.data(); + Index *pointPtr = nullptr; + for (int j = 0; j < numActIn; ++j) { + batchIdx = indicesIn(j, 0); + numValidPoints = getValidOutPos( + indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, + dilation, outSpatialShape, validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + auto index = tv::rowArrayIdx(pointPtr, outSpatialShape) + + spatialVolume * batchIdx; + if (gridsOut[index] == -1) { + for (unsigned k = 1; k < NDim + 1; ++k) { + indicesOut(numAct, k) = pointPtr[k - 1]; + } + indicesOut(numAct, 0) = batchIdx; + gridsOut[index] = numAct++; + } + // indicePairs: [K, 2, L] + indicePairs(offset, 0, indiceNum[offset]) = j; + indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; + } + } + return numAct; +} + +template +Index getIndicePairsDeConv(tv::TensorView indicesIn, + tv::TensorView indicesOut, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const Index *kernelSize, const Index *stride, + const Index *padding, const Index *dilation, + const Index *outSpatialShape) { + Index numAct = 0; + auto numActIn = indicesIn.dim(0); + Index batchIdx = 0; + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + std::vector validPoints_(kernelVolume * (NDim + 1)); + Index* validPoints = validPoints_.data(); + Index *pointPtr = nullptr; + for (int j = 0; j < numActIn; ++j) { + batchIdx = indicesIn(j, 0); + numValidPoints = getValidOutPosTranspose( + indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, + dilation, outSpatialShape, validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + auto index = tv::rowArrayIdx(pointPtr, outSpatialShape) + + spatialVolume * batchIdx; + if (gridsOut[index] == -1) { + for (unsigned k = 1; k < NDim + 1; ++k) { + indicesOut(numAct, k) = pointPtr[k - 1]; + } + indicesOut(numAct, 0) = batchIdx; + gridsOut[index] = numAct++; + } + // indicePairs: [K, 2, L] + indicePairs(offset, 0, indiceNum[offset]) = j; + indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; + } + } + return numAct; +} + +template +Index getIndicePairsSubM(tv::TensorView indicesIn, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const Index *const kernelSize, + const Index *const stride, const Index *const padding, + const Index *dilation, const Index *const outSpatialShape) { + Index numAct = 0; + auto numActIn = indicesIn.dim(0); + Index batchIdx = 0; + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + // Index validPoints[kernelVolume * (NDim + 1)]; + std::vector validPoints_(kernelVolume * (NDim + 1)); + Index* validPoints = validPoints_.data(); + Index *pointPtr = nullptr; + Index index = 0; + for (int j = 0; j < numActIn; ++j) { + index = tv::rowArrayIdx(indicesIn.data() + j * (NDim + 1) + 1, + outSpatialShape) + + spatialVolume * indicesIn(j, 0); + gridsOut[index] = j; + } + for (int j = 0; j < numActIn; ++j) { + numValidPoints = getValidOutPos( + indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, + dilation, outSpatialShape, validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + index = tv::rowArrayIdx(pointPtr, outSpatialShape) + + spatialVolume * indicesIn(j, 0); + if (gridsOut[index] > -1) { + indicePairs(offset, 0, indiceNum[offset]) = j; + indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; + } + } + } + return numActIn; +} + +} // namespace spconv + +#endif diff --git a/mmdet3d/ops/spconv/include/spconv/indice.cu.h b/mmdet3d/ops/spconv/include/spconv/indice.cu.h new file mode 100644 index 0000000000..b9ceaef409 --- /dev/null +++ b/mmdet3d/ops/spconv/include/spconv/indice.cu.h @@ -0,0 +1,243 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef INDICE_CU_H_ +#define INDICE_CU_H_ +#include +#include +#include + +namespace spconv { +template +__global__ void prepareIndicePairsKernel( + tv::TensorView indicesIn, tv::TensorView indicesOut, + tv::TensorView gridsOut, tv::TensorView indicePairs, + tv::TensorView indiceNum, tv::TensorView indicePairUnique, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + Index validPoints[KernelMaxVolume * (NDim + 1)]; + Index *pointPtr = nullptr; + auto indicePairsDim2 = indicePairs.dim(2); + Index index; + for (int ix : tv::KernelLoopX(numActIn)) { + numValidPoints = getValidOutPos( + indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), + stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), + validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); + indicePairs(offset, 0, oldNum) = ix; + index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + + spatialVolume * indicesIn(ix, 0); + indicePairs(offset, 1, oldNum) = index; + indicePairUnique[offset * indicePairsDim2 + oldNum] = index; + } + } +} + +template +__global__ void prepareDeConvIndicePairsKernel( + tv::TensorView indicesIn, tv::TensorView indicesOut, + tv::TensorView gridsOut, tv::TensorView indicePairs, + tv::TensorView indiceNum, tv::TensorView indicePairUnique, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + Index validPoints[KernelMaxVolume * (NDim + 1)]; + Index *pointPtr = nullptr; + auto indicePairsDim2 = indicePairs.dim(2); + Index index; + for (int ix : tv::KernelLoopX(numActIn)) { + numValidPoints = getValidOutPosTranspose( + indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), + stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), + validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); + indicePairs(offset, 0, oldNum) = ix; + index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + + spatialVolume * indicesIn(ix, 0); + indicePairs(offset, 1, oldNum) = index; + indicePairUnique[offset * indicePairsDim2 + oldNum] = index; + } + } +} + +template +__global__ void assignGridAndIndiceOutKernel( + tv::TensorView indicesOut, tv::TensorView gridsOut, + int numAct, tv::TensorView indicePairs, + tv::TensorView indicePairUnique, + const tv::SimpleVector outSpatialShape, int batchSize) { + + Index index; + auto indicesOutPtr = indicesOut.data(); + for (int ix : tv::KernelLoopX(numAct)) { + index = indicePairUnique[ix]; + gridsOut[index] = ix; + index = tv::rowArrayIdxInv( + index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data()); + indicesOut[ix * (NDim + 1)] = index % batchSize; + } +} + +template +__global__ void +assignIndicePairsKernel(tv::TensorView indicesOut, + tv::TensorView gridsOut, int numActIn, + tv::TensorView indicePairs, + tv::TensorView indicePairUnique, + const tv::SimpleVector outSpatialShape) { + + Index index; + int kernelVolume = indicePairs.dim(0); + for (int ix : tv::KernelLoopX(numActIn)) { + for (int i = 0; i < kernelVolume; ++i) { + index = indicePairs(i, 1, ix); + if (index > -1) { + indicePairs(i, 1, ix) = gridsOut[index]; + } + } + } +} + +template +__global__ void +prepareSubMGridKernel(tv::TensorView indicesIn, + tv::TensorView gridsOut, + const tv::SimpleVector outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index index = 0; + for (int ix : tv::KernelLoopX(numActIn)) { + index = tv::rowArrayIdx(indicesIn.data() + ix * (NDim + 1) + 1, + outSpatialShape.data()) + + spatialVolume * indicesIn(ix, 0); + gridsOut[index] = ix; + } +} + +template +__global__ void getSubMIndicePairsKernel( + tv::TensorView indicesIn, tv::TensorView gridsOut, + tv::TensorView indicePairs, tv::TensorView indiceNum, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index numValidPoints = 0; + Index validPoints[KernelMaxVolume * (NDim + 1)]; + Index *pointPtr = nullptr; + Index index = 0; + for (int ix : tv::KernelLoopX(numActIn)) { + numValidPoints = getValidOutPos( + indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), + stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), + validPoints); + for (int i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + + spatialVolume * indicesIn(ix, 0); + if (gridsOut[index] > -1) { + auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); + indicePairs(offset, 1, oldNum) = gridsOut[index]; + indicePairs(offset, 0, oldNum) = ix; + } + } + } +} + +template +__global__ void resetGridKernel(const Index *indicePairUnique, + tv::TensorView gridsOut, + int numAct) { + for (int ix : tv::KernelLoopX(numAct)) { + gridsOut[indicePairUnique[ix]] = -1; + } +} + +template +__global__ void +resetGridSubMKernel(const Index *indices, tv::TensorView gridsOut, + const tv::SimpleVector outSpatialShape, + int numAct) { + int outSpatialShapeReg[NDim]; + for (int i = 0; i < NDim; ++i) { + outSpatialShapeReg[i] = outSpatialShape[i]; + } + Index spatialVolume = 1; + auto indsPtr = indices; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index index; + for (int ix : tv::KernelLoopX(numAct)) { + indsPtr = indices + ix * (NDim + 1); + index = tv::rowArrayIdx(indsPtr + 1, outSpatialShapeReg); + gridsOut[index + spatialVolume * indsPtr[0]] = -1; + } +} + +} // namespace spconv + +#endif diff --git a/mmdet3d/ops/spconv/include/spconv/indice.h b/mmdet3d/ops/spconv/include/spconv/indice.h new file mode 100644 index 0000000000..809c56f734 --- /dev/null +++ b/mmdet3d/ops/spconv/include/spconv/indice.h @@ -0,0 +1,79 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_ +#define SPARSE_CONV_INDICE_FUNCTOR_H_ +#include + +namespace spconv +{ +namespace functor +{ +template +struct CreateConvIndicePairFunctorP1 +{ + Index operator()( + const Device& d, tv::TensorView indicesIn, + tv::TensorView indicesOut, tv::TensorView gridsOut, + tv::TensorView indicePairs, tv::TensorView indiceNum, + tv::TensorView indicePairUnique, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape, bool transpose); +}; + +template +struct CreateConvIndicePairFunctorP2 +{ + Index operator()( + const Device& d, tv::TensorView indicesIn, + tv::TensorView indicesOut, tv::TensorView gridsOut, + tv::TensorView indicePairs, tv::TensorView indiceNum, + tv::TensorView indicePairUnique, + const tv::SimpleVector outSpatialShape, bool transpose, + bool resetGrid=false); +}; + +template +struct CreateConvIndicePairFunctor +{ + Index operator()( + const Device& d, tv::TensorView indicesIn, + tv::TensorView indicesOut, tv::TensorView gridsOut, + tv::TensorView indicePairs, tv::TensorView indiceNum, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape, bool transpose, bool resetGrid=false); +}; + +template +struct CreateSubMIndicePairFunctor +{ + Index operator()( + const Device& d, tv::TensorView indicesIn, tv::TensorView gridsOut, + tv::TensorView indicePairs, tv::TensorView indiceNum, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape, bool transpose, bool resetGrid=false); +}; +} // namespace functor +} // namespace spconv + +#endif diff --git a/mmdet3d/ops/spconv/include/spconv/maxpool.h b/mmdet3d/ops/spconv/include/spconv/maxpool.h new file mode 100644 index 0000000000..5ee91353da --- /dev/null +++ b/mmdet3d/ops/spconv/include/spconv/maxpool.h @@ -0,0 +1,44 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SPARSE_MAXPOOL_FUNCTOR_H_ +#define SPARSE_MAXPOOL_FUNCTOR_H_ +#include + +namespace spconv +{ +namespace functor +{ +template +struct SparseMaxPoolForwardFunctor +{ + void operator()(const Device& d, tv::TensorView outFeatures, + tv::TensorView inFeatures, + tv::TensorView indices, int size); +}; + +template +struct SparseMaxPoolBackwardFunctor +{ + void operator()(const Device& d, tv::TensorView outFeatures, + tv::TensorView inFeatures, + tv::TensorView dout, + tv::TensorView din, + tv::TensorView indices, int size); +}; + +} // namespace functor +} // namespace spconv + +#endif diff --git a/mmdet3d/ops/spconv/include/spconv/mp_helper.h b/mmdet3d/ops/spconv/include/spconv/mp_helper.h new file mode 100644 index 0000000000..cff8dccffe --- /dev/null +++ b/mmdet3d/ops/spconv/include/spconv/mp_helper.h @@ -0,0 +1,47 @@ +#ifndef MP_HELPER_H_ +#define MP_HELPER_H_ +#include +#include + +namespace spconv { +template struct mp_list {}; + +template +using mp_list_c = mp_list...>; + +namespace detail { + +template +constexpr F mp_for_each_impl(mp_list, F &&f) { + return std::initializer_list{(f(T()), 0)...}, std::forward(f); +} + +template constexpr F mp_for_each_impl(mp_list<>, F &&f) { + return std::forward(f); +} + +} // namespace detail + +namespace detail { + +template class B> struct mp_rename_impl { + // An error "no type named 'type'" here means that the first argument to + // mp_rename is not a list +}; + +template