From d1aac35d68a203955a32bca4635429f620fc08dd Mon Sep 17 00:00:00 2001
From: zhangwenwei <wayne.zw@outlook.com>
Date: Tue, 14 Apr 2020 21:21:42 +0800
Subject: [PATCH] Initial commit

---
 .gitignore                                    |  127 ++
 .gitlab-ci.yml                                |   43 +
 .isort.cfg                                    |    8 +
 .pre-commit-config.yaml                       |   27 +
 .style.yapf                                   |    4 +
 .travis.yml                                   |   43 +
 README.md                                     |   58 +
 ...pn-fusion_adamw_2x8_80e_kitti-3d-3class.py |  283 ++++
 ...intpillars_secfpn_6x8_160e_kitti-3d-car.py |  203 +++
 ...d_secfpn_2x8_cosine_80e_kitti-3d-3class.py |  231 ++++
 .../dv_second_secfpn_6x8_80e_kitti-3d-car.py  |  199 +++
 ...ffe_1x_kitti-2d-3class_coco-3x-pretrain.py |  194 +++
 ...intpillars_secfpn_6x8_160e_kitti-3d-car.py |  204 +++
 .../hv_second_secfpn_6x8_80e_kitti-3d-car.py  |  197 +++
 .../faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py   |  187 +++
 ...ntpillars_secfpn_sbn-all_4x8_20e_nus-3d.py |  236 ++++
 ...pn_sbn-all_freeze_adamw_1x16_20e_nus-mm.py |  267 ++++
 .../nus/retinanet_r50_fpn_caffe_2x8_1x_nus.py |  138 ++
 docs/CHANGELOG.md                             |  209 +++
 docs/CODE_OF_CONDUCT.md                       |   76 ++
 docs/CONTRIBUTING.md                          |   35 +
 docs/GETTING_STARTED.md                       |  510 ++++++++
 docs/INSTALL.md                               |  161 +++
 docs/MODEL_ZOO.md                             |  532 ++++++++
 docs/Makefile                                 |   20 +
 docs/ROBUSTNESS_BENCHMARKING.md               |  109 ++
 docs/TECHNICAL_DETAILS.md                     |  226 ++++
 docs/conf.py                                  |   70 +
 docs/index.rst                                |   19 +
 docs/make.bat                                 |   35 +
 docs/requirements.txt                         |    4 +
 mmdet3d/__init__.py                           |    3 +
 mmdet3d/apis/__init__.py                      |    5 +
 mmdet3d/apis/train.py                         |  199 +++
 mmdet3d/core/__init__.py                      |    8 +
 mmdet3d/core/anchor/__init__.py               |   19 +
 mmdet3d/core/anchor/anchor_generator.py       |  288 +++++
 mmdet3d/core/bbox/__init__.py                 |   49 +
 mmdet3d/core/bbox/assign_sampling.py          |   43 +
 mmdet3d/core/bbox/assigners/__init__.py       |    8 +
 .../bbox/assigners/approx_max_iou_assigner.py |  114 ++
 mmdet3d/core/bbox/assigners/assign_result.py  |   19 +
 mmdet3d/core/bbox/assigners/base_assigner.py  |    8 +
 .../core/bbox/assigners/max_iou_assigner.py   |  169 +++
 mmdet3d/core/bbox/box_np_ops.py               |  568 ++++++++
 mmdet3d/core/bbox/box_torch_ops.py            |  192 +++
 mmdet3d/core/bbox/coders/__init__.py          |    3 +
 mmdet3d/core/bbox/coders/box_coder.py         |  116 ++
 mmdet3d/core/bbox/geometry.py                 |  131 ++
 mmdet3d/core/bbox/samplers/__init__.py        |   14 +
 mmdet3d/core/bbox/samplers/base_sampler.py    |   78 ++
 .../core/bbox/samplers/combined_sampler.py    |   16 +
 .../samplers/instance_balanced_pos_sampler.py |   41 +
 .../bbox/samplers/iou_balanced_neg_sampler.py |  133 ++
 mmdet3d/core/bbox/samplers/ohem_sampler.py    |   73 ++
 mmdet3d/core/bbox/samplers/pseudo_sampler.py  |   26 +
 mmdet3d/core/bbox/samplers/random_sampler.py  |   53 +
 mmdet3d/core/bbox/samplers/sampling_result.py |   24 +
 mmdet3d/core/bbox/transforms.py               |  269 ++++
 mmdet3d/core/evaluation/__init__.py           |   14 +
 mmdet3d/core/evaluation/bbox_overlaps.py      |   47 +
 mmdet3d/core/evaluation/class_names.py        |  127 ++
 mmdet3d/core/evaluation/coco_utils.py         |  251 ++++
 mmdet3d/core/evaluation/eval_hooks.py         |  204 +++
 .../core/evaluation/kitti_utils/__init__.py   |    3 +
 mmdet3d/core/evaluation/kitti_utils/eval.py   |  814 ++++++++++++
 .../core/evaluation/kitti_utils/rotate_iou.py |  341 +++++
 mmdet3d/core/evaluation/mean_ap.py            |  385 ++++++
 mmdet3d/core/evaluation/recall.py             |  185 +++
 mmdet3d/core/optimizer/__init__.py            |    5 +
 mmdet3d/core/optimizer/builder.py             |  135 ++
 mmdet3d/core/optimizer/mix_optimizer.py       |   99 ++
 mmdet3d/core/optimizer/registry.py            |   23 +
 mmdet3d/core/post_processing/__init__.py      |    8 +
 mmdet3d/core/post_processing/bbox_nms.py      |   68 +
 mmdet3d/core/post_processing/merge_augs.py    |  101 ++
 mmdet3d/core/utils/__init__.py                |   11 +
 mmdet3d/core/utils/contextmanagers.py         |  121 ++
 mmdet3d/core/utils/dist_utils.py              |   58 +
 mmdet3d/core/utils/kitti_utils.py             |   69 +
 mmdet3d/core/utils/misc.py                    |   65 +
 mmdet3d/core/voxel/__init__.py                |    4 +
 mmdet3d/core/voxel/builder.py                 |   14 +
 mmdet3d/core/voxel/voxel_generator.py         |  207 +++
 mmdet3d/datasets/__init__.py                  |   16 +
 mmdet3d/datasets/builder.py                   |   45 +
 mmdet3d/datasets/dataset_wrappers.py          |  103 ++
 mmdet3d/datasets/kitti2d_dataset.py           |  143 +++
 mmdet3d/datasets/kitti_dataset.py             |  579 +++++++++
 mmdet3d/datasets/loader/__init__.py           |    4 +
 mmdet3d/datasets/loader/build_loader.py       |   57 +
 mmdet3d/datasets/loader/sampler.py            |  164 +++
 mmdet3d/datasets/nuscenes2d_dataset.py        |   38 +
 mmdet3d/datasets/nuscenes_dataset.py          |  495 +++++++
 mmdet3d/datasets/pipelines/__init__.py        |   13 +
 .../datasets/pipelines/data_augment_utils.py  |  326 +++++
 mmdet3d/datasets/pipelines/dbsampler.py       |  509 ++++++++
 mmdet3d/datasets/pipelines/formating.py       |  165 +++
 mmdet3d/datasets/pipelines/loading.py         |  143 +++
 mmdet3d/datasets/pipelines/train_aug.py       |  326 +++++
 mmdet3d/datasets/registry.py                  |    3 +
 mmdet3d/datasets/utils.py                     |   37 +
 mmdet3d/models/__init__.py                    |   21 +
 mmdet3d/models/anchor_heads/__init__.py       |    4 +
 mmdet3d/models/anchor_heads/boxvelo_head.py   |  224 ++++
 mmdet3d/models/anchor_heads/second_head.py    |  405 ++++++
 mmdet3d/models/anchor_heads/train_mixins.py   |  245 ++++
 mmdet3d/models/backbones/__init__.py          |    4 +
 mmdet3d/models/backbones/second.py            |   84 ++
 mmdet3d/models/bbox_heads/__init__.py         |    8 +
 mmdet3d/models/builder.py                     |   56 +
 mmdet3d/models/detectors/__init__.py          |   14 +
 mmdet3d/models/detectors/base.py              |  110 ++
 mmdet3d/models/detectors/mvx_faster_rcnn.py   |  103 ++
 mmdet3d/models/detectors/mvx_single_stage.py  |  330 +++++
 mmdet3d/models/detectors/mvx_two_stage.py     |  376 ++++++
 mmdet3d/models/detectors/single_stage.py      |   89 ++
 mmdet3d/models/detectors/test_mixins.py       |  266 ++++
 mmdet3d/models/detectors/two_stage.py         |  314 +++++
 mmdet3d/models/detectors/voxelnet.py          |  140 ++
 mmdet3d/models/fusion_layers/__init__.py      |    3 +
 mmdet3d/models/fusion_layers/point_fusion.py  |  287 +++++
 mmdet3d/models/losses/__init__.py             |    3 +
 mmdet3d/models/middle_encoders/__init__.py    |    4 +
 .../models/middle_encoders/pillar_scatter.py  |   85 ++
 .../models/middle_encoders/sparse_encoder.py  |  215 ++++
 mmdet3d/models/necks/__init__.py              |    4 +
 mmdet3d/models/necks/second_fpn.py            |  147 +++
 mmdet3d/models/registry.py                    |    5 +
 mmdet3d/models/roi_extractors/__init__.py     |    3 +
 mmdet3d/models/utils/__init__.py              |    3 +
 mmdet3d/models/utils/weight_init.py           |   46 +
 mmdet3d/models/voxel_encoders/__init__.py     |    8 +
 .../models/voxel_encoders/pillar_encoder.py   |  378 ++++++
 mmdet3d/models/voxel_encoders/utils.py        |  148 +++
 .../models/voxel_encoders/voxel_encoder.py    |  478 +++++++
 mmdet3d/ops/__init__.py                       |   11 +
 mmdet3d/ops/iou3d/__init__.py                 |    4 +
 mmdet3d/ops/iou3d/iou3d_utils.py              |  113 ++
 mmdet3d/ops/iou3d/setup.py                    |   18 +
 mmdet3d/ops/iou3d/src/iou3d.cpp               |  179 +++
 mmdet3d/ops/iou3d/src/iou3d_kernel.cu         |  381 ++++++
 mmdet3d/ops/norm.py                           |   10 +
 mmdet3d/ops/spconv/__init__.py                |   37 +
 mmdet3d/ops/spconv/conv.py                    |  446 +++++++
 mmdet3d/ops/spconv/functional.py              |   98 ++
 mmdet3d/ops/spconv/include/paramsgrid.h       |   62 +
 mmdet3d/ops/spconv/include/prettyprint.h      |  445 +++++++
 mmdet3d/ops/spconv/include/pybind11_utils.h   |   61 +
 mmdet3d/ops/spconv/include/spconv/box_iou.h   |  157 +++
 .../spconv/include/spconv/fused_spconv_ops.h  |  127 ++
 mmdet3d/ops/spconv/include/spconv/geometry.h  |  301 +++++
 mmdet3d/ops/spconv/include/spconv/indice.cu.h |  243 ++++
 mmdet3d/ops/spconv/include/spconv/indice.h    |   79 ++
 mmdet3d/ops/spconv/include/spconv/maxpool.h   |   44 +
 mmdet3d/ops/spconv/include/spconv/mp_helper.h |   47 +
 mmdet3d/ops/spconv/include/spconv/nms.h       |  201 +++
 .../ops/spconv/include/spconv/nms_functor.h   |   42 +
 mmdet3d/ops/spconv/include/spconv/nms_gpu.h   |   18 +
 mmdet3d/ops/spconv/include/spconv/nms_ops.h   |   75 ++
 .../ops/spconv/include/spconv/point2voxel.h   |  414 ++++++
 mmdet3d/ops/spconv/include/spconv/pool_ops.h  |   97 ++
 .../ops/spconv/include/spconv/reordering.cu.h |  161 +++
 .../ops/spconv/include/spconv/reordering.h    |   40 +
 .../ops/spconv/include/spconv/spconv_ops.h    |  561 ++++++++
 .../include/tensorview/helper_kernel.cu.h     |   81 ++
 .../spconv/include/tensorview/helper_launch.h |   21 +
 .../spconv/include/tensorview/tensorview.h    | 1144 +++++++++++++++++
 mmdet3d/ops/spconv/include/torch_utils.h      |   70 +
 mmdet3d/ops/spconv/include/utility/timer.h    |   54 +
 mmdet3d/ops/spconv/modules.py                 |  205 +++
 mmdet3d/ops/spconv/ops.py                     |  183 +++
 mmdet3d/ops/spconv/pool.py                    |   85 ++
 mmdet3d/ops/spconv/src/all.cc                 |   51 +
 mmdet3d/ops/spconv/src/indice.cc              |   89 ++
 mmdet3d/ops/spconv/src/indice_cuda.cu         |  158 +++
 mmdet3d/ops/spconv/src/maxpool.cc             |   82 ++
 mmdet3d/ops/spconv/src/maxpool_cuda.cu        |  471 +++++++
 mmdet3d/ops/spconv/src/reordering.cc          |   69 +
 mmdet3d/ops/spconv/src/reordering_cuda.cu     |  155 +++
 mmdet3d/ops/spconv/structure.py               |   69 +
 mmdet3d/ops/spconv/test_utils.py              |  193 +++
 mmdet3d/ops/sync_bn.py                        |  110 ++
 mmdet3d/ops/voxel/__init__.py                 |    4 +
 mmdet3d/ops/voxel/scatter_points.py           |  129 ++
 mmdet3d/ops/voxel/src/scatter_points_cpu.cpp  |  131 ++
 mmdet3d/ops/voxel/src/scatter_points_cuda.cu  |  284 ++++
 mmdet3d/ops/voxel/src/voxelization.cpp        |   13 +
 mmdet3d/ops/voxel/src/voxelization.h          |  113 ++
 mmdet3d/ops/voxel/src/voxelization_cpu.cpp    |  208 +++
 mmdet3d/ops/voxel/src/voxelization_cuda.cu    |  373 ++++++
 mmdet3d/ops/voxel/voxelize.py                 |  122 ++
 mmdet3d/utils/__init__.py                     |    8 +
 mmdet3d/utils/collect_env.py                  |   65 +
 requirements.txt                              |    4 +
 requirements/build.txt                        |    3 +
 requirements/optional.txt                     |    2 +
 requirements/runtime.txt                      |    9 +
 requirements/tests.txt                        |   12 +
 setup.py                                      |  271 ++++
 tests/test_config.py                          |  293 +++++
 tools/create_data.py                          |  106 ++
 tools/create_data.sh                          |   25 +
 tools/data_converter/__init__.py              |    0
 tools/data_converter/create_gt_database.py    |  263 ++++
 tools/data_converter/kitti_converter.py       |  204 +++
 tools/data_converter/kitti_data_utils.py      |  355 +++++
 tools/data_converter/nuscenes_converter.py    |  503 ++++++++
 tools/dist_train.sh                           |    9 +
 tools/publish_model.py                        |   35 +
 tools/slurm_test.sh                           |   22 +
 tools/slurm_train.sh                          |   23 +
 tools/test.py                                 |  170 +++
 tools/train.py                                |  149 +++
 214 files changed, 30129 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .gitlab-ci.yml
 create mode 100644 .isort.cfg
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 .style.yapf
 create mode 100644 .travis.yml
 create mode 100644 README.md
 create mode 100644 configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py
 create mode 100644 configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
 create mode 100644 configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
 create mode 100644 configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py
 create mode 100644 configs/kitti/faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py
 create mode 100644 configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
 create mode 100644 configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py
 create mode 100644 configs/nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py
 create mode 100644 configs/nus/hv_pointpillars_secfpn_sbn-all_4x8_20e_nus-3d.py
 create mode 100644 configs/nus/hv_pointpillars_secfpn_sbn-all_freeze_adamw_1x16_20e_nus-mm.py
 create mode 100644 configs/nus/retinanet_r50_fpn_caffe_2x8_1x_nus.py
 create mode 100644 docs/CHANGELOG.md
 create mode 100644 docs/CODE_OF_CONDUCT.md
 create mode 100644 docs/CONTRIBUTING.md
 create mode 100644 docs/GETTING_STARTED.md
 create mode 100644 docs/INSTALL.md
 create mode 100644 docs/MODEL_ZOO.md
 create mode 100644 docs/Makefile
 create mode 100644 docs/ROBUSTNESS_BENCHMARKING.md
 create mode 100644 docs/TECHNICAL_DETAILS.md
 create mode 100644 docs/conf.py
 create mode 100644 docs/index.rst
 create mode 100644 docs/make.bat
 create mode 100644 docs/requirements.txt
 create mode 100644 mmdet3d/__init__.py
 create mode 100644 mmdet3d/apis/__init__.py
 create mode 100644 mmdet3d/apis/train.py
 create mode 100644 mmdet3d/core/__init__.py
 create mode 100644 mmdet3d/core/anchor/__init__.py
 create mode 100644 mmdet3d/core/anchor/anchor_generator.py
 create mode 100644 mmdet3d/core/bbox/__init__.py
 create mode 100644 mmdet3d/core/bbox/assign_sampling.py
 create mode 100644 mmdet3d/core/bbox/assigners/__init__.py
 create mode 100644 mmdet3d/core/bbox/assigners/approx_max_iou_assigner.py
 create mode 100644 mmdet3d/core/bbox/assigners/assign_result.py
 create mode 100644 mmdet3d/core/bbox/assigners/base_assigner.py
 create mode 100644 mmdet3d/core/bbox/assigners/max_iou_assigner.py
 create mode 100644 mmdet3d/core/bbox/box_np_ops.py
 create mode 100644 mmdet3d/core/bbox/box_torch_ops.py
 create mode 100644 mmdet3d/core/bbox/coders/__init__.py
 create mode 100644 mmdet3d/core/bbox/coders/box_coder.py
 create mode 100644 mmdet3d/core/bbox/geometry.py
 create mode 100644 mmdet3d/core/bbox/samplers/__init__.py
 create mode 100644 mmdet3d/core/bbox/samplers/base_sampler.py
 create mode 100644 mmdet3d/core/bbox/samplers/combined_sampler.py
 create mode 100644 mmdet3d/core/bbox/samplers/instance_balanced_pos_sampler.py
 create mode 100644 mmdet3d/core/bbox/samplers/iou_balanced_neg_sampler.py
 create mode 100644 mmdet3d/core/bbox/samplers/ohem_sampler.py
 create mode 100644 mmdet3d/core/bbox/samplers/pseudo_sampler.py
 create mode 100644 mmdet3d/core/bbox/samplers/random_sampler.py
 create mode 100644 mmdet3d/core/bbox/samplers/sampling_result.py
 create mode 100644 mmdet3d/core/bbox/transforms.py
 create mode 100644 mmdet3d/core/evaluation/__init__.py
 create mode 100644 mmdet3d/core/evaluation/bbox_overlaps.py
 create mode 100644 mmdet3d/core/evaluation/class_names.py
 create mode 100644 mmdet3d/core/evaluation/coco_utils.py
 create mode 100644 mmdet3d/core/evaluation/eval_hooks.py
 create mode 100644 mmdet3d/core/evaluation/kitti_utils/__init__.py
 create mode 100644 mmdet3d/core/evaluation/kitti_utils/eval.py
 create mode 100644 mmdet3d/core/evaluation/kitti_utils/rotate_iou.py
 create mode 100644 mmdet3d/core/evaluation/mean_ap.py
 create mode 100644 mmdet3d/core/evaluation/recall.py
 create mode 100644 mmdet3d/core/optimizer/__init__.py
 create mode 100644 mmdet3d/core/optimizer/builder.py
 create mode 100644 mmdet3d/core/optimizer/mix_optimizer.py
 create mode 100644 mmdet3d/core/optimizer/registry.py
 create mode 100644 mmdet3d/core/post_processing/__init__.py
 create mode 100644 mmdet3d/core/post_processing/bbox_nms.py
 create mode 100644 mmdet3d/core/post_processing/merge_augs.py
 create mode 100644 mmdet3d/core/utils/__init__.py
 create mode 100644 mmdet3d/core/utils/contextmanagers.py
 create mode 100644 mmdet3d/core/utils/dist_utils.py
 create mode 100644 mmdet3d/core/utils/kitti_utils.py
 create mode 100644 mmdet3d/core/utils/misc.py
 create mode 100644 mmdet3d/core/voxel/__init__.py
 create mode 100644 mmdet3d/core/voxel/builder.py
 create mode 100644 mmdet3d/core/voxel/voxel_generator.py
 create mode 100644 mmdet3d/datasets/__init__.py
 create mode 100644 mmdet3d/datasets/builder.py
 create mode 100644 mmdet3d/datasets/dataset_wrappers.py
 create mode 100644 mmdet3d/datasets/kitti2d_dataset.py
 create mode 100644 mmdet3d/datasets/kitti_dataset.py
 create mode 100644 mmdet3d/datasets/loader/__init__.py
 create mode 100644 mmdet3d/datasets/loader/build_loader.py
 create mode 100644 mmdet3d/datasets/loader/sampler.py
 create mode 100644 mmdet3d/datasets/nuscenes2d_dataset.py
 create mode 100644 mmdet3d/datasets/nuscenes_dataset.py
 create mode 100644 mmdet3d/datasets/pipelines/__init__.py
 create mode 100644 mmdet3d/datasets/pipelines/data_augment_utils.py
 create mode 100644 mmdet3d/datasets/pipelines/dbsampler.py
 create mode 100644 mmdet3d/datasets/pipelines/formating.py
 create mode 100644 mmdet3d/datasets/pipelines/loading.py
 create mode 100644 mmdet3d/datasets/pipelines/train_aug.py
 create mode 100644 mmdet3d/datasets/registry.py
 create mode 100644 mmdet3d/datasets/utils.py
 create mode 100644 mmdet3d/models/__init__.py
 create mode 100644 mmdet3d/models/anchor_heads/__init__.py
 create mode 100644 mmdet3d/models/anchor_heads/boxvelo_head.py
 create mode 100644 mmdet3d/models/anchor_heads/second_head.py
 create mode 100644 mmdet3d/models/anchor_heads/train_mixins.py
 create mode 100644 mmdet3d/models/backbones/__init__.py
 create mode 100644 mmdet3d/models/backbones/second.py
 create mode 100644 mmdet3d/models/bbox_heads/__init__.py
 create mode 100644 mmdet3d/models/builder.py
 create mode 100644 mmdet3d/models/detectors/__init__.py
 create mode 100644 mmdet3d/models/detectors/base.py
 create mode 100644 mmdet3d/models/detectors/mvx_faster_rcnn.py
 create mode 100644 mmdet3d/models/detectors/mvx_single_stage.py
 create mode 100644 mmdet3d/models/detectors/mvx_two_stage.py
 create mode 100644 mmdet3d/models/detectors/single_stage.py
 create mode 100644 mmdet3d/models/detectors/test_mixins.py
 create mode 100644 mmdet3d/models/detectors/two_stage.py
 create mode 100644 mmdet3d/models/detectors/voxelnet.py
 create mode 100644 mmdet3d/models/fusion_layers/__init__.py
 create mode 100644 mmdet3d/models/fusion_layers/point_fusion.py
 create mode 100644 mmdet3d/models/losses/__init__.py
 create mode 100644 mmdet3d/models/middle_encoders/__init__.py
 create mode 100644 mmdet3d/models/middle_encoders/pillar_scatter.py
 create mode 100644 mmdet3d/models/middle_encoders/sparse_encoder.py
 create mode 100644 mmdet3d/models/necks/__init__.py
 create mode 100644 mmdet3d/models/necks/second_fpn.py
 create mode 100644 mmdet3d/models/registry.py
 create mode 100644 mmdet3d/models/roi_extractors/__init__.py
 create mode 100644 mmdet3d/models/utils/__init__.py
 create mode 100644 mmdet3d/models/utils/weight_init.py
 create mode 100644 mmdet3d/models/voxel_encoders/__init__.py
 create mode 100644 mmdet3d/models/voxel_encoders/pillar_encoder.py
 create mode 100644 mmdet3d/models/voxel_encoders/utils.py
 create mode 100644 mmdet3d/models/voxel_encoders/voxel_encoder.py
 create mode 100644 mmdet3d/ops/__init__.py
 create mode 100644 mmdet3d/ops/iou3d/__init__.py
 create mode 100644 mmdet3d/ops/iou3d/iou3d_utils.py
 create mode 100644 mmdet3d/ops/iou3d/setup.py
 create mode 100644 mmdet3d/ops/iou3d/src/iou3d.cpp
 create mode 100644 mmdet3d/ops/iou3d/src/iou3d_kernel.cu
 create mode 100644 mmdet3d/ops/norm.py
 create mode 100644 mmdet3d/ops/spconv/__init__.py
 create mode 100644 mmdet3d/ops/spconv/conv.py
 create mode 100644 mmdet3d/ops/spconv/functional.py
 create mode 100644 mmdet3d/ops/spconv/include/paramsgrid.h
 create mode 100644 mmdet3d/ops/spconv/include/prettyprint.h
 create mode 100644 mmdet3d/ops/spconv/include/pybind11_utils.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/box_iou.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/geometry.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/indice.cu.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/indice.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/maxpool.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/mp_helper.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/nms.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/nms_functor.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/nms_gpu.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/nms_ops.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/point2voxel.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/pool_ops.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/reordering.cu.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/reordering.h
 create mode 100644 mmdet3d/ops/spconv/include/spconv/spconv_ops.h
 create mode 100644 mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
 create mode 100644 mmdet3d/ops/spconv/include/tensorview/helper_launch.h
 create mode 100644 mmdet3d/ops/spconv/include/tensorview/tensorview.h
 create mode 100644 mmdet3d/ops/spconv/include/torch_utils.h
 create mode 100644 mmdet3d/ops/spconv/include/utility/timer.h
 create mode 100644 mmdet3d/ops/spconv/modules.py
 create mode 100644 mmdet3d/ops/spconv/ops.py
 create mode 100644 mmdet3d/ops/spconv/pool.py
 create mode 100644 mmdet3d/ops/spconv/src/all.cc
 create mode 100644 mmdet3d/ops/spconv/src/indice.cc
 create mode 100644 mmdet3d/ops/spconv/src/indice_cuda.cu
 create mode 100644 mmdet3d/ops/spconv/src/maxpool.cc
 create mode 100644 mmdet3d/ops/spconv/src/maxpool_cuda.cu
 create mode 100644 mmdet3d/ops/spconv/src/reordering.cc
 create mode 100644 mmdet3d/ops/spconv/src/reordering_cuda.cu
 create mode 100644 mmdet3d/ops/spconv/structure.py
 create mode 100644 mmdet3d/ops/spconv/test_utils.py
 create mode 100644 mmdet3d/ops/sync_bn.py
 create mode 100644 mmdet3d/ops/voxel/__init__.py
 create mode 100644 mmdet3d/ops/voxel/scatter_points.py
 create mode 100644 mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
 create mode 100644 mmdet3d/ops/voxel/src/scatter_points_cuda.cu
 create mode 100644 mmdet3d/ops/voxel/src/voxelization.cpp
 create mode 100644 mmdet3d/ops/voxel/src/voxelization.h
 create mode 100644 mmdet3d/ops/voxel/src/voxelization_cpu.cpp
 create mode 100644 mmdet3d/ops/voxel/src/voxelization_cuda.cu
 create mode 100644 mmdet3d/ops/voxel/voxelize.py
 create mode 100644 mmdet3d/utils/__init__.py
 create mode 100644 mmdet3d/utils/collect_env.py
 create mode 100644 requirements.txt
 create mode 100644 requirements/build.txt
 create mode 100644 requirements/optional.txt
 create mode 100644 requirements/runtime.txt
 create mode 100644 requirements/tests.txt
 create mode 100644 setup.py
 create mode 100644 tests/test_config.py
 create mode 100644 tools/create_data.py
 create mode 100644 tools/create_data.sh
 create mode 100644 tools/data_converter/__init__.py
 create mode 100644 tools/data_converter/create_gt_database.py
 create mode 100644 tools/data_converter/kitti_converter.py
 create mode 100644 tools/data_converter/kitti_data_utils.py
 create mode 100644 tools/data_converter/nuscenes_converter.py
 create mode 100644 tools/dist_train.sh
 create mode 100644 tools/publish_model.py
 create mode 100755 tools/slurm_test.sh
 create mode 100755 tools/slurm_train.sh
 create mode 100644 tools/test.py
 create mode 100644 tools/train.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..4de053d75f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,127 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+*.ipynb
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# cython generated cpp
+mmdet3d/ops/nms/src/soft_nms_cpu.cpp
+mmdet3d/version.py
+data
+.vscode
+.idea
+
+# custom
+*.pkl
+*.pkl.json
+*.log.json
+work_dirs/
+exps/
+*~
+
+# Pytorch
+*.pth
+
+# demo
+*.jpg
+*.png
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000000..6595452b13
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,43 @@
+variables:
+  PYTORCH_IMAGE: registry.sensetime.com/eig-research/pytorch:pytorch1.3.1-cuda10.1-devel
+
+stages:
+  - linting
+  - test
+
+before_script:
+  - echo $PATH
+  - gcc --version
+  - nvcc --version
+  - python --version
+  - pip --version
+  - python -c "import torch; print(torch.__version__)"
+
+.linting_template: &linting_template_def
+  stage: linting
+  script:
+    - pip install flake8 yapf isort
+    - flake8 .
+    - isort -rc --check-only --diff mmdet3d/ tools/ tests/
+    - yapf -r -d mmdet3d/ tools/ tests/ configs/
+
+.test_template: &test_template_def
+  stage: test
+  script:
+    - echo "Start building..."
+    - conda install av -c conda-forge -y
+    - pip install git+https://github.com/open-mmlab/mmdetection.git@v2.0
+    - python -c "import mmdet; print(mmdet.__version__)"
+    - pip install -v -e .[all]
+    - python -c "import mmdet3d; print(mmdet3d.__version__)"
+    - echo "Start testing..."
+    - coverage run --branch --source mmdet3d -m pytest tests/
+    - coverage report -m
+
+linting:pytorch1.3-cuda10:
+  image: $PYTORCH_IMAGE
+  <<: *linting_template_def
+
+test:pytorch1.3-cuda10:
+  image: $PYTORCH_IMAGE
+  <<: *test_template_def
diff --git a/.isort.cfg b/.isort.cfg
new file mode 100644
index 0000000000..09a0e57266
--- /dev/null
+++ b/.isort.cfg
@@ -0,0 +1,8 @@
+[isort]
+line_length = 79
+multi_line_output = 0
+known_standard_library = setuptools
+known_first_party = mmdet,mmdet3d
+known_third_party = Cython,cv2,mmcv,numba,numpy,nuscenes,pycocotools,pyquaternion,scipy,shapely,six,skimage,terminaltables,torch,torchvision
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000..8362bc545d
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,27 @@
+repos:
+  - repo: https://gitlab.com/pycqa/flake8.git
+    rev: 3.7.9
+    hooks:
+      - id: flake8
+  - repo: https://github.com/asottile/seed-isort-config
+    rev: v2.1.0
+    hooks:
+      - id: seed-isort-config
+  - repo: https://github.com/timothycrosley/isort
+    rev: 4.3.21
+    hooks:
+        - id: isort
+  - repo: https://github.com/pre-commit/mirrors-yapf
+    rev: v0.29.0
+    hooks:
+      - id: yapf
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.5.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: fix-encoding-pragma
+        args: ["--remove"]
diff --git a/.style.yapf b/.style.yapf
new file mode 100644
index 0000000000..286a3f1d7a
--- /dev/null
+++ b/.style.yapf
@@ -0,0 +1,4 @@
+[style]
+BASED_ON_STYLE = pep8
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
+SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000000..68f49ccc07
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,43 @@
+dist: bionic  # ubuntu 18.04
+language: python
+
+python:
+  - "3.5"
+  - "3.6"
+  - "3.7"
+
+env: CUDA=10.1.105-1 CUDA_SHORT=10.1 UBUNTU_VERSION=ubuntu1804 FORCE_CUDA=1
+cache: pip
+
+# Ref to CUDA installation in Travis: https://github.com/jeremad/cuda-travis
+before_install:
+  - INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb
+  - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER}
+  - sudo dpkg -i ${INSTALLER}
+  - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub
+  - sudo apt-key add 7fa2af80.pub
+  - sudo apt update -qq
+  - sudo apt install -y cuda-${CUDA_SHORT/./-} cuda-cufft-dev-${CUDA_SHORT/./-}
+  - sudo apt clean
+  - CUDA_HOME=/usr/local/cuda-${CUDA_SHORT}
+  - LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${CUDA_HOME}/include:${LD_LIBRARY_PATH}
+  - PATH=${CUDA_HOME}/bin:${PATH}
+
+install:
+  - pip install Pillow==6.2.2  # remove this line when torchvision>=0.5
+  - pip install torch==1.2 torchvision==0.4.0  # TODO: fix CI for pytorch>1.2
+  - pip install "git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI"
+  - pip install -r requirements.txt
+
+before_script:
+  - flake8 .
+  - isort -rc --check-only --diff mmdet3d/ tools/ tests/
+  - yapf -r -d --style .style.yapf mmdet3d/ tools/ tests/ configs/
+
+script:
+  - python setup.py check -m -s
+  - python setup.py build_ext --inplace
+  - coverage run --source mmdet3d -m py.test -v --xdoctest-modules tests mmdet3d
+
+after_success:
+  - coverage report
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000..40e35a9e7a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,58 @@
+
+# MMDetection3D
+
+
+## Introduction
+
+The master branch works with **PyTorch 1.1** or higher.
+
+mmdetection3d is an open source 3D object detection toolbox based on PyTorch. It is
+a part of the open-mmlab project developed by [Multimedia Laboratory, CUHK](http://mmlab.ie.cuhk.edu.hk/).
+
+
+### Major features
+
+
+
+## License
+
+This project is released under the [Apache 2.0 license](LICENSE).
+
+## Updates
+
+
+v0.0.1 (07/08/2019)
+- the project is initiated
+
+## Benchmark and model zoo
+
+Supported methods and backbones are shown in the below table.
+Results and models are available in the [Model zoo](MODEL_ZOO.md).
+
+
+## Installation
+
+Please refer to [INSTALL.md](INSTALL.md) for installation and dataset preparation.
+
+
+## Get Started
+
+Please see [GETTING_STARTED.md](GETTING_STARTED.md) for the basic usage of MMDetection.
+
+## Contributing
+
+We appreciate all contributions to improve MMDetection3D. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for the contributing guideline.
+
+## Acknowledgement
+
+MMDetection3D is an open source project that is contributed by researchers and engineers from various colleges and companies. We appreciate all the contributors who implement their methods or add new features, as well as users who give valuable feedbacks.
+We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to reimplement existing methods and develop their own new detectors.
+
+
+## Citation
+
+
+
+## Contact
+
+This repo is currently maintained by Wenwei Zhang ([@ZwwWayne](http://github.com/ZwwWayne)).
diff --git a/configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py b/configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py
new file mode 100644
index 0000000000..79e2d6f837
--- /dev/null
+++ b/configs/kitti/dv_mvx-v2_second_secfpn_fpn-fusion_adamw_2x8_80e_kitti-3d-3class.py
@@ -0,0 +1,283 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # velodyne coordinates, x, y, z
+
+model = dict(
+    type='DynamicMVXFasterRCNNV2',
+    pretrained=('./pretrain_detectron/'
+                'ImageNetPretrained/MSRA/resnet50_msra.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    pts_voxel_layer=dict(
+        max_num_points=-1,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(-1, -1),  # (training, testing) max_coxels
+    ),
+    pts_voxel_encoder=dict(
+        type='DynamicVFE',
+        num_input_features=4,
+        num_filters=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=point_cloud_range,
+        fusion_layer=dict(
+            type='PointFusion',
+            img_channels=256,
+            pts_channels=64,
+            mid_channels=128,
+            out_channels=128,
+            img_levels=[0, 1, 2, 3, 4],
+            align_corners=False,
+            activate_out=True,
+            fuse_out=False),
+    ),
+    pts_middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=128,
+        output_shape=[41, 1600, 1408],  # checked from PointCloud3D
+        pre_act=False,
+    ),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        num_filters=[128, 256],
+    ),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        num_upsample_filters=[256, 256],
+    ),
+    pts_bbox_head=dict(
+        type='SECONDHead',
+        class_name=['Pedestrian', 'Cyclist', 'Car'],
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        encode_bg_as_zeros=True,
+        anchor_range=[
+            [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+            [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+            [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+        ],
+        assigner_per_size=True,
+        anchor_strides=[2],
+        anchor_sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+        anchor_rotations=[0, 1.57],
+        diff_rad_by_sin=True,
+        assign_per_class=True,
+        bbox_coder=dict(type='ResidualCoder', ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
+    ))
+# model training and testing settings
+train_cfg = dict(
+    pts=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='MaxIoUAssigner',
+                iou_type='nearest_3d',
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='MaxIoUAssigner',
+                iou_type='nearest_3d',
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='MaxIoUAssigner',
+                iou_type='nearest_3d',
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    pts=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.3,
+        min_bbox_size=0,
+        post_center_limit_range=[0, -40, -3, 70.4, 40, 0.0],
+    ), )
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=True,
+)
+db_sampler = dict(
+    type='MMDataBaseSampler',
+    root_path=data_root,
+    info_path=data_root + 'kitti_mm_dbinfos_train.pkl',
+    rate=1.0,
+    object_rot_range=[0.0, 0.0],
+    blending_type=['box', 'gaussian', 'poisson'],
+    depth_consistent=True,
+    check_2D_collision=True,
+    collision_thr=[0, 0.3, 0.5, 0.7],
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            Car=5,
+            Pedestrian=10,
+            Cyclist=10,
+        ),
+    ),
+    sample_groups=dict(
+        Car=12,
+        Pedestrian=6,
+        Cyclist=6,
+    ),
+)
+train_pipeline = [
+    dict(
+        type='Resize',
+        img_scale=[(640, 192), (2560, 768)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.78539816, 0.78539816],
+        scaling_uniform_noise=[0.95, 1.05],
+        trans_normal_noise=[0.2, 0.2, 0.2]),
+    dict(type='RandomFlip3D', flip_ratio=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'points', 'img', 'gt_bboxes_3d', 'gt_bboxes', 'gt_labels',
+            'gt_labels_3d'
+        ]),
+]
+test_pipeline = [
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='Resize',
+        img_scale=[
+            (1280, 384),
+        ],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[0, 0],
+        scaling_uniform_noise=[1, 1]),
+    dict(type='RandomFlip3D', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points', 'img']),
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_train.pkl',
+        split='training',
+        training=True,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='testing',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True,
+        test_mode=True))
+# Training settings
+optimizer = dict(type='AdamW', lr=0.003, betas=(0.95, 0.99), weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='cosine',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 10,
+    target_lr=1e-5,
+    as_ratio=True,
+)
+momentum_config = None
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+evaluation = dict(interval=1)
+# runtime settings
+total_epochs = 80
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/sec_secfpn_80e'
+load_from = './pretrain_mmdet/mvx_faster_rcnn_r50_fpn_detectron2-caffe_freezeBN_l1-loss_roialign-v2_1x_coco-3-class_44.7_20200205-b1c1533f.pth'  # noqa
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py b/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
new file mode 100644
index 0000000000..43e7b0bbd8
--- /dev/null
+++ b/configs/kitti/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
@@ -0,0 +1,203 @@
+# model settings
+voxel_size = [0.16, 0.16, 4]
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
+
+model = dict(
+    type='DynamicVoxelNet',
+    voxel_layer=dict(
+        max_num_points=-1,  # set -1 for dynamic voxel
+        point_cloud_range=point_cloud_range,  # velodyne coordinates, x, y, z
+        voxel_size=voxel_size,
+        max_voxels=(-1, -1),  # set -1 for dynamic voxel
+    ),
+    voxel_encoder=dict(
+        type='DynamicPillarFeatureNet',
+        num_input_features=4,
+        num_filters=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+    ),
+    middle_encoder=dict(
+        type='PointPillarsScatter',
+        in_channels=64,
+        output_shape=[496, 432],
+    ),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        num_filters=[64, 128, 256],
+    ),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        num_upsample_filters=[128, 128, 128],
+    ),
+    bbox_head=dict(
+        type='SECONDHead',
+        class_name=['Car'],
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        encode_bg_as_zeros=True,
+        anchor_range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
+        anchor_strides=[2],
+        anchor_sizes=[[1.6, 3.9, 1.56]],
+        anchor_rotations=[0, 1.57],
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='ResidualCoder', ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
+    ),
+)
+# model training and testing settings
+train_cfg = dict(
+    assigner=dict(
+        type='MaxIoUAssigner',
+        iou_type='nearest_3d',
+        pos_iou_thr=0.6,
+        neg_iou_thr=0.45,
+        min_pos_iou=0.45,
+        ignore_iof_thr=-1),
+    allowed_border=0,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    use_rotate_nms=True,
+    nms_across_levels=False,
+    nms_thr=0.01,
+    score_thr=0.3,
+    min_bbox_size=0,
+    post_center_limit_range=point_cloud_range,
+    # soft-nms is also supported for rcnn testing
+    # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+)
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False,
+)
+db_sampler = dict(
+    root_path=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    use_road_plane=False,
+    object_rot_range=[0.0, 0.0],
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5),
+    ),
+    sample_groups=dict(Car=15),
+)
+
+train_pipeline = [
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        loc_noise_std=[0.25, 0.25, 0.25],
+        global_rot_range=[0.0, 0.0],
+        rot_uniform_noise=[-0.15707963267, 0.15707963267]),
+    dict(type='PointsRandomFlip', flip_ratio=0.5),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.78539816, 0.78539816],
+        scaling_uniform_noise=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes']),
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_train.pkl',
+        split='training',
+        training=True,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='testing',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True))
+# optimizer
+lr = 0.001  # max learning rate
+optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=[10, 1e-4],
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=[0.85 / 0.95, 1],
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 160
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/pp_secfpn_80e'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py b/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
new file mode 100644
index 0000000000..2ae9164c7b
--- /dev/null
+++ b/configs/kitti/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
@@ -0,0 +1,231 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # velodyne coordinates, x, y, z
+
+model = dict(
+    type='DynamicVoxelNet',
+    voxel_layer=dict(
+        max_num_points=-1,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(-1, -1),  # (training, testing) max_coxels
+    ),
+    voxel_encoder=dict(
+        type='DynamicVFEV3',
+        num_input_features=4,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=4,
+        output_shape=[41, 1600, 1408],
+        pre_act=False,
+    ),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        num_filters=[128, 256],
+    ),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        num_upsample_filters=[256, 256],
+    ),
+    bbox_head=dict(
+        type='SECONDHead',
+        class_name=['Pedestrian', 'Cyclist', 'Car'],
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        encode_bg_as_zeros=True,
+        anchor_range=[
+            [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+            [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+            [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+        ],
+        anchor_strides=[2],
+        anchor_sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+        anchor_rotations=[0, 1.57],
+        diff_rad_by_sin=True,
+        assigner_per_size=True,
+        assign_per_class=True,
+        bbox_coder=dict(type='ResidualCoder', ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
+    ),
+)
+# model training and testing settings
+train_cfg = dict(
+    assigner=[
+        dict(  # for Pedestrian
+            type='MaxIoUAssigner',
+            iou_type='nearest_3d',
+            pos_iou_thr=0.35,
+            neg_iou_thr=0.2,
+            min_pos_iou=0.2,
+            ignore_iof_thr=-1),
+        dict(  # for Cyclist
+            type='MaxIoUAssigner',
+            iou_type='nearest_3d',
+            pos_iou_thr=0.35,
+            neg_iou_thr=0.2,
+            min_pos_iou=0.2,
+            ignore_iof_thr=-1),
+        dict(  # for Car
+            type='MaxIoUAssigner',
+            iou_type='nearest_3d',
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.45,
+            min_pos_iou=0.45,
+            ignore_iof_thr=-1),
+    ],
+    allowed_border=0,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    use_rotate_nms=True,
+    nms_across_levels=False,
+    nms_thr=0.01,
+    score_thr=0.3,
+    min_bbox_size=0,
+    post_center_limit_range=[0, -40, -3, 70.4, 40, 0.0],
+    # soft-nms is also supported for rcnn testing
+    # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+)
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False,
+)
+db_sampler = dict(
+    root_path=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    use_road_plane=False,
+    object_rot_range=[0.0, 0.0],
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            Car=5,
+            Pedestrian=10,
+            Cyclist=10,
+        ),
+    ),
+    sample_groups=dict(
+        Car=12,
+        Pedestrian=6,
+        Cyclist=6,
+    ),
+)
+train_pipeline = [
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        loc_noise_std=[0, 0, 0],
+        global_rot_range=[0.0, 0.0],
+        rot_uniform_noise=[-0.39269908, 0.39269908]),
+    dict(type='RandomFlip3D', flip_ratio=0.5),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.78539816, 0.78539816],
+        scaling_uniform_noise=[0.95, 1.05],
+        trans_normal_noise=[0.2, 0.2, 0.2]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
+]
+test_pipeline = [
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d']),
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_train.pkl',
+        split='training',
+        training=True,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='testing',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True))
+# optimizer
+lr = 0.003  # max learning rate
+optimizer = dict(
+    type='AdamW',
+    lr=lr,
+    betas=(0.95, 0.99),  # the momentum is change during training
+    weight_decay=0.001)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(
+    policy='cosine',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 10,
+    target_lr=1e-5,
+    as_ratio=True,
+)
+momentum_config = None
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 80
+dist_params = dict(backend='nccl', port=29502)
+log_level = 'INFO'
+work_dir = './work_dirs/sec_secfpn_80e'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py b/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py
new file mode 100644
index 0000000000..d0d8fed3a9
--- /dev/null
+++ b/configs/kitti/dv_second_secfpn_6x8_80e_kitti-3d-car.py
@@ -0,0 +1,199 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # velodyne coordinates, x, y, z
+
+model = dict(
+    type='DynamicVoxelNet',
+    voxel_layer=dict(
+        max_num_points=-1,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(-1, -1),  # (training, testing) max_coxels
+    ),
+    voxel_encoder=dict(
+        type='DynamicVFEV3',
+        num_input_features=4,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=4,
+        output_shape=[41, 1600, 1408],  # checked from PointCloud3D
+        pre_act=False,
+    ),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        num_filters=[128, 256],
+    ),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        num_upsample_filters=[256, 256],
+    ),
+    bbox_head=dict(
+        type='SECONDHead',
+        class_name=['Car'],
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        encode_bg_as_zeros=True,
+        anchor_range=[0, -40.0, -1.78, 70.4, 40.0, -1.78],
+        anchor_strides=[2],
+        anchor_sizes=[[1.6, 3.9, 1.56]],
+        anchor_rotations=[0, 1.57],
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='ResidualCoder', ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
+    ),
+)
+# model training and testing settings
+train_cfg = dict(
+    assigner=dict(
+        type='MaxIoUAssigner',
+        iou_type='nearest_3d',
+        pos_iou_thr=0.6,
+        neg_iou_thr=0.45,
+        min_pos_iou=0.45,
+        ignore_iof_thr=-1),
+    allowed_border=0,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    use_rotate_nms=True,
+    nms_across_levels=False,
+    nms_thr=0.01,
+    score_thr=0.3,
+    min_bbox_size=0,
+    post_center_limit_range=[0, -40, -3, 70.4, 40, 0.0],
+    # soft-nms is also supported for rcnn testing
+    # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+)
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False,
+)
+db_sampler = dict(
+    root_path=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    use_road_plane=False,
+    object_rot_range=[0.0, 0.0],
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5),
+    ),
+    sample_groups=dict(Car=15),
+)
+train_pipeline = [
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        loc_noise_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_uniform_noise=[-0.78539816, 0.78539816]),
+    dict(type='PointsRandomFlip', flip_ratio=0.5),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.78539816, 0.78539816],
+        scaling_uniform_noise=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes']),
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_train.pkl',
+        split='training',
+        training=True,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='testing',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True))
+# optimizer
+lr = 0.0018  # max learning rate
+optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=[10, 1e-4],
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=[0.85 / 0.95, 1],
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 80
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/sec_secfpn_80e'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/kitti/faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py b/configs/kitti/faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py
new file mode 100644
index 0000000000..fa09f66a11
--- /dev/null
+++ b/configs/kitti/faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py
@@ -0,0 +1,194 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='FasterRCNN',
+    pretrained=('./pretrain_detectron/'
+                'ImageNetPretrained/MSRA/resnet50_msra.pth'),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_scales=[8],
+        anchor_ratios=[1 / 3, 0.5, 1.0, 2.0, 3.0],
+        anchor_strides=[4, 8, 16, 32, 64],
+        target_means=[.0, .0, .0, .0],
+        target_stds=[1.0, 1.0, 1.0, 1.0],
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            target_means=[0., 0., 0., 0.],
+            target_stds=[0.1, 0.1, 0.2, 0.2],
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        # following the setting of detectron,
+        # which improves ~0.2 bbox mAP.
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.5,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=512,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
+    # soft-nms is also supported for rcnn testing
+    # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+)
+# dataset settings
+dataset_type = 'Kitti2DDataset'
+data_root = 'data/kitti/'
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+# Values to be used for image normalization (BGR order)
+# Default mean pixel value from ImageNet: [103.53, 116.28, 123.675]
+# When using pre-trained models in Detectron1 or any MSRA models,
+# std has been absorbed into its conv1 weights, so the std needs to be set 1.
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=False),
+    dict(
+        type='Resize',
+        img_scale=[(640, 192), (2560, 768)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1280, 384),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        class_names=class_names,
+        ann_file='kitti_infos_train.pkl',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        class_names=class_names,
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        class_names=class_names,
+        ann_file='kitti_infos_val.pkl',
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+evaluation = dict(interval=1)
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl', port=29501)
+log_level = 'INFO'
+work_dir = './work_dirs/faster_rcnn_r50_fpn_1x'
+load_from = './pretrain_mmdet/faster_r50_fpn_detectron2-caffe_freezeBN_l1-loss_roialign-v2_3x-4767dd8e.pth'  # noqa
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py b/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
new file mode 100644
index 0000000000..946620b0a3
--- /dev/null
+++ b/configs/kitti/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
@@ -0,0 +1,204 @@
+# model settings
+point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=64,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,  # velodyne coordinates, x, y, z
+        voxel_size=[0.16, 0.16, 4],
+        max_voxels=(12000, 20000),  # (training, testing) max_coxels
+    ),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        num_input_features=4,
+        num_filters=[64],
+        with_distance=False,
+        # these two arguments should be consistent with the voxel_generator
+        voxel_size=[0.16, 0.16, 4],
+        point_cloud_range=point_cloud_range,
+    ),
+    middle_encoder=dict(
+        type='PointPillarsScatter',
+        in_channels=64,
+        output_shape=[496, 432],
+    ),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        num_filters=[64, 128, 256],
+    ),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        num_upsample_filters=[128, 128, 128],
+    ),
+    bbox_head=dict(
+        type='SECONDHead',
+        class_name=['Car'],
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        encode_bg_as_zeros=True,
+        anchor_range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
+        anchor_strides=[2],
+        anchor_sizes=[[1.6, 3.9, 1.56]],
+        anchor_rotations=[0, 1.57],
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='ResidualCoder', ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
+    ),
+)
+# model training and testing settings
+train_cfg = dict(
+    assigner=dict(
+        type='MaxIoUAssigner',
+        iou_type='nearest_3d',
+        pos_iou_thr=0.6,
+        neg_iou_thr=0.45,
+        min_pos_iou=0.45,
+        ignore_iof_thr=-1),
+    allowed_border=0,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    use_rotate_nms=True,
+    nms_across_levels=False,
+    nms_thr=0.01,
+    score_thr=0.3,
+    min_bbox_size=0,
+    post_center_limit_range=point_cloud_range,
+)
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False,
+)
+db_sampler = dict(
+    root_path=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    use_road_plane=False,
+    object_rot_range=[0.0, 0.0],
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5),
+    ),
+    sample_groups=dict(Car=15),
+)
+
+train_pipeline = [
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        loc_noise_std=[0.25, 0.25, 0.25],
+        global_rot_range=[0.0, 0.0],
+        rot_uniform_noise=[-0.15707963267, 0.15707963267]),
+    dict(type='PointsRandomFlip', flip_ratio=0.5),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.78539816, 0.78539816],
+        scaling_uniform_noise=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes']),
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_train.pkl',
+        split='training',
+        training=True,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='testing',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True))
+# optimizer
+lr = 0.001  # max learning rate
+optimizer = dict(
+    type='AdamW',
+    lr=lr,
+    betas=(0.95, 0.99),  # the momentum is change during training
+    weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=[10, 1e-4],
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=[0.85 / 0.95, 1],
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 160
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/pp_secfpn_80e'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py b/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py
new file mode 100644
index 0000000000..c616a86a98
--- /dev/null
+++ b/configs/kitti/hv_second_secfpn_6x8_80e_kitti-3d-car.py
@@ -0,0 +1,197 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # velodyne coordinates, x, y, z
+
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=5,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000),  # (training, testing) max_coxels
+    ),
+    voxel_encoder=dict(
+        type='VoxelFeatureExtractorV3',
+        num_input_features=4,
+        num_filters=[4],
+        with_distance=False),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=4,
+        output_shape=[41, 1600, 1408],  # checked from PointCloud3D
+        pre_act=False,
+    ),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        num_filters=[128, 256],
+    ),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        num_upsample_filters=[256, 256],
+    ),
+    bbox_head=dict(
+        type='SECONDHead',
+        class_name=['Car'],
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        encode_bg_as_zeros=True,
+        anchor_range=[0, -40.0, -1.78, 70.4, 40.0, -1.78],
+        anchor_strides=[2],
+        anchor_sizes=[[1.6, 3.9, 1.56]],
+        anchor_rotations=[0, 1.57],
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='ResidualCoder', ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
+    ),
+)
+# model training and testing settings
+train_cfg = dict(
+    assigner=dict(
+        type='MaxIoUAssigner',
+        iou_type='nearest_3d',
+        pos_iou_thr=0.6,
+        neg_iou_thr=0.45,
+        min_pos_iou=0.45,
+        ignore_iof_thr=-1),
+    allowed_border=0,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    use_rotate_nms=True,
+    nms_across_levels=False,
+    nms_thr=0.01,
+    score_thr=0.3,
+    min_bbox_size=0,
+    post_center_limit_range=[0, -40, -3, 70.4, 40, 0.0],
+)
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=False,
+)
+db_sampler = dict(
+    root_path=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    use_road_plane=False,
+    object_rot_range=[0.0, 0.0],
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5),
+    ),
+    sample_groups=dict(Car=15),
+)
+train_pipeline = [
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        loc_noise_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_uniform_noise=[-0.78539816, 0.78539816]),
+    dict(type='PointsRandomFlip', flip_ratio=0.5),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.78539816, 0.78539816],
+        scaling_uniform_noise=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes']),
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_train.pkl',
+        split='training',
+        training=True,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='testing',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True))
+# optimizer
+lr = 0.001  # max learning rate
+optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=[10, 1e-4],
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=[0.85 / 0.95, 1],
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 80
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/sec_secfpn_80e'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py b/configs/nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py
new file mode 100644
index 0000000000..42757071a4
--- /dev/null
+++ b/configs/nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py
@@ -0,0 +1,187 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='FasterRCNN',
+    pretrained=('./pretrain_detectron/'
+                'ImageNetPretrained/MSRA/resnet50_msra.pth'),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_scales=[8],
+        anchor_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[4, 8, 16, 32, 64],
+        target_means=[.0, .0, .0, .0],
+        target_stds=[1.0, 1.0, 1.0, 1.0],
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            target_means=[0., 0., 0., 0.],
+            target_stds=[0.1, 0.1, 0.2, 0.2],
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        # following the setting of detectron,
+        # which improves ~0.2 bbox mAP.
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.5,
+            min_pos_iou=0.5,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=512,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_post=1000,
+        max_num=1000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
+    # soft-nms is also supported for rcnn testing
+    # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+)
+# dataset settings
+dataset_type = 'NuScenes2DDataset'
+data_root = 'data/nuscenes/'
+# Values to be used for image normalization (BGR order)
+# Default mean pixel values are from ImageNet: [103.53, 116.28, 123.675]
+# When using pre-trained models in Detectron1 or any MSRA models,
+# std has been absorbed into its conv1 weights, so the std needs to be set 1.
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=False),
+    dict(
+        type='Resize',
+        img_scale=[(1200, 720), (1920, 1080)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1600, 900),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'nuscenes_infos_train.coco.json',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'nuscenes_infos_val.coco.json',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'nuscenes_infos_val.coco.json',
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+evaluation = dict(interval=1)
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl', port=29501)
+log_level = 'INFO'
+work_dir = './work_dirs/faster_rcnn_r50_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/nus/hv_pointpillars_secfpn_sbn-all_4x8_20e_nus-3d.py b/configs/nus/hv_pointpillars_secfpn_sbn-all_4x8_20e_nus-3d.py
new file mode 100644
index 0000000000..34ccedb7eb
--- /dev/null
+++ b/configs/nus/hv_pointpillars_secfpn_sbn-all_4x8_20e_nus-3d.py
@@ -0,0 +1,236 @@
+# model settings
+voxel_size = [0.25, 0.25, 8]
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+model = dict(
+    type='MVXFasterRCNNV2',
+    pts_voxel_layer=dict(
+        max_num_points=64,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,  # velodyne coordinates, x, y, z
+        voxel_size=voxel_size,
+        max_voxels=(30000, 40000),  # (training, testing) max_coxels
+    ),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        num_input_features=4,
+        num_filters=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=point_cloud_range,
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter',
+        in_channels=64,
+        output_shape=[400, 400],  # checked from PointCloud3D
+    ),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        num_filters=[64, 128, 256],
+    ),
+    pts_neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        num_upsample_filters=[128, 128, 128],
+    ),
+    pts_bbox_head=dict(
+        type='Anchor3DVeloHead',
+        class_names=class_names,
+        num_classes=10,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        encode_bg_as_zeros=True,
+        anchor_range=[
+            [-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],  # car
+            [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],  # truck
+            [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],  # trailer
+            [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],  # bicycle
+            [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],  # pedestrian
+            [-49.6, -49.6, -1.80984986, 49.6, 49.6,
+             -1.80984986],  # traffic_cone
+            [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965],  # barrier
+        ],
+        anchor_strides=[2],
+        anchor_sizes=[
+            [1.95017717, 4.60718145, 1.72270761],  # car
+            [2.4560939, 6.73778078, 2.73004906],  # truck
+            [2.87427237, 12.01320693, 3.81509561],  # trailer
+            [0.60058911, 1.68452161, 1.27192197],  # bicycle
+            [0.66344886, 0.7256437, 1.75748069],  # pedestrian
+            [0.39694519, 0.40359262, 1.06232151],  # traffic_cone
+            [2.49008838, 0.48578221, 0.98297065],  # barrier
+        ],
+        anchor_custom_values=[0, 0],
+        anchor_rotations=[0, 1.57],
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='ResidualCoder', ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
+    ),
+)
+# model training and testing settings
+train_cfg = dict(
+    pts=dict(
+        assigner=dict(  # for Car
+            type='MaxIoUAssigner',
+            iou_type='nearest_3d',
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    pts=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.2,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_per_img=500,
+        post_center_limit_range=point_cloud_range,
+        # TODO: check whether need to change this
+        # post_center_limit_range=[-59.6, -59.6, -6, 59.6, 59.6, 4],
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+    ))
+
+# dataset settings
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+input_modality = dict(
+    use_lidar=True,
+    use_depth=False,
+    use_lidar_intensity=True,
+    use_camera=True,
+)
+db_sampler = dict(
+    root_path=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    use_road_plane=False,
+    object_rot_range=[0.0, 0.0],
+    prepare=dict(),
+    sample_groups=dict(
+        bus=4,
+        trailer=4,
+        truck=4,
+    ),
+)
+
+train_pipeline = [
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.3925, 0.3925],
+        scaling_uniform_noise=[0.95, 1.05],
+        trans_normal_noise=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
+]
+test_pipeline = [
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='Resize',
+        img_scale=[
+            (1280, 720),
+        ],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points']),
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=False))
+# optimizer
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[16, 19])
+momentum_config = None
+checkpoint_config = dict(interval=1)
+# yapf:disable
+evaluation = dict(interval=20)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 20
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/pp_secfpn_80e'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/nus/hv_pointpillars_secfpn_sbn-all_freeze_adamw_1x16_20e_nus-mm.py b/configs/nus/hv_pointpillars_secfpn_sbn-all_freeze_adamw_1x16_20e_nus-mm.py
new file mode 100644
index 0000000000..5d26d2560f
--- /dev/null
+++ b/configs/nus/hv_pointpillars_secfpn_sbn-all_freeze_adamw_1x16_20e_nus-mm.py
@@ -0,0 +1,267 @@
+# model settings
+voxel_size = [0.25, 0.25, 8]
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+model = dict(
+    type='MVXFasterRCNNV2',
+    pretrained=('./pretrain_detectron/'
+                'ImageNetPretrained/MSRA/resnet50_msra.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=4,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    pts_voxel_layer=dict(
+        max_num_points=64,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,  # velodyne coordinates, x, y, z
+        voxel_size=voxel_size,
+        max_voxels=(30000, 40000),  # (training, testing) max_coxels
+    ),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        num_input_features=4,
+        num_filters=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=point_cloud_range,
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01),
+        fusion_layer=dict(
+            type='MultiViewPointFusion',
+            img_channels=2048,
+            pts_channels=64,
+            mid_channels=128,
+            out_channels=128,
+            norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01),
+            img_levels=[3],
+            align_corners=False,
+            activate_out=True,
+            fuse_out=False),
+    ),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter',
+        in_channels=128,
+        output_shape=[400, 400],  # checked from PointCloud3D
+    ),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=128,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        num_filters=[64, 128, 256],
+    ),
+    pts_neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        num_upsample_filters=[128, 128, 128],
+    ),
+    pts_bbox_head=dict(
+        type='Anchor3DVeloHead',
+        class_names=class_names,
+        num_classes=10,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        encode_bg_as_zeros=True,
+        anchor_generator=dict(type='AlignedAnchorGeneratorRange', ),
+        anchor_range=[
+            [-50, -50, -1.80032795, 50, 50, -1.80032795],  # car
+            [-50, -50, -1.74440365, 50, 50, -1.74440365],  # truck
+            [-50, -50, -1.68526504, 50, 50, -1.68526504],  # trailer
+            [-50, -50, -1.67339111, 50, 50, -1.67339111],  # bicycle
+            [-50, -50, -1.61785072, 50, 50, -1.61785072],  # pedestrian
+            [-50, -50, -1.80984986, 50, 50, -1.80984986],  # traffic_cone
+            [-50, -50, -1.763965, 50, 50, -1.763965],  # barrier
+        ],
+        anchor_strides=[2],
+        anchor_sizes=[
+            [1.95017717, 4.60718145, 1.72270761],  # car
+            [2.4560939, 6.73778078, 2.73004906],  # truck
+            [2.87427237, 12.01320693, 3.81509561],  # trailer
+            [0.60058911, 1.68452161, 1.27192197],  # bicycle
+            [0.66344886, 0.7256437, 1.75748069],  # pedestrian
+            [0.39694519, 0.40359262, 1.06232151],  # traffic_cone
+            [2.49008838, 0.48578221, 0.98297065],  # barrier
+        ],
+        anchor_custom_values=[0, 0],
+        anchor_rotations=[0, 1.57],
+        assigner_per_size=False,
+        assign_per_class=False,
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='ResidualCoder', ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
+    ),
+)
+# model training and testing settings
+train_cfg = dict(
+    pts=dict(
+        assigner=dict(  # for Car
+            type='MaxIoUAssigner',
+            iou_type='nearest_3d',
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    pts=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.2,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_per_img=500,
+        post_center_limit_range=point_cloud_range,
+        # TODO: check whether need to change this
+        # post_center_limit_range=[-59.6, -59.6, -6, 59.6, 59.6, 4],
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+    ))
+
+# dataset settings
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+input_modality = dict(
+    use_lidar=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False,
+    use_camera=True,
+)
+db_sampler = dict(
+    root_path=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    use_road_plane=False,
+    object_rot_range=[0.0, 0.0],
+    prepare=dict(),
+    sample_groups=dict(
+        bus=4,
+        trailer=4,
+        truck=4,
+    ),
+)
+
+train_pipeline = [
+    dict(
+        type='Resize',
+        img_scale=(1280, 720),
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True),
+    dict(
+        type='GlobalRotScale',
+        rot_uniform_noise=[-0.3925, 0.3925],
+        scaling_uniform_noise=[0.95, 1.05],
+        trans_normal_noise=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']),
+]
+test_pipeline = [
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='Resize',
+        img_scale=[
+            (1280, 720),
+        ],
+        multiscale_mode='value',
+        keep_ratio=True),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points', 'img']),
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        root_path=data_root,
+        ann_file=data_root + 'nuscenes_infos_test.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        class_names=class_names,
+        with_label=False))
+# optimizer
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[16, 19])
+momentum_config = None
+checkpoint_config = dict(interval=1)
+# yapf:disable
+evaluation = dict(interval=20)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 20
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/pp_secfpn_80e'
+load_from = './pretrain_mmdet/mvx_faster_rcnn_r50_fpn_detectron2-caffe_freezeBN_l1-loss_roialign-v2_nus_1x_coco-3x-pre_ap-28.8-4e72d8c7.pth'  # noqa
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/nus/retinanet_r50_fpn_caffe_2x8_1x_nus.py b/configs/nus/retinanet_r50_fpn_caffe_2x8_1x_nus.py
new file mode 100644
index 0000000000..f93e120313
--- /dev/null
+++ b/configs/nus/retinanet_r50_fpn_caffe_2x8_1x_nus.py
@@ -0,0 +1,138 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=False)
+model = dict(
+    type='RetinaNet',
+    pretrained=('./pretrain_detectron/'
+                'ImageNetPretrained/MSRA/resnet50_msra.pth'),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs=True,
+        num_outs=5),
+    bbox_head=dict(
+        type='RetinaHead',
+        num_classes=10,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        octave_base_scale=4,
+        scales_per_octave=3,
+        anchor_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[8, 16, 32, 64, 128],
+        target_means=[.0, .0, .0, .0],
+        target_stds=[1.0, 1.0, 1.0, 1.0],
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.0, loss_weight=1.0)))
+# training and testing settings
+train_cfg = dict(
+    assigner=dict(
+        type='MaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.4,
+        min_pos_iou=0,
+        ignore_iof_thr=-1),
+    allowed_border=-1,
+    pos_weight=-1,
+    debug=False)
+test_cfg = dict(
+    nms_pre=1000,
+    min_bbox_size=0,
+    score_thr=0.05,
+    nms=dict(type='nms', iou_thr=0.5),
+    max_per_img=100)
+# dataset settings
+dataset_type = 'NuScenes2DDataset'
+data_root = 'data/nuscenes/'
+# Values to be used for image normalization (BGR order)
+# Default mean pixel value are from ImageNet: [103.53, 116.28, 123.675]
+# When using pre-trained models in Detectron1 or any MSRA models,
+# std has been absorbed into its conv1 weights, so the std needs to be set 1.
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Resize',
+        img_scale=(1600, 900),
+        ratio_range=(0.8, 1.2),
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1600, 900),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'nuscenes_infos_train.coco.json',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'nuscenes_infos_val.coco.json',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'nuscenes_infos_val.coco.json',
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/retinanet_r50_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
new file mode 100644
index 0000000000..632b97d77f
--- /dev/null
+++ b/docs/CHANGELOG.md
@@ -0,0 +1,209 @@
+## Changelog
+
+### v1.1.0 (24/2/2020)
+
+**Highlights**
+- Dataset evaluation is rewritten with a unified api, which is used by both evaluation hooks and test scripts.
+- Support new methods: [CARAFE](https://arxiv.org/abs/1905.02188).
+
+**Breaking Changes**
+- The new MMDDP inherits from the official DDP, thus the `__init__` api is changed to be the same as official DDP.
+- The `mask_head` field in HTC config files is modified.
+- The evaluation and testing script is updated.
+- In all transforms, instance masks are stored as a numpy array shaped (n, h, w) instead of a list of (h, w) arrays, where n is the number of instances.
+
+**Bug Fixes**
+- Fix IOU assigners when ignore_iof_thr > 0 and there is no pred boxes. (#2135)
+- Fix mAP evaluation when there are no ignored boxes. (#2116)
+- Fix the empty RoI input for Deformable RoI Pooling. (#2099)
+- Fix the dataset settings for multiple workflows. (#2103)
+- Fix the warning related to `torch.uint8` in PyTorch 1.4. (#2105)
+- Fix the inference demo on devices other than gpu:0. (#2098)
+- Fix Dockerfile. (#2097)
+- Fix the bug that `pad_val` is unused in Pad transform. (#2093)
+- Fix the albumentation transform when there is no ground truth bbox. (#2032)
+
+**Improvements**
+- Use torch instead of numpy for random sampling. (#2094)
+- Migrate to the new MMDDP implementation in MMCV v0.3. (#2090)
+- Add meta information in logs. (#2086)
+- Rewrite Soft NMS with pytorch extension and remove cython as a dependency. (#2056)
+- Rewrite dataset evaluation. (#2042, #2087, #2114, #2128)
+- Use numpy array for masks in transforms. (#2030)
+
+**New Features**
+- Implement "CARAFE: Content-Aware ReAssembly of FEatures". (#1583)
+- Add `worker_init_fn()` in data_loader when seed is set. (#2066, #2111)
+- Add logging utils. (#2035)
+
+### v1.0.0 (30/1/2020)
+
+This release mainly improves the code quality and add more docstrings.
+
+**Highlights**
+- Documentation is online now: https://mmdetection.readthedocs.io.
+- Support new models: [ATSS](https://arxiv.org/abs/1912.02424).
+- DCN is now available with the api `build_conv_layer` and `ConvModule` like the normal conv layer.
+- A tool to collect environment information is available for trouble shooting.
+
+**Bug Fixes**
+- Fix the incompatibility of the latest numpy and pycocotools. (#2024)
+- Fix the case when distributed package is unavailable, e.g., on Windows. (#1985)
+- Fix the dimension issue for `refine_bboxes()`. (#1962)
+- Fix the typo when `seg_prefix` is a list. (#1906)
+- Add segmentation map cropping to RandomCrop. (#1880)
+- Fix the return value of `ga_shape_target_single()`. (#1853)
+- Fix the loaded shape of empty proposals. (#1819)
+- Fix the mask data type when using albumentation. (#1818)
+
+**Improvements**
+- Enhance AssignResult and SamplingResult. (#1995)
+- Add ability to overwrite existing module in Registry. (#1982)
+- Reorganize requirements and make albumentations and imagecorruptions optional. (#1969)
+- Check NaN in `SSDHead`. (#1935)
+- Encapsulate the DCN in ResNe(X)t into a ConvModule & Conv_layers. (#1894)
+- Refactoring for mAP evaluation and support multiprocessing and logging. (#1889)
+- Init the root logger before constructing Runner to log more information. (#1865)
+- Split `SegResizeFlipPadRescale` into different existing transforms. (#1852)
+- Move `init_dist()` to MMCV. (#1851)
+- Documentation and docstring improvements. (#1971, #1938, #1869, #1838)
+- Fix the color of the same class for mask visualization. (#1834)
+- Remove the option `keep_all_stages` in HTC and Cascade R-CNN. (#1806)
+
+**New Features**
+- Add two test-time options `crop_mask` and `rle_mask_encode` for mask heads. (#2013)
+- Support loading grayscale images as single channel. (#1975)
+- Implement "Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection". (#1872)
+- Add sphinx generated docs. (#1859, #1864)
+- Add GN support for flops computation. (#1850)
+- Collect env info for trouble shooting. (#1812)
+
+
+### v1.0rc1 (13/12/2019)
+
+The RC1 release mainly focuses on improving the user experience, and fixing bugs.
+
+**Highlights**
+- Support new models: [FoveaBox](https://arxiv.org/abs/1904.03797), [RepPoints](https://arxiv.org/abs/1904.11490) and [FreeAnchor](https://arxiv.org/abs/1909.02466).
+- Add a Dockerfile.
+- Add a jupyter notebook demo and a webcam demo.
+- Setup the code style and CI.
+- Add lots of docstrings and unit tests.
+- Fix lots of bugs.
+
+**Breaking Changes**
+- There was a bug for computing COCO-style mAP w.r.t different scales (AP_s, AP_m, AP_l), introduced by #621. (#1679)
+
+**Bug Fixes**
+- Fix a sampling interval bug in Libra R-CNN. (#1800)
+- Fix the learning rate in SSD300 WIDER FACE. (#1781)
+- Fix the scaling issue when `keep_ratio=False`. (#1730)
+- Fix typos. (#1721, #1492, #1242, #1108, #1107)
+- Fix the shuffle argument in `build_dataloader`. (#1693)
+- Clip the proposal when computing mask targets. (#1688)
+- Fix the "index out of range" bug for samplers in some corner cases. (#1610, #1404)
+- Fix the NMS issue on devices other than GPU:0. (#1603)
+- Fix SSD Head and GHM Loss on CPU. (#1578)
+- Fix the OOM error when there are too many gt bboxes. (#1575)
+- Fix the wrong keyword argument `nms_cfg` in HTC. (#1573)
+- Process masks and semantic segmentation in Expand and MinIoUCrop transforms. (#1550, #1361)
+- Fix a scale bug in the Non Local op. (#1528)
+- Fix a bug in transforms when `gt_bboxes_ignore` is None. (#1498)
+- Fix a bug when `img_prefix` is None. (#1497)
+- Pass the device argument to `grid_anchors` and `valid_flags`. (#1478)
+- Fix the data pipeline for test_robustness. (#1476)
+- Fix the argument type of deformable pooling. (#1390)
+- Fix the coco_eval when there are only two classes. (#1376)
+- Fix a bug in Modulated DeformableConv when deformable_group>1. (#1359)
+- Fix the mask cropping in RandomCrop. (#1333)
+- Fix zero outputs in DeformConv when not running on cuda:0. (#1326)
+- Fix the type issue in Expand. (#1288)
+- Fix the inference API. (#1255)
+- Fix the inplace operation in Expand. (#1249)
+- Fix the from-scratch training config. (#1196)
+- Fix inplace add in RoIExtractor which cause an error in PyTorch 1.2. (#1160)
+- Fix FCOS when input images has no positive sample. (#1136)
+- Fix recursive imports. (#1099)
+
+**Improvements**
+- Print the config file and mmdet version in the log. (#1721)
+- Lint the code before compiling in travis CI. (#1715)
+- Add a probability argument for the `Expand` transform. (#1651)
+- Update the PyTorch and CUDA version in the docker file. (#1615)
+- Raise a warning when specifying `--validate` in non-distributed training. (#1624, #1651)
+- Beautify the mAP printing. (#1614)
+- Add pre-commit hook. (#1536)
+- Add the argument `in_channels` to backbones. (#1475)
+- Add lots of docstrings and unit tests, thanks to [@Erotemic](https://github.com/Erotemic). (#1603, #1517, #1506, #1505, #1491, #1479, #1477, #1475, #1474)
+- Add support for multi-node distributed test when there is no shared storage. (#1399)
+- Optimize Dockerfile to reduce the image size. (#1306)
+- Update new results of HRNet. (#1284, #1182)
+- Add an argument `no_norm_on_lateral` in FPN. (#1240)
+- Test the compiling in CI. (#1235)
+- Move docs to a separate folder. (#1233)
+- Add a jupyter notebook demo. (#1158)
+- Support different type of dataset for training. (#1133)
+- Use int64_t instead of long in cuda kernels. (#1131)
+- Support unsquare RoIs for bbox and mask heads. (#1128)
+- Manually add type promotion to make compatible to PyTorch 1.2. (#1114)
+- Allowing validation dataset for computing validation loss. (#1093)
+- Use `.scalar_type()` instead of `.type()` to suppress some warnings. (#1070)
+
+**New Features**
+- Add an option `--with_ap` to compute the AP for each class. (#1549)
+- Implement "FreeAnchor: Learning to Match Anchors for Visual Object Detection". (#1391)
+- Support [Albumentations](https://github.com/albumentations-team/albumentations) for augmentations in the data pipeline. (#1354)
+- Implement "FoveaBox: Beyond Anchor-based Object Detector". (#1339)
+- Support horizontal and vertical flipping. (#1273, #1115)
+- Implement "RepPoints: Point Set Representation for Object Detection". (#1265)
+- Add test-time augmentation to HTC and Cascade R-CNN. (#1251)
+- Add a COCO result analysis tool. (#1228)
+- Add Dockerfile. (#1168)
+- Add a webcam demo. (#1155, #1150)
+- Add FLOPs counter. (#1127)
+- Allow arbitrary layer order for ConvModule. (#1078)
+
+
+### v1.0rc0 (27/07/2019)
+- Implement lots of new methods and components (Mixed Precision Training, HTC, Libra R-CNN, Guided Anchoring, Empirical Attention, Mask Scoring R-CNN, Grid R-CNN (Plus), GHM, GCNet, FCOS, HRNet, Weight Standardization, etc.). Thank all collaborators!
+- Support two additional datasets: WIDER FACE and Cityscapes.
+- Refactoring for loss APIs and make it more flexible to adopt different losses and related hyper-parameters.
+- Speed up multi-gpu testing.
+- Integrate all compiling and installing in a single script.
+
+### v0.6.0 (14/04/2019)
+- Up to 30% speedup compared to the model zoo.
+- Support both PyTorch stable and nightly version.
+- Replace NMS and SigmoidFocalLoss with Pytorch CUDA extensions.
+
+### v0.6rc0(06/02/2019)
+- Migrate to PyTorch 1.0.
+
+### v0.5.7 (06/02/2019)
+- Add support for Deformable ConvNet v2. (Many thanks to the authors and [@chengdazhi](https://github.com/chengdazhi))
+- This is the last release based on PyTorch 0.4.1.
+
+### v0.5.6 (17/01/2019)
+- Add support for Group Normalization.
+- Unify RPNHead and single stage heads (RetinaHead, SSDHead) with AnchorHead.
+
+### v0.5.5 (22/12/2018)
+- Add SSD for COCO and PASCAL VOC.
+- Add ResNeXt backbones and detection models.
+- Refactoring for Samplers/Assigners and add OHEM.
+- Add VOC dataset and evaluation scripts.
+
+### v0.5.4 (27/11/2018)
+- Add SingleStageDetector and RetinaNet.
+
+### v0.5.3 (26/11/2018)
+- Add Cascade R-CNN and Cascade Mask R-CNN.
+- Add support for Soft-NMS in config files.
+
+### v0.5.2 (21/10/2018)
+- Add support for custom datasets.
+- Add a script to convert PASCAL VOC annotations to the expected format.
+
+### v0.5.1 (20/10/2018)
+- Add BBoxAssigner and BBoxSampler, the `train_cfg` field in config files are restructured.
+- `ConvFCRoIHead` / `SharedFCRoIHead` are renamed to `ConvFCBBoxHead` / `SharedFCBBoxHead` for consistency.
diff --git a/docs/CODE_OF_CONDUCT.md b/docs/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000..efd4305798
--- /dev/null
+++ b/docs/CODE_OF_CONDUCT.md
@@ -0,0 +1,76 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+ advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+ address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at chenkaidev@gmail.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
new file mode 100644
index 0000000000..7a24fb56ca
--- /dev/null
+++ b/docs/CONTRIBUTING.md
@@ -0,0 +1,35 @@
+# Contributing to mmdetection
+
+All kinds of contributions are welcome, including but not limited to the following.
+
+- Fixes (typo, bugs)
+- New features and components
+
+## Workflow
+
+1. fork and pull the latest mmdetection
+2. checkout a new branch (do not use master branch for PRs)
+3. commit your changes
+4. create a PR
+
+Note
+- If you plan to add some new features that involve large changes, it is encouraged to open an issue for discussion first.
+- If you are the author of some papers and would like to include your method to mmdetection,
+please contact Wenwei Zhang (zwwdev[at]gmail[dot]com). We will much appreciate your contribution.
+
+## Code style
+
+### Python
+We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.
+
+We use the following tools for linting and formatting:
+- [flake8](http://flake8.pycqa.org/en/latest/): linter
+- [yapf](https://github.com/google/yapf): formatter
+- [isort](https://github.com/timothycrosley/isort): sort imports
+
+Style configurations of yapf and isort can be found in [.style.yapf](.style.yapf) and [.isort.cfg](.isort.cfg).
+
+>Before you create a PR, make sure that your code lints and is formatted by yapf.
+
+### C++ and CUDA
+We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md
new file mode 100644
index 0000000000..077def3e39
--- /dev/null
+++ b/docs/GETTING_STARTED.md
@@ -0,0 +1,510 @@
+# Getting Started
+
+This page provides basic tutorials about the usage of MMDetection.
+For installation instructions, please see [INSTALL.md](INSTALL.md).
+
+## Inference with pretrained models
+
+We provide testing scripts to evaluate a whole dataset (COCO, PASCAL VOC, Cityscapes, etc.),
+and also some high-level apis for easier integration to other projects.
+
+### Test a dataset
+
+- [x] single GPU testing
+- [x] multiple GPU testing
+- [x] visualize detection results
+
+You can use the following commands to test a dataset.
+
+```shell
+# single-gpu testing
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show]
+
+# multi-gpu testing
+./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}]
+```
+
+Optional arguments:
+- `RESULT_FILE`: Filename of the output results in pickle format. If not specified, the results will not be saved to a file.
+- `EVAL_METRICS`: Items to be evaluated on the results. Allowed values depend on the dataset, e.g., `proposal_fast`, `proposal`, `bbox`, `segm` are available for COCO, `mAP`, `recall` for PASCAL VOC. Cityscapes could be evaluated by `cityscapes` as well as all COCO metrics.
+- `--show`: If specified, detection results will be plotted on the images and shown in a new window. It is only applicable to single GPU testing and used for debugging and visualization. Please make sure that GUI is available in your environment, otherwise you may encounter the error like `cannot connect to X server`.
+
+If you would like to evaluate the dataset, do not specify `--show` at the same time.
+
+Examples:
+
+Assume that you have already downloaded the checkpoints to the directory `checkpoints/`.
+
+1. Test Faster R-CNN and visualize the results. Press any key for the next image.
+
+```shell
+python tools/test.py configs/faster_rcnn_r50_fpn_1x.py \
+    checkpoints/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth \
+    --show
+```
+
+2. Test Faster R-CNN on PASCAL VOC (without saving the test results) and evaluate the mAP.
+
+```shell
+python tools/test.py configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc.py \
+    checkpoints/SOME_CHECKPOINT.pth \
+    --eval mAP
+```
+
+3. Test Mask R-CNN with 8 GPUs, and evaluate the bbox and mask AP.
+
+```shell
+./tools/dist_test.sh configs/mask_rcnn_r50_fpn_1x.py \
+    checkpoints/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth \
+    8 --out results.pkl --eval bbox segm
+```
+
+4. Test Mask R-CNN on COCO test-dev with 8 GPUs, and generate the json file to be submit to the official evaluation server.
+
+```shell
+./tools/dist_test.sh configs/mask_rcnn_r50_fpn_1x.py \
+    checkpoints/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth \
+    8 --format-only --options "jsonfile_prefix=./mask_rcnn_test-dev_results"
+```
+
+You will get two json files `mask_rcnn_test-dev_results.bbox.json` and `mask_rcnn_test-dev_results.segm.json`.
+
+5. Test Mask R-CNN on Cityscapes test with 8 GPUs, and generate the txt and png files to be submit to the official evaluation server.
+
+```shell
+./tools/dist_test.sh configs/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py \
+    checkpoints/mask_rcnn_r50_fpn_1x_cityscapes_20200227-afe51d5a.pth \
+    8  --format_only --options "outfile_prefix=./mask_rcnn_cityscapes_test_results"
+```
+
+The generated png and txt would be under `./mask_rcnn_cityscapes_test_results` directory.
+
+### Webcam demo
+
+We provide a webcam demo to illustrate the results.
+
+```shell
+python demo/webcam_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${GPU_ID}] [--camera-id ${CAMERA-ID}] [--score-thr ${SCORE_THR}]
+```
+
+Examples:
+
+```shell
+python demo/webcam_demo.py configs/faster_rcnn_r50_fpn_1x.py \
+    checkpoints/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth
+```
+
+### High-level APIs for testing images
+
+#### Synchronous interface
+Here is an example of building the model and test given images.
+
+```python
+from mmdet.apis import init_detector, inference_detector, show_result
+import mmcv
+
+config_file = 'configs/faster_rcnn_r50_fpn_1x.py'
+checkpoint_file = 'checkpoints/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth'
+
+# build the model from a config file and a checkpoint file
+model = init_detector(config_file, checkpoint_file, device='cuda:0')
+
+# test a single image and show the results
+img = 'test.jpg'  # or img = mmcv.imread(img), which will only load it once
+result = inference_detector(model, img)
+# visualize the results in a new window
+show_result(img, result, model.CLASSES)
+# or save the visualization results to image files
+show_result(img, result, model.CLASSES, out_file='result.jpg')
+
+# test a video and show the results
+video = mmcv.VideoReader('video.mp4')
+for frame in video:
+    result = inference_detector(model, frame)
+    show_result(frame, result, model.CLASSES, wait_time=1)
+```
+
+A notebook demo can be found in [demo/inference_demo.ipynb](https://github.com/open-mmlab/mmdetection/blob/master/demo/inference_demo.ipynb).
+
+#### Asynchronous interface - supported for Python 3.7+
+
+Async interface allows not to block CPU on GPU bound inference code and enables better CPU/GPU utilization for single threaded application. Inference can be done concurrently either between different input data samples or between different models of some inference pipeline.
+
+See `tests/async_benchmark.py` to compare the speed of synchronous and asynchronous interfaces.
+
+```python
+import asyncio
+import torch
+from mmdet.apis import init_detector, async_inference_detector, show_result
+from mmdet.utils.contextmanagers import concurrent
+
+async def main():
+    config_file = 'configs/faster_rcnn_r50_fpn_1x.py'
+    checkpoint_file = 'checkpoints/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth'
+    device = 'cuda:0'
+    model = init_detector(config_file, checkpoint=checkpoint_file, device=device)
+
+    # queue is used for concurrent inference of multiple images
+    streamqueue = asyncio.Queue()
+    # queue size defines concurrency level
+    streamqueue_size = 3
+
+    for _ in range(streamqueue_size):
+        streamqueue.put_nowait(torch.cuda.Stream(device=device))
+
+    # test a single image and show the results
+    img = 'test.jpg'  # or img = mmcv.imread(img), which will only load it once
+
+    async with concurrent(streamqueue):
+        result = await async_inference_detector(model, img)
+
+    # visualize the results in a new window
+    show_result(img, result, model.CLASSES)
+    # or save the visualization results to image files
+    show_result(img, result, model.CLASSES, out_file='result.jpg')
+
+
+asyncio.run(main())
+
+```
+
+
+## Train a model
+
+MMDetection implements distributed training and non-distributed training,
+which uses `MMDistributedDataParallel` and `MMDataParallel` respectively.
+
+All outputs (log files and checkpoints) will be saved to the working directory,
+which is specified by `work_dir` in the config file.
+
+By default we evaluate the model on the validation set after each epoch, you can change the evaluation interval by adding the interval argument in the training config.
+```python
+evaluation = dict(interval=12)  # This evaluate the model per 12 epoch.
+```
+
+**\*Important\***: The default learning rate in config files is for 8 GPUs and 2 img/gpu (batch size = 8*2 = 16).
+According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you need to set the learning rate proportional to the batch size if you use different GPUs or images per GPU, e.g., lr=0.01 for 4 GPUs * 2 img/gpu and lr=0.08 for 16 GPUs * 4 img/gpu.
+
+### Train with a single GPU
+
+```shell
+python tools/train.py ${CONFIG_FILE}
+```
+
+If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`.
+
+### Train with multiple GPUs
+
+```shell
+./tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
+```
+
+Optional arguments are:
+
+- `--validate` (**strongly recommended**): Perform evaluation at every k (default value is 1, which can be modified like [this](https://github.com/open-mmlab/mmdetection/blob/master/configs/mask_rcnn_r50_fpn_1x.py#L174)) epochs during the training.
+- `--work_dir ${WORK_DIR}`: Override the working directory specified in the config file.
+- `--resume_from ${CHECKPOINT_FILE}`: Resume from a previous checkpoint file.
+
+Difference between `resume_from` and `load_from`:
+`resume_from` loads both the model weights and optimizer status, and the epoch is also inherited from the specified checkpoint. It is usually used for resuming the training process that is interrupted accidentally.
+`load_from` only loads the model weights and the training epoch starts from 0. It is usually used for finetuning.
+
+### Train with multiple machines
+
+If you run MMDetection on a cluster managed with [slurm](https://slurm.schedmd.com/), you can use the script `slurm_train.sh`. (This script also supports single machine training.)
+
+```shell
+./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR} [${GPUS}]
+```
+
+Here is an example of using 16 GPUs to train Mask R-CNN on the dev partition.
+
+```shell
+./tools/slurm_train.sh dev mask_r50_1x configs/mask_rcnn_r50_fpn_1x.py /nfs/xxxx/mask_rcnn_r50_fpn_1x 16
+```
+
+You can check [slurm_train.sh](https://github.com/open-mmlab/mmdetection/blob/master/tools/slurm_train.sh) for full arguments and environment variables.
+
+If you have just multiple machines connected with ethernet, you can refer to
+pytorch [launch utility](https://pytorch.org/docs/stable/distributed_deprecated.html#launch-utility).
+Usually it is slow if you do not have high speed networking like infiniband.
+
+### Launch multiple jobs on a single machine
+
+If you launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs,
+you need to specify different ports (29500 by default) for each job to avoid communication conflict.
+
+If you use `dist_train.sh` to launch training jobs, you can set the port in commands.
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4
+```
+
+If you use launch training jobs with slurm, you need to modify the config files (usually the 6th line from the bottom in config files) to set different communication ports.
+
+In `config1.py`,
+```python
+dist_params = dict(backend='nccl', port=29500)
+```
+
+In `config2.py`,
+```python
+dist_params = dict(backend='nccl', port=29501)
+```
+
+Then you can launch two jobs with `config1.py` ang `config2.py`.
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR} 4
+CUDA_VISIBLE_DEVICES=4,5,6,7 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR} 4
+```
+
+## Useful tools
+
+We provide lots of useful tools under `tools/` directory.
+
+### Analyze logs
+
+You can plot loss/mAP curves given a training log file. Run `pip install seaborn` first to install the dependency.
+
+![loss curve image](../demo/loss_curve.png)
+
+```shell
+python tools/analyze_logs.py plot_curve [--keys ${KEYS}] [--title ${TITLE}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}]
+```
+
+Examples:
+
+- Plot the classification loss of some run.
+
+```shell
+python tools/analyze_logs.py plot_curve log.json --keys loss_cls --legend loss_cls
+```
+
+- Plot the classification and regression loss of some run, and save the figure to a pdf.
+
+```shell
+python tools/analyze_logs.py plot_curve log.json --keys loss_cls loss_reg --out losses.pdf
+```
+
+- Compare the bbox mAP of two runs in the same figure.
+
+```shell
+python tools/analyze_logs.py plot_curve log1.json log2.json --keys bbox_mAP --legend run1 run2
+```
+
+You can also compute the average training speed.
+
+```shell
+python tools/analyze_logs.py cal_train_time ${CONFIG_FILE} [--include-outliers]
+```
+
+The output is expected to be like the following.
+
+```
+-----Analyze train time of work_dirs/some_exp/20190611_192040.log.json-----
+slowest epoch 11, average time is 1.2024
+fastest epoch 1, average time is 1.1909
+time std over epochs is 0.0028
+average iter time: 1.1959 s/iter
+
+```
+
+### Get the FLOPs and params (experimental)
+
+We provide a script adapted from [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch) to compute the FLOPs and params of a given model.
+
+```shell
+python tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}]
+```
+
+You will get the result like this.
+
+```
+==============================
+Input shape: (3, 1280, 800)
+Flops: 239.32 GMac
+Params: 37.74 M
+==============================
+```
+
+**Note**: This tool is still experimental and we do not guarantee that the number is correct. You may well use the result for simple comparisons, but double check it before you adopt it in technical reports or papers.
+
+(1) FLOPs are related to the input shape while parameters are not. The default input shape is (1, 3, 1280, 800).
+(2) Some operators are not counted into FLOPs like GN and custom operators.
+You can add support for new operators by modifying [`mmdet/utils/flops_counter.py`](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/utils/flops_counter.py).
+(3) The FLOPs of two-stage detectors is dependent on the number of proposals.
+
+### Publish a model
+
+Before you upload a model to AWS, you may want to
+(1) convert model weights to CPU tensors, (2) delete the optimizer states and
+(3) compute the hash of the checkpoint file and append the hash id to the filename.
+
+```shell
+python tools/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
+```
+
+E.g.,
+
+```shell
+python tools/publish_model.py work_dirs/faster_rcnn/latest.pth faster_rcnn_r50_fpn_1x_20190801.pth
+```
+
+The final output filename will be `faster_rcnn_r50_fpn_1x_20190801-{hash id}.pth`.
+
+### Test the robustness of detectors
+
+Please refer to [ROBUSTNESS_BENCHMARKING.md](ROBUSTNESS_BENCHMARKING.md).
+
+
+## How-to
+
+### Use my own datasets
+
+The simplest way is to convert your dataset to existing dataset formats (COCO or PASCAL VOC).
+
+Here we show an example of adding a custom dataset of 5 classes, assuming it is also in COCO format.
+
+In `mmdet/datasets/my_dataset.py`:
+
+```python
+from .coco import CocoDataset
+from .registry import DATASETS
+
+
+@DATASETS.register_module
+class MyDataset(CocoDataset):
+
+    CLASSES = ('a', 'b', 'c', 'd', 'e')
+```
+
+In `mmdet/datasets/__init__.py`:
+
+```python
+from .my_dataset import MyDataset
+```
+
+Then you can use `MyDataset` in config files, with the same API as CocoDataset.
+
+
+It is also fine if you do not want to convert the annotation format to COCO or PASCAL format.
+Actually, we define a simple annotation format and all existing datasets are
+processed to be compatible with it, either online or offline.
+
+The annotation of a dataset is a list of dict, each dict corresponds to an image.
+There are 3 field `filename` (relative path), `width`, `height` for testing,
+and an additional field `ann` for training. `ann` is also a dict containing at least 2 fields:
+`bboxes` and `labels`, both of which are numpy arrays. Some datasets may provide
+annotations like crowd/difficult/ignored bboxes, we use `bboxes_ignore` and `labels_ignore`
+to cover them.
+
+Here is an example.
+```
+[
+    {
+        'filename': 'a.jpg',
+        'width': 1280,
+        'height': 720,
+        'ann': {
+            'bboxes': <np.ndarray, float32> (n, 4),
+            'labels': <np.ndarray, int64> (n, ),
+            'bboxes_ignore': <np.ndarray, float32> (k, 4),
+            'labels_ignore': <np.ndarray, int64> (k, ) (optional field)
+        }
+    },
+    ...
+]
+```
+
+There are two ways to work with custom datasets.
+
+- online conversion
+
+  You can write a new Dataset class inherited from `CustomDataset`, and overwrite two methods
+  `load_annotations(self, ann_file)` and `get_ann_info(self, idx)`,
+  like [CocoDataset](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/coco.py) and [VOCDataset](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/voc.py).
+
+- offline conversion
+
+  You can convert the annotation format to the expected format above and save it to
+  a pickle or json file, like [pascal_voc.py](https://github.com/open-mmlab/mmdetection/blob/master/tools/convert_datasets/pascal_voc.py).
+  Then you can simply use `CustomDataset`.
+
+### Customize optimizer
+
+An example of customized optimizer `CopyOfSGD` is defined in `mmdet/core/optimizer/copy_of_sgd.py`.
+More generally, a customized optimizer could be defined as following.
+
+In `mmdet/core/optimizer/my_optimizer.py`:
+
+```python
+from .registry import OPTIMIZERS
+from torch.optim import Optimizer
+
+
+@OPTIMIZERS.register_module
+class MyOptimizer(Optimizer):
+
+```
+
+In `mmdet/core/optimizer/__init__.py`:
+
+```python
+from .my_optimizer import MyOptimizer
+```
+
+Then you can use `MyOptimizer` in `optimizer` field of config files.
+
+### Develop new components
+
+We basically categorize model components into 4 types.
+
+- backbone: usually an FCN network to extract feature maps, e.g., ResNet, MobileNet.
+- neck: the component between backbones and heads, e.g., FPN, PAFPN.
+- head: the component for specific tasks, e.g., bbox prediction and mask prediction.
+- roi extractor: the part for extracting RoI features from feature maps, e.g., RoI Align.
+
+Here we show how to develop new components with an example of MobileNet.
+
+1. Create a new file `mmdet/models/backbones/mobilenet.py`.
+
+```python
+import torch.nn as nn
+
+from ..registry import BACKBONES
+
+
+@BACKBONES.register_module
+class MobileNet(nn.Module):
+
+    def __init__(self, arg1, arg2):
+        pass
+
+    def forward(self, x):  # should return a tuple
+        pass
+
+    def init_weights(self, pretrained=None):
+        pass
+```
+
+2. Import the module in `mmdet/models/backbones/__init__.py`.
+
+```python
+from .mobilenet import MobileNet
+```
+
+3. Use it in your config file.
+
+```python
+model = dict(
+    ...
+    backbone=dict(
+        type='MobileNet',
+        arg1=xxx,
+        arg2=xxx),
+    ...
+```
+
+For more information on how it works, you can refer to [TECHNICAL_DETAILS.md](TECHNICAL_DETAILS.md) (TODO).
diff --git a/docs/INSTALL.md b/docs/INSTALL.md
new file mode 100644
index 0000000000..b22b970b97
--- /dev/null
+++ b/docs/INSTALL.md
@@ -0,0 +1,161 @@
+## Installation
+
+### Requirements
+
+- Linux (Windows is not officially supported)
+- Python 3.5+
+- PyTorch 1.1 or higher
+- CUDA 9.0 or higher
+- NCCL 2
+- GCC 4.9 or higher
+- [mmcv](https://github.com/open-mmlab/mmcv)
+
+We have tested the following versions of OS and softwares:
+
+- OS: Ubuntu 16.04/18.04 and CentOS 7.2
+- CUDA: 9.0/9.2/10.0/10.1
+- NCCL: 2.1.15/2.2.13/2.3.7/2.4.2
+- GCC(G++): 4.9/5.3/5.4/7.3
+
+### Install mmdetection
+
+a. Create a conda virtual environment and activate it.
+
+```shell
+conda create -n open-mmlab python=3.7 numba=0.45.1 -y
+conda activate open-mmlab
+```
+
+b. Install PyTorch and torchvision following the [official instructions](https://pytorch.org/), e.g.,
+
+```shell
+conda install pytorch torchvision -c pytorch
+```
+
+c. Clone the mmdetection repository.
+
+```shell
+git clone https://github.com/open-mmlab/mmdetection.git
+cd mmdetection
+```
+
+d. Install build requirements and then install mmdetection.
+(We install pycocotools via the github repo instead of pypi because the pypi version is old and not compatible with the latest numpy.)
+
+```shell
+pip install -r requirements/build.txt
+pip install "git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI"
+pip install -v -e .  # or "python setup.py develop"
+```
+
+e. Clone the MMDetection3D repository.
+
+```shell
+git clone https://github.com/open-mmlab/mmdetection3d.git
+cd mmdetection3d
+```
+
+f. Install build requirements and then install MMDetection3D.
+
+```shell
+pip install -r requirements/build.txt
+pip install -v -e .  # or "python setup.py develop"
+```
+
+Note:
+
+1. The git commit id will be written to the version number with step d, e.g. 0.6.0+2e7045c. The version will also be saved in trained models.
+It is recommended that you run step d each time you pull some updates from github. If C++/CUDA codes are modified, then this step is compulsory.
+
+2. Following the above instructions, mmdetection is installed on `dev` mode, any local modifications made to the code will take effect without the need to reinstall it (unless you submit some commits and want to update the version number).
+
+3. If you would like to use `opencv-python-headless` instead of `opencv-python`,
+you can install it before installing MMCV.
+
+4. Some dependencies are optional. Simply running `pip install -v -e .` will only install the minimum runtime requirements. To use optional dependencies like `albumentations` and `imagecorruptions` either install them manually with `pip install -r requirements/optional.txt` or specify desired extras when calling `pip` (e.g. `pip install -v -e .[optional]`). Valid keys for the extras field are: `all`, `tests`, `build`, and `optional`.
+
+### Another option: Docker Image
+
+We provide a [Dockerfile](https://github.com/open-mmlab/mmdetection/blob/master/docker/Dockerfile) to build an image.
+
+```shell
+# build an image with PyTorch 1.1, CUDA 10.0 and CUDNN 7.5
+docker build -t mmdetection docker/
+```
+
+### Prepare datasets
+
+It is recommended to symlink the dataset root to `$MMDETECTION/data`.
+If your folder structure is different, you may need to change the corresponding paths in config files.
+
+```
+mmdetection
+├── mmdet
+├── tools
+├── configs
+├── data
+│   ├── coco
+│   │   ├── annotations
+│   │   ├── train2017
+│   │   ├── val2017
+│   │   ├── test2017
+│   ├── cityscapes
+│   │   ├── annotations
+│   │   ├── leftImg8bit
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── gtFine
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── VOCdevkit
+│   │   ├── VOC2007
+│   │   ├── VOC2012
+
+```
+The cityscapes annotations have to be converted into the coco format using `tools/convert_datasets/cityscapes.py`:
+```shell
+pip install cityscapesscripts
+python tools/convert_datasets/cityscapes.py ./data/cityscapes --nproc 8 --out_dir ./data/cityscapes/annotations
+```
+Current the config files in `cityscapes` use COCO pre-trained weights to initialize.
+You could download the pre-trained models in advance if network is unavailable or slow, otherwise it would cause errors at the beginning of training.
+
+### A from-scratch setup script
+
+Here is a full script for setting up mmdetection with conda and link the dataset path (supposing that your COCO dataset path is $COCO_ROOT).
+
+```shell
+conda create -n open-mmlab python=3.7 numba=0.45.1 -y
+conda activate open-mmlab
+
+conda install -c pytorch pytorch torchvision -y
+git clone https://github.com/open-mmlab/mmdetection.git
+cd mmdetection
+pip install -r requirements/build.txt
+pip install "git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI"
+pip install -v -e .
+
+git clone https://github.com/open-mmlab/mmdetection3d.git
+cd mmdetection3d
+pip install -r requirements/build.txt
+pip install -v -e .
+
+mkdir data
+ln -s $COCO_ROOT data
+```
+
+### Using multiple MMDetection3D versions
+
+If there are more than one mmdetection on your machine, and you want to use them alternatively, the recommended way is to create multiple conda environments and use different environments for different versions.
+
+Another way is to insert the following code to the main scripts (`train.py`, `test.py` or any other scripts you run)
+```python
+import os.path as osp
+import sys
+sys.path.insert(0, osp.join(osp.dirname(osp.abspath(__file__)), '../'))
+```
+
+Or run the following command in the terminal of corresponding folder to temporally use the current one.
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
diff --git a/docs/MODEL_ZOO.md b/docs/MODEL_ZOO.md
new file mode 100644
index 0000000000..c15a00b6fa
--- /dev/null
+++ b/docs/MODEL_ZOO.md
@@ -0,0 +1,532 @@
+# Benchmark and Model Zoo
+
+## Environment
+
+### Hardware
+
+- 8 NVIDIA Tesla V100 GPUs
+- Intel Xeon 4114 CPU @ 2.20GHz
+
+### Software environment
+
+- Python 3.6 / 3.7
+- PyTorch 1.1
+- CUDA 9.0.176
+- CUDNN 7.0.4
+- NCCL 2.1.15
+
+## Mirror sites
+
+We use AWS as the main site to host our model zoo, and maintain a mirror on aliyun.
+You can replace `https://s3.ap-northeast-2.amazonaws.com/open-mmlab` with `https://open-mmlab.oss-cn-beijing.aliyuncs.com` in model urls.
+
+## Common settings
+
+- All FPN baselines and RPN-C4 baselines were trained using 8 GPU with a batch size of 16 (2 images per GPU). Other C4 baselines were trained using 8 GPU with a batch size of 8 (1 image per GPU).
+- All models were trained on `coco_2017_train`, and tested on the `coco_2017_val`.
+- We use distributed training and BN layer stats are fixed.
+- We adopt the same training schedules as Detectron. 1x indicates 12 epochs and 2x indicates 24 epochs, which corresponds to slightly less iterations than Detectron and the difference can be ignored.
+- All pytorch-style pretrained backbones on ImageNet are from PyTorch model zoo.
+- For fair comparison with other codebases, we report the GPU memory as the maximum value of `torch.cuda.max_memory_allocated()` for all 8 GPUs. Note that this value is usually less than what `nvidia-smi` shows.
+- We report the inference time as the overall time including data loading, network forwarding and post processing.
+
+
+## Baselines
+
+More models with different backbones will be added to the model zoo.
+
+### RPN
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | AR1000 |                                                          Download                                                          |
+| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------------: |
+|     R-50-C4     |  caffe  |   1x    |    -     |          -          |      20.5      |  51.1  |      [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_caffe_c4_1x-ea7d3428.pth)       |
+|     R-50-C4     |  caffe  |   2x    |   2.2    |        0.17         |      20.3      |  52.2  |      [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_caffe_c4_2x-c6d5b958.pth)       |
+|     R-50-C4     | pytorch |   1x    |    -     |          -          |      20.1      |  50.2  |         [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_c4_1x-eb38972b.pth)          |
+|     R-50-C4     | pytorch |   2x    |    -     |          -          |      20.0      |  51.1  |         [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_c4_2x-3d4c1e14.pth)          |
+|    R-50-FPN     |  caffe  |   1x    |   3.3    |        0.253        |      16.9      |  58.2  |                                                             -                                                              |
+|    R-50-FPN     | pytorch |   1x    |   3.5    |        0.276        |      17.7      |  57.1  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_fpn_1x_20181010-4a9c0712.pth)     |
+|    R-50-FPN     | pytorch |   2x    |    -     |          -          |       -        |  57.6  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r50_fpn_2x_20181010-88a4a471.pth)     |
+|    R-101-FPN    |  caffe  |   1x    |   5.2    |        0.379        |      13.9      |  59.4  |                                                             -                                                              |
+|    R-101-FPN    | pytorch |   1x    |   5.4    |        0.396        |      14.4      |  58.6  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r101_fpn_1x_20181129-f50da4bd.pth)    |
+|    R-101-FPN    | pytorch |   2x    |    -     |          -          |       -        |  59.1  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_r101_fpn_2x_20181129-e42c6c9a.pth)    |
+| X-101-32x4d-FPN | pytorch |   1x    |   6.6    |        0.589        |      11.8      |  59.4  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_32x4d_fpn_1x_20181218-7e379d26.pth) |
+| X-101-32x4d-FPN | pytorch |   2x    |    -     |          -          |       -        |  59.9  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_32x4d_fpn_2x_20181218-0510af40.pth) |
+| X-101-64x4d-FPN | pytorch |   1x    |   9.5    |        0.955        |      8.3       |  59.8  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_64x4d_fpn_1x_20181218-c1a24f1f.pth) |
+| X-101-64x4d-FPN | pytorch |   2x    |    -     |          -          |       -        |  60.0  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/rpn_x101_64x4d_fpn_2x_20181218-c22bdd70.pth) |
+
+### Faster R-CNN
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP |                                                              Download                                                              |
+| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :--------------------------------------------------------------------------------------------------------------------------------: |
+|     R-50-C4     |  caffe  |   1x    |    -     |          -          |      9.5       |  34.9  |      [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_caffe_c4_1x-75ecfdfa.pth)       |
+|     R-50-C4     |  caffe  |   2x    |   4.0    |        0.39         |      9.3       |  36.5  |      [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_caffe_c4_2x-71c67f27.pth)       |
+|     R-50-C4     | pytorch |   1x    |    -     |          -          |      9.3       |  33.9  |         [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_c4_1x-642cf91f.pth)          |
+|     R-50-C4     | pytorch |   2x    |    -     |          -          |      9.4       |  35.9  |         [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_c4_2x-6e4fdf4f.pth)          |
+|    R-50-FPN     |  caffe  |   1x    |   3.6    |        0.333        |      13.5      |  36.6  |                                                                 -                                                                  |
+|    R-50-FPN     | pytorch |   1x    |   3.8    |        0.353        |      13.6      |  36.4  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_fpn_1x_20181010-3d1b3351.pth)     |
+|    R-50-FPN     | pytorch |   2x    |    -     |          -          |       -        |  37.7  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r50_fpn_2x_20181010-443129e1.pth)     |
+|    R-101-FPN    |  caffe  |   1x    |   5.5    |        0.465        |      11.5      |  38.8  |                                                                 -                                                                  |
+|    R-101-FPN    | pytorch |   1x    |   5.7    |        0.474        |      11.9      |  38.5  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r101_fpn_1x_20181129-d1468807.pth)    |
+|    R-101-FPN    | pytorch |   2x    |    -     |          -          |       -        |  39.4  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_r101_fpn_2x_20181129-73e7ade7.pth)    |
+| X-101-32x4d-FPN | pytorch |   1x    |   6.9    |        0.672        |      10.3      |  40.1  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_32x4d_fpn_1x_20181218-ad81c133.pth) |
+| X-101-32x4d-FPN | pytorch |   2x    |    -     |          -          |       -        |  40.4  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_32x4d_fpn_2x_20181218-0ed58946.pth) |
+| X-101-64x4d-FPN | pytorch |   1x    |   9.8    |        1.040        |      7.3       |  41.3  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_64x4d_fpn_1x_20181218-c9c69c8f.pth) |
+| X-101-64x4d-FPN | pytorch |   2x    |    -     |          -          |       -        |  40.7  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/faster_rcnn_x101_64x4d_fpn_2x_20181218-fe94f9b8.pth) |
+|   HRNetV2p-W18   | pytorch |   1x    |    -     |          -          |       -        |  36.1  |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/faster_rcnn_hrnetv2p_w18_1x_20190522-e368c387.pth)    |
+|   HRNetV2p-W18   | pytorch |   2x    |    -     |          -          |       -        |  38.3  |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/faster_rcnn_hrnetv2p_w18_2x_20190810-9c8615d5.pth) |
+|   HRNetV2p-W32   | pytorch |   1x    |    -     |          -          |       -        |  39.5  |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/faster_rcnn_hrnetv2p_w32_1x_20190522-d22f1fef.pth)    |
+|   HRNetV2p-W32   | pytorch |   2x    |    -     |          -          |       -        |  40.6  |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/faster_rcnn_hrnetv2p_w32_2x_20190810-24e8912a.pth) |
+|   HRNetV2p-W48   | pytorch |   1x    |    -     |          -          |       -        |  40.9  |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/faster_rcnn_hrnetv2p_w48_1x_20190820-5c6d0903.pth)    |
+|   HRNetV2p-W48   | pytorch |   2x    |    -     |          -          |       -        |  41.5  |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/faster_rcnn_hrnetv2p_w48_2x_20190820-79fb8bfc.pth) |
+
+
+### Mask R-CNN
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP |                                                             Download                                                             |
+| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :------------------------------------------------------------------------------------------------------------------------------: |
+|     R-50-C4     |  caffe  |   1x    |    -     |          -          |      8.1       |  35.9  |  31.5   |      [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_caffe_c4_1x-02a4ad3b.pth)       |
+|     R-50-C4     |  caffe  |   2x    |   4.2    |        0.43         |      8.1       |  37.9  |  32.9   |      [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_caffe_c4_2x-d150973a.pth)       |
+|     R-50-C4     | pytorch |   1x    |    -     |          -          |      7.9       |  35.1  |  31.2   |         [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_c4_1x-a83bdd40.pth)          |
+|     R-50-C4     | pytorch |   2x    |    -     |          -          |      8.0       |  37.2  |  32.5   |         [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_c4_2x-3cf169a9.pth)          |
+|    R-50-FPN     |  caffe  |   1x    |   3.8    |        0.430        |      10.2      |  37.4  |  34.3   |                                                                -                                                                 |
+|    R-50-FPN     | pytorch |   1x    |   3.9    |        0.453        |      10.6      |  37.3  |  34.2   |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth)     |
+|    R-50-FPN     | pytorch |   2x    |    -     |          -          |       -        |  38.5  |  35.1   |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_2x_20181010-41d35c05.pth)     |
+|    R-101-FPN    |  caffe  |   1x    |   5.7    |        0.534        |      9.4       |  39.9  |  36.1   |                                                                -                                                                 |
+|    R-101-FPN    | pytorch |   1x    |   5.8    |        0.571        |      9.5       |  39.4  |  35.9   |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r101_fpn_1x_20181129-34ad1961.pth)    |
+|    R-101-FPN    | pytorch |   2x    |    -     |          -          |       -        |  40.3  |  36.5   |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r101_fpn_2x_20181129-a254bdfc.pth)    |
+| X-101-32x4d-FPN | pytorch |   1x    |   7.1    |        0.759        |      8.3       |  41.1  |  37.1   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_32x4d_fpn_1x_20181218-44e635cc.pth) |
+| X-101-32x4d-FPN | pytorch |   2x    |    -     |          -          |       -        |  41.4  |  37.1   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_32x4d_fpn_2x_20181218-f023dffa.pth) |
+| X-101-64x4d-FPN | pytorch |   1x    |   10.0   |        1.102        |      6.5       |  42.1  |  38.0   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_64x4d_fpn_1x_20181218-cb159987.pth) |
+| X-101-64x4d-FPN | pytorch |   2x    |    -     |          -          |       -        |  42.0  |  37.7   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_x101_64x4d_fpn_2x_20181218-ea936e44.pth) |
+|   HRNetV2p-W18   | pytorch |   1x    |    -     |          -          |       -        |  37.3  |  34.2   |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/mask_rcnn_hrnetv2p_w18_1x_20190522-c8ad459f.pth)    |
+|   HRNetV2p-W18   | pytorch |   2x    |    -     |          -          |       -        |  39.2  |  35.7   |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/mask_rcnn_hrnetv2p_w18_2x_20190810-1e4747eb.pth)   |
+|   HRNetV2p-W32   | pytorch |   1x    |    -     |          -          |       -        |  40.7  |  36.8   |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/mask_rcnn_hrnetv2p_w32_1x_20190522-374aaa00.pth)    |
+|   HRNetV2p-W32   | pytorch |   2x    |    -     |          -          |       -        |  41.7  |  37.5   |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/mask_rcnn_hrnetv2p_w32_2x_20190810-773eca75.pth) |
+|   HRNetV2p-W48   | pytorch |   1x    |    -     |          -          |       -        |  42.4  |  38.1   |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/mask_rcnn_hrnetv2p_w48_1x_20190820-0923d1ad.pth) |
+|   HRNetV2p-W48   | pytorch |   2x    |    -     |          -          |       -        |  42.9  |  38.3   |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/mask_rcnn_hrnetv2p_w48_2x_20190820-70df51b2.pth) |
+
+### Fast R-CNN (with pre-computed proposals)
+
+| Backbone  |  Style  |  Type  | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP |                                                            Download                                                             |
+| :-------: | :-----: | :----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------------------------------------------------------------------: |
+|  R-50-C4  |  caffe  | Faster |   1x    |    -     |          -          |      6.7       |  35.0  |    -    |      [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_caffe_c4_1x-0ef9a60b.pth)      |
+|  R-50-C4  |  caffe  | Faster |   2x    |   3.8    |        0.34         |      6.6       |  36.4  |    -    |         [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_c4_2x-657a9fc6.pth)         |
+|  R-50-C4  | pytorch | Faster |   1x    |    -     |          -          |      6.3       |  34.2  |    -    |         [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_c4_1x-2bc00ca9.pth)         |
+|  R-50-C4  | pytorch | Faster |   2x    |    -     |          -          |      6.1       |  35.8  |    -    |      [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_caffe_c4_2x-9171d0fc.pth)      |
+| R-50-FPN  |  caffe  | Faster |   1x    |   3.3    |        0.242        |      18.4      |  36.6  |    -    |                                                                -                                                                |
+| R-50-FPN  | pytorch | Faster |   1x    |   3.5    |        0.250        |      16.5      |  35.8  |    -    |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_fpn_1x_20181010-08160859.pth)    |
+|  R-50-C4  |  caffe  |  Mask  |   1x    |    -     |          -          |      8.1       |  35.9  |  31.5   |   [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_caffe_c4_1x-b43f7f3c.pth)    |
+|  R-50-C4  |  caffe  |  Mask  |   2x    |   4.2    |        0.43         |      8.1       |  37.9  |  32.9   |   [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_caffe_c4_2x-e3580184.pth)    |
+|  R-50-C4  | pytorch |  Mask  |   1x    |    -     |          -          |      7.9       |  35.1  |  31.2   |      [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_c4_1x-bc7fa8c8.pth)       |
+|  R-50-C4  | pytorch |  Mask  |   2x    |    -     |          -          |      8.0       |  37.2  |  32.5   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_fpn_2x_20181010-5048cb03.pth)  |
+| R-50-FPN  | pytorch | Faster |   2x    |    -     |          -          |       -        |  37.1  |    -    |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r50_fpn_2x_20181010-d263ada5.pth)    |
+| R-101-FPN |  caffe  | Faster |   1x    |   5.2    |        0.355        |      14.4      |  38.6  |    -    |                                                                -                                                                |
+| R-101-FPN | pytorch | Faster |   1x    |   5.4    |        0.388        |      13.2      |  38.1  |    -    |   [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r101_fpn_1x_20181129-ffaa2eb0.pth)    |
+| R-101-FPN | pytorch | Faster |   2x    |    -     |          -          |       -        |  38.8  |    -    |   [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_rcnn_r101_fpn_2x_20181129-9dba92ce.pth)    |
+| R-50-FPN  |  caffe  |  Mask  |   1x    |   3.4    |        0.328        |      12.8      |  37.3  |  34.5   |                                                                -                                                                |
+| R-50-FPN  | pytorch |  Mask  |   1x    |   3.5    |        0.346        |      12.7      |  36.8  |  34.1   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_fpn_1x_20181010-e030a38f.pth)  |
+| R-50-FPN  | pytorch |  Mask  |   2x    |    -     |          -          |       -        |  37.9  |  34.8   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r50_fpn_2x_20181010-5048cb03.pth)  |
+| R-101-FPN |  caffe  |  Mask  |   1x    |   5.2    |        0.429        |      11.2      |  39.4  |  36.1   |                                                                -                                                                |
+| R-101-FPN | pytorch |  Mask  |   1x    |   5.4    |        0.462        |      10.9      |  38.9  |  35.8   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r101_fpn_1x_20181129-2273fa9b.pth) |
+| R-101-FPN | pytorch |  Mask  |   2x    |    -     |          -          |       -        |  39.9  |  36.4   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fast_mask_rcnn_r101_fpn_2x_20181129-bf63ec5e.pth) |
+
+### RetinaNet
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP |                                                             Download                                                             |
+| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     |  caffe  |   1x    |   3.4    |        0.285        |      12.5      |  35.8  |                                                                -                                                                 |
+|    R-50-FPN     | pytorch |   1x    |   3.6    |        0.308        |      12.1      |  35.6  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r50_fpn_1x_20181125-7b0c2548.pth)     |
+|    R-50-FPN     | pytorch |   2x    |    -     |          -          |       -        |  36.4  |    [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/retinanet_r50_fpn_2x_20190616-75574209.pth)     |
+|    R-101-FPN    |  caffe  |   1x    |   5.3    |        0.410        |      10.4      |  37.8  |                                                                -                                                                 |
+|    R-101-FPN    | pytorch |   1x    |   5.5    |        0.429        |      10.9      |  37.7  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r101_fpn_1x_20181129-f016f384.pth)    |
+|    R-101-FPN    | pytorch |   2x    |    -     |          -          |       -        |  38.1  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r101_fpn_2x_20181129-72c14526.pth)    |
+| X-101-32x4d-FPN | pytorch |   1x    |   6.7    |        0.632        |      9.3       |  39.0  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_32x4d_fpn_1x_20190501-967812ba.pth) |
+| X-101-32x4d-FPN | pytorch |   2x    |    -     |          -          |       -        |  39.3  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_32x4d_fpn_2x_20181218-8596452d.pth) |
+| X-101-64x4d-FPN | pytorch |   1x    |   9.6    |        0.993        |      7.0       |  40.0  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_64x4d_fpn_1x_20181218-a0a22662.pth) |
+| X-101-64x4d-FPN | pytorch |   2x    |    -     |          -          |       -        |  39.6  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_x101_64x4d_fpn_2x_20181218-5e88d045.pth) |
+
+### Cascade R-CNN
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP |                                                              Download                                                               |
+| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :---------------------------------------------------------------------------------------------------------------------------------: |
+|     R-50-C4     |  caffe  |   1x    |   8.7    |        0.92         |      5.0       |  38.7  |      [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r50_caffe_c4_1x-7c85c62b.pth)       |
+|    R-50-FPN     |  caffe  |   1x    |   3.9    |        0.464        |      10.9      |  40.5  |                                                                  -                                                                  |
+|    R-50-FPN     | pytorch |   1x    |   4.1    |        0.455        |      11.9      |  40.4  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r50_fpn_1x_20190501-3b6211ab.pth)     |
+|    R-50-FPN     | pytorch |   20e   |    -     |          -          |       -        |  41.1  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r50_fpn_20e_20181123-db483a09.pth)    |
+|    R-101-FPN    |  caffe  |   1x    |   5.8    |        0.569        |      9.6       |  42.4  |                                                                  -                                                                  |
+|    R-101-FPN    | pytorch |   1x    |   6.0    |        0.584        |      10.3      |  42.0  |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r101_fpn_1x_20181129-d64ebac7.pth)    |
+|    R-101-FPN    | pytorch |   20e   |    -     |          -          |       -        |  42.5  |   [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_r101_fpn_20e_20181129-b46dcede.pth)    |
+| X-101-32x4d-FPN | pytorch |   1x    |   7.2    |        0.770        |      8.9       |  43.6  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_32x4d_fpn_1x_20190501-af628be5.pth) |
+| X-101-32x4d-FPN | pytorch |   20e   |    -     |          -          |       -        |  44.0  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_32x4d_fpn_2x_20181218-28f73c4c.pth) |
+| X-101-64x4d-FPN | pytorch |   1x    |   10.0   |        1.133        |      6.7       |  44.5  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_64x4d_fpn_1x_20181218-e2dc376a.pth) |
+| X-101-64x4d-FPN | pytorch |   20e   |    -     |          -          |       -        |  44.7  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_rcnn_x101_64x4d_fpn_2x_20181218-5add321e.pth) |
+|   HRNetV2p-W18   | pytorch |   20e   |    -     |          -          |       -        |  41.2  | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/cascade_rcnn_hrnetv2p_w18_20e_20190810-132012d0.pth) |
+|   HRNetV2p-W32   | pytorch |   20e   |    -     |          -          |       -        |  43.7  | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/cascade_rcnn_hrnetv2p_w32_20e_20190522-55bec4ee.pth)|
+|   HRNetV2p-W48   | pytorch |   20e   |    -     |          -          |       -        |  44.6  | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/cascade_rcnn_hrnetv2p_w48_20e_20190810-f40ed8e1.pth) |
+
+### Cascade Mask R-CNN
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP |                                                                 Download                                                                  |
+| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :---------------------------------------------------------------------------------------------------------------------------------------: |
+|     R-50-C4     |  caffe  |   1x    |   9.1    |        0.99         |      4.5       |  39.3  |  32.8   |       [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_caffe_c4_1x-f72cc254.pth)       |
+|    R-50-FPN     |  caffe  |   1x    |   5.1    |        0.692        |      7.6       |  40.9  |  35.5   |                                                                     -                                                                     |
+|    R-50-FPN     | pytorch |   1x    |   5.3    |        0.683        |      7.4       |  41.2  |  35.7   |     [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_fpn_1x_20181123-88b170c9.pth)     |
+|    R-50-FPN     | pytorch |   20e   |    -     |          -          |       -        |  42.3  |  36.6   |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_fpn_20e_20181123-6e0c9713.pth)     |
+|    R-101-FPN    |  caffe  |   1x    |   7.0    |        0.803        |      7.2       |  43.1  |  37.2   |                                                                     -                                                                     |
+|    R-101-FPN    | pytorch |   1x    |   7.2    |        0.807        |      6.8       |  42.6  |  37.0   |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r101_fpn_1x_20181129-64f00602.pth)     |
+|    R-101-FPN    | pytorch |   20e   |    -     |          -          |       -        |  43.3  |  37.6   |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r101_fpn_20e_20181129-cb85151d.pth)    |
+| X-101-32x4d-FPN | pytorch |   1x    |   8.4    |        0.976        |      6.6       |  44.4  |  38.2   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_32x4d_fpn_1x_20181218-1d944c89.pth)  |
+| X-101-32x4d-FPN | pytorch |   20e   |    -     |          -          |       -        |  44.7  |  38.6   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_32x4d_fpn_20e_20181218-761a3473.pth) |
+| X-101-64x4d-FPN | pytorch |   1x    |   11.4   |        1.33         |      5.3       |  45.4  |  39.1   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_64x4d_fpn_1x_20190501-827e0a70.pth)  |
+| X-101-64x4d-FPN | pytorch |   20e   |    -     |          -          |       -        |  45.7  |  39.4   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_x101_64x4d_fpn_20e_20181218-630773a7.pth) |
+|   HRNetV2p-W18   | pytorch |   20e   |    -     |          -          |       -        |  41.9  |  36.4   | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_20190810-054fb7bf.pth) |
+|   HRNetV2p-W32   | pytorch |   20e   |    -     |          -          |       -        |  44.5  |  38.5   | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_20190810-76f61cd0.pth) |
+|   HRNetV2p-W48   | pytorch |   20e   |    -     |          -          |       -        |  46.0  |  39.5   | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/cascade_mask_rcnn_hrnetv2p_w48_20e_20190810-d04a1415.pth) |
+
+**Notes:**
+
+- The `20e` schedule in Cascade (Mask) R-CNN indicates decreasing the lr at 16 and 19 epochs, with a total of 20 epochs.
+
+### Hybrid Task Cascade (HTC)
+
+|    Backbone     |  Style  | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP |                                                            Download                                                             |
+| :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------------------------------------------------------------------: |
+|    R-50-FPN     | pytorch |   1x    |   7.4    |        0.936        |      4.1       |  42.1  |  37.3   |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r50_fpn_1x_20190408-878c1712.pth)     |
+|    R-50-FPN     | pytorch |   20e   |    -     |          -          |       -        |  43.2  |  38.1   |    [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r50_fpn_20e_20190408-c03b7015.pth)     |
+|    R-101-FPN    | pytorch |   20e   |   9.3    |        1.051        |      4.0       |  44.9  |  39.4   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r101_fpn_20e_20190408-a2e586db.pth)    |
+| X-101-32x4d-FPN | pytorch |   20e   |   5.8    |        0.769        |      3.8       |  46.1  |  40.3   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_x101_32x4d_fpn_20e_20190408-9eae4d0b.pth) |
+| X-101-64x4d-FPN | pytorch |   20e   |   7.5    |        1.120        |      3.5       |  46.9  |  40.8   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_x101_64x4d_fpn_20e_20190408-497f2561.pth) |
+|   HRNetV2p-W18   | pytorch |   20e   |    -     |          -          |       -        |  43.1  |  37.9   | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/htc_hrnetv2p_w18_20e_20190810-d70072af.pth) |
+|   HRNetV2p-W32   | pytorch |   20e   |    -     |          -          |       -        |  45.3  |  39.6   | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/htc_hrnetv2p_w32_20e_20190810-82f9ef5a.pth) |
+|   HRNetV2p-W48   | pytorch |   20e   |    -     |          -          |       -        |  46.8  | 40.7    | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/htc_hrnetv2p_w48_20e_20190810-f6d2c3fd.pth) |
+|   HRNetV2p-W48   | pytorch |   28e   |    -     |          -          |       -        |  47.0  |  41.0   | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/hrnet/htc_hrnetv2p_w48_28e_20190810-a4274b38.pth) |
+
+**Notes:**
+
+- Please refer to [Hybrid Task Cascade](https://github.com/open-mmlab/mmdetection/blob/master/configs/htc) for details and more a powerful model (50.7/43.9).
+
+### SSD
+
+| Backbone | Size  | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP |                                                             Download                                                              |
+| :------: | :---: | :---: | :-----: | :------: | :-----------------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------------------: |
+|  VGG16   |  300  | caffe |  120e   |   3.5    |        0.256        |  25.9 / 34.6   |  25.7  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd300_coco_vgg16_caffe_120e_20181221-84d7110b.pth) |
+|  VGG16   |  512  | caffe |  120e   |   7.6    |        0.412        |  20.7 / 25.4   |  29.3  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd512_coco_vgg16_caffe_120e_20181221-d48b0be8.pth) |
+
+**Notes:**
+
+- `cudnn.benchmark` is set as `True` for SSD training and testing.
+- Inference time is reported for batch size = 1 and batch size = 8.
+- The speed on COCO and VOC are different due to model parameters and nms.
+
+### Group Normalization (GN)
+
+Please refer to [Group Normalization](https://github.com/open-mmlab/mmdetection/blob/master/configs/gn) for details.
+
+### Weight Standardization
+
+Please refer to [Weight Standardization](https://github.com/open-mmlab/mmdetection/blob/master/configs/gn+ws) for details.
+
+### Deformable Convolution v2
+
+Please refer to [Deformable Convolutional Networks](https://github.com/open-mmlab/mmdetection/blob/master/configs/dcn) for details.
+
+### CARAFE: Content-Aware ReAssembly of FEatures
+Please refer to [CARAFE](https://github.com/open-mmlab/mmdetection/blob/master/configs/carafe) for details.
+
+### Instaboost
+
+Please refer to [Instaboost](https://github.com/open-mmlab/mmdetection/blob/master/configs/instaboost) for details.
+
+### Libra R-CNN
+
+Please refer to [Libra R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/libra_rcnn) for details.
+
+### Guided Anchoring
+
+Please refer to [Guided Anchoring](https://github.com/open-mmlab/mmdetection/blob/master/configs/guided_anchoring) for details.
+
+### FCOS
+
+Please refer to [FCOS](https://github.com/open-mmlab/mmdetection/blob/master/configs/fcos) for details.
+
+### FoveaBox
+
+Please refer to [FoveaBox](https://github.com/open-mmlab/mmdetection/blob/master/configs/foveabox) for details.
+
+### RepPoints
+
+Please refer to [RepPoints](https://github.com/open-mmlab/mmdetection/blob/master/configs/reppoints) for details.
+
+### FreeAnchor
+
+Please refer to [FreeAnchor](https://github.com/open-mmlab/mmdetection/blob/master/configs/free_anchor) for details.
+
+### Grid R-CNN (plus)
+
+Please refer to [Grid R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/grid_rcnn) for details.
+
+### GHM
+
+Please refer to [GHM](https://github.com/open-mmlab/mmdetection/blob/master/configs/ghm) for details.
+
+### GCNet
+
+Please refer to [GCNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/gcnet) for details.
+
+### HRNet
+Please refer to [HRNet](https://github.com/open-mmlab/mmdetection/blob/master/configs/hrnet) for details.
+
+### Mask Scoring R-CNN
+
+Please refer to [Mask Scoring R-CNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/ms_rcnn) for details.
+
+### Train from Scratch
+
+Please refer to [Rethinking ImageNet Pre-training](https://github.com/open-mmlab/mmdetection/blob/master/configs/scratch) for details.
+
+### NAS-FPN
+Please refer to [NAS-FPN](https://github.com/open-mmlab/mmdetection/blob/master/configs/nas_fpn) for details.
+
+### ATSS
+Please refer to [ATSS](https://github.com/open-mmlab/mmdetection/blob/master/configs/atss) for details.
+
+### Other datasets
+
+We also benchmark some methods on [PASCAL VOC](https://github.com/open-mmlab/mmdetection/blob/master/configs/pascal_voc), [Cityscapes](https://github.com/open-mmlab/mmdetection/blob/master/configs/cityscapes) and [WIDER FACE](https://github.com/open-mmlab/mmdetection/blob/master/configs/wider_face).
+
+
+## Comparison with Detectron and maskrcnn-benchmark
+
+We compare mmdetection with [Detectron](https://github.com/facebookresearch/Detectron)
+and [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark). The backbone used is R-50-FPN.
+
+In general, mmdetection has 3 advantages over Detectron.
+
+- **Higher performance** (especially in terms of mask AP)
+- **Faster training speed**
+- **Memory efficient**
+
+### Performance
+
+Detectron and maskrcnn-benchmark use caffe-style ResNet as the backbone.
+We report results using both caffe-style (weights converted from
+[here](https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md#imagenet-pretrained-models))
+and pytorch-style (weights from the official model zoo) ResNet backbone,
+indicated as *pytorch-style results* / *caffe-style results*.
+
+We find that pytorch-style ResNet usually converges slower than caffe-style ResNet,
+thus leading to slightly lower results in 1x schedule, but the final results
+of 2x schedule is higher.
+
+<table>
+  <tr>
+    <th>Type</th>
+    <th>Lr schd</th>
+    <th>Detectron</th>
+    <th>maskrcnn-benchmark</th>
+    <th>mmdetection</th>
+  </tr>
+  <tr>
+    <td rowspan="2">RPN</td>
+    <td>1x</td>
+    <td>57.2</td>
+    <td>-</td>
+    <td>57.1 / 58.2</td>
+  </tr>
+  <tr>
+    <td>2x</td>
+    <td>-</td>
+    <td>-</td>
+    <td>57.6 / -</td>
+  </tr>
+  <tr>
+    <td rowspan="2">Faster R-CNN</td>
+    <td>1x</td>
+    <td>36.7</td>
+    <td>36.8</td>
+    <td>36.4 / 36.6</td>
+  </tr>
+  <tr>
+    <td>2x</td>
+    <td>37.9</td>
+    <td>-</td>
+    <td>37.7 / -</td>
+  </tr>
+  <tr>
+    <td rowspan="2">Mask R-CNN</td>
+    <td>1x</td>
+    <td>37.7 &amp; 33.9</td>
+    <td>37.8 &amp; 34.2</td>
+    <td>37.3 &amp; 34.2 / 37.4 &amp; 34.3</td>
+  </tr>
+  <tr>
+    <td>2x</td>
+    <td>38.6 &amp; 34.5</td>
+    <td>-</td>
+    <td>38.5 &amp; 35.1 / -</td>
+  </tr>
+  <tr>
+    <td rowspan="2">Fast R-CNN</td>
+    <td>1x</td>
+    <td>36.4</td>
+    <td>-</td>
+    <td>35.8 / 36.6</td>
+  </tr>
+  <tr>
+    <td>2x</td>
+    <td>36.8</td>
+    <td>-</td>
+    <td>37.1 / -</td>
+  </tr>
+  <tr>
+    <td rowspan="2">Fast R-CNN (w/mask)</td>
+    <td>1x</td>
+    <td>37.3 &amp; 33.7</td>
+    <td>-</td>
+    <td>36.8 &amp; 34.1 / 37.3 &amp; 34.5</td>
+  </tr>
+  <tr>
+    <td>2x</td>
+    <td>37.7 &amp; 34.0</td>
+    <td>-</td>
+    <td>37.9 &amp; 34.8 / -</td>
+  </tr>
+</table>
+
+### Training Speed
+
+The training speed is measure with s/iter. The lower, the better.
+
+<table>
+  <tr>
+    <th>Type</th>
+    <th>Detectron (P100<sup>1</sup>)</th>
+    <th>maskrcnn-benchmark (V100)</th>
+    <th>mmdetection (V100<sup>2</sup>)</th>
+  </tr>
+  <tr>
+    <td>RPN</td>
+    <td>0.416</td>
+    <td>-</td>
+    <td>0.253</td>
+  </tr>
+  <tr>
+    <td>Faster R-CNN</td>
+    <td>0.544</td>
+    <td>0.353</td>
+    <td>0.333</td>
+  </tr>
+  <tr>
+    <td>Mask R-CNN</td>
+    <td>0.889</td>
+    <td>0.454</td>
+    <td>0.430</td>
+  </tr>
+  <tr>
+    <td>Fast R-CNN</td>
+    <td>0.285</td>
+    <td>-</td>
+    <td>0.242</td>
+  </tr>
+  <tr>
+    <td>Fast R-CNN (w/mask)</td>
+    <td>0.377</td>
+    <td>-</td>
+    <td>0.328</td>
+  </tr>
+</table>
+
+\*1. Facebook's Big Basin servers (P100/V100) is slightly faster than the servers we use. mmdetection can also run slightly faster on FB's servers.
+
+\*2. For fair comparison, we list the caffe-style results here.
+
+
+### Inference Speed
+
+The inference speed is measured with fps (img/s) on a single GPU. The higher, the better.
+
+<table>
+  <tr>
+    <th>Type</th>
+    <th>Detectron (P100)</th>
+    <th>maskrcnn-benchmark (V100)</th>
+    <th>mmdetection (V100)</th>
+  </tr>
+  <tr>
+    <td>RPN</td>
+    <td>12.5</td>
+    <td>-</td>
+    <td>16.9</td>
+  </tr>
+  <tr>
+    <td>Faster R-CNN</td>
+    <td>10.3</td>
+    <td>7.9</td>
+    <td>13.5</td>
+  </tr>
+  <tr>
+    <td>Mask R-CNN</td>
+    <td>8.5</td>
+    <td>7.7</td>
+    <td>10.2</td>
+  </tr>
+  <tr>
+    <td>Fast R-CNN</td>
+    <td>12.5</td>
+    <td>-</td>
+    <td>18.4</td>
+  </tr>
+  <tr>
+    <td>Fast R-CNN (w/mask)</td>
+    <td>9.9</td>
+    <td>-</td>
+    <td>12.8</td>
+  </tr>
+</table>
+
+### Training memory
+
+<table>
+  <tr>
+    <th>Type</th>
+    <th>Detectron</th>
+    <th>maskrcnn-benchmark</th>
+    <th>mmdetection</th>
+  </tr>
+  <tr>
+    <td>RPN</td>
+    <td>6.4</td>
+    <td>-</td>
+    <td>3.3</td>
+  </tr>
+  <tr>
+    <td>Faster R-CNN</td>
+    <td>7.2</td>
+    <td>4.4</td>
+    <td>3.6</td>
+  </tr>
+  <tr>
+    <td>Mask R-CNN</td>
+    <td>8.6</td>
+    <td>5.2</td>
+    <td>3.8</td>
+  </tr>
+  <tr>
+    <td>Fast R-CNN</td>
+    <td>6.0</td>
+    <td>-</td>
+    <td>3.3</td>
+  </tr>
+  <tr>
+    <td>Fast R-CNN (w/mask)</td>
+    <td>7.9</td>
+    <td>-</td>
+    <td>3.4</td>
+  </tr>
+</table>
+
+There is no doubt that maskrcnn-benchmark and mmdetection is more memory efficient than Detectron,
+and the main advantage is PyTorch itself. We also perform some memory optimizations to push it forward.
+
+Note that Caffe2 and PyTorch have different apis to obtain memory usage with different implementations.
+For all codebases, `nvidia-smi` shows a larger memory usage than the reported number in the above table.
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000000..d4bb2cbb9e
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/ROBUSTNESS_BENCHMARKING.md b/docs/ROBUSTNESS_BENCHMARKING.md
new file mode 100644
index 0000000000..1ed441ab5a
--- /dev/null
+++ b/docs/ROBUSTNESS_BENCHMARKING.md
@@ -0,0 +1,109 @@
+# Corruption Benchmarking
+
+## Introduction
+
+We provide tools to test object detection and instance segmentation models on the image corruption benchmark defined in [Benchmarking Robustness in Object Detection: Autonomous Driving when Winter is Coming](https://arxiv.org/abs/1907.07484).
+This page provides basic tutorials how to use the benchmark.
+
+```
+@article{michaelis2019winter,
+  title={Benchmarking Robustness in Object Detection:
+    Autonomous Driving when Winter is Coming},
+  author={Michaelis, Claudio and Mitzkus, Benjamin and
+    Geirhos, Robert and Rusak, Evgenia and
+    Bringmann, Oliver and Ecker, Alexander S. and
+    Bethge, Matthias and Brendel, Wieland},
+  journal={arXiv:1907.07484},
+  year={2019}
+}
+```
+
+![image corruption example](../demo/corruptions_sev_3.png)
+
+## About the benchmark
+
+To submit results to the benchmark please visit the [benchmark homepage](https://github.com/bethgelab/robust-detection-benchmark)
+
+The benchmark is modelled after the [imagenet-c benchmark](https://github.com/hendrycks/robustness) which was originally
+published in [Benchmarking Neural Network Robustness to Common Corruptions and Perturbations](https://arxiv.org/abs/1903.12261) (ICLR 2019) by Dan Hendrycks and Thomas Dietterich.
+
+The image corruption functions are included in this library but can be installed separately using:
+
+```shell
+pip install imagecorruptions
+```
+
+Compared to imagenet-c a few changes had to be made to handle images of arbitrary size and greyscale images.
+We also modfied the 'motion blur' and 'snow' corruptions to remove dependency from a linux specific library,
+which would have to be installed separately otherwise. For details please refer to the [imagecorruptions repository](https://github.com/bethgelab/imagecorruptions).
+
+## Inference with pretrained models
+
+We provide a testing script to evaluate a models performance on any combination of the corruptions provided in the benchmark.
+
+### Test a dataset
+
+- [x] single GPU testing
+- [ ] multiple GPU testing
+- [ ] visualize detection results
+
+You can use the following commands to test a models performance under the 15 corruptions used in the benchmark.
+
+```shell
+# single-gpu testing
+python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}]
+```
+
+Alternatively different group of corruptions can be selected.
+
+```shell
+# noise
+python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions noise
+
+# blur
+python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions blur
+
+# wetaher
+python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions weather
+
+# digital
+python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions digital
+```
+
+Or a costom set of corruptions e.g.:
+```shell
+# gaussian noise, zoom blur and snow
+python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions gaussian_noise zoom_blur snow
+```
+
+Finally the corruption severities to evaluate can be chosen.
+Severity 0 corresponds to clean data and the effect increases from 1 to 5.
+
+```shell
+# severity 1
+python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --severities 1
+
+# severities 0,2,4
+python tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --severities 0 2 4
+```
+
+## Results for modelzoo models
+
+The results on COCO 2017val are shown in the below table.
+
+Model  | Backbone  | Style   | Lr schd | box AP clean | box AP corr. | box % | mask AP clean | mask AP corr. | mask % |
+:-----:|:---------:|:-------:|:-------:|:------------:|:------------:|:-----:|:-------------:|:-------------:|:------:|
+Faster R-CNN | R-50-FPN  | pytorch | 1x      | 36.3   | 18.2         | 50.2  | -             | -             | -      |
+Faster R-CNN | R-101-FPN | pytorch | 1x      | 38.5   | 20.9         | 54.2  | -             | -             | -      |
+Faster R-CNN | X-101-32x4d-FPN | pytorch |1x | 40.1   | 22.3         | 55.5  | -             | -             | -      |
+Faster R-CNN | X-101-64x4d-FPN | pytorch |1x | 41.3   | 23.4         | 56.6  | -             | -             | -      |
+Faster R-CNN | R-50-FPN-DCN | pytorch | 1x   | 40.0   | 22.4         | 56.1  | -             | -             | -      |
+Faster R-CNN | X-101-32x4d-FPN-DCN | pytorch | 1x | 43.4 | 26.7      | 61.6  | -             | -             | -      |
+Mask R-CNN   | R-50-FPN  | pytorch | 1x      | 37.3   | 18.7         | 50.1  | 34.2          | 16.8          | 49.1   |
+Mask R-CNN   | R-50-FPN-DCN | pytorch | 1x   | 41.1   | 23.3         | 56.7  | 37.2          | 20.7          | 55.7   |
+Cascade R-CNN | R-50-FPN  | pytorch | 1x     | 40.4   | 20.1         | 49.7  | -             | -             | -      |
+Cascade Mask R-CNN | R-50-FPN  | pytorch | 1x| 41.2   | 20.7         | 50.2  | 35.7          | 17.6          | 49.3   |
+RetinaNet    | R-50-FPN  | pytorch | 1x      | 35.6   | 17.8         | 50.1  | -             | -             | -      |
+Hybrid Task Cascade | X-101-64x4d-FPN-DCN | pytorch | 1x | 50.6 | 32.7 | 64.7 | 43.8         | 28.1          | 64.0   |
+
+Results may vary slightly due to the stochastic application of the corruptions.
diff --git a/docs/TECHNICAL_DETAILS.md b/docs/TECHNICAL_DETAILS.md
new file mode 100644
index 0000000000..91b0cfb941
--- /dev/null
+++ b/docs/TECHNICAL_DETAILS.md
@@ -0,0 +1,226 @@
+# Technical Details
+
+In this section, we will introduce the main units of training a detector:
+data pipeline, model and iteration pipeline.
+
+## Data pipeline
+
+Following typical conventions, we use `Dataset` and `DataLoader` for data loading
+with multiple workers. `Dataset` returns a dict of data items corresponding
+the arguments of models' forward method.
+Since the data in object detection may not be the same size (image size, gt bbox size, etc.),
+we introduce a new `DataContainer` type in MMCV to help collect and distribute
+data of different size.
+See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details.
+
+The data preparation pipeline and the dataset is decomposed. Usually a dataset
+defines how to process the annotations and a data pipeline defines all the steps to prepare a data dict.
+A pipeline consists of a sequence of operations. Each operation takes a dict as input and also output a dict for the next transform.
+
+We present a classical pipeline in the following figure. The blue blocks are pipeline operations. With the pipeline going on, each operator can add new keys (marked as green) to the result dict or update the existing keys (marked as orange).
+![pipeline figure](../demo/data_pipeline.png)
+
+The operations are categorized into data loading, pre-processing, formatting and test-time augmentation.
+
+Here is an pipeline example for Faster R-CNN.
+```python
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+```
+
+For each operation, we list the related dict fields that are added/updated/removed.
+
+### Data loading
+
+`LoadImageFromFile`
+- add: img, img_shape, ori_shape
+
+`LoadAnnotations`
+- add: gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks, gt_semantic_seg, bbox_fields, mask_fields
+
+`LoadProposals`
+- add: proposals
+
+### Pre-processing
+
+`Resize`
+- add: scale, scale_idx, pad_shape, scale_factor, keep_ratio
+- update: img, img_shape, *bbox_fields, *mask_fields, *seg_fields
+
+`RandomFlip`
+- add: flip
+- update: img, *bbox_fields, *mask_fields, *seg_fields
+
+`Pad`
+- add: pad_fixed_size, pad_size_divisor
+- update: img, pad_shape, *mask_fields, *seg_fields
+
+`RandomCrop`
+- update: img, pad_shape, gt_bboxes, gt_labels, gt_masks, *bbox_fields
+
+`Normalize`
+- add: img_norm_cfg
+- update: img
+
+`SegRescale`
+- update: gt_semantic_seg
+
+`PhotoMetricDistortion`
+- update: img
+
+`Expand`
+- update: img, gt_bboxes
+
+`MinIoURandomCrop`
+- update: img, gt_bboxes, gt_labels
+
+`Corrupt`
+- update: img
+
+### Formatting
+
+`ToTensor`
+- update: specified by `keys`.
+
+`ImageToTensor`
+- update: specified by `keys`.
+
+`Transpose`
+- update: specified by `keys`.
+
+`ToDataContainer`
+- update: specified by `fields`.
+
+`DefaultFormatBundle`
+- update: img, proposals, gt_bboxes, gt_bboxes_ignore, gt_labels, gt_masks, gt_semantic_seg
+
+`Collect`
+- add: img_meta (the keys of img_meta is specified by `meta_keys`)
+- remove: all other keys except for those specified by `keys`
+
+### Test time augmentation
+
+`MultiScaleFlipAug`
+
+## Model
+
+In MMDetection, model components are basically categorized as 4 types.
+
+- backbone: usually a FCN network to extract feature maps, e.g., ResNet.
+- neck: the part between backbones and heads, e.g., FPN, ASPP.
+- head: the part for specific tasks, e.g., bbox prediction and mask prediction.
+- roi extractor: the part for extracting features from feature maps, e.g., RoI Align.
+
+We also write implement some general detection pipelines with the above components,
+such as `SingleStageDetector` and `TwoStageDetector`.
+
+### Build a model with basic components
+
+Following some basic pipelines (e.g., two-stage detectors), the model structure
+can be customized through config files with no pains.
+
+If we want to implement some new components, e.g, the path aggregation
+FPN structure in [Path Aggregation Network for Instance Segmentation](https://arxiv.org/abs/1803.01534), there are two things to do.
+
+1. create a new file in `mmdet/models/necks/pafpn.py`.
+
+    ```python
+    from ..registry import NECKS
+
+    @NECKS.register
+    class PAFPN(nn.Module):
+
+        def __init__(self,
+                    in_channels,
+                    out_channels,
+                    num_outs,
+                    start_level=0,
+                    end_level=-1,
+                    add_extra_convs=False):
+            pass
+
+        def forward(self, inputs):
+            # implementation is ignored
+            pass
+    ```
+
+2. Import the module in `mmdet/models/necks/__init__.py`.
+
+    ```python
+    from .pafpn import PAFPN
+    ```
+
+2. modify the config file from
+
+    ```python
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5)
+    ```
+
+    to
+
+    ```python
+    neck=dict(
+        type='PAFPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5)
+    ```
+
+We will release more components (backbones, necks, heads) for research purpose.
+
+### Write a new model
+
+To write a new detection pipeline, you need to inherit from `BaseDetector`,
+which defines the following abstract methods.
+
+- `extract_feat()`: given an image batch of shape (n, c, h, w), extract the feature map(s).
+- `forward_train()`: forward method of the training mode
+- `simple_test()`: single scale testing without augmentation
+- `aug_test()`: testing with augmentation (multi-scale, flip, etc.)
+
+[TwoStageDetector](https://github.com/hellock/mmdetection/blob/master/mmdet/models/detectors/two_stage.py)
+is a good example which shows how to do that.
+
+## Iteration pipeline
+
+We adopt distributed training for both single machine and multiple machines.
+Supposing that the server has 8 GPUs, 8 processes will be started and each process runs on a single GPU.
+
+Each process keeps an isolated model, data loader, and optimizer.
+Model parameters are only synchronized once at the beginning.
+After a forward and backward pass, gradients will be allreduced among all GPUs,
+and the optimizer will update model parameters.
+Since the gradients are allreduced, the model parameter stays the same for all processes after the iteration.
+
+## Other information
+
+For more information, please refer to our [technical report](https://arxiv.org/abs/1906.07155).
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000000..aad51b6ae3
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,70 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+# -- Project information -----------------------------------------------------
+
+project = 'MMDetection'
+copyright = '2018-2020, OpenMMLab'
+author = 'OpenMMLab'
+
+# The full version, including alpha/beta/rc tags
+release = '1.0.0'
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'recommonmark',
+    'sphinx_markdown_tables',
+]
+
+autodoc_mock_imports = ['torch', 'torchvision', 'mmcv']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+# The master toctree document.
+master_doc = 'index'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000000..6e56b1432e
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,19 @@
+Welcome to MMDetection's documentation!
+=======================================
+
+.. toctree::
+   :maxdepth: 2
+
+   INSTALL.md
+   GETTING_STARTED.md
+   MODEL_ZOO.md
+   TECHNICAL_DETAILS.md
+   CHANGELOG.md
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000000..2119f51099
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000000..89fbf86c01
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,4 @@
+recommonmark
+sphinx
+sphinx_markdown_tables
+sphinx_rtd_theme
diff --git a/mmdet3d/__init__.py b/mmdet3d/__init__.py
new file mode 100644
index 0000000000..1c4f7e8fcc
--- /dev/null
+++ b/mmdet3d/__init__.py
@@ -0,0 +1,3 @@
+from .version import __version__, short_version
+
+__all__ = ['__version__', 'short_version']
diff --git a/mmdet3d/apis/__init__.py b/mmdet3d/apis/__init__.py
new file mode 100644
index 0000000000..4833e520f4
--- /dev/null
+++ b/mmdet3d/apis/__init__.py
@@ -0,0 +1,5 @@
+from .train import train_detector
+
+__all__ = [
+    'train_detector',
+]
diff --git a/mmdet3d/apis/train.py b/mmdet3d/apis/train.py
new file mode 100644
index 0000000000..d85f6c1784
--- /dev/null
+++ b/mmdet3d/apis/train.py
@@ -0,0 +1,199 @@
+import torch
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import DistSamplerSeedHook, Runner
+
+from mmdet3d.core import build_optimizer
+from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet.apis.train import parse_losses
+from mmdet.core import (DistEvalHook, DistOptimizerHook, EvalHook,
+                        Fp16OptimizerHook)
+from mmdet.utils import get_root_logger
+
+
+def batch_processor(model, data, train_mode):
+    """Process a data batch.
+
+    This method is required as an argument of Runner, which defines how to
+    process a data batch and obtain proper outputs. The first 3 arguments of
+    batch_processor are fixed.
+
+    Args:
+        model (nn.Module): A PyTorch model.
+        data (dict): The data batch in a dict.
+        train_mode (bool): Training mode or not. It may be useless for some
+            models.
+
+    Returns:
+        dict: A dict containing losses and log vars.
+    """
+    losses = model(**data)
+    loss, log_vars = parse_losses(losses)
+
+    if 'img_meta' in data:
+        num_samples = len(data['img_meta'].data)
+    else:
+        num_samples = len(data['img'].data)
+    outputs = dict(loss=loss, log_vars=log_vars, num_samples=num_samples)
+
+    return outputs
+
+
+def train_detector(model,
+                   dataset,
+                   cfg,
+                   distributed=False,
+                   validate=False,
+                   timestamp=None,
+                   meta=None):
+    logger = get_root_logger(cfg.log_level)
+
+    # start training
+    if distributed:
+        _dist_train(
+            model,
+            dataset,
+            cfg,
+            validate=validate,
+            logger=logger,
+            timestamp=timestamp,
+            meta=meta)
+    else:
+        _non_dist_train(
+            model,
+            dataset,
+            cfg,
+            validate=validate,
+            logger=logger,
+            timestamp=timestamp,
+            meta=meta)
+
+
+def _dist_train(model,
+                dataset,
+                cfg,
+                validate=False,
+                logger=None,
+                timestamp=None,
+                meta=None):
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            dist=True,
+            seed=cfg.seed) for ds in dataset
+    ]
+    # put model on gpus
+    find_unused_parameters = cfg.get('find_unused_parameters', False)
+    # Sets the `find_unused_parameters` parameter in
+    # torch.nn.parallel.DistributedDataParallel
+    model = MMDistributedDataParallel(
+        model.cuda(),
+        device_ids=[torch.cuda.current_device()],
+        broadcast_buffers=False,
+        find_unused_parameters=find_unused_parameters)
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+    runner = Runner(
+        model,
+        batch_processor,
+        optimizer,
+        cfg.work_dir,
+        logger=logger,
+        meta=meta)
+    # an ugly walkaround to make the .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
+                                             **fp16_cfg)
+    else:
+        optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config)
+    runner.register_hook(DistSamplerSeedHook())
+    # register eval hooks
+    if validate:
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+        val_dataloader = build_dataloader(
+            val_dataset,
+            samples_per_gpu=1,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=True,
+            shuffle=False)
+        eval_cfg = cfg.get('evaluation', {})
+        runner.register_hook(DistEvalHook(val_dataloader, **eval_cfg))
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
+
+
+def _non_dist_train(model,
+                    dataset,
+                    cfg,
+                    validate=False,
+                    logger=None,
+                    timestamp=None,
+                    meta=None):
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            cfg.gpus,
+            dist=False,
+            seed=cfg.seed) for ds in dataset
+    ]
+    # put model on gpus
+    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+    runner = Runner(
+        model,
+        batch_processor,
+        optimizer,
+        cfg.work_dir,
+        logger=logger,
+        meta=meta)
+    # an ugly walkaround to make the .log and .log.json filenames the same
+    runner.timestamp = timestamp
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=False)
+    else:
+        optimizer_config = cfg.optimizer_config
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config)
+
+    # register eval hooks
+    if validate:
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+        val_dataloader = build_dataloader(
+            val_dataset,
+            samples_per_gpu=1,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=False,
+            shuffle=False)
+        eval_cfg = cfg.get('evaluation', {})
+        runner.register_hook(EvalHook(val_dataloader, **eval_cfg))
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
diff --git a/mmdet3d/core/__init__.py b/mmdet3d/core/__init__.py
new file mode 100644
index 0000000000..1d7bf519c1
--- /dev/null
+++ b/mmdet3d/core/__init__.py
@@ -0,0 +1,8 @@
+from .anchor import *  # noqa: F401, F403
+from .bbox import *  # noqa: F401, F403
+from .evaluation import *  # noqa: F401, F403
+from .optimizer import *  # noqa: F401, F403
+from .post_processing import *  # noqa: F401, F403
+from .utils import *  # noqa: F401, F403
+
+# from .voxel import *  # noqa: F401, F403
diff --git a/mmdet3d/core/anchor/__init__.py b/mmdet3d/core/anchor/__init__.py
new file mode 100644
index 0000000000..4693853eb8
--- /dev/null
+++ b/mmdet3d/core/anchor/__init__.py
@@ -0,0 +1,19 @@
+from .anchor_generator import (AlignedAnchorGeneratorRange, AnchorGenerator,
+                               AnchorGeneratorRange)
+
+__all__ = [
+    'AnchorGenerator', 'anchor_inside_flags', 'images_to_levels', 'unmap',
+    'AlignedAnchorGeneratorRange', 'AnchorGeneratorRange',
+    'build_anchor_generator'
+]
+
+
+def build_anchor_generator(cfg, **kwargs):
+    from . import anchor_generator
+    import mmcv
+    if isinstance(cfg, dict):
+        return mmcv.runner.obj_from_dict(
+            cfg, anchor_generator, default_args=kwargs)
+    else:
+        raise TypeError('Invalid type {} for building a sampler'.format(
+            type(cfg)))
diff --git a/mmdet3d/core/anchor/anchor_generator.py b/mmdet3d/core/anchor/anchor_generator.py
new file mode 100644
index 0000000000..21af0f92dc
--- /dev/null
+++ b/mmdet3d/core/anchor/anchor_generator.py
@@ -0,0 +1,288 @@
+import torch
+
+
+class AnchorGenerator(object):
+    """
+    Examples:
+        >>> from mmdet.core import AnchorGenerator
+        >>> self = AnchorGenerator(9, [1.], [1.])
+        >>> all_anchors = self.grid_anchors((2, 2), device='cpu')
+        >>> print(all_anchors)
+        tensor([[ 0.,  0.,  8.,  8.],
+                [16.,  0., 24.,  8.],
+                [ 0., 16.,  8., 24.],
+                [16., 16., 24., 24.]])
+    """
+
+    def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None):
+        self.base_size = base_size
+        self.scales = torch.Tensor(scales)
+        self.ratios = torch.Tensor(ratios)
+        self.scale_major = scale_major
+        self.ctr = ctr
+        self.base_anchors = self.gen_base_anchors()
+
+    @property
+    def num_base_anchors(self):
+        return self.base_anchors.size(0)
+
+    def gen_base_anchors(self):
+        w = self.base_size
+        h = self.base_size
+
+        h_ratios = torch.sqrt(self.ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * self.scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * self.scales[None, :]).view(-1)
+        else:
+            ws = (w * self.scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * self.scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # yapf: disable
+        base_anchors = torch.stack(
+            [
+                -0.5 * ws, -0.5 * hs,
+                0.5 * ws, 0.5 * hs
+            ],
+            dim=-1)
+        # yapf: enable
+        return base_anchors
+
+    def _meshgrid(self, x, y, row_major=True):
+        xx = x.repeat(len(y))
+        yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_anchors(self, featmap_size, stride=16, device='cuda'):
+        base_anchors = self.base_anchors.to(device)
+
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0, feat_w, device=device) * stride
+        shift_y = torch.arange(0, feat_h, device=device) * stride
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        shifts = shifts.type_as(base_anchors)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        return all_anchors
+
+    def valid_flags(self, featmap_size, valid_size, device='cuda'):
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.uint8, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.uint8, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        valid = valid[:, None].expand(
+            valid.size(0), self.num_base_anchors).contiguous().view(-1).bool()
+        return valid
+
+
+class AnchorGeneratorRange(object):
+
+    def __init__(self,
+                 anchor_ranges,
+                 sizes=((1.6, 3.9, 1.56), ),
+                 stride=2,
+                 rotations=(0, 3.1415926 / 2),
+                 custom_values=(),
+                 cache_anchor=False):
+        self.sizes = sizes
+        self.stride = stride
+        self.anchor_ranges = anchor_ranges
+        if len(anchor_ranges) != len(sizes):
+            self.anchor_ranges = anchor_ranges * len(sizes)
+        self.rotations = rotations
+        self.custom_values = custom_values
+        self.cache_anchor = cache_anchor
+        self.cached_anchors = None
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'anchor_range={}, '.format(self.anchor_ranges)
+        s += 'stride={}, '.format(self.stride)
+        s += 'sizes={}, '.format(self.sizes)
+        s += 'rotations={})'.format(self.rotations)
+        return s
+
+    @property
+    def num_base_anchors(self):
+        num_rot = len(self.rotations)
+        num_size = torch.tensor(self.sizes).reshape(-1, 3).size(0)
+        return num_rot * num_size
+
+    def grid_anchors(self, feature_map_size, device='cuda'):
+        # We reimplement the anchor generator using torch in cuda
+        # torch: 0.6975 s for 1000 times
+        # numpy: 4.3345 s for 1000 times
+        # which is ~5 times faster than numpy implementation
+        if (self.cache_anchor and self.cached_anchors):
+            return self.cached_anchors
+        if not isinstance(self.anchor_ranges[0], list):
+            return self.anchors_single_range(
+                feature_map_size,
+                self.anchor_ranges,
+                self.sizes,
+                self.rotations,
+                device=device)
+        assert len(self.sizes) == len(self.anchor_ranges)
+        mr_anchors = []
+        for anchor_range, anchor_size in zip(self.anchor_ranges, self.sizes):
+            mr_anchors.append(
+                self.anchors_single_range(
+                    feature_map_size,
+                    anchor_range,
+                    anchor_size,
+                    self.rotations,
+                    device=device))
+        mr_anchors = torch.cat(mr_anchors, dim=-3)
+        if self.cache_anchor and not self.cached_anchors:
+            self.cached_anchors = mr_anchors
+        return mr_anchors
+
+    def anchors_single_range(self,
+                             feature_size,
+                             anchor_range,
+                             sizes=((1.6, 3.9, 1.56), ),
+                             rotations=(0, 3.1415927 / 2),
+                             device='cuda'):
+        """Generate anchors in a single range
+        Args:
+            feature_size: list [D, H, W](zyx)
+            sizes: [N, 3] list of list or array, size of anchors, xyz
+
+        Returns:
+            anchors: [*feature_size, num_sizes, num_rots, 7] tensor.
+        """
+        if len(feature_size) == 2:
+            feature_size = [1, feature_size[0], feature_size[1]]
+        anchor_range = torch.tensor(anchor_range, device=device)
+        z_centers = torch.linspace(
+            anchor_range[2], anchor_range[5], feature_size[0], device=device)
+        y_centers = torch.linspace(
+            anchor_range[1], anchor_range[4], feature_size[1], device=device)
+        x_centers = torch.linspace(
+            anchor_range[0], anchor_range[3], feature_size[2], device=device)
+        sizes = torch.tensor(sizes, device=device).reshape(-1, 3)
+        rotations = torch.tensor(rotations, device=device)
+
+        # torch.meshgrid default behavior is 'id', np's default is 'xy'
+        rets = torch.meshgrid(x_centers, y_centers, z_centers, rotations)
+        # torch.meshgrid returns a tuple rather than list
+        rets = list(rets)
+        tile_shape = [1] * 5
+        tile_shape[-2] = int(sizes.shape[0])
+        for i in range(len(rets)):
+            rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
+
+        sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
+        tile_size_shape = list(rets[0].shape)
+        tile_size_shape[3] = 1
+        sizes = sizes.repeat(tile_size_shape)
+        rets.insert(3, sizes)
+
+        ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
+        # [1, 200, 176, N, 2, 7] for kitti after permute
+        # ret = ret.reshape(-1, 7)
+
+        if len(self.custom_values) > 0:
+            custom_ndim = len(self.custom_values)
+            custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
+            # custom[:] = self.custom_values
+            ret = torch.cat([ret, custom], dim=-1)
+            # [1, 200, 176, N, 2, 9] for nus dataset after permute
+        return ret
+
+
+class AlignedAnchorGeneratorRange(AnchorGeneratorRange):
+
+    def __init__(self, shift_center=True, **kwargs):
+        super(AlignedAnchorGeneratorRange, self).__init__(**kwargs)
+        self.shift_center = shift_center
+
+    def anchors_single_range(self,
+                             feature_size,
+                             anchor_range,
+                             sizes=((1.6, 3.9, 1.56), ),
+                             rotations=(0, 3.1415927 / 2),
+                             device='cuda'):
+        """Generate anchors in a single range
+        Args:
+            feature_size: list [D, H, W](zyx)
+            sizes: [N, 3] list of list or array, size of anchors, xyz
+
+        Returns:
+            anchors: [*feature_size, num_sizes, num_rots, 7] tensor.
+        """
+        if len(feature_size) == 2:
+            feature_size = [1, feature_size[0], feature_size[1]]
+        anchor_range = torch.tensor(anchor_range, device=device)
+        z_centers = torch.linspace(
+            anchor_range[2],
+            anchor_range[5],
+            feature_size[0] + 1,
+            device=device)
+        y_centers = torch.linspace(
+            anchor_range[1],
+            anchor_range[4],
+            feature_size[1] + 1,
+            device=device)
+        x_centers = torch.linspace(
+            anchor_range[0],
+            anchor_range[3],
+            feature_size[2] + 1,
+            device=device)
+        sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * self.stride
+        rotations = torch.tensor(rotations, device=device)
+
+        # shift the anchor center
+        if self.shift_center:
+            z_shift = (z_centers[1] - z_centers[0]) / 2
+            y_shift = (y_centers[1] - y_centers[0]) / 2
+            x_shift = (x_centers[1] - x_centers[0]) / 2
+            z_centers += z_shift
+            y_centers += y_shift
+            x_centers += x_shift
+
+        # torch.meshgrid default behavior is 'id', np's default is 'xy'
+        rets = torch.meshgrid(x_centers[:feature_size[2]],
+                              y_centers[:feature_size[1]],
+                              z_centers[:feature_size[0]], rotations)
+
+        # torch.meshgrid returns a tuple rather than list
+        rets = list(rets)
+        tile_shape = [1] * 5
+        tile_shape[-2] = int(sizes.shape[0])
+        for i in range(len(rets)):
+            rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
+
+        sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
+        tile_size_shape = list(rets[0].shape)
+        tile_size_shape[3] = 1
+        sizes = sizes.repeat(tile_size_shape)
+        rets.insert(3, sizes)
+
+        ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
+        # [1, 200, 176, N, 2, 7] for kitti after permute
+        # ret = ret.reshape(-1, 7)
+
+        if len(self.custom_values) > 0:
+            custom_ndim = len(self.custom_values)
+            custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
+            # custom[:] = self.custom_values
+            ret = torch.cat([ret, custom], dim=-1)
+            # [1, 200, 176, N, 2, 9] for nus dataset after permute
+        return ret
diff --git a/mmdet3d/core/bbox/__init__.py b/mmdet3d/core/bbox/__init__.py
new file mode 100644
index 0000000000..95efe671fe
--- /dev/null
+++ b/mmdet3d/core/bbox/__init__.py
@@ -0,0 +1,49 @@
+from . import box_torch_ops
+from .assigners import AssignResult, BaseAssigner, MaxIoUAssigner
+from .coders import ResidualCoder
+# from .bbox_target import bbox_target
+from .geometry import (bbox_overlaps_2d, bbox_overlaps_3d,
+                       bbox_overlaps_nearest_3d)
+from .samplers import (BaseSampler, CombinedSampler,
+                       InstanceBalancedPosSampler, IoUBalancedNegSampler,
+                       PseudoSampler, RandomSampler, SamplingResult)
+from .transforms import delta2bbox  # bbox2result_kitti,
+from .transforms import (bbox2delta, bbox2result_coco, bbox2roi, bbox_flip,
+                         bbox_mapping, bbox_mapping_back,
+                         boxes3d_to_bev_torch_lidar, distance2bbox, roi2bbox)
+
+from .assign_sampling import (  # isort:skip, avoid recursive imports
+    build_bbox_coder,  # temporally settings
+    assign_and_sample, build_assigner, build_sampler)
+
+__all__ = [
+    'BaseAssigner',
+    'MaxIoUAssigner',
+    'AssignResult',
+    'BaseSampler',
+    'PseudoSampler',
+    'RandomSampler',
+    'InstanceBalancedPosSampler',
+    'IoUBalancedNegSampler',
+    'CombinedSampler',
+    'SamplingResult',
+    'bbox2delta',
+    'delta2bbox',
+    'bbox_flip',
+    'bbox_mapping',
+    'bbox_mapping_back',
+    'bbox2roi',
+    'roi2bbox',
+    'bbox2result_coco',
+    'distance2bbox',  # 'bbox2result_kitti',
+    'build_assigner',
+    'build_sampler',
+    'assign_and_sample',
+    'bbox_overlaps_2d',
+    'bbox_overlaps_3d',
+    'bbox_overlaps_nearest_3d',
+    'box_torch_ops',
+    'build_bbox_coder',
+    'ResidualCoder',
+    'boxes3d_to_bev_torch_lidar'
+]
diff --git a/mmdet3d/core/bbox/assign_sampling.py b/mmdet3d/core/bbox/assign_sampling.py
new file mode 100644
index 0000000000..ed7632984c
--- /dev/null
+++ b/mmdet3d/core/bbox/assign_sampling.py
@@ -0,0 +1,43 @@
+import mmcv
+
+from . import assigners, coders, samplers
+
+
+def build_assigner(cfg, **kwargs):
+    if isinstance(cfg, assigners.BaseAssigner):
+        return cfg
+    elif isinstance(cfg, dict):
+        return mmcv.runner.obj_from_dict(cfg, assigners, default_args=kwargs)
+    else:
+        raise TypeError('Invalid type {} for building a sampler'.format(
+            type(cfg)))
+
+
+def build_bbox_coder(cfg, **kwargs):
+    if isinstance(cfg, coders.ResidualCoder):
+        return cfg
+    elif isinstance(cfg, dict):
+        return mmcv.runner.obj_from_dict(cfg, coders, default_args=kwargs)
+    else:
+        raise TypeError('Invalid type {} for building a sampler'.format(
+            type(cfg)))
+
+
+def build_sampler(cfg, **kwargs):
+    if isinstance(cfg, samplers.BaseSampler):
+        return cfg
+    elif isinstance(cfg, dict):
+        return mmcv.runner.obj_from_dict(cfg, samplers, default_args=kwargs)
+    else:
+        raise TypeError('Invalid type {} for building a sampler'.format(
+            type(cfg)))
+
+
+def assign_and_sample(bboxes, gt_bboxes, gt_bboxes_ignore, gt_labels, cfg):
+    bbox_assigner = build_assigner(cfg.assigner)
+    bbox_sampler = build_sampler(cfg.sampler)
+    assign_result = bbox_assigner.assign(bboxes, gt_bboxes, gt_bboxes_ignore,
+                                         gt_labels)
+    sampling_result = bbox_sampler.sample(assign_result, bboxes, gt_bboxes,
+                                          gt_labels)
+    return assign_result, sampling_result
diff --git a/mmdet3d/core/bbox/assigners/__init__.py b/mmdet3d/core/bbox/assigners/__init__.py
new file mode 100644
index 0000000000..594e8406b5
--- /dev/null
+++ b/mmdet3d/core/bbox/assigners/__init__.py
@@ -0,0 +1,8 @@
+from .approx_max_iou_assigner import ApproxMaxIoUAssigner
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+from .max_iou_assigner import MaxIoUAssigner
+
+__all__ = [
+    'BaseAssigner', 'MaxIoUAssigner', 'ApproxMaxIoUAssigner', 'AssignResult'
+]
diff --git a/mmdet3d/core/bbox/assigners/approx_max_iou_assigner.py b/mmdet3d/core/bbox/assigners/approx_max_iou_assigner.py
new file mode 100644
index 0000000000..e308a1b1c2
--- /dev/null
+++ b/mmdet3d/core/bbox/assigners/approx_max_iou_assigner.py
@@ -0,0 +1,114 @@
+import torch
+
+from ..geometry import bbox_overlaps_2d
+from .max_iou_assigner import MaxIoUAssigner
+
+
+class ApproxMaxIoUAssigner(MaxIoUAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+    """
+
+    def __init__(self,
+                 pos_iou_thr,
+                 neg_iou_thr,
+                 min_pos_iou=.0,
+                 gt_max_assign_all=True,
+                 ignore_iof_thr=-1,
+                 ignore_wrt_candidates=True):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+
+    def assign(self,
+               approxs,
+               squares,
+               approxs_per_octave,
+               gt_bboxes,
+               gt_bboxes_ignore=None,
+               gt_labels=None):
+        """Assign gt to approxs.
+
+        This method assign a gt bbox to each group of approxs (bboxes),
+        each group of approxs is represent by a base approx (bbox) and
+        will be assigned with -1, 0, or a positive number.
+        -1 means don't care, 0 means negative sample,
+        positive number is the index (1-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to -1
+        2. use the max IoU of each group of approxs to assign
+        2. assign proposals whose iou with all gts < neg_iou_thr to 0
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            approxs (Tensor): Bounding boxes to be assigned,
+        shape(approxs_per_octave*n, 4).
+            squares (Tensor): Base Bounding boxes to be assigned,
+        shape(n, 4).
+            approxs_per_octave (int): number of approxs per octave
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+
+        if squares.shape[0] == 0 or gt_bboxes.shape[0] == 0:
+            raise ValueError('No gt or approxs')
+        num_squares = squares.size(0)
+        num_gts = gt_bboxes.size(0)
+        # re-organize anchors by approxs_per_octave x num_squares
+        approxs = torch.transpose(
+            approxs.view(num_squares, approxs_per_octave, 4), 0,
+            1).contiguous().view(-1, 4)
+        all_overlaps = bbox_overlaps_2d(approxs, gt_bboxes)
+
+        overlaps, _ = all_overlaps.view(approxs_per_octave, num_squares,
+                                        num_gts).max(dim=0)
+        overlaps = torch.transpose(overlaps, 0, 1)
+
+        bboxes = squares[:, :4]
+
+        if (self.ignore_iof_thr > 0) and (gt_bboxes_ignore is not None) and (
+                gt_bboxes_ignore.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = bbox_overlaps_2d(
+                    bboxes, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = bbox_overlaps_2d(
+                    gt_bboxes_ignore, bboxes, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        return assign_result
diff --git a/mmdet3d/core/bbox/assigners/assign_result.py b/mmdet3d/core/bbox/assigners/assign_result.py
new file mode 100644
index 0000000000..33c761dde2
--- /dev/null
+++ b/mmdet3d/core/bbox/assigners/assign_result.py
@@ -0,0 +1,19 @@
+import torch
+
+
+class AssignResult(object):
+
+    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+
+    def add_gt_(self, gt_labels):
+        self_inds = torch.arange(
+            1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device)
+        self.gt_inds = torch.cat([self_inds, self.gt_inds])
+        self.max_overlaps = torch.cat(
+            [self.max_overlaps.new_ones(self.num_gts), self.max_overlaps])
+        if self.labels is not None:
+            self.labels = torch.cat([gt_labels, self.labels])
diff --git a/mmdet3d/core/bbox/assigners/base_assigner.py b/mmdet3d/core/bbox/assigners/base_assigner.py
new file mode 100644
index 0000000000..7bd02dce14
--- /dev/null
+++ b/mmdet3d/core/bbox/assigners/base_assigner.py
@@ -0,0 +1,8 @@
+from abc import ABCMeta, abstractmethod
+
+
+class BaseAssigner(metaclass=ABCMeta):
+
+    @abstractmethod
+    def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        pass
diff --git a/mmdet3d/core/bbox/assigners/max_iou_assigner.py b/mmdet3d/core/bbox/assigners/max_iou_assigner.py
new file mode 100644
index 0000000000..53e3df1307
--- /dev/null
+++ b/mmdet3d/core/bbox/assigners/max_iou_assigner.py
@@ -0,0 +1,169 @@
+import torch
+
+from .. import geometry
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+class MaxIoUAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+    """
+
+    def __init__(self,
+                 pos_iou_thr,
+                 neg_iou_thr,
+                 min_pos_iou=.0,
+                 gt_max_assign_all=True,
+                 ignore_iof_thr=-1,
+                 iou_type='2d',
+                 ignore_wrt_candidates=True):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        # iou_type could be 2d, 3d, nearest_3d
+        self.iou_type = iou_type
+        self.bbox_overlaps = getattr(geometry,
+                                     'bbox_overlaps_{}'.format(iou_type))
+
+    def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        """Assign gt to bboxes.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, 0, or a positive number. -1 means don't care,
+        0 means negative sample, positive number is the index (1-based) of
+        assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to -1
+        2. assign proposals whose iou with all gts < neg_iou_thr to 0
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4).
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        if self.iou_type == '2d':
+            bboxes = bboxes[:, :4]
+        overlaps = self.bbox_overlaps(gt_bboxes, bboxes)
+        if (self.ignore_iof_thr > 0) and (gt_bboxes_ignore is not None) and (
+                gt_bboxes_ignore.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.bbox_overlaps(
+                    bboxes, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.bbox_overlaps(
+                    gt_bboxes_ignore, bboxes, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        return assign_result
+
+    def assign_wrt_overlaps(self, overlaps, gt_labels=None):
+        """Assign w.r.t. the overlaps of bboxes with gts.
+
+        Args:
+            overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes,
+                shape(k, n).
+            gt_labels (Tensor, optional): Labels of k gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_gts, num_bboxes = overlaps.size(0), overlaps.size(1)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = overlaps.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_bboxes, ))
+            if num_gts == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = overlaps.new_zeros((num_bboxes, ),
+                                                     dtype=torch.long)
+            return AssignResult(
+                num_gts,
+                assigned_gt_inds,
+                max_overlaps,
+                labels=assigned_labels)
+
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
+        # 2. assign negative: below
+        if isinstance(self.neg_iou_thr, float):
+            assigned_gt_inds[(max_overlaps >= 0)
+                             & (max_overlaps < self.neg_iou_thr)] = 0
+        elif isinstance(self.neg_iou_thr, tuple):
+            assert len(self.neg_iou_thr) == 2
+            assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0])
+                             & (max_overlaps < self.neg_iou_thr[1])] = 0
+
+        # 3. assign positive: above positive IoU threshold
+        pos_inds = max_overlaps >= self.pos_iou_thr
+        assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+        # 4. assign fg: for each gt, proposals with highest IoU
+        for i in range(num_gts):
+            if gt_max_overlaps[i] >= self.min_pos_iou:
+                if self.gt_max_assign_all:
+                    max_iou_inds = overlaps[i, :] == gt_max_overlaps[i]
+                    assigned_gt_inds[max_iou_inds] = i + 1
+                else:
+                    assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_zeros((num_bboxes, ))
+            pos_inds = torch.nonzero(assigned_gt_inds > 0).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+        else:
+            assigned_labels = None
+
+        return AssignResult(
+            num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/mmdet3d/core/bbox/box_np_ops.py b/mmdet3d/core/bbox/box_np_ops.py
new file mode 100644
index 0000000000..966905e19c
--- /dev/null
+++ b/mmdet3d/core/bbox/box_np_ops.py
@@ -0,0 +1,568 @@
+import numba
+import numpy as np
+
+
+def camera_to_lidar(points, r_rect, velo2cam):
+    points_shape = list(points.shape[0:-1])
+    if points.shape[-1] == 3:
+        points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1)
+    lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T)
+    return lidar_points[..., :3]
+
+
+def box_camera_to_lidar(data, r_rect, velo2cam):
+    xyz = data[:, 0:3]
+    l, h, w = data[:, 3:4], data[:, 4:5], data[:, 5:6]
+    r = data[:, 6:7]
+    xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam)
+    return np.concatenate([xyz_lidar, w, l, h, r], axis=1)
+
+
+def corners_nd(dims, origin=0.5):
+    """generate relative box corners based on length per dim and
+    origin point.
+
+    Args:
+        dims (float array, shape=[N, ndim]): array of length per dim
+        origin (list or array or float): origin point relate to smallest point.
+
+    Returns:
+        float array, shape=[N, 2 ** ndim, ndim]: returned corners.
+        point layout example: (2d) x0y0, x0y1, x1y0, x1y1;
+            (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+            where x0 < x1, y0 < y1, z0 < z1
+    """
+    ndim = int(dims.shape[1])
+    corners_norm = np.stack(
+        np.unravel_index(np.arange(2**ndim), [2] * ndim),
+        axis=1).astype(dims.dtype)
+    # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1
+    # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+    # so need to convert to a format which is convenient to do other computing.
+    # for 2d boxes, format is clockwise start with minimum point
+    # for 3d boxes, please draw lines by your hand.
+    if ndim == 2:
+        # generate clockwise box corners
+        corners_norm = corners_norm[[0, 1, 3, 2]]
+    elif ndim == 3:
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+    corners_norm = corners_norm - np.array(origin, dtype=dims.dtype)
+    corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape(
+        [1, 2**ndim, ndim])
+    return corners
+
+
+def rotation_2d(points, angles):
+    """rotation 2d points based on origin point clockwise when angle positive.
+
+    Args:
+        points (float array, shape=[N, point_size, 2]): points to be rotated.
+        angles (float array, shape=[N]): rotation angle.
+
+    Returns:
+        float array: same shape as points
+    """
+    rot_sin = np.sin(angles)
+    rot_cos = np.cos(angles)
+    rot_mat_T = np.stack([[rot_cos, -rot_sin], [rot_sin, rot_cos]])
+    return np.einsum('aij,jka->aik', points, rot_mat_T)
+
+
+def center_to_corner_box2d(centers, dims, angles=None, origin=0.5):
+    """convert kitti locations, dimensions and angles to corners.
+    format: center(xy), dims(xy), angles(clockwise when positive)
+
+    Args:
+        centers (float array, shape=[N, 2]): locations in kitti label file.
+        dims (float array, shape=[N, 2]): dimensions in kitti label file.
+        angles (float array, shape=[N]): rotation_y in kitti label file.
+
+    Returns:
+        [type]: [description]
+    """
+    # 'length' in kitti format is in x axis.
+    # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
+    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+    corners = corners_nd(dims, origin=origin)
+    # corners: [N, 4, 2]
+    if angles is not None:
+        corners = rotation_2d(corners, angles)
+    corners += centers.reshape([-1, 1, 2])
+    return corners
+
+
+@numba.jit(nopython=True)
+def depth_to_points(depth, trunc_pixel):
+    num_pts = np.sum(depth[trunc_pixel:, ] > 0.1)
+    points = np.zeros((num_pts, 3), dtype=depth.dtype)
+    x = np.array([0, 0, 1], dtype=depth.dtype)
+    k = 0
+    for i in range(trunc_pixel, depth.shape[0]):
+        for j in range(depth.shape[1]):
+            if depth[i, j] > 0.1:
+                x = np.array([j, i, 1], dtype=depth.dtype)
+                points[k] = x * depth[i, j]
+                k += 1
+    return points
+
+
+def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam):
+    pts = depth_to_points(depth, trunc_pixel)
+    points_shape = list(pts.shape[0:-1])
+    points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1)
+    points = points @ np.linalg.inv(P2.T)
+    lidar_points = camera_to_lidar(points, r_rect, velo2cam)
+    return lidar_points
+
+
+def rotation_3d_in_axis(points, angles, axis=0):
+    # points: [N, point_size, 3]
+    rot_sin = np.sin(angles)
+    rot_cos = np.cos(angles)
+    ones = np.ones_like(rot_cos)
+    zeros = np.zeros_like(rot_cos)
+    if axis == 1:
+        rot_mat_T = np.stack([[rot_cos, zeros, -rot_sin], [zeros, ones, zeros],
+                              [rot_sin, zeros, rot_cos]])
+    elif axis == 2 or axis == -1:
+        rot_mat_T = np.stack([[rot_cos, -rot_sin, zeros],
+                              [rot_sin, rot_cos, zeros], [zeros, zeros, ones]])
+    elif axis == 0:
+        rot_mat_T = np.stack([[zeros, rot_cos, -rot_sin],
+                              [zeros, rot_sin, rot_cos], [ones, zeros, zeros]])
+    else:
+        raise ValueError('axis should in range')
+
+    return np.einsum('aij,jka->aik', points, rot_mat_T)
+
+
+def center_to_corner_box3d(centers,
+                           dims,
+                           angles=None,
+                           origin=(0.5, 1.0, 0.5),
+                           axis=1):
+    """convert kitti locations, dimensions and angles to corners
+
+    Args:
+        centers (float array, shape=[N, 3]): locations in kitti label file.
+        dims (float array, shape=[N, 3]): dimensions in kitti label file.
+        angles (float array, shape=[N]): rotation_y in kitti label file.
+        origin (list or array or float): origin point relate to smallest point.
+            use [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar.
+        axis (int): rotation axis. 1 for camera and 2 for lidar.
+    Returns:
+        [type]: [description]
+    """
+    # 'length' in kitti format is in x axis.
+    # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
+    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+    corners = corners_nd(dims, origin=origin)
+    # corners: [N, 8, 3]
+    if angles is not None:
+        corners = rotation_3d_in_axis(corners, angles, axis=axis)
+    corners += centers.reshape([-1, 1, 3])
+    return corners
+
+
+@numba.jit(nopython=True)
+def box2d_to_corner_jit(boxes):
+    num_box = boxes.shape[0]
+    corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
+    corners_norm[1, 1] = 1.0
+    corners_norm[2] = 1.0
+    corners_norm[3, 0] = 1.0
+    corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
+    corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(
+        1, 4, 2)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype)
+    for i in range(num_box):
+        rot_sin = np.sin(boxes[i, -1])
+        rot_cos = np.cos(boxes[i, -1])
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 1] = -rot_sin
+        rot_mat_T[1, 0] = rot_sin
+        rot_mat_T[1, 1] = rot_cos
+        box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2]
+    return box_corners
+
+
+@numba.njit
+def corner_to_standup_nd_jit(boxes_corner):
+    num_boxes = boxes_corner.shape[0]
+    ndim = boxes_corner.shape[-1]
+    result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype)
+    for i in range(num_boxes):
+        for j in range(ndim):
+            result[i, j] = np.min(boxes_corner[i, :, j])
+        for j in range(ndim):
+            result[i, j + ndim] = np.max(boxes_corner[i, :, j])
+    return result
+
+
+@numba.jit(nopython=True)
+def corner_to_surfaces_3d_jit(corners):
+    """convert 3d box corners from corner function above
+    to surfaces that normal vectors all direct to internal.
+
+    Args:
+        corners (float array, [N, 8, 3]): 3d box corners.
+    Returns:
+        surfaces (float array, [N, 6, 4, 3]):
+    """
+    # box_corners: [N, 8, 3], must from corner functions in this module
+    num_boxes = corners.shape[0]
+    surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype)
+    corner_idxes = np.array([
+        0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7
+    ]).reshape(6, 4)
+    for i in range(num_boxes):
+        for j in range(6):
+            for k in range(4):
+                surfaces[i, j, k] = corners[i, corner_idxes[j, k]]
+    return surfaces
+
+
+def rotation_points_single_angle(points, angle, axis=0):
+    # points: [N, 3]
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    if axis == 1:
+        rot_mat_T = np.array(
+            [[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]],
+            dtype=points.dtype)
+    elif axis == 2 or axis == -1:
+        rot_mat_T = np.array(
+            [[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]],
+            dtype=points.dtype)
+    elif axis == 0:
+        rot_mat_T = np.array(
+            [[1, 0, 0], [0, rot_cos, -rot_sin], [0, rot_sin, rot_cos]],
+            dtype=points.dtype)
+    else:
+        raise ValueError('axis should in range')
+
+    return points @ rot_mat_T, rot_mat_T
+
+
+def project_to_image(points_3d, proj_mat):
+    points_shape = list(points_3d.shape)
+    points_shape[-1] = 1
+    points_4 = np.concatenate([points_3d, np.zeros(points_shape)], axis=-1)
+    point_2d = points_4 @ proj_mat.T
+    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+    return point_2d_res
+
+
+def box3d_to_bbox(box3d, rect, Trv2c, P2):
+    box_corners = center_to_corner_box3d(
+        box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1)
+    box_corners_in_image = project_to_image(box_corners, P2)
+    # box_corners_in_image: [N, 8, 2]
+    minxy = np.min(box_corners_in_image, axis=1)
+    maxxy = np.max(box_corners_in_image, axis=1)
+    bbox = np.concatenate([minxy, maxxy], axis=1)
+    return bbox
+
+
+def corner_to_surfaces_3d(corners):
+    """convert 3d box corners from corner function above
+    to surfaces that normal vectors all direct to internal.
+
+    Args:
+        corners (float array, [N, 8, 3]): 3d box corners.
+    Returns:
+        surfaces (float array, [N, 6, 4, 3]):
+    """
+    # box_corners: [N, 8, 3], must from corner functions in this module
+    surfaces = np.array([
+        [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]],
+        [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]],
+        [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]],
+        [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]],
+        [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]],
+        [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]],
+    ]).transpose([2, 0, 1, 3])
+    return surfaces
+
+
+def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)):
+    # TODO: this function is different from PointCloud3D, be careful
+    # when start to use nuscene, check the input
+    rbbox_corners = center_to_corner_box3d(
+        rbbox[:, :3], rbbox[:, 3:6], rbbox[:, 6], origin=origin, axis=z_axis)
+    surfaces = corner_to_surfaces_3d(rbbox_corners)
+    indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces)
+    return indices
+
+
+def minmax_to_corner_2d(minmax_box):
+    ndim = minmax_box.shape[-1] // 2
+    center = minmax_box[..., :ndim]
+    dims = minmax_box[..., ndim:] - center
+    return center_to_corner_box2d(center, dims, origin=0.0)
+
+
+def limit_period(val, offset=0.5, period=np.pi):
+    return val - np.floor(val / period + offset) * period
+
+
+def create_anchors_3d_range(feature_size,
+                            anchor_range,
+                            sizes=((1.6, 3.9, 1.56), ),
+                            rotations=(0, np.pi / 2),
+                            dtype=np.float32):
+    """
+    Args:
+        feature_size: list [D, H, W](zyx)
+        sizes: [N, 3] list of list or array, size of anchors, xyz
+
+    Returns:
+        anchors: [*feature_size, num_sizes, num_rots, 7] tensor.
+    """
+    anchor_range = np.array(anchor_range, dtype)
+    z_centers = np.linspace(
+        anchor_range[2], anchor_range[5], feature_size[0], dtype=dtype)
+    y_centers = np.linspace(
+        anchor_range[1], anchor_range[4], feature_size[1], dtype=dtype)
+    x_centers = np.linspace(
+        anchor_range[0], anchor_range[3], feature_size[2], dtype=dtype)
+    sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3])
+    rotations = np.array(rotations, dtype=dtype)
+    rets = np.meshgrid(
+        x_centers, y_centers, z_centers, rotations, indexing='ij')
+    tile_shape = [1] * 5
+    tile_shape[-2] = int(sizes.shape[0])
+    for i in range(len(rets)):
+        rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape)
+        rets[i] = rets[i][..., np.newaxis]  # for concat
+    sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3])
+    tile_size_shape = list(rets[0].shape)
+    tile_size_shape[3] = 1
+    sizes = np.tile(sizes, tile_size_shape)
+    rets.insert(3, sizes)
+    ret = np.concatenate(rets, axis=-1)
+    return np.transpose(ret, [2, 1, 0, 3, 4, 5])
+
+
+def center_to_minmax_2d_0_5(centers, dims):
+    return np.concatenate([centers - dims / 2, centers + dims / 2], axis=-1)
+
+
+def center_to_minmax_2d(centers, dims, origin=0.5):
+    if origin == 0.5:
+        return center_to_minmax_2d_0_5(centers, dims)
+    corners = center_to_corner_box2d(centers, dims, origin=origin)
+    return corners[:, [0, 2]].reshape([-1, 4])
+
+
+def rbbox2d_to_near_bbox(rbboxes):
+    """convert rotated bbox to nearest 'standing' or 'lying' bbox.
+    Args:
+        rbboxes: [N, 5(x, y, xdim, ydim, rad)] rotated bboxes
+    Returns:
+        bboxes: [N, 4(xmin, ymin, xmax, ymax)] bboxes
+    """
+    rots = rbboxes[..., -1]
+    rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi))
+    cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis]
+    bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4])
+    bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:])
+    return bboxes
+
+
+@numba.jit(nopython=True)
+def iou_jit(boxes, query_boxes, mode='iou', eps=0.0):
+    """calculate box iou. note that jit version runs ~10x faster than the
+    box_overlaps function in mmdet3d.core.evaluation
+    Parameters
+    ----------
+    boxes: (N, 4) ndarray of float
+    query_boxes: (K, 4) ndarray of float
+    Returns
+    -------
+    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+    """
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) *
+                    (query_boxes[k, 3] - query_boxes[k, 1] + eps))
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]) + eps)
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]) + eps)
+                if ih > 0:
+                    if mode == 'iou':
+                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
+                              (boxes[n, 3] - boxes[n, 1] + eps) + box_area -
+                              iw * ih)
+                    else:
+                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
+                              (boxes[n, 3] - boxes[n, 1] + eps))
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def change_box3d_center_(box3d, src, dst):
+    dst = np.array(dst, dtype=box3d.dtype)
+    src = np.array(src, dtype=box3d.dtype)
+    box3d[..., :3] += box3d[..., 3:6] * (dst - src)
+
+
+def projection_matrix_to_CRT_kitti(proj):
+    # P = C @ [R|T]
+    # C is upper triangular matrix, so we need to inverse CR and use QR
+    # stable for all kitti camera projection matrix
+    CR = proj[0:3, 0:3]
+    CT = proj[0:3, 3]
+    RinvCinv = np.linalg.inv(CR)
+    Rinv, Cinv = np.linalg.qr(RinvCinv)
+    C = np.linalg.inv(Cinv)
+    R = np.linalg.inv(Rinv)
+    T = Cinv @ CT
+    return C, R, T
+
+
+def remove_outside_points(points, rect, Trv2c, P2, image_shape):
+    # 5x faster than remove_outside_points_v1(2ms vs 10ms)
+    C, R, T = projection_matrix_to_CRT_kitti(P2)
+    image_bbox = [0, 0, image_shape[1], image_shape[0]]
+    frustum = get_frustum(image_bbox, C)
+    frustum -= T
+    frustum = np.linalg.inv(R) @ frustum.T
+    frustum = camera_to_lidar(frustum.T, rect, Trv2c)
+    frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...])
+    indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces)
+    points = points[indices.reshape([-1])]
+    return points
+
+
+def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
+    fku = C[0, 0]
+    fkv = -C[1, 1]
+    u0v0 = C[0:2, 2]
+    z_points = np.array(
+        [near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis]
+    b = bbox_image
+    box_corners = np.array(
+        [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]],
+        dtype=C.dtype)
+    near_box_corners = (box_corners - u0v0) / np.array(
+        [fku / near_clip, -fkv / near_clip], dtype=C.dtype)
+    far_box_corners = (box_corners - u0v0) / np.array(
+        [fku / far_clip, -fkv / far_clip], dtype=C.dtype)
+    ret_xy = np.concatenate([near_box_corners, far_box_corners],
+                            axis=0)  # [8, 2]
+    ret_xyz = np.concatenate([ret_xy, z_points], axis=1)
+    return ret_xyz
+
+
+def surface_equ_3d(polygon_surfaces):
+    # return [a, b, c], d in ax+by+cz+d=0
+    # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3]
+    surface_vec = polygon_surfaces[:, :, :2, :] - polygon_surfaces[:, :,
+                                                                   1:3, :]
+    # normal_vec: [..., 3]
+    normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :])
+    # print(normal_vec.shape, points[..., 0, :].shape)
+    # d = -np.inner(normal_vec, points[..., 0, :])
+    d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :])
+    return normal_vec, -d
+
+
+@numba.njit
+def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d,
+                                     num_surfaces):
+    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
+    num_points = points.shape[0]
+    num_polygons = polygon_surfaces.shape[0]
+    ret = np.ones((num_points, num_polygons), dtype=np.bool_)
+    sign = 0.0
+    for i in range(num_points):
+        for j in range(num_polygons):
+            for k in range(max_num_surfaces):
+                if k > num_surfaces[j]:
+                    break
+                sign = (
+                    points[i, 0] * normal_vec[j, k, 0] +
+                    points[i, 1] * normal_vec[j, k, 1] +
+                    points[i, 2] * normal_vec[j, k, 2] + d[j, k])
+                if sign >= 0:
+                    ret[i, j] = False
+                    break
+    return ret
+
+
+def points_in_convex_polygon_3d_jit(points,
+                                    polygon_surfaces,
+                                    num_surfaces=None):
+    """check points is in 3d convex polygons.
+    Args:
+        points: [num_points, 3] array.
+        polygon_surfaces: [num_polygon, max_num_surfaces,
+            max_num_points_of_surface, 3]
+            array. all surfaces' normal vector must direct to internal.
+            max_num_points_of_surface must at least 3.
+        num_surfaces: [num_polygon] array. indicate how many surfaces
+            a polygon contain
+    Returns:
+        [num_points, num_polygon] bool array.
+    """
+    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
+    # num_points = points.shape[0]
+    num_polygons = polygon_surfaces.shape[0]
+    if num_surfaces is None:
+        num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64)
+    normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :])
+    # normal_vec: [num_polygon, max_num_surfaces, 3]
+    # d: [num_polygon, max_num_surfaces]
+    return _points_in_convex_polygon_3d_jit(points, polygon_surfaces,
+                                            normal_vec, d, num_surfaces)
+
+
+@numba.jit
+def points_in_convex_polygon_jit(points, polygon, clockwise=True):
+    """check points is in 2d convex polygons. True when point in polygon
+    Args:
+        points: [num_points, 2] array.
+        polygon: [num_polygon, num_points_of_polygon, 2] array.
+        clockwise: bool. indicate polygon is clockwise.
+    Returns:
+        [num_points, num_polygon] bool array.
+    """
+    # first convert polygon to directed lines
+    num_points_of_polygon = polygon.shape[1]
+    num_points = points.shape[0]
+    num_polygons = polygon.shape[0]
+    # if clockwise:
+    #     vec1 = polygon - polygon[:, [num_points_of_polygon - 1] +
+    #                              list(range(num_points_of_polygon - 1)), :]
+    # else:
+    #     vec1 = polygon[:, [num_points_of_polygon - 1] +
+    #                    list(range(num_points_of_polygon - 1)), :] - polygon
+    # vec1: [num_polygon, num_points_of_polygon, 2]
+    vec1 = np.zeros((2), dtype=polygon.dtype)
+    ret = np.zeros((num_points, num_polygons), dtype=np.bool_)
+    success = True
+    cross = 0.0
+    for i in range(num_points):
+        for j in range(num_polygons):
+            success = True
+            for k in range(num_points_of_polygon):
+                if clockwise:
+                    vec1 = polygon[j, k] - polygon[j, k - 1]
+                else:
+                    vec1 = polygon[j, k - 1] - polygon[j, k]
+                cross = vec1[1] * (polygon[j, k, 0] - points[i, 0])
+                cross -= vec1[0] * (polygon[j, k, 1] - points[i, 1])
+                if cross >= 0:
+                    success = False
+                    break
+            ret[i, j] = success
+    return ret
diff --git a/mmdet3d/core/bbox/box_torch_ops.py b/mmdet3d/core/bbox/box_torch_ops.py
new file mode 100644
index 0000000000..b0d197f4b1
--- /dev/null
+++ b/mmdet3d/core/bbox/box_torch_ops.py
@@ -0,0 +1,192 @@
+import numpy as np
+import torch
+
+
+def limit_period(val, offset=0.5, period=np.pi):
+    return val - torch.floor(val / period + offset) * period
+
+
+def corners_nd(dims, origin=0.5):
+    """generate relative box corners based on length per dim and
+    origin point.
+
+    Args:
+        dims (float array, shape=[N, ndim]): array of length per dim
+        origin (list or array or float): origin point relate to smallest point.
+
+    Returns:
+        float array, shape=[N, 2 ** ndim, ndim]: returned corners.
+        point layout example: (2d) x0y0, x0y1, x1y0, x1y1;
+            (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+            where x0 < x1, y0 < y1, z0 < z1
+    """
+    ndim = int(dims.shape[1])
+    corners_norm = np.stack(
+        np.unravel_index(np.arange(2**ndim), [2] * ndim),
+        axis=1).astype(dims.dtype)
+    # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1
+    # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+    # so need to convert to a format which is convenient to do other computing.
+    # for 2d boxes, format is clockwise start with minimum point
+    # for 3d boxes, please draw lines by your hand.
+    if ndim == 2:
+        # generate clockwise box corners
+        corners_norm = corners_norm[[0, 1, 3, 2]]
+    elif ndim == 3:
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+    corners_norm = corners_norm - np.array(origin, dtype=dims.dtype)
+    corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape(
+        [1, 2**ndim, ndim])
+    return corners
+
+
+def rotation_3d_in_axis(points, angles, axis=0):
+    # points: [N, point_size, 3]
+    # angles: [N]
+    rot_sin = torch.sin(angles)
+    rot_cos = torch.cos(angles)
+    ones = torch.ones_like(rot_cos)
+    zeros = torch.zeros_like(rot_cos)
+    if axis == 1:
+        rot_mat_T = torch.stack([
+            torch.stack([rot_cos, zeros, -rot_sin]),
+            torch.stack([zeros, ones, zeros]),
+            torch.stack([rot_sin, zeros, rot_cos])
+        ])
+    elif axis == 2 or axis == -1:
+        rot_mat_T = torch.stack([
+            torch.stack([rot_cos, -rot_sin, zeros]),
+            torch.stack([rot_sin, rot_cos, zeros]),
+            torch.stack([zeros, zeros, ones])
+        ])
+    elif axis == 0:
+        rot_mat_T = torch.stack([
+            torch.stack([zeros, rot_cos, -rot_sin]),
+            torch.stack([zeros, rot_sin, rot_cos]),
+            torch.stack([ones, zeros, zeros])
+        ])
+    else:
+        raise ValueError('axis should in range')
+
+    return torch.einsum('aij,jka->aik', (points, rot_mat_T))
+
+
+def center_to_corner_box3d(centers,
+                           dims,
+                           angles,
+                           origin=[0.5, 1.0, 0.5],
+                           axis=1):
+    """convert kitti locations, dimensions and angles to corners
+
+    Args:
+        centers (float array, shape=[N, 3]): locations in kitti label file.
+        dims (float array, shape=[N, 3]): dimensions in kitti label file.
+        angles (float array, shape=[N]): rotation_y in kitti label file.
+        origin (list or array or float): origin point relate to smallest point.
+            use [0.5, 1.0, 0.5] in camera and [0.5, 0.5, 0] in lidar.
+        axis (int): rotation axis. 1 for camera and 2 for lidar.
+    Returns:
+        [type]: [description]
+    """
+    # 'length' in kitti format is in x axis.
+    # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
+    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+    corners = corners_nd(dims, origin=origin)
+    # corners: [N, 8, 3]
+    corners = rotation_3d_in_axis(corners, angles, axis=axis)
+    corners += centers.view(-1, 1, 3)
+    return corners
+
+
+def lidar_to_camera(points, r_rect, velo2cam):
+    num_points = points.shape[0]
+    points = torch.cat(
+        [points, torch.ones(num_points, 1).type_as(points)], dim=-1)
+    camera_points = points @ (r_rect @ velo2cam).t()
+    return camera_points[..., :3]
+
+
+def box_lidar_to_camera(data, r_rect, velo2cam):
+    xyz_lidar = data[..., 0:3]
+    w, l, h = data[..., 3:4], data[..., 4:5], data[..., 5:6]
+    r = data[..., 6:7]
+    xyz = lidar_to_camera(xyz_lidar, r_rect, velo2cam)
+    return torch.cat([xyz, l, h, w, r], dim=-1)
+
+
+def project_to_image(points_3d, proj_mat):
+    points_num = list(points_3d.shape)[:-1]
+    points_shape = np.concatenate([points_num, [1]], axis=0).tolist()
+    # previous implementation use new_zeros, new_one yeilds better results
+    points_4 = torch.cat(
+        [points_3d, points_3d.new_ones(*points_shape)], dim=-1)
+    # point_2d = points_4 @ tf.transpose(proj_mat, [1, 0])
+    point_2d = torch.matmul(points_4, proj_mat.t())
+    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+    return point_2d_res
+
+
+def rbbox2d_to_near_bbox(rbboxes):
+    """convert rotated bbox to nearest 'standing' or 'lying' bbox.
+
+    Args:
+        rbboxes: [N, 5(x, y, xdim, ydim, rad)] rotated bboxes
+    Returns:
+        bboxes: [N, 4(xmin, ymin, xmax, ymax)] bboxes
+    """
+    rots = rbboxes[..., -1]
+    rots_0_pi_div_2 = torch.abs(limit_period(rots, 0.5, np.pi))
+    cond = (rots_0_pi_div_2 > np.pi / 4)[..., None]
+    bboxes_center = torch.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4])
+    bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:])
+    return bboxes
+
+
+def center_to_minmax_2d_0_5(centers, dims):
+    return torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
+
+
+def center_to_minmax_2d(centers, dims, origin=0.5):
+    if origin == 0.5:
+        return center_to_minmax_2d_0_5(centers, dims)
+    corners = center_to_corner_box2d(centers, dims, origin=origin)
+    return corners[:, [0, 2]].reshape([-1, 4])
+
+
+def center_to_corner_box2d(centers, dims, angles=None, origin=0.5):
+    """convert kitti locations, dimensions and angles to corners.
+    format: center(xy), dims(xy), angles(clockwise when positive)
+
+    Args:
+        centers (float array, shape=[N, 2]): locations in kitti label file.
+        dims (float array, shape=[N, 2]): dimensions in kitti label file.
+        angles (float array, shape=[N]): rotation_y in kitti label file.
+
+    Returns:
+        [type]: [description]
+    """
+    # 'length' in kitti format is in x axis.
+    # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
+    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+    corners = corners_nd(dims, origin=origin)
+    # corners: [N, 4, 2]
+    if angles is not None:
+        corners = rotation_2d(corners, angles)
+    corners += centers.reshape([-1, 1, 2])
+    return corners
+
+
+def rotation_2d(points, angles):
+    """rotation 2d points based on origin point clockwise when angle positive.
+
+    Args:
+        points (float array, shape=[N, point_size, 2]): points to be rotated.
+        angles (float array, shape=[N]): rotation angle.
+
+    Returns:
+        float array: same shape as points
+    """
+    rot_sin = torch.sin(angles)
+    rot_cos = torch.cos(angles)
+    rot_mat_T = torch.stack([[rot_cos, -rot_sin], [rot_sin, rot_cos]])
+    return torch.einsum('aij,jka->aik', points, rot_mat_T)
diff --git a/mmdet3d/core/bbox/coders/__init__.py b/mmdet3d/core/bbox/coders/__init__.py
new file mode 100644
index 0000000000..700a4963f8
--- /dev/null
+++ b/mmdet3d/core/bbox/coders/__init__.py
@@ -0,0 +1,3 @@
+from .box_coder import ResidualCoder
+
+__all__ = ['ResidualCoder']
diff --git a/mmdet3d/core/bbox/coders/box_coder.py b/mmdet3d/core/bbox/coders/box_coder.py
new file mode 100644
index 0000000000..d936a3f1e1
--- /dev/null
+++ b/mmdet3d/core/bbox/coders/box_coder.py
@@ -0,0 +1,116 @@
+import numpy as np
+import torch
+
+
+class ResidualCoder(object):
+
+    def __init__(self, code_size=7, mean=None, std=None):
+        super().__init__()
+        self.code_size = code_size
+        self.mean = mean
+        self.std = std
+
+    @staticmethod
+    def encode_np(boxes, anchors):
+        """
+        :param boxes: (N, 7) x, y, z, w, l, h, r
+        :param anchors: (N, 7)
+        :return:
+        """
+        # need to convert boxes to z-center format
+        xa, ya, za, wa, la, ha, ra = np.split(anchors, 7, axis=-1)
+        xg, yg, zg, wg, lg, hg, rg = np.split(boxes, 7, axis=-1)
+        zg = zg + hg / 2
+        za = za + ha / 2
+        diagonal = np.sqrt(la**2 + wa**2)  # 4.3
+        xt = (xg - xa) / diagonal
+        yt = (yg - ya) / diagonal
+        zt = (zg - za) / ha  # 1.6
+        lt = np.log(lg / la)
+        wt = np.log(wg / wa)
+        ht = np.log(hg / ha)
+        rt = rg - ra
+        return np.concatenate([xt, yt, zt, wt, lt, ht, rt], axis=-1)
+
+    @staticmethod
+    def decode_np(box_encodings, anchors):
+        """
+        :param box_encodings: (N, 7) x, y, z, w, l, h, r
+        :param anchors: (N, 7)
+        :return:
+        """
+        # need to convert box_encodings to z-bottom format
+        xa, ya, za, wa, la, ha, ra = np.split(anchors, 7, axis=-1)
+        xt, yt, zt, wt, lt, ht, rt = np.split(box_encodings, 7, axis=-1)
+
+        za = za + ha / 2
+        diagonal = np.sqrt(la**2 + wa**2)
+        xg = xt * diagonal + xa
+        yg = yt * diagonal + ya
+        zg = zt * ha + za
+
+        lg = np.exp(lt) * la
+        wg = np.exp(wt) * wa
+        hg = np.exp(ht) * ha
+        rg = rt + ra
+        zg = zg - hg / 2
+        return np.concatenate([xg, yg, zg, wg, lg, hg, rg], axis=-1)
+
+    @staticmethod
+    def encode_torch(anchors, boxes, means, stds):
+        """
+        :param boxes: (N, 7+n) x, y, z, w, l, h, r, velo*
+        :param anchors: (N, 7+n)
+        :return:
+        """
+        box_ndim = anchors.shape[-1]
+        cas, cgs, cts = [], [], []
+        if box_ndim > 7:
+            xa, ya, za, wa, la, ha, ra, *cas = torch.split(anchors, 1, dim=-1)
+            xg, yg, zg, wg, lg, hg, rg, *cgs = torch.split(boxes, 1, dim=-1)
+            cts = [g - a for g, a in zip(cgs, cas)]
+        else:
+            xa, ya, za, wa, la, ha, ra = torch.split(anchors, 1, dim=-1)
+            xg, yg, zg, wg, lg, hg, rg = torch.split(boxes, 1, dim=-1)
+        za = za + ha / 2
+        zg = zg + hg / 2
+        diagonal = torch.sqrt(la**2 + wa**2)
+        xt = (xg - xa) / diagonal
+        yt = (yg - ya) / diagonal
+        zt = (zg - za) / ha
+        lt = torch.log(lg / la)
+        wt = torch.log(wg / wa)
+        ht = torch.log(hg / ha)
+        rt = rg - ra
+        return torch.cat([xt, yt, zt, wt, lt, ht, rt, *cts], dim=-1)
+
+    @staticmethod
+    def decode_torch(anchors, box_encodings, means, stds):
+        """
+        :param box_encodings: (N, 7 + n) x, y, z, w, l, h, r
+        :param anchors: (N, 7)
+        :return:
+        """
+        cas, cts = [], []
+        box_ndim = anchors.shape[-1]
+        if box_ndim > 7:
+            xa, ya, za, wa, la, ha, ra, *cas = torch.split(anchors, 1, dim=-1)
+            xt, yt, zt, wt, lt, ht, rt, *cts = torch.split(
+                box_encodings, 1, dim=-1)
+        else:
+            xa, ya, za, wa, la, ha, ra = torch.split(anchors, 1, dim=-1)
+            xt, yt, zt, wt, lt, ht, rt = torch.split(box_encodings, 1, dim=-1)
+
+        za = za + ha / 2
+        diagonal = torch.sqrt(la**2 + wa**2)
+        xg = xt * diagonal + xa
+        yg = yt * diagonal + ya
+        zg = zt * ha + za
+
+        lg = torch.exp(lt) * la
+        wg = torch.exp(wt) * wa
+        hg = torch.exp(ht) * ha
+        rg = rt + ra
+        zg = zg - hg / 2
+        cgs = [t + a for t, a in zip(cts, cas)]
+        return torch.cat([xg, yg, zg, wg, lg, hg, rg, *cgs], dim=-1)
diff --git a/mmdet3d/core/bbox/geometry.py b/mmdet3d/core/bbox/geometry.py
new file mode 100644
index 0000000000..9d2b95b0b1
--- /dev/null
+++ b/mmdet3d/core/bbox/geometry.py
@@ -0,0 +1,131 @@
+import torch
+
+from mmdet3d.ops.iou3d import boxes_iou3d_gpu
+from . import box_torch_ops
+
+
+def bbox_overlaps_2d(bboxes1, bboxes2, mode='iou', is_aligned=False):
+    """Calculate overlap between two set of bboxes.
+
+    If ``is_aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (m, 4) in <x1, y1, x2, y2> format.
+        bboxes2 (Tensor): shape (n, 4) in <x1, y1, x2, y2> format.
+            If is_aligned is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+
+    Returns:
+        ious(Tensor): shape (m, n) if is_aligned == False else shape (m, 1)
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> bbox_overlaps(bboxes1, bboxes2)
+        tensor([[0.5238, 0.0500, 0.0041],
+                [0.0323, 0.0452, 1.0000],
+                [0.0000, 0.0000, 0.0000]])
+
+    Example:
+        >>> empty = torch.FloatTensor([])
+        >>> nonempty = torch.FloatTensor([
+        >>>     [0, 0, 10, 9],
+        >>> ])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'iof']
+
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        return bboxes1.new(rows, 1) if is_aligned else bboxes1.new(rows, cols)
+
+    if is_aligned:
+        lt = torch.max(bboxes1[:, :2], bboxes2[:, :2])  # [rows, 2]
+        rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:])  # [rows, 2]
+
+        wh = (rb - lt).clamp(min=0)  # [rows, 2]
+        overlap = wh[:, 0] * wh[:, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (
+            bboxes1[:, 3] - bboxes1[:, 1])
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (
+                bboxes2[:, 3] - bboxes2[:, 1])
+            ious = overlap / (area1 + area2 - overlap)
+        else:
+            ious = overlap / area1
+    else:
+        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]
+        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]
+
+        wh = (rb - lt).clamp(min=0)  # [rows, cols, 2]
+        overlap = wh[:, :, 0] * wh[:, :, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (
+            bboxes1[:, 3] - bboxes1[:, 1])
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (
+                bboxes2[:, 3] - bboxes2[:, 1])
+            ious = overlap / (area1[:, None] + area2 - overlap)
+        else:
+            ious = overlap / (area1[:, None])
+
+    return ious
+
+
+def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou'):
+    '''
+
+    :param bboxes1: Tensor, shape (N, 7) [x, y, z, h, w, l, ry]
+    :param bboxes2: Tensor, shape (M, 7) [x, y, z, h, w, l, ry]
+    :param mode: mode (str): "iou" (intersection over union) or
+            iof (intersection over foreground).
+    :return: iou: (M, N) not support aligned mode currently
+    '''
+    # TODO: check the input dimension meanings,
+    #  this is inconsistent with that in bbox_overlaps_nearest_3d
+    return boxes_iou3d_gpu(bboxes1, bboxes2, mode)
+
+
+def bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode='iou', is_aligned=False):
+    '''
+    :param bboxes1: Tensor, shape (N, 7) [x, y, z, h, w, l, ry]?
+    :param bboxes2: Tensor, shape (M, 7) [x, y, z, h, w, l, ry]?
+    :param mode: mode (str): "iou" (intersection over union) or iof
+            (intersection over foreground).
+    :return: iou: (M, N) not support aligned mode currently
+    rbboxes: [N, 5(x, y, xdim, ydim, rad)] rotated bboxes
+    '''
+    # TODO: check the input dimension meanings,
+    # this is inconsistent with that in bbox_overlaps_3d
+    rbboxes1_np = bboxes1.index_select(
+        dim=-1, index=bboxes1.new_tensor([0, 1, 3, 4, 6]).long())
+    rbboxes2_np = bboxes2.index_select(
+        dim=-1, index=bboxes1.new_tensor([0, 1, 3, 4, 6]).long())
+
+    # Change the bboxes to bev
+    # box conversion and iou calculation in torch version on CUDA
+    # is 10x faster than that in numpy version
+    bboxes1_bv = box_torch_ops.rbbox2d_to_near_bbox(rbboxes1_np)
+    bboxes2_bv = box_torch_ops.rbbox2d_to_near_bbox(rbboxes2_np)
+    ret = bbox_overlaps_2d(
+        bboxes1_bv, bboxes2_bv, mode=mode, is_aligned=is_aligned)
+    return ret
diff --git a/mmdet3d/core/bbox/samplers/__init__.py b/mmdet3d/core/bbox/samplers/__init__.py
new file mode 100644
index 0000000000..d709d8ecb2
--- /dev/null
+++ b/mmdet3d/core/bbox/samplers/__init__.py
@@ -0,0 +1,14 @@
+from .base_sampler import BaseSampler
+from .combined_sampler import CombinedSampler
+from .instance_balanced_pos_sampler import InstanceBalancedPosSampler
+from .iou_balanced_neg_sampler import IoUBalancedNegSampler
+from .ohem_sampler import OHEMSampler
+from .pseudo_sampler import PseudoSampler
+from .random_sampler import RandomSampler
+from .sampling_result import SamplingResult
+
+__all__ = [
+    'BaseSampler', 'PseudoSampler', 'RandomSampler',
+    'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
+    'OHEMSampler', 'SamplingResult'
+]
diff --git a/mmdet3d/core/bbox/samplers/base_sampler.py b/mmdet3d/core/bbox/samplers/base_sampler.py
new file mode 100644
index 0000000000..12df01306f
--- /dev/null
+++ b/mmdet3d/core/bbox/samplers/base_sampler.py
@@ -0,0 +1,78 @@
+from abc import ABCMeta, abstractmethod
+
+import torch
+
+from .sampling_result import SamplingResult
+
+
+class BaseSampler(metaclass=ABCMeta):
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 **kwargs):
+        self.num = num
+        self.pos_fraction = pos_fraction
+        self.neg_pos_ub = neg_pos_ub
+        self.add_gt_as_proposals = add_gt_as_proposals
+        self.pos_sampler = self
+        self.neg_sampler = self
+
+    @abstractmethod
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        pass
+
+    @abstractmethod
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        pass
+
+    def sample(self,
+               assign_result,
+               bboxes,
+               gt_bboxes,
+               gt_labels=None,
+               **kwargs):
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            bboxes (Tensor): Boxes to be sampled from.
+            gt_bboxes (Tensor): Ground truth bboxes.
+            gt_labels (Tensor, optional): Class labels of ground truth bboxes.
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+        """
+        bboxes = bboxes[:, :4]
+
+        gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals:
+            bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
+        neg_inds = neg_inds.unique()
+
+        return SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                              assign_result, gt_flags)
diff --git a/mmdet3d/core/bbox/samplers/combined_sampler.py b/mmdet3d/core/bbox/samplers/combined_sampler.py
new file mode 100644
index 0000000000..351a097f67
--- /dev/null
+++ b/mmdet3d/core/bbox/samplers/combined_sampler.py
@@ -0,0 +1,16 @@
+from ..assign_sampling import build_sampler
+from .base_sampler import BaseSampler
+
+
+class CombinedSampler(BaseSampler):
+
+    def __init__(self, pos_sampler, neg_sampler, **kwargs):
+        super(CombinedSampler, self).__init__(**kwargs)
+        self.pos_sampler = build_sampler(pos_sampler, **kwargs)
+        self.neg_sampler = build_sampler(neg_sampler, **kwargs)
+
+    def _sample_pos(self, **kwargs):
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        raise NotImplementedError
diff --git a/mmdet3d/core/bbox/samplers/instance_balanced_pos_sampler.py b/mmdet3d/core/bbox/samplers/instance_balanced_pos_sampler.py
new file mode 100644
index 0000000000..bc829a236c
--- /dev/null
+++ b/mmdet3d/core/bbox/samplers/instance_balanced_pos_sampler.py
@@ -0,0 +1,41 @@
+import numpy as np
+import torch
+
+from .random_sampler import RandomSampler
+
+
+class InstanceBalancedPosSampler(RandomSampler):
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            unique_gt_inds = assign_result.gt_inds[pos_inds].unique()
+            num_gts = len(unique_gt_inds)
+            num_per_gt = int(round(num_expected / float(num_gts)) + 1)
+            sampled_inds = []
+            for i in unique_gt_inds:
+                inds = torch.nonzero(assign_result.gt_inds == i.item())
+                if inds.numel() != 0:
+                    inds = inds.squeeze(1)
+                else:
+                    continue
+                if len(inds) > num_per_gt:
+                    inds = self.random_choice(inds, num_per_gt)
+                sampled_inds.append(inds)
+            sampled_inds = torch.cat(sampled_inds)
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(
+                    list(set(pos_inds.cpu()) - set(sampled_inds.cpu())))
+                if len(extra_inds) > num_extra:
+                    extra_inds = self.random_choice(extra_inds, num_extra)
+                extra_inds = torch.from_numpy(extra_inds).to(
+                    assign_result.gt_inds.device).long()
+                sampled_inds = torch.cat([sampled_inds, extra_inds])
+            elif len(sampled_inds) > num_expected:
+                sampled_inds = self.random_choice(sampled_inds, num_expected)
+            return sampled_inds
diff --git a/mmdet3d/core/bbox/samplers/iou_balanced_neg_sampler.py b/mmdet3d/core/bbox/samplers/iou_balanced_neg_sampler.py
new file mode 100644
index 0000000000..62431d6a07
--- /dev/null
+++ b/mmdet3d/core/bbox/samplers/iou_balanced_neg_sampler.py
@@ -0,0 +1,133 @@
+import numpy as np
+import torch
+
+from .random_sampler import RandomSampler
+
+
+class IoUBalancedNegSampler(RandomSampler):
+    """IoU Balanced Sampling
+
+    arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019)
+
+    Sampling proposals according to their IoU. `floor_fraction` of needed RoIs
+    are sampled from proposals whose IoU are lower than `floor_thr` randomly.
+    The others are sampled from proposals whose IoU are higher than
+    `floor_thr`. These proposals are sampled from some bins evenly, which are
+    split by `num_bins` via IoU evenly.
+
+    Args:
+        num (int): number of proposals.
+        pos_fraction (float): fraction of positive proposals.
+        floor_thr (float): threshold (minimum) IoU for IoU balanced sampling,
+            set to -1 if all using IoU balanced sampling.
+        floor_fraction (float): sampling fraction of proposals under floor_thr.
+        num_bins (int): number of bins in IoU balanced sampling.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 floor_thr=-1,
+                 floor_fraction=0,
+                 num_bins=3,
+                 **kwargs):
+        super(IoUBalancedNegSampler, self).__init__(num, pos_fraction,
+                                                    **kwargs)
+        assert floor_thr >= 0 or floor_thr == -1
+        assert 0 <= floor_fraction <= 1
+        assert num_bins >= 1
+
+        self.floor_thr = floor_thr
+        self.floor_fraction = floor_fraction
+        self.num_bins = num_bins
+
+    def sample_via_interval(self, max_overlaps, full_set, num_expected):
+        max_iou = max_overlaps.max()
+        iou_interval = (max_iou - self.floor_thr) / self.num_bins
+        per_num_expected = int(num_expected / self.num_bins)
+
+        sampled_inds = []
+        for i in range(self.num_bins):
+            start_iou = self.floor_thr + i * iou_interval
+            end_iou = self.floor_thr + (i + 1) * iou_interval
+            tmp_set = set(
+                np.where(
+                    np.logical_and(max_overlaps >= start_iou,
+                                   max_overlaps < end_iou))[0])
+            tmp_inds = list(tmp_set & full_set)
+            if len(tmp_inds) > per_num_expected:
+                tmp_sampled_set = self.random_choice(tmp_inds,
+                                                     per_num_expected)
+            else:
+                tmp_sampled_set = np.array(tmp_inds, dtype=np.int)
+            sampled_inds.append(tmp_sampled_set)
+
+        sampled_inds = np.concatenate(sampled_inds)
+        if len(sampled_inds) < num_expected:
+            num_extra = num_expected - len(sampled_inds)
+            extra_inds = np.array(list(full_set - set(sampled_inds)))
+            if len(extra_inds) > num_extra:
+                extra_inds = self.random_choice(extra_inds, num_extra)
+            sampled_inds = np.concatenate([sampled_inds, extra_inds])
+
+        return sampled_inds
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            max_overlaps = assign_result.max_overlaps.cpu().numpy()
+            # balance sampling for negative samples
+            neg_set = set(neg_inds.cpu().numpy())
+
+            if self.floor_thr > 0:
+                floor_set = set(
+                    np.where(
+                        np.logical_and(max_overlaps >= 0,
+                                       max_overlaps < self.floor_thr))[0])
+                iou_sampling_set = set(
+                    np.where(max_overlaps >= self.floor_thr)[0])
+            elif self.floor_thr == 0:
+                floor_set = set(np.where(max_overlaps == 0)[0])
+                iou_sampling_set = set(
+                    np.where(max_overlaps > self.floor_thr)[0])
+            else:
+                floor_set = set()
+                iou_sampling_set = set(
+                    np.where(max_overlaps > self.floor_thr)[0])
+
+            floor_neg_inds = list(floor_set & neg_set)
+            iou_sampling_neg_inds = list(iou_sampling_set & neg_set)
+            num_expected_iou_sampling = int(num_expected *
+                                            (1 - self.floor_fraction))
+            if len(iou_sampling_neg_inds) > num_expected_iou_sampling:
+                if self.num_bins >= 2:
+                    iou_sampled_inds = self.sample_via_interval(
+                        max_overlaps, set(iou_sampling_neg_inds),
+                        num_expected_iou_sampling)
+                else:
+                    iou_sampled_inds = self.random_choice(
+                        iou_sampling_neg_inds, num_expected_iou_sampling)
+            else:
+                iou_sampled_inds = np.array(
+                    iou_sampling_neg_inds, dtype=np.int)
+            num_expected_floor = num_expected - len(iou_sampled_inds)
+            if len(floor_neg_inds) > num_expected_floor:
+                sampled_floor_inds = self.random_choice(
+                    floor_neg_inds, num_expected_floor)
+            else:
+                sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int)
+            sampled_inds = np.concatenate(
+                (sampled_floor_inds, iou_sampled_inds))
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(list(neg_set - set(sampled_inds)))
+                if len(extra_inds) > num_extra:
+                    extra_inds = self.random_choice(extra_inds, num_extra)
+                sampled_inds = np.concatenate((sampled_inds, extra_inds))
+            sampled_inds = torch.from_numpy(sampled_inds).long().to(
+                assign_result.gt_inds.device)
+            return sampled_inds
diff --git a/mmdet3d/core/bbox/samplers/ohem_sampler.py b/mmdet3d/core/bbox/samplers/ohem_sampler.py
new file mode 100644
index 0000000000..2500f3113c
--- /dev/null
+++ b/mmdet3d/core/bbox/samplers/ohem_sampler.py
@@ -0,0 +1,73 @@
+import torch
+
+from ..transforms import bbox2roi
+from .base_sampler import BaseSampler
+
+
+class OHEMSampler(BaseSampler):
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 context,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 **kwargs):
+        super(OHEMSampler, self).__init__(num, pos_fraction, neg_pos_ub,
+                                          add_gt_as_proposals)
+        if not hasattr(context, 'num_stages'):
+            self.bbox_roi_extractor = context.bbox_roi_extractor
+            self.bbox_head = context.bbox_head
+        else:
+            self.bbox_roi_extractor = context.bbox_roi_extractor[
+                context.current_stage]
+            self.bbox_head = context.bbox_head[context.current_stage]
+
+    def hard_mining(self, inds, num_expected, bboxes, labels, feats):
+        with torch.no_grad():
+            rois = bbox2roi([bboxes])
+            bbox_feats = self.bbox_roi_extractor(
+                feats[:self.bbox_roi_extractor.num_inputs], rois)
+            cls_score, _ = self.bbox_head(bbox_feats)
+            loss = self.bbox_head.loss(
+                cls_score=cls_score,
+                bbox_pred=None,
+                labels=labels,
+                label_weights=cls_score.new_ones(cls_score.size(0)),
+                bbox_targets=None,
+                bbox_weights=None,
+                reduction_override='none')['loss_cls']
+            _, topk_loss_inds = loss.topk(num_expected)
+        return inds[topk_loss_inds]
+
+    def _sample_pos(self,
+                    assign_result,
+                    num_expected,
+                    bboxes=None,
+                    feats=None,
+                    **kwargs):
+        # Sample some hard positive samples
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.hard_mining(pos_inds, num_expected, bboxes[pos_inds],
+                                    assign_result.labels[pos_inds], feats)
+
+    def _sample_neg(self,
+                    assign_result,
+                    num_expected,
+                    bboxes=None,
+                    feats=None,
+                    **kwargs):
+        # Sample some hard negative samples
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            return self.hard_mining(neg_inds, num_expected, bboxes[neg_inds],
+                                    assign_result.labels[neg_inds], feats)
diff --git a/mmdet3d/core/bbox/samplers/pseudo_sampler.py b/mmdet3d/core/bbox/samplers/pseudo_sampler.py
new file mode 100644
index 0000000000..b4c2ea09b0
--- /dev/null
+++ b/mmdet3d/core/bbox/samplers/pseudo_sampler.py
@@ -0,0 +1,26 @@
+import torch
+
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+class PseudoSampler(BaseSampler):
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        raise NotImplementedError
+
+    def sample(self, assign_result, bboxes, gt_bboxes, **kwargs):
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0).squeeze(-1).unique()
+        gt_flags = bboxes.new_zeros(bboxes.shape[0], dtype=torch.uint8)
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result
diff --git a/mmdet3d/core/bbox/samplers/random_sampler.py b/mmdet3d/core/bbox/samplers/random_sampler.py
new file mode 100644
index 0000000000..0d02b2747f
--- /dev/null
+++ b/mmdet3d/core/bbox/samplers/random_sampler.py
@@ -0,0 +1,53 @@
+import numpy as np
+import torch
+
+from .base_sampler import BaseSampler
+
+
+class RandomSampler(BaseSampler):
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 **kwargs):
+        super(RandomSampler, self).__init__(num, pos_fraction, neg_pos_ub,
+                                            add_gt_as_proposals)
+
+    @staticmethod
+    def random_choice(gallery, num):
+        """Random select some elements from the gallery.
+
+        It seems that Pytorch's implementation is slower than numpy so we use
+        numpy to randperm the indices.
+        """
+        assert len(gallery) >= num
+        if isinstance(gallery, list):
+            gallery = np.array(gallery)
+        cands = np.arange(len(gallery))
+        np.random.shuffle(cands)
+        rand_inds = cands[:num]
+        if not isinstance(gallery, np.ndarray):
+            rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
+        return gallery[rand_inds]
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some positive samples."""
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some negative samples."""
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            return self.random_choice(neg_inds, num_expected)
diff --git a/mmdet3d/core/bbox/samplers/sampling_result.py b/mmdet3d/core/bbox/samplers/sampling_result.py
new file mode 100644
index 0000000000..696e650971
--- /dev/null
+++ b/mmdet3d/core/bbox/samplers/sampling_result.py
@@ -0,0 +1,24 @@
+import torch
+
+
+class SamplingResult(object):
+
+    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
+                 gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.pos_bboxes = bboxes[pos_inds]
+        self.neg_bboxes = bboxes[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_bboxes.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+        self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds, :]
+        if assign_result.labels is not None:
+            self.pos_gt_labels = assign_result.labels[pos_inds]
+        else:
+            self.pos_gt_labels = None
+
+    @property
+    def bboxes(self):
+        return torch.cat([self.pos_bboxes, self.neg_bboxes])
diff --git a/mmdet3d/core/bbox/transforms.py b/mmdet3d/core/bbox/transforms.py
new file mode 100644
index 0000000000..3a213ca016
--- /dev/null
+++ b/mmdet3d/core/bbox/transforms.py
@@ -0,0 +1,269 @@
+import mmcv
+import numpy as np
+import torch
+
+
+def bbox2delta(proposals, gt, means=[0, 0, 0, 0], stds=[1, 1, 1, 1]):
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0]
+    ph = proposals[..., 3] - proposals[..., 1]
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0]
+    gh = gt[..., 3] - gt[..., 1]
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+def delta2bbox(rois,
+               deltas,
+               means=[0, 0, 0, 0],
+               stds=[1, 1, 1, 1],
+               max_shape=None,
+               wh_ratio_clip=16 / 1000):
+    """
+    Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+
+    Args:
+        rois (Tensor): boxes to be transformed. Has shape (N, 4)
+        deltas (Tensor): encoded offsets with respect to each roi.
+            Has shape (N, 4). Note N = num_anchors * W * H when rois is a grid
+            of anchors. Offset encoding follows [1]_.
+        means (list): denormalizing means for delta coordinates
+        stds (list): denormalizing standard deviation for delta coordinates
+        max_shape (tuple[int, int]): maximum bounds for boxes. specifies (H, W)
+        wh_ratio_clip (float): maximum aspect ratio for boxes.
+
+    Returns:
+        Tensor: boxes with shape (N, 4), where columns represent
+            tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> delta2bbox(rois, deltas, max_shape=(32, 32))
+        tensor([[0.0000, 0.0000, 1.0000, 1.0000],
+                [0.2817, 0.2817, 4.7183, 4.7183],
+                [0.0000, 0.6321, 7.3891, 0.3679],
+                [5.8967, 2.9251, 5.5033, 3.2749]])
+    """
+    means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
+    stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[:, 0::4]
+    dy = denorm_deltas[:, 1::4]
+    dw = denorm_deltas[:, 2::4]
+    dh = denorm_deltas[:, 3::4]
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    dw = dw.clamp(min=-max_ratio, max=max_ratio)
+    dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    # Compute center of each roi
+    px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
+    py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
+    # Compute width/height of each roi
+    pw = (rois[:, 2] - rois[:, 0]).unsqueeze(1).expand_as(dw)
+    ph = (rois[:, 3] - rois[:, 1]).unsqueeze(1).expand_as(dh)
+    # Use exp(network energy) to enlarge/shrink each roi
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    # Use network energy to shift the center of each roi
+    gx = torch.addcmul(px, 1, pw, dx)  # gx = px + pw * dx
+    gy = torch.addcmul(py, 1, ph, dy)  # gy = py + ph * dy
+    # Convert center-xy/width/height to top-left, bottom-right
+    x1 = gx - gw * 0.5
+    y1 = gy - gh * 0.5
+    x2 = gx + gw * 0.5
+    y2 = gy + gh * 0.5
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas)
+    return bboxes
+
+
+def bbox_flip(bboxes, img_shape):
+    """Flip bboxes horizontally.
+
+    Args:
+        bboxes(Tensor or ndarray): Shape (..., 4*k)
+        img_shape(tuple): Image shape.
+
+    Returns:
+        Same type as `bboxes`: Flipped bboxes.
+    """
+    if isinstance(bboxes, torch.Tensor):
+        assert bboxes.shape[-1] % 4 == 0
+        flipped = bboxes.clone()
+        flipped[:, 0::4] = img_shape[1] - bboxes[:, 2::4]
+        flipped[:, 2::4] = img_shape[1] - bboxes[:, 0::4]
+        return flipped
+    elif isinstance(bboxes, np.ndarray):
+        return mmcv.bbox_flip(bboxes, img_shape)
+
+
+def bbox_mapping(bboxes, img_shape, scale_factor, flip):
+    """Map bboxes from the original image scale to testing scale"""
+    new_bboxes = bboxes * scale_factor
+    if flip:
+        new_bboxes = bbox_flip(new_bboxes, img_shape)
+    return new_bboxes
+
+
+def bbox_mapping_back(bboxes, img_shape, scale_factor, flip):
+    """Map bboxes from testing scale to original image scale"""
+    new_bboxes = bbox_flip(bboxes, img_shape) if flip else bboxes
+    new_bboxes = new_bboxes / scale_factor
+    return new_bboxes
+
+
+def bbox2roi(bbox_list):
+    """Convert a list of bboxes to roi format.
+
+    Args:
+        bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
+            of images.
+
+    Returns:
+        Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
+        else:
+            rois = bboxes.new_zeros((0, 5))
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def roi2bbox(rois):
+    bbox_list = []
+    img_ids = torch.unique(rois[:, 0].cpu(), sorted=True)
+    for img_id in img_ids:
+        inds = (rois[:, 0] == img_id.item())
+        bbox = rois[inds, 1:]
+        bbox_list.append(bbox)
+    return bbox_list
+
+
+def bbox2result_coco(bboxes, labels, num_classes):
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (Tensor): shape (n, 5)
+        labels (Tensor): shape (n, )
+        num_classes (int): class number, including background class
+
+    Returns:
+        list(ndarray): bbox results of each class
+    """
+    if bboxes.shape[0] == 0:
+        return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)]
+    else:
+        bboxes = bboxes.cpu().numpy()
+        labels = labels.cpu().numpy()
+        return [bboxes[labels == i, :] for i in range(num_classes)]
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return torch.stack([x1, y1, x2, y2], -1)
+
+
+def transform_lidar_to_cam(boxes_lidar):
+    """
+    Only transform format, not exactly in camera coords
+    :param boxes_lidar: (N, 3 or 7) [x, y, z, w, l, h, ry] in LiDAR coords
+    :return: boxes_cam: (N, 3 or 7) [x, y, z, h, w, l, ry] in camera coords
+    """
+    # boxes_cam = boxes_lidar.new_tensor(boxes_lidar.data)
+    boxes_cam = boxes_lidar.clone().detach()
+    boxes_cam[:, 0] = -boxes_lidar[:, 1]
+    boxes_cam[:, 1] = -boxes_lidar[:, 2]
+    boxes_cam[:, 2] = boxes_lidar[:, 0]
+    if boxes_cam.shape[1] > 3:
+        boxes_cam[:, [3, 4, 5]] = boxes_lidar[:, [5, 3, 4]]
+    return boxes_cam
+
+
+def boxes3d_to_bev_torch(boxes3d):
+    """
+    :param boxes3d: (N, 7) [x, y, z, h, w, l, ry] in camera coords
+    :return:
+        boxes_bev: (N, 5) [x1, y1, x2, y2, ry]
+    """
+    boxes_bev = boxes3d.new(torch.Size((boxes3d.shape[0], 5)))
+
+    cu, cv = boxes3d[:, 0], boxes3d[:, 2]
+    half_l, half_w = boxes3d[:, 5] / 2, boxes3d[:, 4] / 2
+    boxes_bev[:, 0], boxes_bev[:, 1] = cu - half_l, cv - half_w
+    boxes_bev[:, 2], boxes_bev[:, 3] = cu + half_l, cv + half_w
+    boxes_bev[:, 4] = boxes3d[:, 6]
+    return boxes_bev
+
+
+def boxes3d_to_bev_torch_lidar(boxes3d):
+    """
+    :param boxes3d: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coords
+    :return:
+        boxes_bev: (N, 5) [x1, y1, x2, y2, ry]
+    """
+    boxes_bev = boxes3d.new(torch.Size((boxes3d.shape[0], 5)))
+
+    cu, cv = boxes3d[:, 0], boxes3d[:, 1]
+    half_l, half_w = boxes3d[:, 4] / 2, boxes3d[:, 3] / 2
+    boxes_bev[:, 0], boxes_bev[:, 1] = cu - half_w, cv - half_l
+    boxes_bev[:, 2], boxes_bev[:, 3] = cu + half_w, cv + half_l
+    boxes_bev[:, 4] = boxes3d[:, 6]
+    return boxes_bev
diff --git a/mmdet3d/core/evaluation/__init__.py b/mmdet3d/core/evaluation/__init__.py
new file mode 100644
index 0000000000..6d424903ea
--- /dev/null
+++ b/mmdet3d/core/evaluation/__init__.py
@@ -0,0 +1,14 @@
+from .class_names import (coco_classes, dataset_aliases, get_classes,
+                          imagenet_det_classes, imagenet_vid_classes,
+                          kitti_classes, voc_classes)
+from .eval_hooks import (CocoDistEvalmAPHook, CocoDistEvalRecallHook,
+                         DistEvalHook, DistEvalmAPHook, KittiDistEvalmAPHook)
+from .kitti_utils import kitti_eval, kitti_eval_coco_style
+
+__all__ = [
+    'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes',
+    'coco_classes', 'dataset_aliases', 'get_classes', 'kitti_classes',
+    'kitti_eval_coco_style', 'kitti_eval', 'CocoDistEvalmAPHook',
+    'KittiDistEvalmAPHook', 'CocoDistEvalRecallHook', 'DistEvalHook',
+    'DistEvalmAPHook'
+]
diff --git a/mmdet3d/core/evaluation/bbox_overlaps.py b/mmdet3d/core/evaluation/bbox_overlaps.py
new file mode 100644
index 0000000000..5507e88c00
--- /dev/null
+++ b/mmdet3d/core/evaluation/bbox_overlaps.py
@@ -0,0 +1,47 @@
+import numpy as np
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou'):
+    """Calculate the ious between each bbox of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1(ndarray): shape (n, 4)
+        bboxes2(ndarray): shape (k, 4)
+        mode(str): iou (intersection over union) or iof (intersection
+            over foreground)
+
+    Returns:
+        ious(ndarray): shape (n, k)
+    """
+
+    assert mode in ['iou', 'iof']
+
+    bboxes1 = bboxes1.astype(np.float32)
+    bboxes2 = bboxes2.astype(np.float32)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    ious = np.zeros((rows, cols), dtype=np.float32)
+    if rows * cols == 0:
+        return ious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        ious = np.zeros((cols, rows), dtype=np.float32)
+        exchange = True
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
+    for i in range(bboxes1.shape[0]):
+        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
+        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
+        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
+        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
+        overlap = np.maximum(x_end - x_start, 0) * np.maximum(
+            y_end - y_start, 0)
+        if mode == 'iou':
+            union = area1[i] + area2 - overlap
+        else:
+            union = area1[i] if not exchange else area2
+        ious[i, :] = overlap / union
+    if exchange:
+        ious = ious.T
+    return ious
diff --git a/mmdet3d/core/evaluation/class_names.py b/mmdet3d/core/evaluation/class_names.py
new file mode 100644
index 0000000000..216c2f5691
--- /dev/null
+++ b/mmdet3d/core/evaluation/class_names.py
@@ -0,0 +1,127 @@
+import mmcv
+
+
+def wider_face_classes():
+    return ['face']
+
+
+def voc_classes():
+    return [
+        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+        'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+    ]
+
+
+def imagenet_det_classes():
+    return [
+        'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo',
+        'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam',
+        'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap',
+        'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder',
+        'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito',
+        'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle',
+        'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker',
+        'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew',
+        'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper',
+        'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly',
+        'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig',
+        'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog',
+        'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart',
+        'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger',
+        'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim',
+        'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse',
+        'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle',
+        'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard',
+        'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can',
+        'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace',
+        'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume',
+        'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza',
+        'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine',
+        'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse',
+        'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator',
+        'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler',
+        'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver',
+        'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile',
+        'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula',
+        'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer',
+        'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine',
+        'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie',
+        'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet',
+        'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin',
+        'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft',
+        'whale', 'wine_bottle', 'zebra'
+    ]
+
+
+def imagenet_vid_classes():
+    return [
+        'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
+        'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
+        'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit',
+        'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle',
+        'watercraft', 'whale', 'zebra'
+    ]
+
+
+def coco_classes():
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign',
+        'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard',
+        'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy_bear', 'hair_drier', 'toothbrush'
+    ]
+
+
+def cityscapes_classes():
+    return [
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+def kitti_classes():
+    return [
+        'Car',
+        'Pedestrian',
+        'Cyclist',
+        'Van',
+        'Person_sitting',
+    ]
+
+
+dataset_aliases = {
+    'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
+    'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
+    'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
+    'coco': ['coco', 'mscoco', 'ms_coco'],
+    'wider_face': ['WIDERFaceDataset', 'wider_face', 'WDIERFace'],
+    'cityscapes': ['cityscapes'],
+    'kitti': ['KITTI', 'kitti']
+}
+
+
+def get_classes(dataset):
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if mmcv.is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_classes()')
+        else:
+            raise ValueError('Unrecognized dataset: {}'.format(dataset))
+    else:
+        raise TypeError('dataset must a str, but got {}'.format(type(dataset)))
+    return labels
diff --git a/mmdet3d/core/evaluation/coco_utils.py b/mmdet3d/core/evaluation/coco_utils.py
new file mode 100644
index 0000000000..7fbb6d2ac1
--- /dev/null
+++ b/mmdet3d/core/evaluation/coco_utils.py
@@ -0,0 +1,251 @@
+import itertools
+
+import mmcv
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from terminaltables import AsciiTable
+
+from .recall import eval_recalls
+
+
+def coco_eval(result_files,
+              result_types,
+              coco,
+              max_dets=(100, 300, 1000),
+              cat_ids=[],
+              classwise=False):
+    for res_type in result_types:
+        assert res_type in [
+            'proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'
+        ]
+
+    if mmcv.is_str(coco):
+        coco = COCO(coco)
+    assert isinstance(coco, COCO)
+
+    if result_types == ['proposal_fast']:
+        ar = fast_eval_recall(result_files, coco, np.array(max_dets))
+        for i, num in enumerate(max_dets):
+            print('AR@{}\t= {:.4f}'.format(num, ar[i]))
+        return
+
+    for res_type in result_types:
+        if isinstance(result_files, str):
+            result_file = result_files
+        elif isinstance(result_files, dict):
+            result_file = result_files[res_type]
+        else:
+            assert TypeError('result_files must be a str or dict')
+        assert result_file.endswith('.json')
+
+        coco_dets = coco.loadRes(result_file)
+        # it will load all images if cat_ids is []
+        # img_ids = getImgIds(coco, catIds=cat_ids)
+        if len(cat_ids) < 80:
+            img_ids = getImgIds(coco, catIds=cat_ids)
+        else:
+            img_ids = coco.getImgIds()
+        iou_type = 'bbox' if res_type == 'proposal' else res_type
+        cocoEval = COCOeval(coco, coco_dets, iou_type)
+        if cat_ids:
+            # cat_ids is not None means it is set
+            cocoEval.params.catIds = cat_ids
+        cocoEval.params.imgIds = img_ids
+        if res_type == 'proposal':
+            cocoEval.params.useCats = 0
+            cocoEval.params.maxDets = list(max_dets)
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        cocoEval.summarize()
+
+        if classwise:
+            # Compute per-category AP
+            # from https://github.com/facebookresearch/detectron2/blob/03064eb5bafe4a3e5750cc7a16672daf5afe8435/detectron2/evaluation/coco_evaluation.py#L259-L283 # noqa
+            precisions = cocoEval.eval['precision']
+            catIds = cat_ids if cat_ids else coco.getCatIds()
+            # precision has dims (iou, recall, cls, area range, max dets)
+            assert len(catIds) == precisions.shape[2]
+
+            results_per_category = []
+            for idx, catId in enumerate(catIds):
+                # area range index 0: all area ranges
+                # max dets index -1: typically 100 per image
+                nm = coco.loadCats(catId)[0]
+                precision = precisions[:, :, idx, 0, -1]
+                precision = precision[precision > -1]
+                ap = np.mean(precision) if precision.size else float('nan')
+                results_per_category.append(
+                    ('{}'.format(nm['name']),
+                     '{:0.3f}'.format(float(ap * 100))))
+
+            N_COLS = min(6, len(results_per_category) * 2)
+            results_flatten = list(itertools.chain(*results_per_category))
+            headers = ['category', 'AP'] * (N_COLS // 2)
+            results_2d = itertools.zip_longest(
+                *[results_flatten[i::N_COLS] for i in range(N_COLS)])
+            table_data = [headers]
+            table_data += [result for result in results_2d]
+            table = AsciiTable(table_data)
+            print(table.table)
+
+
+def fast_eval_recall(results,
+                     coco,
+                     max_dets,
+                     iou_thrs=np.arange(0.5, 0.96, 0.05)):
+    if mmcv.is_str(results):
+        assert results.endswith('.pkl')
+        results = mmcv.load(results)
+    elif not isinstance(results, list):
+        raise TypeError(
+            'results must be a list of numpy arrays or a filename, not {}'.
+            format(type(results)))
+
+    gt_bboxes = []
+    img_ids = coco.getImgIds()
+    for i in range(len(img_ids)):
+        ann_ids = coco.getAnnIds(imgIds=img_ids[i])
+        ann_info = coco.loadAnns(ann_ids)
+        if len(ann_info) == 0:
+            gt_bboxes.append(np.zeros((0, 4)))
+            continue
+        bboxes = []
+        for ann in ann_info:
+            if ann.get('ignore', False) or ann['iscrowd']:
+                continue
+            x1, y1, w, h = ann['bbox']
+            bboxes.append([x1, y1, x1 + w, y1 + h])
+        bboxes = np.array(bboxes, dtype=np.float32)
+        if bboxes.shape[0] == 0:
+            bboxes = np.zeros((0, 4))
+        gt_bboxes.append(bboxes)
+
+    recalls = eval_recalls(
+        gt_bboxes, results, max_dets, iou_thrs, print_summary=False)
+    ar = recalls.mean(axis=1)
+    return ar
+
+
+def xyxy2xywh(bbox):
+    _bbox = bbox.tolist()
+    return [
+        _bbox[0],
+        _bbox[1],
+        _bbox[2] - _bbox[0],
+        _bbox[3] - _bbox[1],
+    ]
+
+
+def proposal2json(dataset, results):
+    json_results = []
+    for idx in range(len(dataset)):
+        img_id = dataset.img_ids[idx]
+        bboxes = results[idx]
+        for i in range(bboxes.shape[0]):
+            data = dict()
+            data['image_id'] = img_id
+            data['bbox'] = xyxy2xywh(bboxes[i])
+            data['score'] = float(bboxes[i][4])
+            data['category_id'] = 1
+            json_results.append(data)
+    return json_results
+
+
+def det2json(dataset, results):
+    json_results = []
+    for idx in range(len(dataset)):
+        img_id = dataset.img_ids[idx]
+        result = results[idx]
+        for label in range(len(result)):
+            bboxes = result[label]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = xyxy2xywh(bboxes[i])
+                data['score'] = float(bboxes[i][4])
+                data['category_id'] = dataset.cat_ids[label]
+                json_results.append(data)
+    return json_results
+
+
+def segm2json(dataset, results):
+    bbox_json_results = []
+    segm_json_results = []
+    for idx in range(len(dataset)):
+        img_id = dataset.img_ids[idx]
+        det, seg = results[idx]
+        for label in range(len(det)):
+            # bbox results
+            bboxes = det[label]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = xyxy2xywh(bboxes[i])
+                data['score'] = float(bboxes[i][4])
+                data['category_id'] = dataset.cat_ids[label]
+                bbox_json_results.append(data)
+
+            # segm results
+            # some detectors use different score for det and segm
+            if isinstance(seg, tuple):
+                segms = seg[0][label]
+                mask_score = seg[1][label]
+            else:
+                segms = seg[label]
+                mask_score = [bbox[4] for bbox in bboxes]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = xyxy2xywh(bboxes[i])
+                data['score'] = float(mask_score[i])
+                data['category_id'] = dataset.cat_ids[label]
+                if isinstance(segms[i]['counts'], bytes):
+                    segms[i]['counts'] = segms[i]['counts'].decode()
+                data['segmentation'] = segms[i]
+                segm_json_results.append(data)
+    return bbox_json_results, segm_json_results
+
+
+def results2json(dataset, results, out_file):
+    result_files = dict()
+    if isinstance(results[0], list):
+        json_results = det2json(dataset, results)
+        result_files['bbox'] = '{}.{}.json'.format(out_file, 'bbox')
+        result_files['proposal'] = '{}.{}.json'.format(out_file, 'bbox')
+        mmcv.dump(json_results, result_files['bbox'])
+    elif isinstance(results[0], tuple):
+        json_results = segm2json(dataset, results)
+        result_files['bbox'] = '{}.{}.json'.format(out_file, 'bbox')
+        result_files['proposal'] = '{}.{}.json'.format(out_file, 'bbox')
+        result_files['segm'] = '{}.{}.json'.format(out_file, 'segm')
+        mmcv.dump(json_results[0], result_files['bbox'])
+        mmcv.dump(json_results[1], result_files['segm'])
+    elif isinstance(results[0], np.ndarray):
+        json_results = proposal2json(dataset, results)
+        result_files['proposal'] = '{}.{}.json'.format(out_file, 'proposal')
+        mmcv.dump(json_results, result_files['proposal'])
+    else:
+        raise TypeError('invalid type of results')
+    return result_files
+
+
+def getImgIds(coco, imgIds=[], catIds=[]):
+    '''
+    Get img ids that satisfy given filter conditions.
+    Different from the coco.getImgIds, this function returns the id if
+    the img contains one of the cat rather than all.
+    :param imgIds (int array) : get imgs for given ids
+    :param catIds (int array) : get imgs with all given cats
+    :return: ids (int array)  : integer array of img ids
+    '''
+    if len(imgIds) == len(catIds) == 0:
+        ids = coco.imgs.keys()
+    else:
+        ids = set(imgIds)
+        for i, catId in enumerate(catIds):
+            if i == 0 and len(ids) == 0:
+                ids = set(coco.catToImgs[catId])
+            else:
+                ids |= set(coco.catToImgs[catId])
+    return list(ids)
diff --git a/mmdet3d/core/evaluation/eval_hooks.py b/mmdet3d/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000000..ccb3cbd73a
--- /dev/null
+++ b/mmdet3d/core/evaluation/eval_hooks.py
@@ -0,0 +1,204 @@
+import os
+import os.path as osp
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.parallel import collate, scatter
+from mmcv.runner import Hook
+from pycocotools.cocoeval import COCOeval
+from torch.utils.data import Dataset
+
+from mmdet3d import datasets
+from .coco_utils import fast_eval_recall, results2json
+from .mean_ap import eval_map
+
+
+class DistEvalHook(Hook):
+
+    def __init__(self, dataset, interval=1):
+        if isinstance(dataset, Dataset):
+            self.dataset = dataset
+        elif isinstance(dataset, dict):
+            self.dataset = datasets.build_dataset(dataset, {'test_mode': True})
+        else:
+            raise TypeError(
+                'dataset must be a Dataset object or a dict, not {}'.format(
+                    type(dataset)))
+        self.interval = interval
+
+    def after_train_epoch(self, runner):
+        if not self.every_n_epochs(runner, self.interval):
+            return
+        runner.model.eval()
+        results = [None for _ in range(len(self.dataset))]
+        if runner.rank == 0:
+            prog_bar = mmcv.ProgressBar(len(self.dataset))
+        for idx in range(runner.rank, len(self.dataset), runner.world_size):
+            data = self.dataset[idx]
+            data_gpu = scatter(
+                collate([data], samples_per_gpu=1),
+                [torch.cuda.current_device()])[0]
+
+            # compute output
+            with torch.no_grad():
+                result = runner.model(
+                    return_loss=False, rescale=True, **data_gpu)
+            results[idx] = result
+
+            batch_size = runner.world_size
+            if runner.rank == 0:
+                for _ in range(batch_size):
+                    prog_bar.update()
+
+        if runner.rank == 0:
+            print('\n')
+            dist.barrier()
+            for i in range(1, runner.world_size):
+                tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i))
+                tmp_results = mmcv.load(tmp_file)
+                for idx in range(i, len(results), runner.world_size):
+                    results[idx] = tmp_results[idx]
+                os.remove(tmp_file)
+            self.evaluate(runner, results)
+        else:
+            tmp_file = osp.join(runner.work_dir,
+                                'temp_{}.pkl'.format(runner.rank))
+            mmcv.dump(results, tmp_file)
+            dist.barrier()
+        dist.barrier()
+
+    def evaluate(self):
+        raise NotImplementedError
+
+
+class DistEvalmAPHook(DistEvalHook):
+
+    def evaluate(self, runner, results):
+        gt_bboxes = []
+        gt_labels = []
+        gt_ignore = []
+        for i in range(len(self.dataset)):
+            ann = self.dataset.get_ann_info(i)
+            bboxes = ann['bboxes']
+            labels = ann['labels']
+            if 'bboxes_ignore' in ann:
+                ignore = np.concatenate([
+                    np.zeros(bboxes.shape[0], dtype=np.bool),
+                    np.ones(ann['bboxes_ignore'].shape[0], dtype=np.bool)
+                ])
+                gt_ignore.append(ignore)
+                bboxes = np.vstack([bboxes, ann['bboxes_ignore']])
+                labels = np.concatenate([labels, ann['labels_ignore']])
+            gt_bboxes.append(bboxes)
+            gt_labels.append(labels)
+        if not gt_ignore:
+            gt_ignore = None
+        # If the dataset is VOC2007, then use 11 points mAP evaluation.
+        if hasattr(self.dataset, 'year') and self.dataset.year == 2007:
+            ds_name = 'voc07'
+        else:
+            ds_name = self.dataset.CLASSES
+        mean_ap, eval_results = eval_map(
+            results,
+            gt_bboxes,
+            gt_labels,
+            gt_ignore=gt_ignore,
+            scale_ranges=None,
+            iou_thr=0.5,
+            dataset=ds_name,
+            print_summary=True)
+        runner.log_buffer.output['mAP'] = mean_ap
+        runner.log_buffer.ready = True
+
+
+class KittiDistEvalmAPHook(DistEvalHook):
+
+    def evaluate(self, runner, results):
+        tmp_file = osp.join(runner.work_dir, 'temp_0')
+        if not isinstance(results[0], dict):
+            result_files = self.dataset.reformat_bbox(results, tmp_file)
+            paste_result, ret_dict = self.dataset.evaluate(result_files)
+            for ap_cls, ap_result in ret_dict.items():
+                for ap_type, ap in ap_result.items():
+                    key = f'{ap_cls}_{ap_type}'
+                    val = float('{:.4f}'.format(ap))
+                    runner.log_buffer.output[key] = val
+        else:
+            for name in results[0]:
+                print('\nEvaluating {}'.format(name))
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(tmp_file, name)
+                result_files = self.dataset.reformat_bbox(results_, tmp_file_)
+                paste_result, ret_dict = self.dataset.evaluate(
+                    result_files, name)
+                for ap_cls, ap_result in ret_dict.items():
+                    for ap_type, ap in ap_result.items():
+                        key = f'{name}/{ap_cls}_{ap_type}'
+                        val = float('{:.4f}'.format(ap))
+                        runner.log_buffer.output[key] = val
+        runner.log_buffer.ready = True
+
+
+class CocoDistEvalRecallHook(DistEvalHook):
+
+    def __init__(self,
+                 dataset,
+                 interval=1,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=np.arange(0.5, 0.96, 0.05)):
+        super(CocoDistEvalRecallHook, self).__init__(
+            dataset, interval=interval)
+        self.proposal_nums = np.array(proposal_nums, dtype=np.int32)
+        self.iou_thrs = np.array(iou_thrs, dtype=np.float32)
+
+    def evaluate(self, runner, results):
+        # the official coco evaluation is too slow, here we use our own
+        # implementation instead, which may get slightly different results
+        ar = fast_eval_recall(results, self.dataset.coco, self.proposal_nums,
+                              self.iou_thrs)
+        for i, num in enumerate(self.proposal_nums):
+            runner.log_buffer.output['AR@{}'.format(num)] = ar[i]
+        runner.log_buffer.ready = True
+
+
+class CocoDistEvalmAPHook(DistEvalHook):
+
+    def evaluate(self, runner, results):
+        tmp_file = osp.join(runner.work_dir, 'temp_0')
+        result_files = results2json(self.dataset, results, tmp_file)
+
+        res_types = ['bbox', 'segm'
+                     ] if runner.model.module.with_mask else ['bbox']
+        cocoGt = self.dataset.coco
+        # load image based on cat_ids
+        if len(self.dataset.cat_ids) < len(self.dataset.CLASSES):
+            from .coco_utils import getImgIds
+            imgIds = getImgIds(cocoGt, catIds=self.dataset.cat_ids)
+        else:
+            imgIds = cocoGt.getImgIds()
+        for res_type in res_types:
+            try:
+                cocoDt = cocoGt.loadRes(result_files[res_type])
+            except IndexError:
+                print('No prediction found.')
+                break
+            iou_type = res_type
+            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
+            cocoEval.params.catIds = self.dataset.cat_ids
+            cocoEval.params.imgIds = imgIds
+            cocoEval.evaluate()
+            cocoEval.accumulate()
+            cocoEval.summarize()
+            metrics = ['mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l']
+            for i in range(len(metrics)):
+                key = '{}_{}'.format(res_type, metrics[i])
+                val = float('{:.3f}'.format(cocoEval.stats[i]))
+                runner.log_buffer.output[key] = val
+            runner.log_buffer.output['{}_mAP_copypaste'.format(res_type)] = (
+                '{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                '{ap[4]:.3f} {ap[5]:.3f}').format(ap=cocoEval.stats[:6])
+        runner.log_buffer.ready = True
+        for res_type in res_types:
+            os.remove(result_files[res_type])
diff --git a/mmdet3d/core/evaluation/kitti_utils/__init__.py b/mmdet3d/core/evaluation/kitti_utils/__init__.py
new file mode 100644
index 0000000000..b1fc7bc3dc
--- /dev/null
+++ b/mmdet3d/core/evaluation/kitti_utils/__init__.py
@@ -0,0 +1,3 @@
+from .eval import kitti_eval, kitti_eval_coco_style
+
+__all__ = ['kitti_eval', 'kitti_eval_coco_style']
diff --git a/mmdet3d/core/evaluation/kitti_utils/eval.py b/mmdet3d/core/evaluation/kitti_utils/eval.py
new file mode 100644
index 0000000000..d9591892dd
--- /dev/null
+++ b/mmdet3d/core/evaluation/kitti_utils/eval.py
@@ -0,0 +1,814 @@
+import gc
+import io as sysio
+
+import numba
+import numpy as np
+
+
+@numba.jit
+def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41):
+    scores.sort()
+    scores = scores[::-1]
+    current_recall = 0
+    thresholds = []
+    for i, score in enumerate(scores):
+        l_recall = (i + 1) / num_gt
+        if i < (len(scores) - 1):
+            r_recall = (i + 2) / num_gt
+        else:
+            r_recall = l_recall
+        if (((r_recall - current_recall) < (current_recall - l_recall))
+                and (i < (len(scores) - 1))):
+            continue
+        # recall = l_recall
+        thresholds.append(score)
+        current_recall += 1 / (num_sample_pts - 1.0)
+    return thresholds
+
+
+def clean_data(gt_anno, dt_anno, current_class, difficulty):
+    CLASS_NAMES = ['car', 'pedestrian', 'cyclist']
+    MIN_HEIGHT = [40, 25, 25]
+    MAX_OCCLUSION = [0, 1, 2]
+    MAX_TRUNCATION = [0.15, 0.3, 0.5]
+    dc_bboxes, ignored_gt, ignored_dt = [], [], []
+    current_cls_name = CLASS_NAMES[current_class].lower()
+    num_gt = len(gt_anno['name'])
+    num_dt = len(dt_anno['name'])
+    num_valid_gt = 0
+    for i in range(num_gt):
+        bbox = gt_anno['bbox'][i]
+        gt_name = gt_anno['name'][i].lower()
+        height = bbox[3] - bbox[1]
+        valid_class = -1
+        if (gt_name == current_cls_name):
+            valid_class = 1
+        elif (current_cls_name == 'Pedestrian'.lower()
+              and 'Person_sitting'.lower() == gt_name):
+            valid_class = 0
+        elif (current_cls_name == 'Car'.lower() and 'Van'.lower() == gt_name):
+            valid_class = 0
+        else:
+            valid_class = -1
+        ignore = False
+        if ((gt_anno['occluded'][i] > MAX_OCCLUSION[difficulty])
+                or (gt_anno['truncated'][i] > MAX_TRUNCATION[difficulty])
+                or (height <= MIN_HEIGHT[difficulty])):
+            ignore = True
+        if valid_class == 1 and not ignore:
+            ignored_gt.append(0)
+            num_valid_gt += 1
+        elif (valid_class == 0 or (ignore and (valid_class == 1))):
+            ignored_gt.append(1)
+        else:
+            ignored_gt.append(-1)
+    # for i in range(num_gt):
+        if gt_anno['name'][i] == 'DontCare':
+            dc_bboxes.append(gt_anno['bbox'][i])
+    for i in range(num_dt):
+        if (dt_anno['name'][i].lower() == current_cls_name):
+            valid_class = 1
+        else:
+            valid_class = -1
+        height = abs(dt_anno['bbox'][i, 3] - dt_anno['bbox'][i, 1])
+        if height < MIN_HEIGHT[difficulty]:
+            ignored_dt.append(1)
+        elif valid_class == 1:
+            ignored_dt.append(0)
+        else:
+            ignored_dt.append(-1)
+
+    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
+
+
+@numba.jit(nopython=True)
+def image_box_overlap(boxes, query_boxes, criterion=-1):
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        qbox_area = ((query_boxes[k, 2] - query_boxes[k, 0]) *
+                     (query_boxes[k, 3] - query_boxes[k, 1]))
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]))
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]))
+                if ih > 0:
+                    if criterion == -1:
+                        ua = ((boxes[n, 2] - boxes[n, 0]) *
+                              (boxes[n, 3] - boxes[n, 1]) + qbox_area -
+                              iw * ih)
+                    elif criterion == 0:
+                        ua = ((boxes[n, 2] - boxes[n, 0]) *
+                              (boxes[n, 3] - boxes[n, 1]))
+                    elif criterion == 1:
+                        ua = qbox_area
+                    else:
+                        ua = 1.0
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def bev_box_overlap(boxes, qboxes, criterion=-1):
+    from .rotate_iou import rotate_iou_gpu_eval
+    riou = rotate_iou_gpu_eval(boxes, qboxes, criterion)
+    return riou
+
+
+@numba.jit(nopython=True, parallel=True)
+def d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1):
+    # ONLY support overlap in CAMERA, not lidar.
+    # TODO: change to use prange for parallel mode, should check the difference
+    N, K = boxes.shape[0], qboxes.shape[0]
+    for i in numba.prange(N):
+        for j in numba.prange(K):
+            if rinc[i, j] > 0:
+                # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] +
+                #         qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1]))
+                iw = (
+                    min(boxes[i, 1], qboxes[j, 1]) -
+                    max(boxes[i, 1] - boxes[i, 4],
+                        qboxes[j, 1] - qboxes[j, 4]))
+
+                if iw > 0:
+                    area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5]
+                    area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5]
+                    inc = iw * rinc[i, j]
+                    if criterion == -1:
+                        ua = (area1 + area2 - inc)
+                    elif criterion == 0:
+                        ua = area1
+                    elif criterion == 1:
+                        ua = area2
+                    else:
+                        ua = inc
+                    rinc[i, j] = inc / ua
+                else:
+                    rinc[i, j] = 0.0
+
+
+def d3_box_overlap(boxes, qboxes, criterion=-1):
+    from .rotate_iou import rotate_iou_gpu_eval
+    rinc = rotate_iou_gpu_eval(boxes[:, [0, 2, 3, 5, 6]],
+                               qboxes[:, [0, 2, 3, 5, 6]], 2)
+    d3_box_overlap_kernel(boxes, qboxes, rinc, criterion)
+    return rinc
+
+
+@numba.jit(nopython=True)
+def compute_statistics_jit(overlaps,
+                           gt_datas,
+                           dt_datas,
+                           ignored_gt,
+                           ignored_det,
+                           dc_bboxes,
+                           metric,
+                           min_overlap,
+                           thresh=0,
+                           compute_fp=False,
+                           compute_aos=False):
+
+    det_size = dt_datas.shape[0]
+    gt_size = gt_datas.shape[0]
+    dt_scores = dt_datas[:, -1]
+    dt_alphas = dt_datas[:, 4]
+    gt_alphas = gt_datas[:, 4]
+    dt_bboxes = dt_datas[:, :4]
+    # gt_bboxes = gt_datas[:, :4]
+
+    assigned_detection = [False] * det_size
+    ignored_threshold = [False] * det_size
+    if compute_fp:
+        for i in range(det_size):
+            if (dt_scores[i] < thresh):
+                ignored_threshold[i] = True
+    NO_DETECTION = -10000000
+    tp, fp, fn, similarity = 0, 0, 0, 0
+    # thresholds = [0.0]
+    # delta = [0.0]
+    thresholds = np.zeros((gt_size, ))
+    thresh_idx = 0
+    delta = np.zeros((gt_size, ))
+    delta_idx = 0
+    for i in range(gt_size):
+        if ignored_gt[i] == -1:
+            continue
+        det_idx = -1
+        valid_detection = NO_DETECTION
+        max_overlap = 0
+        assigned_ignored_det = False
+
+        for j in range(det_size):
+            if (ignored_det[j] == -1):
+                continue
+            if (assigned_detection[j]):
+                continue
+            if (ignored_threshold[j]):
+                continue
+            overlap = overlaps[j, i]
+            dt_score = dt_scores[j]
+            if (not compute_fp and (overlap > min_overlap)
+                    and dt_score > valid_detection):
+                det_idx = j
+                valid_detection = dt_score
+            elif (compute_fp and (overlap > min_overlap)
+                  and (overlap > max_overlap or assigned_ignored_det)
+                  and ignored_det[j] == 0):
+                max_overlap = overlap
+                det_idx = j
+                valid_detection = 1
+                assigned_ignored_det = False
+            elif (compute_fp and (overlap > min_overlap)
+                  and (valid_detection == NO_DETECTION)
+                  and ignored_det[j] == 1):
+                det_idx = j
+                valid_detection = 1
+                assigned_ignored_det = True
+
+        if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0:
+            fn += 1
+        elif ((valid_detection != NO_DETECTION)
+              and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1)):
+            assigned_detection[det_idx] = True
+        elif valid_detection != NO_DETECTION:
+            tp += 1
+            # thresholds.append(dt_scores[det_idx])
+            thresholds[thresh_idx] = dt_scores[det_idx]
+            thresh_idx += 1
+            if compute_aos:
+                # delta.append(gt_alphas[i] - dt_alphas[det_idx])
+                delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx]
+                delta_idx += 1
+
+            assigned_detection[det_idx] = True
+    if compute_fp:
+        for i in range(det_size):
+            if (not (assigned_detection[i] or ignored_det[i] == -1
+                     or ignored_det[i] == 1 or ignored_threshold[i])):
+                fp += 1
+        nstuff = 0
+        if metric == 0:
+            overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0)
+            for i in range(dc_bboxes.shape[0]):
+                for j in range(det_size):
+                    if (assigned_detection[j]):
+                        continue
+                    if (ignored_det[j] == -1 or ignored_det[j] == 1):
+                        continue
+                    if (ignored_threshold[j]):
+                        continue
+                    if overlaps_dt_dc[j, i] > min_overlap:
+                        assigned_detection[j] = True
+                        nstuff += 1
+        fp -= nstuff
+        if compute_aos:
+            tmp = np.zeros((fp + delta_idx, ))
+            # tmp = [0] * fp
+            for i in range(delta_idx):
+                tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0
+                # tmp.append((1.0 + np.cos(delta[i])) / 2.0)
+            # assert len(tmp) == fp + tp
+            # assert len(delta) == tp
+            if tp > 0 or fp > 0:
+                similarity = np.sum(tmp)
+            else:
+                similarity = -1
+    return tp, fp, fn, similarity, thresholds[:thresh_idx]
+
+
+def get_split_parts(num, num_part):
+    same_part = num // num_part
+    remain_num = num % num_part
+    if remain_num == 0:
+        return [same_part] * num_part
+    else:
+        return [same_part] * num_part + [remain_num]
+
+
+@numba.jit(nopython=True)
+def fused_compute_statistics(overlaps,
+                             pr,
+                             gt_nums,
+                             dt_nums,
+                             dc_nums,
+                             gt_datas,
+                             dt_datas,
+                             dontcares,
+                             ignored_gts,
+                             ignored_dets,
+                             metric,
+                             min_overlap,
+                             thresholds,
+                             compute_aos=False):
+    gt_num = 0
+    dt_num = 0
+    dc_num = 0
+    for i in range(gt_nums.shape[0]):
+        for t, thresh in enumerate(thresholds):
+            overlap = overlaps[dt_num:dt_num + dt_nums[i],
+                               gt_num:gt_num + gt_nums[i]]
+
+            gt_data = gt_datas[gt_num:gt_num + gt_nums[i]]
+            dt_data = dt_datas[dt_num:dt_num + dt_nums[i]]
+            ignored_gt = ignored_gts[gt_num:gt_num + gt_nums[i]]
+            ignored_det = ignored_dets[dt_num:dt_num + dt_nums[i]]
+            dontcare = dontcares[dc_num:dc_num + dc_nums[i]]
+            tp, fp, fn, similarity, _ = compute_statistics_jit(
+                overlap,
+                gt_data,
+                dt_data,
+                ignored_gt,
+                ignored_det,
+                dontcare,
+                metric,
+                min_overlap=min_overlap,
+                thresh=thresh,
+                compute_fp=True,
+                compute_aos=compute_aos)
+            pr[t, 0] += tp
+            pr[t, 1] += fp
+            pr[t, 2] += fn
+            if similarity != -1:
+                pr[t, 3] += similarity
+        gt_num += gt_nums[i]
+        dt_num += dt_nums[i]
+        dc_num += dc_nums[i]
+
+
+def calculate_iou_partly(gt_annos, dt_annos, metric, num_parts=50):
+    """fast iou algorithm. this function can be used independently to
+    do result analysis. Must be used in CAMERA coordinate system.
+    Args:
+        gt_annos: dict, must from get_label_annos() in kitti_common.py
+        dt_annos: dict, must from get_label_annos() in kitti_common.py
+        metric: eval type. 0: bbox, 1: bev, 2: 3d
+        num_parts: int. a parameter for fast calculate algorithm
+    """
+    assert len(gt_annos) == len(dt_annos)
+    total_dt_num = np.stack([len(a['name']) for a in dt_annos], 0)
+    total_gt_num = np.stack([len(a['name']) for a in gt_annos], 0)
+    num_examples = len(gt_annos)
+    split_parts = get_split_parts(num_examples, num_parts)
+    parted_overlaps = []
+    example_idx = 0
+
+    for num_part in split_parts:
+        gt_annos_part = gt_annos[example_idx:example_idx + num_part]
+        dt_annos_part = dt_annos[example_idx:example_idx + num_part]
+        if metric == 0:
+            gt_boxes = np.concatenate([a['bbox'] for a in gt_annos_part], 0)
+            dt_boxes = np.concatenate([a['bbox'] for a in dt_annos_part], 0)
+            overlap_part = image_box_overlap(gt_boxes, dt_boxes)
+        elif metric == 1:
+            loc = np.concatenate(
+                [a['location'][:, [0, 2]] for a in gt_annos_part], 0)
+            dims = np.concatenate(
+                [a['dimensions'][:, [0, 2]] for a in gt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
+            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            loc = np.concatenate(
+                [a['location'][:, [0, 2]] for a in dt_annos_part], 0)
+            dims = np.concatenate(
+                [a['dimensions'][:, [0, 2]] for a in dt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
+            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            overlap_part = bev_box_overlap(gt_boxes,
+                                           dt_boxes).astype(np.float64)
+        elif metric == 2:
+            loc = np.concatenate([a['location'] for a in gt_annos_part], 0)
+            dims = np.concatenate([a['dimensions'] for a in gt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
+            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            loc = np.concatenate([a['location'] for a in dt_annos_part], 0)
+            dims = np.concatenate([a['dimensions'] for a in dt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
+            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            overlap_part = d3_box_overlap(gt_boxes,
+                                          dt_boxes).astype(np.float64)
+        else:
+            raise ValueError('unknown metric')
+        parted_overlaps.append(overlap_part)
+        example_idx += num_part
+    overlaps = []
+    example_idx = 0
+    for j, num_part in enumerate(split_parts):
+        gt_annos_part = gt_annos[example_idx:example_idx + num_part]
+        dt_annos_part = dt_annos[example_idx:example_idx + num_part]
+        gt_num_idx, dt_num_idx = 0, 0
+        for i in range(num_part):
+            gt_box_num = total_gt_num[example_idx + i]
+            dt_box_num = total_dt_num[example_idx + i]
+            overlaps.append(
+                parted_overlaps[j][gt_num_idx:gt_num_idx + gt_box_num,
+                                   dt_num_idx:dt_num_idx + dt_box_num])
+            gt_num_idx += gt_box_num
+            dt_num_idx += dt_box_num
+        example_idx += num_part
+
+    return overlaps, parted_overlaps, total_gt_num, total_dt_num
+
+
+def _prepare_data(gt_annos, dt_annos, current_class, difficulty):
+    gt_datas_list = []
+    dt_datas_list = []
+    total_dc_num = []
+    ignored_gts, ignored_dets, dontcares = [], [], []
+    total_num_valid_gt = 0
+    for i in range(len(gt_annos)):
+        rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty)
+        num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets
+        ignored_gts.append(np.array(ignored_gt, dtype=np.int64))
+        ignored_dets.append(np.array(ignored_det, dtype=np.int64))
+        if len(dc_bboxes) == 0:
+            dc_bboxes = np.zeros((0, 4)).astype(np.float64)
+        else:
+            dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64)
+        total_dc_num.append(dc_bboxes.shape[0])
+        dontcares.append(dc_bboxes)
+        total_num_valid_gt += num_valid_gt
+        gt_datas = np.concatenate(
+            [gt_annos[i]['bbox'], gt_annos[i]['alpha'][..., np.newaxis]], 1)
+        dt_datas = np.concatenate([
+            dt_annos[i]['bbox'], dt_annos[i]['alpha'][..., np.newaxis],
+            dt_annos[i]['score'][..., np.newaxis]
+        ], 1)
+        gt_datas_list.append(gt_datas)
+        dt_datas_list.append(dt_datas)
+    total_dc_num = np.stack(total_dc_num, axis=0)
+    return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares,
+            total_dc_num, total_num_valid_gt)
+
+
+def eval_class(gt_annos,
+               dt_annos,
+               current_classes,
+               difficultys,
+               metric,
+               min_overlaps,
+               compute_aos=False,
+               num_parts=200):
+    """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP.
+    Args:
+        gt_annos: dict, must from get_label_annos() in kitti_common.py
+        dt_annos: dict, must from get_label_annos() in kitti_common.py
+        current_classes: list of int, 0: car, 1: pedestrian, 2: cyclist
+        difficultys: list of int. eval difficulty, 0: easy, 1: normal, 2: hard
+        metric: eval type. 0: bbox, 1: bev, 2: 3d
+        min_overlaps: float, min overlap. format: [num_overlap, metric, class].
+        num_parts: int. a parameter for fast calculate algorithm
+
+    Returns:
+        dict of recall, precision and aos
+    """
+    assert len(gt_annos) == len(dt_annos)
+    num_examples = len(gt_annos)
+    split_parts = get_split_parts(num_examples, num_parts)
+
+    rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts)
+    overlaps, parted_overlaps, total_dt_num, total_gt_num = rets
+    N_SAMPLE_PTS = 41
+    num_minoverlap = len(min_overlaps)
+    num_class = len(current_classes)
+    num_difficulty = len(difficultys)
+    precision = np.zeros(
+        [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    recall = np.zeros(
+        [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    for m, current_class in enumerate(current_classes):
+        for l, difficulty in enumerate(difficultys):
+            rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty)
+            (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets,
+             dontcares, total_dc_num, total_num_valid_gt) = rets
+            for k, min_overlap in enumerate(min_overlaps[:, metric, m]):
+                thresholdss = []
+                for i in range(len(gt_annos)):
+                    rets = compute_statistics_jit(
+                        overlaps[i],
+                        gt_datas_list[i],
+                        dt_datas_list[i],
+                        ignored_gts[i],
+                        ignored_dets[i],
+                        dontcares[i],
+                        metric,
+                        min_overlap=min_overlap,
+                        thresh=0.0,
+                        compute_fp=False)
+                    tp, fp, fn, similarity, thresholds = rets
+                    thresholdss += thresholds.tolist()
+                thresholdss = np.array(thresholdss)
+                thresholds = get_thresholds(thresholdss, total_num_valid_gt)
+                thresholds = np.array(thresholds)
+                pr = np.zeros([len(thresholds), 4])
+                idx = 0
+                for j, num_part in enumerate(split_parts):
+                    gt_datas_part = np.concatenate(
+                        gt_datas_list[idx:idx + num_part], 0)
+                    dt_datas_part = np.concatenate(
+                        dt_datas_list[idx:idx + num_part], 0)
+                    dc_datas_part = np.concatenate(
+                        dontcares[idx:idx + num_part], 0)
+                    ignored_dets_part = np.concatenate(
+                        ignored_dets[idx:idx + num_part], 0)
+                    ignored_gts_part = np.concatenate(
+                        ignored_gts[idx:idx + num_part], 0)
+                    fused_compute_statistics(
+                        parted_overlaps[j],
+                        pr,
+                        total_gt_num[idx:idx + num_part],
+                        total_dt_num[idx:idx + num_part],
+                        total_dc_num[idx:idx + num_part],
+                        gt_datas_part,
+                        dt_datas_part,
+                        dc_datas_part,
+                        ignored_gts_part,
+                        ignored_dets_part,
+                        metric,
+                        min_overlap=min_overlap,
+                        thresholds=thresholds,
+                        compute_aos=compute_aos)
+                    idx += num_part
+                for i in range(len(thresholds)):
+                    recall[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
+                    precision[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1])
+                    if compute_aos:
+                        aos[m, l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1])
+                for i in range(len(thresholds)):
+                    precision[m, l, k, i] = np.max(
+                        precision[m, l, k, i:], axis=-1)
+                    recall[m, l, k, i] = np.max(recall[m, l, k, i:], axis=-1)
+                    if compute_aos:
+                        aos[m, l, k, i] = np.max(aos[m, l, k, i:], axis=-1)
+    ret_dict = {
+        'recall': recall,
+        'precision': precision,
+        'orientation': aos,
+    }
+
+    # clean temp variables
+    del overlaps
+    del parted_overlaps
+
+    gc.collect()
+    return ret_dict
+
+
+def get_mAP(prec):
+    sums = 0
+    for i in range(0, prec.shape[-1], 4):
+        sums = sums + prec[..., i]
+    return sums / 11 * 100
+
+
+def print_str(value, *arg, sstream=None):
+    if sstream is None:
+        sstream = sysio.StringIO()
+    sstream.truncate(0)
+    sstream.seek(0)
+    print(value, *arg, file=sstream)
+    return sstream.getvalue()
+
+
+def do_eval(gt_annos,
+            dt_annos,
+            current_classes,
+            min_overlaps,
+            eval_types=['bbox', 'bev', '3d']):
+    # min_overlaps: [num_minoverlap, metric, num_class]
+    difficultys = [0, 1, 2]
+    ret = eval_class(
+        gt_annos,
+        dt_annos,
+        current_classes,
+        difficultys,
+        0,
+        min_overlaps,
+        compute_aos=('aos' in eval_types))
+    # ret: [num_class, num_diff, num_minoverlap, num_sample_points]
+    mAP_bbox = get_mAP(ret['precision'])
+    mAP_aos = None
+    if 'aos' in eval_types:
+        mAP_aos = get_mAP(ret['orientation'])
+
+    mAP_bev = None
+    if 'bev' in eval_types:
+        ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 1,
+                         min_overlaps)
+        mAP_bev = get_mAP(ret['precision'])
+
+    mAP_3d = None
+    if '3d' in eval_types:
+        ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2,
+                         min_overlaps)
+        mAP_3d = get_mAP(ret['precision'])
+    return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
+
+
+def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges,
+                       compute_aos):
+    # overlap_ranges: [range, metric, num_class]
+    min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]])
+    for i in range(overlap_ranges.shape[1]):
+        for j in range(overlap_ranges.shape[2]):
+            min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j])
+    mAP_bbox, mAP_bev, mAP_3d, mAP_aos = do_eval(gt_annos, dt_annos,
+                                                 current_classes, min_overlaps,
+                                                 compute_aos)
+    # ret: [num_class, num_diff, num_minoverlap]
+    mAP_bbox = mAP_bbox.mean(-1)
+    mAP_bev = mAP_bev.mean(-1)
+    mAP_3d = mAP_3d.mean(-1)
+    if mAP_aos is not None:
+        mAP_aos = mAP_aos.mean(-1)
+    return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
+
+
+def kitti_eval(gt_annos,
+               dt_annos,
+               current_classes,
+               eval_types=['bbox', 'bev', '3d']):
+    assert 'bbox' in eval_types, 'must evaluate bbox at least'
+    overlap_0_7 = np.array([[0.7, 0.5, 0.5, 0.7,
+                             0.5], [0.7, 0.5, 0.5, 0.7, 0.5],
+                            [0.7, 0.5, 0.5, 0.7, 0.5]])
+    overlap_0_5 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5],
+                            [0.5, 0.25, 0.25, 0.5, 0.25],
+                            [0.5, 0.25, 0.25, 0.5, 0.25]])
+    min_overlaps = np.stack([overlap_0_7, overlap_0_5], axis=0)  # [2, 3, 5]
+    class_to_name = {
+        0: 'Car',
+        1: 'Pedestrian',
+        2: 'Cyclist',
+        3: 'Van',
+        4: 'Person_sitting',
+    }
+    name_to_class = {v: n for n, v in class_to_name.items()}
+    if not isinstance(current_classes, (list, tuple)):
+        current_classes = [current_classes]
+    current_classes_int = []
+    for curcls in current_classes:
+        if isinstance(curcls, str):
+            current_classes_int.append(name_to_class[curcls])
+        else:
+            current_classes_int.append(curcls)
+    current_classes = current_classes_int
+    min_overlaps = min_overlaps[:, :, current_classes]
+    result = ''
+    # check whether alpha is valid
+    compute_aos = False
+    for anno in dt_annos:
+        if anno['alpha'].shape[0] != 0:
+            if anno['alpha'][0] != -10:
+                compute_aos = True
+                eval_types.append('aos')
+            break
+
+    mAPbbox, mAPbev, mAP3d, mAPaos = do_eval(gt_annos, dt_annos,
+                                             current_classes, min_overlaps,
+                                             eval_types)
+
+    ret_dict = {}
+    difficulty = ['easy', 'moderate', 'hard']
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        curcls_name = class_to_name[curcls]
+        ret_dict[curcls_name] = {}
+        for i in range(min_overlaps.shape[0]):
+            # prepare results for print
+            result += ('{} AP@{:.2f}, {:.2f}, {:.2f}:\n'.format(
+                curcls_name, *min_overlaps[i, :, j]))
+            if mAPbbox is not None:
+                result += (
+                    'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbbox[j, :,
+                                                                       i]))
+            if mAPbev is not None:
+                result += (
+                    'bev  AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbev[j, :,
+                                                                      i]))
+            if mAP3d is not None:
+                result += (
+                    '3d   AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP3d[j, :, i]))
+
+            if compute_aos:
+                result += (
+                    'aos  AP:{:.2f}, {:.2f}, {:.2f}\n'.format(*mAPaos[j, :,
+                                                                      i]))
+
+            # prepare results for logger
+            for idx in range(3):
+                postfix = '{}_{}'.format(difficulty[idx], min_overlaps[i, idx,
+                                                                       j])
+                if mAP3d is not None:
+                    ret_dict[curcls_name]['3D_{}'.format(postfix)] = mAP3d[j,
+                                                                           idx,
+                                                                           i]
+                if mAPbev is not None:
+                    ret_dict[curcls_name]['BEV_{}'.format(postfix)] = mAPbev[
+                        j, idx, i]
+                if mAPbbox is not None:
+                    ret_dict[curcls_name]['2D_{}'.format(postfix)] = mAPbbox[
+                        j, idx, i]
+
+    # calculate mAP over all classes if there are multiple classes
+    if len(current_classes) > 1:
+        # prepare results for print
+        result += ('\nOverall AP@{}, {}, {}:\n'.format(*difficulty))
+        if mAPbbox is not None:
+            mAPbbox = mAPbbox.mean(axis=0)
+            result += ('bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbbox[:,
+                                                                          0]))
+        if mAPbev is not None:
+            mAPbev = mAPbev.mean(axis=0)
+            result += ('bev  AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbev[:,
+                                                                         0]))
+        if mAP3d is not None:
+            mAP3d = mAP3d.mean(axis=0)
+            result += ('3d   AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP3d[:, 0]))
+        if compute_aos:
+            mAPaos = mAPaos.mean(axis=0)
+            result += ('aos  AP:{:.2f}, {:.2f}, {:.2f}\n'.format(*mAPaos[:,
+                                                                         0]))
+
+        # prepare results for logger
+        ret_dict['Overall'] = dict()
+        for idx in range(3):
+            postfix = '{}'.format(difficulty[idx])
+            if mAP3d is not None:
+                ret_dict['Overall']['3D_{}'.format(postfix)] = mAP3d[idx, 0]
+            if mAPbev is not None:
+                ret_dict['Overall']['BEV_{}'.format(postfix)] = mAPbev[idx, 0]
+            if mAPbbox is not None:
+                ret_dict['Overall']['2D_{}'.format(postfix)] = mAPbbox[idx, 0]
+    print(result)
+    return result, ret_dict
+
+
+def kitti_eval_coco_style(gt_annos, dt_annos, current_classes):
+    class_to_name = {
+        0: 'Car',
+        1: 'Pedestrian',
+        2: 'Cyclist',
+        3: 'Van',
+        4: 'Person_sitting',
+    }
+    class_to_range = {
+        0: [0.5, 0.95, 10],
+        1: [0.25, 0.7, 10],
+        2: [0.25, 0.7, 10],
+        3: [0.5, 0.95, 10],
+        4: [0.25, 0.7, 10],
+    }
+    name_to_class = {v: n for n, v in class_to_name.items()}
+    if not isinstance(current_classes, (list, tuple)):
+        current_classes = [current_classes]
+    current_classes_int = []
+    for curcls in current_classes:
+        if isinstance(curcls, str):
+            current_classes_int.append(name_to_class[curcls])
+        else:
+            current_classes_int.append(curcls)
+    current_classes = current_classes_int
+    overlap_ranges = np.zeros([3, 3, len(current_classes)])
+    for i, curcls in enumerate(current_classes):
+        overlap_ranges[:, :, i] = np.array(class_to_range[curcls])[:,
+                                                                   np.newaxis]
+    result = ''
+    # check whether alpha is valid
+    compute_aos = False
+    for anno in dt_annos:
+        if anno['alpha'].shape[0] != 0:
+            if anno['alpha'][0] != -10:
+                compute_aos = True
+            break
+    mAPbbox, mAPbev, mAP3d, mAPaos = do_coco_style_eval(
+        gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos)
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        o_range = np.array(class_to_range[curcls])[[0, 2, 1]]
+        o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1)
+        result += print_str((f'{class_to_name[curcls]} '
+                             'coco AP@{:.2f}:{:.2f}:{:.2f}:'.format(*o_range)))
+        result += print_str((f'bbox AP:{mAPbbox[j, 0]:.2f}, '
+                             f'{mAPbbox[j, 1]:.2f}, '
+                             f'{mAPbbox[j, 2]:.2f}'))
+        result += print_str((f'bev  AP:{mAPbev[j, 0]:.2f}, '
+                             f'{mAPbev[j, 1]:.2f}, '
+                             f'{mAPbev[j, 2]:.2f}'))
+        result += print_str((f'3d   AP:{mAP3d[j, 0]:.2f}, '
+                             f'{mAP3d[j, 1]:.2f}, '
+                             f'{mAP3d[j, 2]:.2f}'))
+        if compute_aos:
+            result += print_str((f'aos  AP:{mAPaos[j, 0]:.2f}, '
+                                 f'{mAPaos[j, 1]:.2f}, '
+                                 f'{mAPaos[j, 2]:.2f}'))
+    return result
diff --git a/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py b/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py
new file mode 100644
index 0000000000..735386943c
--- /dev/null
+++ b/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py
@@ -0,0 +1,341 @@
+#####################
+# Based on https://github.com/hongzhenwang/RRPN-revise
+# Licensed under The MIT License
+# Author: yanyan, scrin@foxmail.com
+#####################
+import math
+
+import numba
+import numpy as np
+from numba import cuda
+
+
+@numba.jit(nopython=True)
+def div_up(m, n):
+    return m // n + (m % n > 0)
+
+
+@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True)
+def trangle_area(a, b, c):
+    return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) *
+            (b[0] - c[0])) / 2.0
+
+
+@cuda.jit('(float32[:], int32)', device=True, inline=True)
+def area(int_pts, num_of_inter):
+    area_val = 0.0
+    for i in range(num_of_inter - 2):
+        area_val += abs(
+            trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4],
+                         int_pts[2 * i + 4:2 * i + 6]))
+    return area_val
+
+
+@cuda.jit('(float32[:], int32)', device=True, inline=True)
+def sort_vertex_in_convex_polygon(int_pts, num_of_inter):
+    if num_of_inter > 0:
+        center = cuda.local.array((2, ), dtype=numba.float32)
+        center[:] = 0.0
+        for i in range(num_of_inter):
+            center[0] += int_pts[2 * i]
+            center[1] += int_pts[2 * i + 1]
+        center[0] /= num_of_inter
+        center[1] /= num_of_inter
+        v = cuda.local.array((2, ), dtype=numba.float32)
+        vs = cuda.local.array((16, ), dtype=numba.float32)
+        for i in range(num_of_inter):
+            v[0] = int_pts[2 * i] - center[0]
+            v[1] = int_pts[2 * i + 1] - center[1]
+            d = math.sqrt(v[0] * v[0] + v[1] * v[1])
+            v[0] = v[0] / d
+            v[1] = v[1] / d
+            if v[1] < 0:
+                v[0] = -2 - v[0]
+            vs[i] = v[0]
+        j = 0
+        temp = 0
+        for i in range(1, num_of_inter):
+            if vs[i - 1] > vs[i]:
+                temp = vs[i]
+                tx = int_pts[2 * i]
+                ty = int_pts[2 * i + 1]
+                j = i
+                while j > 0 and vs[j - 1] > temp:
+                    vs[j] = vs[j - 1]
+                    int_pts[j * 2] = int_pts[j * 2 - 2]
+                    int_pts[j * 2 + 1] = int_pts[j * 2 - 1]
+                    j -= 1
+
+                vs[j] = temp
+                int_pts[j * 2] = tx
+                int_pts[j * 2 + 1] = ty
+
+
+@cuda.jit(
+    '(float32[:], float32[:], int32, int32, float32[:])',
+    device=True,
+    inline=True)
+def line_segment_intersection(pts1, pts2, i, j, temp_pts):
+    A = cuda.local.array((2, ), dtype=numba.float32)
+    B = cuda.local.array((2, ), dtype=numba.float32)
+    C = cuda.local.array((2, ), dtype=numba.float32)
+    D = cuda.local.array((2, ), dtype=numba.float32)
+
+    A[0] = pts1[2 * i]
+    A[1] = pts1[2 * i + 1]
+
+    B[0] = pts1[2 * ((i + 1) % 4)]
+    B[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    C[0] = pts2[2 * j]
+    C[1] = pts2[2 * j + 1]
+
+    D[0] = pts2[2 * ((j + 1) % 4)]
+    D[1] = pts2[2 * ((j + 1) % 4) + 1]
+    BA0 = B[0] - A[0]
+    BA1 = B[1] - A[1]
+    DA0 = D[0] - A[0]
+    CA0 = C[0] - A[0]
+    DA1 = D[1] - A[1]
+    CA1 = C[1] - A[1]
+    acd = DA1 * CA0 > CA1 * DA0
+    bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0])
+    if acd != bcd:
+        abc = CA1 * BA0 > BA1 * CA0
+        abd = DA1 * BA0 > BA1 * DA0
+        if abc != abd:
+            DC0 = D[0] - C[0]
+            DC1 = D[1] - C[1]
+            ABBA = A[0] * B[1] - B[0] * A[1]
+            CDDC = C[0] * D[1] - D[0] * C[1]
+            DH = BA1 * DC0 - BA0 * DC1
+            Dx = ABBA * DC0 - BA0 * CDDC
+            Dy = ABBA * DC1 - BA1 * CDDC
+            temp_pts[0] = Dx / DH
+            temp_pts[1] = Dy / DH
+            return True
+    return False
+
+
+@cuda.jit(
+    '(float32[:], float32[:], int32, int32, float32[:])',
+    device=True,
+    inline=True)
+def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts):
+    a = cuda.local.array((2, ), dtype=numba.float32)
+    b = cuda.local.array((2, ), dtype=numba.float32)
+    c = cuda.local.array((2, ), dtype=numba.float32)
+    d = cuda.local.array((2, ), dtype=numba.float32)
+
+    a[0] = pts1[2 * i]
+    a[1] = pts1[2 * i + 1]
+
+    b[0] = pts1[2 * ((i + 1) % 4)]
+    b[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    c[0] = pts2[2 * j]
+    c[1] = pts2[2 * j + 1]
+
+    d[0] = pts2[2 * ((j + 1) % 4)]
+    d[1] = pts2[2 * ((j + 1) % 4) + 1]
+
+    area_abc = trangle_area(a, b, c)
+    area_abd = trangle_area(a, b, d)
+
+    if area_abc * area_abd >= 0:
+        return False
+
+    area_cda = trangle_area(c, d, a)
+    area_cdb = area_cda + area_abc - area_abd
+
+    if area_cda * area_cdb >= 0:
+        return False
+    t = area_cda / (area_abd - area_abc)
+
+    dx = t * (b[0] - a[0])
+    dy = t * (b[1] - a[1])
+    temp_pts[0] = a[0] + dx
+    temp_pts[1] = a[1] + dy
+    return True
+
+
+@cuda.jit('(float32, float32, float32[:])', device=True, inline=True)
+def point_in_quadrilateral(pt_x, pt_y, corners):
+    ab0 = corners[2] - corners[0]
+    ab1 = corners[3] - corners[1]
+
+    ad0 = corners[6] - corners[0]
+    ad1 = corners[7] - corners[1]
+
+    ap0 = pt_x - corners[0]
+    ap1 = pt_y - corners[1]
+
+    abab = ab0 * ab0 + ab1 * ab1
+    abap = ab0 * ap0 + ab1 * ap1
+    adad = ad0 * ad0 + ad1 * ad1
+    adap = ad0 * ap0 + ad1 * ap1
+
+    return abab >= abap and abap >= 0 and adad >= adap and adap >= 0
+
+
+@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True)
+def quadrilateral_intersection(pts1, pts2, int_pts):
+    num_of_inter = 0
+    for i in range(4):
+        if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2):
+            int_pts[num_of_inter * 2] = pts1[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1]
+            num_of_inter += 1
+        if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1):
+            int_pts[num_of_inter * 2] = pts2[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1]
+            num_of_inter += 1
+    temp_pts = cuda.local.array((2, ), dtype=numba.float32)
+    for i in range(4):
+        for j in range(4):
+            has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts)
+            if has_pts:
+                int_pts[num_of_inter * 2] = temp_pts[0]
+                int_pts[num_of_inter * 2 + 1] = temp_pts[1]
+                num_of_inter += 1
+
+    return num_of_inter
+
+
+@cuda.jit('(float32[:], float32[:])', device=True, inline=True)
+def rbbox_to_corners(corners, rbbox):
+    # generate clockwise corners and rotate it clockwise
+    angle = rbbox[4]
+    a_cos = math.cos(angle)
+    a_sin = math.sin(angle)
+    center_x = rbbox[0]
+    center_y = rbbox[1]
+    x_d = rbbox[2]
+    y_d = rbbox[3]
+    corners_x = cuda.local.array((4, ), dtype=numba.float32)
+    corners_y = cuda.local.array((4, ), dtype=numba.float32)
+    corners_x[0] = -x_d / 2
+    corners_x[1] = -x_d / 2
+    corners_x[2] = x_d / 2
+    corners_x[3] = x_d / 2
+    corners_y[0] = -y_d / 2
+    corners_y[1] = y_d / 2
+    corners_y[2] = y_d / 2
+    corners_y[3] = -y_d / 2
+    for i in range(4):
+        corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x
+        corners[2 * i +
+                1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y
+
+
+@cuda.jit('(float32[:], float32[:])', device=True, inline=True)
+def inter(rbbox1, rbbox2):
+    corners1 = cuda.local.array((8, ), dtype=numba.float32)
+    corners2 = cuda.local.array((8, ), dtype=numba.float32)
+    intersection_corners = cuda.local.array((16, ), dtype=numba.float32)
+
+    rbbox_to_corners(corners1, rbbox1)
+    rbbox_to_corners(corners2, rbbox2)
+
+    num_intersection = quadrilateral_intersection(corners1, corners2,
+                                                  intersection_corners)
+    sort_vertex_in_convex_polygon(intersection_corners, num_intersection)
+    # print(intersection_corners.reshape([-1, 2])[:num_intersection])
+
+    return area(intersection_corners, num_intersection)
+
+
+@cuda.jit('(float32[:], float32[:], int32)', device=True, inline=True)
+def devRotateIoUEval(rbox1, rbox2, criterion=-1):
+    area1 = rbox1[2] * rbox1[3]
+    area2 = rbox2[2] * rbox2[3]
+    area_inter = inter(rbox1, rbox2)
+    if criterion == -1:
+        return area_inter / (area1 + area2 - area_inter)
+    elif criterion == 0:
+        return area_inter / area1
+    elif criterion == 1:
+        return area_inter / area2
+    else:
+        return area_inter
+
+
+@cuda.jit(
+    '(int64, int64, float32[:], float32[:], float32[:], int32)',
+    fastmath=False)
+def rotate_iou_kernel_eval(N,
+                           K,
+                           dev_boxes,
+                           dev_query_boxes,
+                           dev_iou,
+                           criterion=-1):
+    threadsPerBlock = 8 * 8
+    row_start = cuda.blockIdx.x
+    col_start = cuda.blockIdx.y
+    tx = cuda.threadIdx.x
+    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
+    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
+    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+
+    dev_query_box_idx = threadsPerBlock * col_start + tx
+    dev_box_idx = threadsPerBlock * row_start + tx
+    if (tx < col_size):
+        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
+        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
+        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
+        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
+        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
+    if (tx < row_size):
+        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
+        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
+        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
+        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
+        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
+    cuda.syncthreads()
+    if tx < row_size:
+        for i in range(col_size):
+            offset = (
+                row_start * threadsPerBlock * K + col_start * threadsPerBlock +
+                tx * K + i)
+            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
+                                               block_boxes[tx * 5:tx * 5 + 5],
+                                               criterion)
+
+
+def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
+    """rotated box iou running in gpu. 500x faster than cpu version
+    (take 5ms in one example with numba.cuda code).
+    convert from [this project](
+        https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
+
+    Args:
+        boxes (float tensor: [N, 5]): rbboxes. format: centers, dims,
+            angles(clockwise when positive)
+        query_boxes (float tensor: [K, 5]): [description]
+        device_id (int, optional): Defaults to 0. [description]
+
+    Returns:
+        [type]: [description]
+    """
+    boxes = boxes.astype(np.float32)
+    query_boxes = query_boxes.astype(np.float32)
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    iou = np.zeros((N, K), dtype=np.float32)
+    if N == 0 or K == 0:
+        return iou
+    threadsPerBlock = 8 * 8
+    cuda.select_device(device_id)
+    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
+
+    stream = cuda.stream()
+    with stream.auto_synchronize():
+        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
+        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
+        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
+        rotate_iou_kernel_eval[blockspergrid, threadsPerBlock,
+                               stream](N, K, boxes_dev, query_boxes_dev,
+                                       iou_dev, criterion)
+        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
+    return iou.astype(boxes.dtype)
diff --git a/mmdet3d/core/evaluation/mean_ap.py b/mmdet3d/core/evaluation/mean_ap.py
new file mode 100644
index 0000000000..f1d185a697
--- /dev/null
+++ b/mmdet3d/core/evaluation/mean_ap.py
@@ -0,0 +1,385 @@
+import mmcv
+import numpy as np
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+from .class_names import get_classes
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+            ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+
+def tpfp_imagenet(det_bboxes,
+                  gt_bboxes,
+                  gt_ignore,
+                  default_iou_thr,
+                  area_ranges=None):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): the detected bbox
+        gt_bboxes (ndarray): ground truth bboxes of this image
+        gt_ignore (ndarray): indicate if gts are ignored for evaluation or not
+        default_iou_thr (float): the iou thresholds for medium and large bboxes
+        area_ranges (list or None): gt bbox area ranges
+
+    Returns:
+        tuple: two arrays (tp, fp) whose elements are 0 and 1
+    """
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp
+    # of a certain scale.
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * (
+                det_bboxes[:, 3] - det_bboxes[:, 1])
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(det_bboxes, gt_bboxes - 1)
+    gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0]
+    gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1]
+    iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
+                          default_iou_thr)
+    # sort all detections by scores in descending order
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool)
+        else:
+            gt_areas = gt_w * gt_h
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            max_iou = -1
+            matched_gt = -1
+            # find best overlapped available gt
+            for j in range(num_gts):
+                # different from PASCAL VOC: allow finding other gts if the
+                # best overlaped ones are already matched by other det bboxes
+                if gt_covered[j]:
+                    continue
+                elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou:
+                    max_iou = ious[i, j]
+                    matched_gt = j
+            # there are 4 cases for a det bbox:
+            # 1. it matches a gt, tp = 1, fp = 0
+            # 2. it matches an ignored gt, tp = 0, fp = 0
+            # 3. it matches no gt and within area range, tp = 0, fp = 1
+            # 4. it matches no gt but is beyond area range, tp = 0, fp = 0
+            if matched_gt >= 0:
+                gt_covered[matched_gt] = 1
+                if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]):
+                    tp[k, i] = 1
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_default(det_bboxes, gt_bboxes, gt_ignore, iou_thr, area_ranges=None):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): the detected bbox
+        gt_bboxes (ndarray): ground truth bboxes of this image
+        gt_ignore (ndarray): indicate if gts are ignored for evaluation or not
+        iou_thr (float): the iou thresholds
+
+    Returns:
+        tuple: (tp, fp), two arrays whose elements are 0 and 1
+    """
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * (
+                det_bboxes[:, 3] - det_bboxes[:, 1])
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(det_bboxes, gt_bboxes)
+    ious_max = ious.max(axis=1)
+    ious_argmax = ious.argmax(axis=1)
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool)
+        else:
+            gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+                gt_bboxes[:, 3] - gt_bboxes[:, 1])
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            if ious_max[i] >= iou_thr:
+                matched_gt = ious_argmax[i]
+                if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]):
+                    if not gt_covered[matched_gt]:
+                        gt_covered[matched_gt] = True
+                        tp[k, i] = 1
+                    else:
+                        fp[k, i] = 1
+                # otherwise ignore this detected bbox, tp = 0, fp = 0
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def get_cls_results(det_results, gt_bboxes, gt_labels, gt_ignore, class_id):
+    """Get det results and gt information of a certain class."""
+    cls_dets = [det[class_id]
+                for det in det_results]  # det bboxes of this class
+    cls_gts = []  # gt bboxes of this class
+    cls_gt_ignore = []
+    for j in range(len(gt_bboxes)):
+        gt_bbox = gt_bboxes[j]
+        cls_inds = (gt_labels[j] == class_id)
+        cls_gt = gt_bbox[cls_inds, :] if gt_bbox.shape[0] > 0 else gt_bbox
+        cls_gts.append(cls_gt)
+        if gt_ignore is None:
+            cls_gt_ignore.append(np.zeros(cls_gt.shape[0], dtype=np.int32))
+        else:
+            cls_gt_ignore.append(gt_ignore[j][cls_inds])
+    return cls_dets, cls_gts, cls_gt_ignore
+
+
+def eval_map(det_results,
+             gt_bboxes,
+             gt_labels,
+             gt_ignore=None,
+             scale_ranges=None,
+             iou_thr=0.5,
+             dataset=None,
+             print_summary=True):
+    """Evaluate mAP of a dataset.
+
+    Args:
+        det_results (list): a list of list, [[cls1_det, cls2_det, ...], ...]
+        gt_bboxes (list): ground truth bboxes of each image, a list of K*4
+            array.
+        gt_labels (list): ground truth labels of each image, a list of K array
+        gt_ignore (list): gt ignore indicators of each image, a list of K array
+        scale_ranges (list, optional): [(min1, max1), (min2, max2), ...]
+        iou_thr (float): IoU threshold
+        dataset (None or str or list): dataset name or dataset classes, there
+            are minor differences in metrics for different datsets, e.g.
+            "voc07", "imagenet_det", etc.
+        print_summary (bool): whether to print the mAP summary
+
+    Returns:
+        tuple: (mAP, [dict, dict, ...])
+    """
+    assert len(det_results) == len(gt_bboxes) == len(gt_labels)
+    if gt_ignore is not None:
+        assert len(gt_ignore) == len(gt_labels)
+        for i in range(len(gt_ignore)):
+            assert len(gt_labels[i]) == len(gt_ignore[i])
+    area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
+                   if scale_ranges is not None else None)
+    num_scales = len(scale_ranges) if scale_ranges is not None else 1
+    eval_results = []
+    num_classes = len(det_results[0])  # positive class num
+    gt_labels = [
+        label if label.ndim == 1 else label[:, 0] for label in gt_labels
+    ]
+    for i in range(num_classes):
+        # get gt and det bboxes of this class
+        cls_dets, cls_gts, cls_gt_ignore = get_cls_results(
+            det_results, gt_bboxes, gt_labels, gt_ignore, i)
+        # calculate tp and fp for each image
+        tpfp_func = (
+            tpfp_imagenet if dataset in ['det', 'vid'] else tpfp_default)
+        tpfp = [
+            tpfp_func(cls_dets[j], cls_gts[j], cls_gt_ignore[j], iou_thr,
+                      area_ranges) for j in range(len(cls_dets))
+        ]
+        tp, fp = tuple(zip(*tpfp))
+        # calculate gt number of each scale, gts ignored or beyond scale
+        # are not counted
+        num_gts = np.zeros(num_scales, dtype=int)
+        for j, bbox in enumerate(cls_gts):
+            if area_ranges is None:
+                num_gts[0] += np.sum(np.logical_not(cls_gt_ignore[j]))
+            else:
+                gt_areas = (bbox[:, 2] - bbox[:, 0]) * (
+                    bbox[:, 3] - bbox[:, 1])
+                for k, (min_area, max_area) in enumerate(area_ranges):
+                    num_gts[k] += np.sum(
+                        np.logical_not(cls_gt_ignore[j])
+                        & (gt_areas >= min_area) & (gt_areas < max_area))
+        # sort all det bboxes by score, also sort tp and fp
+        cls_dets = np.vstack(cls_dets)
+        num_dets = cls_dets.shape[0]
+        sort_inds = np.argsort(-cls_dets[:, -1])
+        tp = np.hstack(tp)[:, sort_inds]
+        fp = np.hstack(fp)[:, sort_inds]
+        # calculate recall and precision with tp and fp
+        tp = np.cumsum(tp, axis=1)
+        fp = np.cumsum(fp, axis=1)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+        # calculate AP
+        if scale_ranges is None:
+            recalls = recalls[0, :]
+            precisions = precisions[0, :]
+            num_gts = num_gts.item()
+        mode = 'area' if dataset != 'voc07' else '11points'
+        ap = average_precision(recalls, precisions, mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+    if scale_ranges is not None:
+        # shape (num_classes, num_scales)
+        all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
+        all_num_gts = np.vstack(
+            [cls_result['num_gts'] for cls_result in eval_results])
+        mean_ap = []
+        for i in range(num_scales):
+            if np.any(all_num_gts[:, i] > 0):
+                mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean())
+            else:
+                mean_ap.append(0.0)
+    else:
+        aps = []
+        for cls_result in eval_results:
+            if cls_result['num_gts'] > 0:
+                aps.append(cls_result['ap'])
+        mean_ap = np.array(aps).mean().item() if aps else 0.0
+    if print_summary:
+        print_map_summary(mean_ap, eval_results, dataset, area_ranges)
+
+    return mean_ap, eval_results
+
+
+def print_map_summary(mean_ap, results, dataset=None, ranges=None):
+    """Print mAP and results of each class.
+
+    Args:
+        mean_ap(float): calculated from `eval_map`
+        results(list): calculated from `eval_map`
+        dataset(None or str or list): dataset name or dataset classes.
+        ranges(list or Tuple): ranges of areas
+    """
+    num_scales = len(results[0]['ap']) if isinstance(results[0]['ap'],
+                                                     np.ndarray) else 1
+    if ranges is not None:
+        assert len(ranges) == num_scales
+
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    precisions = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+            precisions[:, i] = np.array(
+                cls_result['precision'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    if dataset is None:
+        label_names = [str(i) for i in range(0, num_classes)]
+    elif mmcv.is_str(dataset):
+        label_names = get_classes(dataset)
+    else:
+        label_names = dataset
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+    header = ['class', 'gts', 'dets', 'recall', 'precision', 'ap']
+    for i in range(num_scales):
+        if ranges is not None:
+            print('Area range ', ranges[i])
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                '{:.3f}'.format(recalls[i, j]),
+                '{:.3f}'.format(precisions[i, j]), '{:.3f}'.format(aps[i, j])
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', '', '{:.3f}'.format(mean_ap[i])])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print(table.table)
diff --git a/mmdet3d/core/evaluation/recall.py b/mmdet3d/core/evaluation/recall.py
new file mode 100644
index 0000000000..45c2627c6b
--- /dev/null
+++ b/mmdet3d/core/evaluation/recall.py
@@ -0,0 +1,185 @@
+import numpy as np
+from terminaltables import AsciiTable
+
+from ..bbox import bbox_overlaps_2d
+
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+    img_num = all_ious.shape[0]
+    total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+    _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+    for k, proposal_num in enumerate(proposal_nums):
+        tmp_ious = np.zeros(0)
+        for i in range(img_num):
+            ious = all_ious[i][:, :proposal_num].copy()
+            gt_ious = np.zeros((ious.shape[0]))
+            if ious.size == 0:
+                tmp_ious = np.hstack((tmp_ious, gt_ious))
+                continue
+            for j in range(ious.shape[0]):
+                gt_max_overlaps = ious.argmax(axis=1)
+                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+                gt_idx = max_ious.argmax()
+                gt_ious[j] = max_ious[gt_idx]
+                box_idx = gt_max_overlaps[gt_idx]
+                ious[gt_idx, :] = -1
+                ious[:, box_idx] = -1
+            tmp_ious = np.hstack((tmp_ious, gt_ious))
+        _ious[k, :] = tmp_ious
+
+    _ious = np.fliplr(np.sort(_ious, axis=1))
+    recalls = np.zeros((proposal_nums.size, thrs.size))
+    for i, thr in enumerate(thrs):
+        recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num)
+
+    return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+    """Check proposal_nums and iou_thrs and set correct format.
+    """
+    if isinstance(proposal_nums, list):
+        _proposal_nums = np.array(proposal_nums)
+    elif isinstance(proposal_nums, int):
+        _proposal_nums = np.array([proposal_nums])
+    else:
+        _proposal_nums = proposal_nums
+
+    if iou_thrs is None:
+        _iou_thrs = np.array([0.5])
+    elif isinstance(iou_thrs, list):
+        _iou_thrs = np.array(iou_thrs)
+    elif isinstance(iou_thrs, float):
+        _iou_thrs = np.array([iou_thrs])
+    else:
+        _iou_thrs = iou_thrs
+
+    return _proposal_nums, _iou_thrs
+
+
+def eval_recalls(gts,
+                 proposals,
+                 proposal_nums=None,
+                 iou_thrs=None,
+                 print_summary=True):
+    """Calculate recalls.
+
+    Args:
+        gts(list or ndarray): a list of arrays of shape (n, 4)
+        proposals(list or ndarray): a list of arrays of shape (k, 4) or (k, 5)
+        proposal_nums(int or list of int or ndarray): top N proposals
+        thrs(float or list or ndarray): iou thresholds
+
+    Returns:
+        ndarray: recalls of different ious and proposal nums
+    """
+
+    img_num = len(gts)
+    assert img_num == len(proposals)
+
+    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+
+    all_ious = []
+    for i in range(img_num):
+        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+            scores = proposals[i][:, 4]
+            sort_idx = np.argsort(scores)[::-1]
+            img_proposal = proposals[i][sort_idx, :]
+        else:
+            img_proposal = proposals[i]
+        prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+        if gts[i] is None or gts[i].shape[0] == 0:
+            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+        else:
+            ious = bbox_overlaps_2d(gts[i], img_proposal[:prop_num, :4])
+        all_ious.append(ious)
+    all_ious = np.array(all_ious)
+    recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+    if print_summary:
+        print_recall_summary(recalls, proposal_nums, iou_thrs)
+    return recalls
+
+
+def print_recall_summary(recalls,
+                         proposal_nums,
+                         iou_thrs,
+                         row_idxs=None,
+                         col_idxs=None):
+    """Print recalls in a table.
+
+    Args:
+        recalls(ndarray): calculated from `bbox_recalls`
+        proposal_nums(ndarray or list): top N proposals
+        iou_thrs(ndarray or list): iou thresholds
+        row_idxs(ndarray): which rows(proposal nums) to print
+        col_idxs(ndarray): which cols(iou thresholds) to print
+    """
+    proposal_nums = np.array(proposal_nums, dtype=np.int32)
+    iou_thrs = np.array(iou_thrs)
+    if row_idxs is None:
+        row_idxs = np.arange(proposal_nums.size)
+    if col_idxs is None:
+        col_idxs = np.arange(iou_thrs.size)
+    row_header = [''] + iou_thrs[col_idxs].tolist()
+    table_data = [row_header]
+    for i, num in enumerate(proposal_nums[row_idxs]):
+        row = [
+            '{:.3f}'.format(val)
+            for val in recalls[row_idxs[i], col_idxs].tolist()
+        ]
+        row.insert(0, num)
+        table_data.append(row)
+    table = AsciiTable(table_data)
+    print(table.table)
+
+
+def plot_num_recall(recalls, proposal_nums):
+    """Plot Proposal_num-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        proposal_nums(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(proposal_nums, np.ndarray):
+        _proposal_nums = proposal_nums.tolist()
+    else:
+        _proposal_nums = proposal_nums
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot([0] + _proposal_nums, [0] + _recalls)
+    plt.xlabel('Proposal num')
+    plt.ylabel('Recall')
+    plt.axis([0, proposal_nums.max(), 0, 1])
+    f.show()
+
+
+def plot_iou_recall(recalls, iou_thrs):
+    """Plot IoU-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        iou_thrs(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(iou_thrs, np.ndarray):
+        _iou_thrs = iou_thrs.tolist()
+    else:
+        _iou_thrs = iou_thrs
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot(_iou_thrs + [1.0], _recalls + [0.])
+    plt.xlabel('IoU')
+    plt.ylabel('Recall')
+    plt.axis([iou_thrs.min(), 1, 0, 1])
+    f.show()
diff --git a/mmdet3d/core/optimizer/__init__.py b/mmdet3d/core/optimizer/__init__.py
new file mode 100644
index 0000000000..1643fe10e9
--- /dev/null
+++ b/mmdet3d/core/optimizer/__init__.py
@@ -0,0 +1,5 @@
+from .builder import build_optimizer
+from .mix_optimizer import MixedOptimizer
+from .registry import OPTIMIZERS
+
+__all__ = ['OPTIMIZERS', 'build_optimizer', 'MixedOptimizer']
diff --git a/mmdet3d/core/optimizer/builder.py b/mmdet3d/core/optimizer/builder.py
new file mode 100644
index 0000000000..c6ae7f6222
--- /dev/null
+++ b/mmdet3d/core/optimizer/builder.py
@@ -0,0 +1,135 @@
+import re
+
+import torch
+
+from mmdet.utils import build_from_cfg, get_root_logger
+from .registry import OPTIMIZERS
+
+
+def build_optimizer(model, optimizer_cfg):
+    """Build optimizer from configs.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        optimizer_cfg (dict): The config dict of the optimizer.
+            Positional fields are:
+                - type: class name of the optimizer.
+                - lr: base learning rate.
+            Optional fields are:
+                - any arguments of the corresponding optimizer type, e.g.,
+                  weight_decay, momentum, etc.
+                - paramwise_options: a dict with 4 accepted fileds
+                  (bias_lr_mult, bias_decay_mult, norm_decay_mult,
+                  dwconv_decay_mult).
+                  `bias_lr_mult` and `bias_decay_mult` will be multiplied to
+                  the lr and weight decay respectively for all bias parameters
+                  (except for the normalization layers), and
+                  `norm_decay_mult` will be multiplied to the weight decay
+                  for all weight and bias parameters of normalization layers.
+                  `dwconv_decay_mult` will be multiplied to the weight decay
+                  for all weight and bias parameters of depthwise conv layers.
+
+    Returns:
+        torch.optim.Optimizer: The initialized optimizer.
+
+    Example:
+        >>> import torch
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9,
+        >>>                      weight_decay=0.0001)
+        >>> optimizer = build_optimizer(model, optimizer_cfg)
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+
+    optimizer_cfg = optimizer_cfg.copy()
+
+    if isinstance(optimizer_cfg, list):
+        # Assume paramwise_options is None if optimizer_cfg is list
+        from .mix_optimizer import MixedOptimizer
+        logger = get_root_logger()
+        keys = [optimizer.pop('key') for optimizer in optimizer_cfg]
+        keys_params = {key: [] for key in keys}
+        keys_params_name = {key: [] for key in keys}
+        keys_optimizer = []
+        for name, param in model.named_parameters():
+            param_group = {'params': [param]}
+            find_flag = False
+            for key in keys:
+                if key in name:
+                    keys_params[key].append(param_group)
+                    keys_params_name[key].append(name)
+                    find_flag = True
+                    break
+            assert find_flag, 'key {} is not matched to any optimizer'.format(
+                name)
+
+        step_intervals = []
+        for key, single_cfg in zip(keys, optimizer_cfg):
+            optimizer_cls = getattr(torch.optim, single_cfg.pop('type'))
+            step_intervals.append(single_cfg.pop('step_interval', 1))
+            single_optim = optimizer_cls(keys_params[key], **single_cfg)
+            keys_optimizer.append(single_optim)
+            logger.info('{} optimizes key:\n {}\n'.format(
+                optimizer_cls.__name__, keys_params_name[key]))
+
+        mix_optimizer = MixedOptimizer(keys_optimizer, step_intervals)
+        return mix_optimizer
+    else:
+        paramwise_options = optimizer_cfg.pop('paramwise_options', None)
+
+    # if no paramwise option is specified, just use the global setting
+    if paramwise_options is None:
+        params = model.parameters()
+    else:
+        assert isinstance(paramwise_options, dict)
+        # get base lr and weight decay
+        base_lr = optimizer_cfg['lr']
+        base_wd = optimizer_cfg.get('weight_decay', None)
+        # weight_decay must be explicitly specified if mult is specified
+        if ('bias_decay_mult' in paramwise_options
+                or 'norm_decay_mult' in paramwise_options
+                or 'dwconv_decay_mult' in paramwise_options):
+            assert base_wd is not None
+        # get param-wise options
+        bias_lr_mult = paramwise_options.get('bias_lr_mult', 1.)
+        bias_decay_mult = paramwise_options.get('bias_decay_mult', 1.)
+        norm_decay_mult = paramwise_options.get('norm_decay_mult', 1.)
+        dwconv_decay_mult = paramwise_options.get('dwconv_decay_mult', 1.)
+        named_modules = dict(model.named_modules())
+        # set param-wise lr and weight decay
+        params = []
+        for name, param in model.named_parameters():
+            param_group = {'params': [param]}
+            if not param.requires_grad:
+                # FP16 training needs to copy gradient/weight between master
+                # weight copy and model weight, it is convenient to keep all
+                # parameters here to align with model.parameters()
+                params.append(param_group)
+                continue
+
+            # for norm layers, overwrite the weight decay of weight and bias
+            # TODO: obtain the norm layer prefixes dynamically
+            if re.search(r'(bn|gn)(\d+)?.(weight|bias)', name):
+                if base_wd is not None:
+                    param_group['weight_decay'] = base_wd * norm_decay_mult
+            # for other layers, overwrite both lr and weight decay of bias
+            elif name.endswith('.bias'):
+                param_group['lr'] = base_lr * bias_lr_mult
+                if base_wd is not None:
+                    param_group['weight_decay'] = base_wd * bias_decay_mult
+
+            module_name = name.replace('.weight', '').replace('.bias', '')
+            if module_name in named_modules and base_wd is not None:
+                module = named_modules[module_name]
+                # if this Conv2d is depthwise Conv2d
+                if isinstance(module, torch.nn.Conv2d) and \
+                        module.in_channels == module.groups:
+                    param_group['weight_decay'] = base_wd * dwconv_decay_mult
+            # otherwise use the global settings
+
+            params.append(param_group)
+
+    optimizer_cfg['params'] = params
+
+    return build_from_cfg(optimizer_cfg, OPTIMIZERS)
diff --git a/mmdet3d/core/optimizer/mix_optimizer.py b/mmdet3d/core/optimizer/mix_optimizer.py
new file mode 100644
index 0000000000..250ce1c767
--- /dev/null
+++ b/mmdet3d/core/optimizer/mix_optimizer.py
@@ -0,0 +1,99 @@
+from torch.optim import Optimizer
+
+from .registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module
+class MixedOptimizer(Optimizer):
+    """Mixed Optimizer that contains multiple optimizers
+
+    This optimizer applies the cocktail optimzation for multi-modality models.
+
+    """
+
+    def __init__(self, optimizers, step_intervals=None):
+        self.optimizers = optimizers
+        self.param_groups = []
+        for optimizer in self.optimizers:
+            self.param_groups += optimizer.param_groups
+        if not isinstance(step_intervals, list):
+            step_intervals = [1] * len(self.optimizers)
+        self.step_intervals = step_intervals
+        self.num_step_updated = 0
+
+    def __getstate__(self):
+        return {
+            'num_step_updated':
+            self.num_step_updated,
+            'defaults': [optimizer.defaults for optimizer in self.optimizers],
+            'state': [optimizer.state for optimizer in self.optimizers],
+            'param_groups':
+            [optimizer.param_groups for optimizer in self.optimizers],
+        }
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + ' (\n'
+        for optimizer in self.optimizers:
+            format_string += '\t' + optimizer.__repr__ + ',\n'
+        format_string += ')'
+        return format_string
+
+    def state_dict(self):
+        state_dicts = [optimizer.state_dict() for optimizer in self.optimizers]
+        return {
+            'num_step_updated':
+            self.num_step_updated,
+            'state': [state_dict['state'] for state_dict in state_dicts],
+            'param_groups':
+            [state_dict['param_groups'] for state_dict in state_dicts],
+        }
+
+    def load_state_dict(self, state_dict):
+        r"""Loads the optimizer state.
+
+        Arguments:
+            state_dict (dict): optimizer state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        assert len(state_dict['state']) == len(self.optimizers)
+        assert len(state_dict['param_groups']) == len(self.optimizers)
+        for i, (single_state, single_param_groups) in enumerate(
+                zip(state_dict['state'], state_dict['param_groups'])):
+            single_state_dict = dict(
+                state=single_state, param_groups=single_param_groups)
+            self.optimizers[i].load_state_dict(single_state_dict)
+
+        self.param_groups = []
+        for optimizer in self.optimizers:
+            self.param_groups += optimizer.param_groups
+        self.num_step_updated = state_dict['num_step_updated']
+
+    def zero_grad(self):
+        r"""Clears the gradients of all optimized :class:`torch.Tensor` s."""
+        for optimizer in self.optimizers:
+            optimizer.zero_grad()
+
+    def step(self, closure=None):
+        r"""Performs a single optimization step (parameter update).
+
+        Arguments:
+            closure (callable): A closure that reevaluates the model and
+                returns the loss. Optional for most optimizers.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        self.num_step_updated += 1
+        for step_interval, optimizer in zip(self.step_intervals,
+                                            self.optimizers):
+            if self.num_step_updated % step_interval == 0:
+                optimizer.step()
+
+        return loss
+
+    def add_param_group(self, param_group):
+        raise NotImplementedError
diff --git a/mmdet3d/core/optimizer/registry.py b/mmdet3d/core/optimizer/registry.py
new file mode 100644
index 0000000000..de9b738989
--- /dev/null
+++ b/mmdet3d/core/optimizer/registry.py
@@ -0,0 +1,23 @@
+import inspect
+
+import torch
+
+from mmdet.utils import Registry
+
+OPTIMIZERS = Registry('optimizer')
+
+
+def register_torch_optimizers():
+    torch_optimizers = []
+    for module_name in dir(torch.optim):
+        if module_name.startswith('__'):
+            continue
+        _optim = getattr(torch.optim, module_name)
+        if inspect.isclass(_optim) and issubclass(_optim,
+                                                  torch.optim.Optimizer):
+            OPTIMIZERS.register_module(_optim)
+            torch_optimizers.append(module_name)
+    return torch_optimizers
+
+
+TORCH_OPTIMIZERS = register_torch_optimizers()
diff --git a/mmdet3d/core/post_processing/__init__.py b/mmdet3d/core/post_processing/__init__.py
new file mode 100644
index 0000000000..11c3d30adf
--- /dev/null
+++ b/mmdet3d/core/post_processing/__init__.py
@@ -0,0 +1,8 @@
+from .bbox_nms import multiclass_nms
+from .merge_augs import (merge_aug_bboxes, merge_aug_masks,
+                         merge_aug_proposals, merge_aug_scores)
+
+__all__ = [
+    'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
+    'merge_aug_scores', 'merge_aug_masks'
+]
diff --git a/mmdet3d/core/post_processing/bbox_nms.py b/mmdet3d/core/post_processing/bbox_nms.py
new file mode 100644
index 0000000000..76f0d6bb62
--- /dev/null
+++ b/mmdet3d/core/post_processing/bbox_nms.py
@@ -0,0 +1,68 @@
+import torch
+
+from mmdet.ops.nms import nms_wrapper
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   nms_cfg,
+                   max_num=-1,
+                   score_factors=None):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class+1), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+        score_factors (Tensor): The factors multiplied to scores before
+            applying NMS
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels
+            are 0-based.
+    """
+    # scores has num_classes + 1 (last one is BG)
+    num_classes = multi_scores.shape[1] - 1
+    bboxes, labels = [], []
+    nms_cfg_ = nms_cfg.copy()
+    nms_type = nms_cfg_.pop('type', 'nms')
+    nms_op = getattr(nms_wrapper, nms_type)
+    # the fg class id range: [0, num_classes-1]
+    for i in range(0, num_classes):
+        cls_inds = multi_scores[:, i] > score_thr
+        if not cls_inds.any():
+            continue
+        # get bboxes and scores of this class
+        if multi_bboxes.shape[1] == 4:
+            _bboxes = multi_bboxes[cls_inds, :]
+        else:
+            _bboxes = multi_bboxes[cls_inds, i * 4:(i + 1) * 4]
+        _scores = multi_scores[cls_inds, i]
+        if score_factors is not None:
+            _scores *= score_factors[cls_inds]
+        cls_dets = torch.cat([_bboxes, _scores[:, None]], dim=1)
+        cls_dets, _ = nms_op(cls_dets, **nms_cfg_)
+        cls_labels = multi_bboxes.new_full((cls_dets.shape[0], ),
+                                           i,
+                                           dtype=torch.long)
+        bboxes.append(cls_dets)
+        labels.append(cls_labels)
+    if bboxes:
+        bboxes = torch.cat(bboxes)
+        labels = torch.cat(labels)
+        if bboxes.shape[0] > max_num:
+            _, inds = bboxes[:, -1].sort(descending=True)
+            inds = inds[:max_num]
+            bboxes = bboxes[inds]
+            labels = labels[inds]
+    else:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+
+    return bboxes, labels
diff --git a/mmdet3d/core/post_processing/merge_augs.py b/mmdet3d/core/post_processing/merge_augs.py
new file mode 100644
index 0000000000..2dca68654a
--- /dev/null
+++ b/mmdet3d/core/post_processing/merge_augs.py
@@ -0,0 +1,101 @@
+import numpy as np
+import torch
+
+from mmdet3d.ops import nms
+from ..bbox import bbox_mapping_back
+
+
+def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg):
+    """Merge augmented proposals (multiscale, flip, etc.)
+
+    Args:
+        aug_proposals (list[Tensor]): proposals from different testing
+            schemes, shape (n, 5). Note that they are not rescaled to the
+            original image size.
+
+        img_metas (list[dict]): list of image info dict where each dict has:
+            'img_shape', 'scale_factor', 'flip', and my also contain
+            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            For details on the values of these keys see
+            `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+        rpn_test_cfg (dict): rpn test config.
+
+    Returns:
+        Tensor: shape (n, 4), proposals corresponding to original image scale.
+    """
+    recovered_proposals = []
+    for proposals, img_info in zip(aug_proposals, img_metas):
+        img_shape = img_info['img_shape']
+        scale_factor = img_info['scale_factor']
+        flip = img_info['flip']
+        _proposals = proposals.clone()
+        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
+                                              scale_factor, flip)
+        recovered_proposals.append(_proposals)
+    aug_proposals = torch.cat(recovered_proposals, dim=0)
+    merged_proposals, _ = nms(aug_proposals, rpn_test_cfg.nms_thr)
+    scores = merged_proposals[:, 4]
+    _, order = scores.sort(0, descending=True)
+    num = min(rpn_test_cfg.max_num, merged_proposals.shape[0])
+    order = order[:num]
+    merged_proposals = merged_proposals[order, :]
+    return merged_proposals
+
+
+def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
+    """Merge augmented detection bboxes and scores.
+
+    Args:
+        aug_bboxes (list[Tensor]): shape (n, 4*#class)
+        aug_scores (list[Tensor] or None): shape (n, #class)
+        img_shapes (list[Tensor]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_bboxes = []
+    for bboxes, img_info in zip(aug_bboxes, img_metas):
+        img_shape = img_info[0]['img_shape']
+        scale_factor = img_info[0]['scale_factor']
+        flip = img_info[0]['flip']
+        bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip)
+        recovered_bboxes.append(bboxes)
+    bboxes = torch.stack(recovered_bboxes).mean(dim=0)
+    if aug_scores is None:
+        return bboxes
+    else:
+        scores = torch.stack(aug_scores).mean(dim=0)
+        return bboxes, scores
+
+
+def merge_aug_scores(aug_scores):
+    """Merge augmented bbox scores."""
+    if isinstance(aug_scores[0], torch.Tensor):
+        return torch.mean(torch.stack(aug_scores), dim=0)
+    else:
+        return np.mean(aug_scores, axis=0)
+
+
+def merge_aug_masks(aug_masks, img_metas, rcnn_test_cfg, weights=None):
+    """Merge augmented mask prediction.
+
+    Args:
+        aug_masks (list[ndarray]): shape (n, #class, h, w)
+        img_shapes (list[ndarray]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_masks = [
+        mask if not img_info[0]['flip'] else mask[..., ::-1]
+        for mask, img_info in zip(aug_masks, img_metas)
+    ]
+    if weights is None:
+        merged_masks = np.mean(recovered_masks, axis=0)
+    else:
+        merged_masks = np.average(
+            np.array(recovered_masks), axis=0, weights=np.array(weights))
+    return merged_masks
diff --git a/mmdet3d/core/utils/__init__.py b/mmdet3d/core/utils/__init__.py
new file mode 100644
index 0000000000..47c0a9d933
--- /dev/null
+++ b/mmdet3d/core/utils/__init__.py
@@ -0,0 +1,11 @@
+from .dist_utils import DistOptimizerHook, allreduce_grads
+from .misc import tensor2imgs  # merge_batch, merge_hook_batch
+from .misc import multi_apply, unmap
+
+__all__ = [
+    'allreduce_grads',
+    'DistOptimizerHook',
+    'multi_apply',
+    'tensor2imgs',
+    'unmap',  # 'merge_batch', 'merge_hook_batch'
+]
diff --git a/mmdet3d/core/utils/contextmanagers.py b/mmdet3d/core/utils/contextmanagers.py
new file mode 100644
index 0000000000..5705338c51
--- /dev/null
+++ b/mmdet3d/core/utils/contextmanagers.py
@@ -0,0 +1,121 @@
+import asyncio
+import contextlib
+import logging
+import os
+import time
+from typing import List
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+DEBUG_COMPLETED_TIME = bool(os.environ.get('DEBUG_COMPLETED_TIME', False))
+
+
+@contextlib.asynccontextmanager
+async def completed(trace_name='',
+                    name='',
+                    sleep_interval=0.05,
+                    streams: List[torch.cuda.Stream] = None):
+    """
+    Async context manager that waits for work to complete on
+    given CUDA streams.
+
+    """
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    stream_before_context_switch = torch.cuda.current_stream()
+    if not streams:
+        streams = [stream_before_context_switch]
+    else:
+        streams = [s if s else stream_before_context_switch for s in streams]
+
+    end_events = [
+        torch.cuda.Event(enable_timing=DEBUG_COMPLETED_TIME) for _ in streams
+    ]
+
+    if DEBUG_COMPLETED_TIME:
+        start = torch.cuda.Event(enable_timing=True)
+        stream_before_context_switch.record_event(start)
+
+        cpu_start = time.monotonic()
+    logger.debug('%s %s starting, streams: %s', trace_name, name, streams)
+    grad_enabled_before = torch.is_grad_enabled()
+    try:
+        yield
+    finally:
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_end = time.monotonic()
+        for i, stream in enumerate(streams):
+            event = end_events[i]
+            stream.record_event(event)
+
+        grad_enabled_after = torch.is_grad_enabled()
+
+        # observed change of torch.is_grad_enabled() during concurrent run of
+        # async_test_bboxes code
+        assert grad_enabled_before == grad_enabled_after, \
+            'Unexpected is_grad_enabled() value change'
+
+        are_done = [e.query() for e in end_events]
+        logger.debug('%s %s completed: %s streams: %s', trace_name, name,
+                     are_done, streams)
+        with torch.cuda.stream(stream_before_context_switch):
+            while not all(are_done):
+                await asyncio.sleep(sleep_interval)
+                are_done = [e.query() for e in end_events]
+                logger.debug('%s %s completed: %s streams: %s', trace_name,
+                             name, are_done, streams)
+
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_time = (cpu_end - cpu_start) * 1000
+            stream_times_ms = ''
+            for i, stream in enumerate(streams):
+                elapsed_time = start.elapsed_time(end_events[i])
+                stream_times_ms += ' {stream} {elapsed_time:.2f} ms'.format(
+                    stream, elapsed_time)
+            logger.info('{trace_name} {name} cpu_time {cpu_time:.2f} ms',
+                        trace_name, name, cpu_time, stream_times_ms)
+
+
+@contextlib.asynccontextmanager
+async def concurrent(streamqueue: asyncio.Queue,
+                     trace_name='concurrent',
+                     name='stream'):
+    """Run code concurrently in different streams.
+
+    :param streamqueue: asyncio.Queue instance.
+
+    Queue tasks define the pool of streams used for concurrent execution.
+
+    """
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    initial_stream = torch.cuda.current_stream()
+
+    with torch.cuda.stream(initial_stream):
+        stream = await streamqueue.get()
+        assert isinstance(stream, torch.cuda.Stream)
+
+        try:
+            with torch.cuda.stream(stream):
+                logger.debug('%s %s is starting, stream: %s', trace_name, name,
+                             stream)
+                yield
+                current = torch.cuda.current_stream()
+                assert current == stream
+                logger.debug('%s %s has finished, stream: %s', trace_name,
+                             name, stream)
+        finally:
+            streamqueue.task_done()
+            streamqueue.put_nowait(stream)
diff --git a/mmdet3d/core/utils/dist_utils.py b/mmdet3d/core/utils/dist_utils.py
new file mode 100644
index 0000000000..249f71b342
--- /dev/null
+++ b/mmdet3d/core/utils/dist_utils.py
@@ -0,0 +1,58 @@
+from collections import OrderedDict
+
+import torch.distributed as dist
+from mmcv.runner import OptimizerHook
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+class DistOptimizerHook(OptimizerHook):
+
+    def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+
+    def after_train_iter(self, runner):
+        runner.optimizer.zero_grad()
+        runner.outputs['loss'].backward()
+        # allreduce_grads(runner.model.parameters(), self.coalesce,
+        #                 self.bucket_size_mb)
+        if self.grad_clip is not None:
+            self.clip_grads(runner.model.parameters())
+        runner.optimizer.step()
diff --git a/mmdet3d/core/utils/kitti_utils.py b/mmdet3d/core/utils/kitti_utils.py
new file mode 100644
index 0000000000..53c8800f85
--- /dev/null
+++ b/mmdet3d/core/utils/kitti_utils.py
@@ -0,0 +1,69 @@
+import numpy as np
+import scipy
+import torch
+from scipy.spatial import Delaunay
+
+
+def in_hull(p, hull):
+    """
+    :param p: (N, K) test points
+    :param hull: (M, K) M corners of a box
+    :return (N) bool
+    """
+    try:
+        if not isinstance(hull, Delaunay):
+            hull = Delaunay(hull)
+        flag = hull.find_simplex(p) >= 0
+    except scipy.spatial.qhull.QhullError:
+        print('Warning: not a hull %s' % str(hull))
+        flag = np.zeros(p.shape[0], dtype=np.bool)
+
+    return flag
+
+
+def enlarge_box3d(boxes3d, extra_width):
+    """
+    :param boxes3d: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coords
+    """
+    if isinstance(boxes3d, np.ndarray):
+        large_boxes3d = boxes3d.copy()
+    else:
+        large_boxes3d = boxes3d.clone()
+    large_boxes3d[:, 3:6] += extra_width * 2
+    # bugfixed: here should be minus, not addion in LiDAR, 20190508
+    large_boxes3d[:, 2] -= extra_width
+    return large_boxes3d
+
+
+def rotate_pc_along_z(pc, rot_angle):
+    """
+    params pc: (N, 3+C), (N, 3) is in the LiDAR coordinate
+    params rot_angle: rad scalar
+    Output pc: updated pc with XYZ rotated
+    """
+    cosval = np.cos(rot_angle)
+    sinval = np.sin(rot_angle)
+    rotmat = np.array([[cosval, -sinval], [sinval, cosval]])
+    pc[:, 0:2] = np.dot(pc[:, 0:2], rotmat)
+    return pc
+
+
+def rotate_pc_along_z_torch(pc, rot_angle):
+    """
+    :param pc: (N, 512, 3 + C) in the LiDAR coordinate
+    :param rot_angle: (N)
+    :return:
+    TODO: merge with rotate_pc_along_y_torch in bbox_transform.py
+    """
+    cosa = torch.cos(rot_angle).view(-1, 1)  # (N, 1)
+    sina = torch.sin(rot_angle).view(-1, 1)  # (N, 1)
+
+    raw_1 = torch.cat([cosa, -sina], dim=1)  # (N, 2)
+    raw_2 = torch.cat([sina, cosa], dim=1)  # (N, 2)
+    R = torch.cat((raw_1.unsqueeze(dim=1), raw_2.unsqueeze(dim=1)),
+                  dim=1)  # (N, 2, 2)
+
+    pc_temp = pc[:, :, 0:2]  # (N, 512, 2)
+
+    pc[:, :, 0:2] = torch.matmul(pc_temp, R)  # (N, 512, 2)
+    return pc
diff --git a/mmdet3d/core/utils/misc.py b/mmdet3d/core/utils/misc.py
new file mode 100644
index 0000000000..a63170636e
--- /dev/null
+++ b/mmdet3d/core/utils/misc.py
@@ -0,0 +1,65 @@
+from functools import partial
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn.functional as F
+from six.moves import map, zip
+
+
+def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
+    num_imgs = tensor.size(0)
+    mean = np.array(mean, dtype=np.float32)
+    std = np.array(std, dtype=np.float32)
+    imgs = []
+    for img_id in range(num_imgs):
+        img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
+        img = mmcv.imdenormalize(
+            img, mean, std, to_bgr=to_rgb).astype(np.uint8)
+        imgs.append(np.ascontiguousarray(img))
+    return imgs
+
+
+def multi_apply(func, *args, **kwargs):
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def unmap(data, count, inds, fill=0):
+    """ Unmap a subset of item (data) back to the original set of items (of
+    size count) """
+    if data.dim() == 1:
+        ret = data.new_full((count, ), fill)
+        ret[inds] = data
+    else:
+        new_size = (count, ) + data.size()[1:]
+        ret = data.new_full(new_size, fill)
+        ret[inds, :] = data
+    return ret
+
+
+def merge_batch(data):
+    for key, elems in data.items():
+        if key in ['voxels', 'num_points', 'voxel_labels', 'voxel_centers']:
+            data[key]._data[0] = torch.cat(elems._data[0], dim=0)
+        elif key == 'coors':
+            coors = []
+            for i, coor in enumerate(elems._data[0]):
+                coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+                coors.append(coor_pad)
+            data[key]._data[0] = torch.cat(coors, dim=0)
+    return data
+
+
+def merge_hook_batch(data):
+    for key, elems in data.items():
+        if key in ['voxels', 'num_points', 'voxel_labels', 'voxel_centers']:
+            data[key] = torch.cat(elems, dim=0)
+        elif key == 'coors':
+            coors = []
+            for i, coor in enumerate(elems):
+                coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+                coors.append(coor_pad)
+            data[key] = torch.cat(coors, dim=0)
+    return data
diff --git a/mmdet3d/core/voxel/__init__.py b/mmdet3d/core/voxel/__init__.py
new file mode 100644
index 0000000000..7324f2521a
--- /dev/null
+++ b/mmdet3d/core/voxel/__init__.py
@@ -0,0 +1,4 @@
+from .builder import build_voxel_generator
+from .voxel_generator import VoxelGenerator
+
+__all__ = ['build_voxel_generator', 'VoxelGenerator']
diff --git a/mmdet3d/core/voxel/builder.py b/mmdet3d/core/voxel/builder.py
new file mode 100644
index 0000000000..cc311a3fe2
--- /dev/null
+++ b/mmdet3d/core/voxel/builder.py
@@ -0,0 +1,14 @@
+import mmcv
+
+from . import voxel_generator
+
+
+def build_voxel_generator(cfg, **kwargs):
+    if isinstance(cfg, voxel_generator.VoxelGenerator):
+        return cfg
+    elif isinstance(cfg, dict):
+        return mmcv.runner.obj_from_dict(
+            cfg, voxel_generator, default_args=kwargs)
+    else:
+        raise TypeError('Invalid type {} for building a sampler'.format(
+            type(cfg)))
diff --git a/mmdet3d/core/voxel/voxel_generator.py b/mmdet3d/core/voxel/voxel_generator.py
new file mode 100644
index 0000000000..c21afd9be7
--- /dev/null
+++ b/mmdet3d/core/voxel/voxel_generator.py
@@ -0,0 +1,207 @@
+import numba
+import numpy as np
+
+
+class VoxelGenerator(object):
+
+    def __init__(self,
+                 voxel_size,
+                 point_cloud_range,
+                 max_num_points,
+                 max_voxels=20000):
+        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
+        # [0, -40, -3, 70.4, 40, 1]
+        voxel_size = np.array(voxel_size, dtype=np.float32)
+        grid_size = (point_cloud_range[3:] -
+                     point_cloud_range[:3]) / voxel_size
+        grid_size = np.round(grid_size).astype(np.int64)
+
+        self._voxel_size = voxel_size
+        self._point_cloud_range = point_cloud_range
+        self._max_num_points = max_num_points
+        self._max_voxels = max_voxels
+        self._grid_size = grid_size
+
+    def generate(self, points):
+        return points_to_voxel(points, self._voxel_size,
+                               self._point_cloud_range, self._max_num_points,
+                               True, self._max_voxels)
+
+    @property
+    def voxel_size(self):
+        return self._voxel_size
+
+    @property
+    def max_num_points_per_voxel(self):
+        return self._max_num_points
+
+    @property
+    def point_cloud_range(self):
+        return self._point_cloud_range
+
+    @property
+    def grid_size(self):
+        return self._grid_size
+
+
+def points_to_voxel(points,
+                    voxel_size,
+                    coors_range,
+                    max_points=35,
+                    reverse_index=True,
+                    max_voxels=20000):
+    """convert kitti points(N, >=3) to voxels. This version calculate
+    everything in one loop. now it takes only 4.2ms(complete point cloud)
+    with jit and 3.2ghz cpu.(don't calculate other features)
+
+    Args:
+        points: [N, ndim] float tensor. points[:, :3] contain xyz points and
+            points[:, 3:] contain other information such as reflectivity.
+        voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size
+        coors_range: [6] list/tuple or array, float. indicate voxel range.
+            format: xyzxyz, minmax
+        max_points: int. indicate maximum points contained in a voxel.
+        reverse_index: boolean. indicate whether return reversed coordinates.
+            if points has xyz format and reverse_index is True, output
+            coordinates will be zyx format, but points in features always
+            xyz format.
+        max_voxels: int. indicate maximum voxels this function create.
+            for second, 20000 is a good choice. you should shuffle points
+            before call this function because max_voxels may drop some points.
+
+    Returns:
+        voxels: [M, max_points, ndim] float tensor. only contain points.
+        coordinates: [M, 3] int32 tensor.
+        num_points_per_voxel: [M] int32 tensor.
+    """
+    if not isinstance(voxel_size, np.ndarray):
+        voxel_size = np.array(voxel_size, dtype=points.dtype)
+    if not isinstance(coors_range, np.ndarray):
+        coors_range = np.array(coors_range, dtype=points.dtype)
+    voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
+    voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
+    if reverse_index:
+        voxelmap_shape = voxelmap_shape[::-1]
+    # don't create large array in jit(nopython=True) code.
+    num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
+    coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32)
+    voxels = np.zeros(
+        shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype)
+    coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
+    if reverse_index:
+        voxel_num = _points_to_voxel_reverse_kernel(
+            points, voxel_size, coors_range, num_points_per_voxel,
+            coor_to_voxelidx, voxels, coors, max_points, max_voxels)
+
+    else:
+        voxel_num = _points_to_voxel_kernel(points, voxel_size, coors_range,
+                                            num_points_per_voxel,
+                                            coor_to_voxelidx, voxels, coors,
+                                            max_points, max_voxels)
+
+    coors = coors[:voxel_num]
+    voxels = voxels[:voxel_num]
+    num_points_per_voxel = num_points_per_voxel[:voxel_num]
+
+    return voxels, coors, num_points_per_voxel
+
+
+@numba.jit(nopython=True)
+def _points_to_voxel_reverse_kernel(points,
+                                    voxel_size,
+                                    coors_range,
+                                    num_points_per_voxel,
+                                    coor_to_voxelidx,
+                                    voxels,
+                                    coors,
+                                    max_points=35,
+                                    max_voxels=20000):
+    # put all computations to one loop.
+    # we shouldn't create large array in main jit code, otherwise
+    # reduce performance
+    N = points.shape[0]
+    # ndim = points.shape[1] - 1
+    ndim = 3
+    ndim_minus_1 = ndim - 1
+    grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
+    # np.round(grid_size)
+    # grid_size = np.round(grid_size).astype(np.int64)(np.int32)
+    grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
+    coor = np.zeros(shape=(3, ), dtype=np.int32)
+    voxel_num = 0
+    failed = False
+    for i in range(N):
+        failed = False
+        for j in range(ndim):
+            c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
+            if c < 0 or c >= grid_size[j]:
+                failed = True
+                break
+            coor[ndim_minus_1 - j] = c
+        if failed:
+            continue
+        voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
+        if voxelidx == -1:
+            voxelidx = voxel_num
+            if voxel_num >= max_voxels:
+                break
+            voxel_num += 1
+            coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
+            coors[voxelidx] = coor
+        num = num_points_per_voxel[voxelidx]
+        if num < max_points:
+            voxels[voxelidx, num] = points[i]
+            num_points_per_voxel[voxelidx] += 1
+    return voxel_num
+
+
+@numba.jit(nopython=True)
+def _points_to_voxel_kernel(points,
+                            voxel_size,
+                            coors_range,
+                            num_points_per_voxel,
+                            coor_to_voxelidx,
+                            voxels,
+                            coors,
+                            max_points=35,
+                            max_voxels=20000):
+    # need mutex if write in cuda, but numba.cuda don't support mutex.
+    # in addition, pytorch don't support cuda in dataloader.
+    # put all computations to one loop.
+    # we shouldn't create large array in main jit code, otherwise
+    # decrease performance
+    N = points.shape[0]
+    # ndim = points.shape[1] - 1
+    ndim = 3
+    grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
+    # grid_size = np.round(grid_size).astype(np.int64)(np.int32)
+    grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
+
+    # lower_bound = coors_range[:3]
+    # upper_bound = coors_range[3:]
+    coor = np.zeros(shape=(3, ), dtype=np.int32)
+    voxel_num = 0
+    failed = False
+    for i in range(N):
+        failed = False
+        for j in range(ndim):
+            c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
+            if c < 0 or c >= grid_size[j]:
+                failed = True
+                break
+            coor[j] = c
+        if failed:
+            continue
+        voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
+        if voxelidx == -1:
+            voxelidx = voxel_num
+            if voxel_num >= max_voxels:
+                break
+            voxel_num += 1
+            coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
+            coors[voxelidx] = coor
+        num = num_points_per_voxel[voxelidx]
+        if num < max_points:
+            voxels[voxelidx, num] = points[i]
+            num_points_per_voxel[voxelidx] += 1
+    return voxel_num
diff --git a/mmdet3d/datasets/__init__.py b/mmdet3d/datasets/__init__.py
new file mode 100644
index 0000000000..55c93a06b5
--- /dev/null
+++ b/mmdet3d/datasets/__init__.py
@@ -0,0 +1,16 @@
+from mmdet.datasets.registry import DATASETS
+from .builder import build_dataset
+from .coco import CocoDataset
+from .dataset_wrappers import ConcatDataset, RepeatDataset
+from .kitti2d_dataset import Kitti2DDataset
+from .kitti_dataset import KittiDataset
+from .loader import DistributedGroupSampler, GroupSampler, build_dataloader
+from .nuscenes2d_dataset import NuScenes2DDataset
+from .nuscenes_dataset import NuScenesDataset
+
+__all__ = [
+    'KittiDataset', 'GroupSampler', 'DistributedGroupSampler',
+    'build_dataloader', 'ConcatDataset', 'RepeatDataset', 'DATASETS',
+    'build_dataset', 'CocoDataset', 'Kitti2DDataset', 'NuScenesDataset',
+    'NuScenes2DDataset'
+]
diff --git a/mmdet3d/datasets/builder.py b/mmdet3d/datasets/builder.py
new file mode 100644
index 0000000000..e9ef97abc1
--- /dev/null
+++ b/mmdet3d/datasets/builder.py
@@ -0,0 +1,45 @@
+import copy
+
+from mmdet.datasets import ConcatDataset, RepeatDataset
+from mmdet.utils import build_from_cfg
+from .dataset_wrappers import RepeatFactorDataset
+from .registry import DATASETS
+
+
+def _concat_dataset(cfg, default_args=None):
+    ann_files = cfg['ann_file']
+    img_prefixes = cfg.get('img_prefix', None)
+    seg_prefixes = cfg.get('seg_prefix', None)
+    proposal_files = cfg.get('proposal_file', None)
+
+    datasets = []
+    num_dset = len(ann_files)
+    for i in range(num_dset):
+        data_cfg = copy.deepcopy(cfg)
+        data_cfg['ann_file'] = ann_files[i]
+        if isinstance(img_prefixes, (list, tuple)):
+            data_cfg['img_prefix'] = img_prefixes[i]
+        if isinstance(seg_prefixes, (list, tuple)):
+            data_cfg['seg_prefix'] = seg_prefixes[i]
+        if isinstance(proposal_files, (list, tuple)):
+            data_cfg['proposal_file'] = proposal_files[i]
+        datasets.append(build_dataset(data_cfg, default_args))
+
+    return ConcatDataset(datasets)
+
+
+def build_dataset(cfg, default_args=None):
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    elif cfg['type'] == 'RepeatFactorDataset':
+        dataset = RepeatFactorDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['repeat_thr'])
+    elif isinstance(cfg.get('ann_file'), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
diff --git a/mmdet3d/datasets/dataset_wrappers.py b/mmdet3d/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000..ca99192583
--- /dev/null
+++ b/mmdet3d/datasets/dataset_wrappers.py
@@ -0,0 +1,103 @@
+import math
+from collections import defaultdict
+
+import numpy as np
+
+from mmdet.datasets import DATASETS
+
+
+# Modified from https://github.com/facebookresearch/detectron2/blob/41d475b75a230221e21d9cac5d69655e3415e3a4/detectron2/data/samplers/distributed_sampler.py#L57 # noqa
+@DATASETS.register_module
+class RepeatFactorDataset(object):
+    """A wrapper of repeated dataset with repeat factor.
+
+    Suitable for training on class imbalanced datasets like LVIS. In each
+    epoch, an image may appear multiple times based on its "repeat factor".
+    The repeat factor for an image is a function of the frequency the rarest
+    category labeled in that image. The "frequency of category c" in [0, 1]
+    is defined as the fraction of images in the training set (without repeats)
+    in which category c appears.
+    This wrapper will finally be merged into LVIS dataset.
+
+    See https://arxiv.org/abs/1908.03195 (>= v2) Appendix B.2.
+    Args:
+        dataset (:obj:`Dataset`): The dataset to be repeated.
+        repeat_thr (float): frequency threshold below which data is repeated.
+    """
+
+    def __init__(self, dataset, repeat_thr):
+        self.dataset = dataset
+        self.repeat_thr = repeat_thr
+        self.CLASSES = dataset.CLASSES
+
+        repeat_factors = self._get_repeat_factors(dataset, repeat_thr)
+        repeat_indices = []
+        for dataset_index, repeat_factor in enumerate(repeat_factors):
+            repeat_indices.extend([dataset_index] * math.ceil(repeat_factor))
+        self.repeat_indices = repeat_indices
+
+        flags = []
+        if hasattr(self.dataset, 'flag'):
+            for flag, repeat_factor in zip(self.dataset.flag, repeat_factors):
+                flags.extend([flag] * int(math.ceil(repeat_factor)))
+            assert len(flags) == len(repeat_indices)
+        self.flag = np.asarray(flags, dtype=np.uint8)
+
+    def _get_repeat_factors(self, dataset, repeat_thr):
+        # 1. For each category c, compute the fraction # of images
+        # that contain it: f(c)
+        category_freq = defaultdict(int)
+        for idx, img_info in enumerate(dataset.data_infos):
+            if 'category_ids' in img_info:
+                cat_ids = set(img_info['category_ids'])
+            elif 'gt_names' in img_info:
+                cat_ids = set([
+                    gt for gt in img_info['gt_names']
+                    if gt in dataset.class_names
+                ])
+            else:
+                labels = dataset.get_ann_info(idx)['labels']
+                cat_ids = set([label for label in labels])
+            for cat_id in cat_ids:
+                category_freq[cat_id] += 1
+        num_images = len(dataset)
+        for k, v in category_freq.items():
+            category_freq[k] = v / num_images
+
+        # 2. For each category c, compute the category-level repeat factor:
+        #    r(c) = max(1, sqrt(t / f(c)))
+        category_repeat = {
+            cat_id: max(1.0, math.sqrt(repeat_thr / cat_freq))
+            for cat_id, cat_freq in category_freq.items()
+        }
+
+        # 3. For each image I, compute the image-level repeat factor:
+        #    r(I) = max_{c in I} r(c)
+        repeat_factors = []
+        for idx, img_info in enumerate(dataset.data_infos):
+            if 'category_ids' in img_info:
+                cat_ids = set(img_info['category_ids'])
+            elif 'gt_names' in img_info:
+                cat_ids = set([
+                    gt for gt in img_info['gt_names']
+                    if gt in dataset.class_names
+                ])
+            else:
+                labels = dataset.get_ann_info(idx)['labels']
+                cat_ids = set([label for label in labels])
+
+            if len(cat_ids) == 0:
+                repeat_factor = 1
+            else:
+                repeat_factor = max(
+                    {category_repeat[cat_id]
+                     for cat_id in cat_ids})
+            repeat_factors.append(repeat_factor)
+        return repeat_factors
+
+    def __getitem__(self, idx):
+        ori_index = self.repeat_indices[idx]
+        return self.dataset[ori_index]
+
+    def __len__(self):
+        return len(self.repeat_indices)
diff --git a/mmdet3d/datasets/kitti2d_dataset.py b/mmdet3d/datasets/kitti2d_dataset.py
new file mode 100644
index 0000000000..ffcad3c413
--- /dev/null
+++ b/mmdet3d/datasets/kitti2d_dataset.py
@@ -0,0 +1,143 @@
+import mmcv
+import numpy as np
+
+from mmdet.datasets import DATASETS, CustomDataset
+
+
+@DATASETS.register_module
+class Kitti2DDataset(CustomDataset):
+
+    CLASSES = ('car', 'pedestrian', 'cyclist')
+    """
+    Annotation format:
+    [
+        {
+            'image': {
+                'image_idx': 0,
+                'image_path': 'training/image_2/000000.png',
+                'image_shape': array([ 370, 1224], dtype=int32)
+            },
+            'point_cloud': {
+                 'num_features': 4,
+                 'velodyne_path': 'training/velodyne/000000.bin'
+             },
+             'calib': {
+                 'P0': <np.ndarray> (4, 4),
+                 'P1': <np.ndarray> (4, 4),
+                 'P2': <np.ndarray> (4, 4),
+                 'P3': <np.ndarray> (4, 4),
+                 'R0_rect':4x4 np.array,
+                 'Tr_velo_to_cam': 4x4 np.array,
+                 'Tr_imu_to_velo': 4x4 np.array
+             },
+             'annos': {
+                 'name': <np.ndarray> (n),
+                 'truncated': <np.ndarray> (n),
+                 'occluded': <np.ndarray> (n),
+                 'alpha': <np.ndarray> (n),
+                 'bbox': <np.ndarray> (n, 4),
+                 'dimensions': <np.ndarray> (n, 3),
+                 'location': <np.ndarray> (n, 3),
+                 'rotation_y': <np.ndarray> (n),
+                 'score': <np.ndarray> (n),
+                 'index': array([0], dtype=int32),
+                 'group_ids': array([0], dtype=int32),
+                 'difficulty': array([0], dtype=int32),
+                 'num_points_in_gt': <np.ndarray> (n),
+             }
+        }
+    ]
+    """
+
+    def load_annotations(self, ann_file):
+        self.data_infos = mmcv.load(ann_file)
+        self.cat2label = {
+            cat_name: i
+            for i, cat_name in enumerate(self.class_names)
+        }
+        return self.data_infos
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images without ground truths."""
+        valid_inds = []
+        for i, img_info in enumerate(self.data_infos):
+            if len(img_info['annos']['name']) > 0:
+                valid_inds.append(i)
+        return valid_inds
+
+    def get_ann_info(self, index):
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+        annos = info['annos']
+        gt_names = annos['name']
+        gt_bboxes = annos['bbox']
+        difficulty = annos['difficulty']
+
+        # remove classes that is not needed
+        selected = self.keep_arrays_by_name(gt_names, self.CLASSES)
+        gt_bboxes = gt_bboxes[selected]
+        gt_names = gt_names[selected]
+        difficulty = difficulty[selected]
+        gt_labels = np.array([self.cat2label[n] for n in gt_names])
+
+        anns_results = dict(
+            bboxes=gt_bboxes.astype(np.float32),
+            labels=gt_labels,
+        )
+        return anns_results
+
+    def prepare_train_img(self, idx):
+        img_raw_info = self.data_infos[idx]['image']
+        img_info = dict(filename=img_raw_info['image_path'])
+        ann_info = self.get_ann_info(idx)
+        if len(ann_info['bboxes']) == 0:
+            return None
+        results = dict(img_info=img_info, ann_info=ann_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def prepare_test_img(self, idx):
+        img_raw_info = self.data_infos[idx]['image']
+        img_info = dict(filename=img_raw_info['image_path'])
+        results = dict(img_info=img_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0.
+        In kitti's pcd, they are all the same, thus are all zeros
+        """
+        self.flag = np.zeros(len(self), dtype=np.uint8)
+
+    def drop_arrays_by_name(self, gt_names, used_classes):
+        inds = [i for i, x in enumerate(gt_names) if x not in used_classes]
+        inds = np.array(inds, dtype=np.int64)
+        return inds
+
+    def keep_arrays_by_name(self, gt_names, used_classes):
+        inds = [i for i, x in enumerate(gt_names) if x in used_classes]
+        inds = np.array(inds, dtype=np.int64)
+        return inds
+
+    def reformat_bbox(self, outputs, out=None):
+        from mmdet3d.core.bbox.transforms import bbox2result_kitti2d
+        sample_idx = [info['image']['image_idx'] for info in self.data_infos]
+        result_files = bbox2result_kitti2d(outputs, self.CLASSES, sample_idx,
+                                           out)
+        return result_files
+
+    def evaluate(self, result_files, eval_types=None):
+        from mmdet3d.core.evaluation import kitti_eval
+        eval_types = ['bbox'] if not eval_types else eval_types
+        assert eval_types in ('bbox', ['bbox'
+                                       ]), 'KITTI data set only evaluate bbox'
+        gt_annos = [info['annos'] for info in self.data_infos]
+        ap_result_str, ap_dict = kitti_eval(
+            gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
+        return ap_result_str, ap_dict
diff --git a/mmdet3d/datasets/kitti_dataset.py b/mmdet3d/datasets/kitti_dataset.py
new file mode 100644
index 0000000000..796eea7b80
--- /dev/null
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -0,0 +1,579 @@
+import copy
+import os
+import pickle
+
+import mmcv
+import numpy as np
+import torch
+import torch.utils.data as torch_data
+
+from mmdet.datasets.registry import DATASETS
+from ..core.bbox import box_np_ops
+from .pipelines import Compose
+from .utils import remove_dontcare
+
+
+@DATASETS.register_module
+class KittiDataset(torch_data.Dataset):
+
+    CLASSES = ('car', 'pedestrian', 'cyclist')
+
+    def __init__(self,
+                 root_path,
+                 ann_file,
+                 split,
+                 pipeline=None,
+                 training=False,
+                 class_names=None,
+                 modality=None,
+                 with_label=True,
+                 test_mode=False):
+        """
+        :param root_path: KITTI data path
+        :param split:
+        """
+        super().__init__()
+        self.root_path = root_path
+        self.root_split_path = os.path.join(
+            self.root_path, 'training' if split != 'test' else 'testing')
+        self.class_names = class_names if class_names else self.CLASSES
+        self.modality = modality
+        self.with_label = with_label
+        assert self.modality is not None
+        self.modality = modality
+        self.test_mode = test_mode
+        # TODO: rm the key training if it is not needed
+        self.training = training
+        self.pcd_limit_range = [0, -40, -3, 70.4, 40, 0.0]
+
+        self.ann_file = ann_file
+        with open(ann_file, 'rb') as f:
+            self.kitti_infos = pickle.load(f)
+
+        # set group flag for the sampler
+        if not self.test_mode:
+            self._set_group_flag()
+
+        # processing pipeline
+        if pipeline is not None:
+            self.pipeline = Compose(pipeline)
+
+    def __getitem__(self, idx):
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def prepare_train_data(self, index):
+        input_dict = self.get_sensor_data(index)
+        input_dict = self.train_pre_pipeline(input_dict)
+        if input_dict is None:
+            return None
+        example = self.pipeline(input_dict)
+        if example is None or len(example['gt_bboxes_3d']._data) == 0:
+            return None
+        return example
+
+    def train_pre_pipeline(self, input_dict):
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_bboxes = input_dict['gt_bboxes']
+        gt_names = input_dict['gt_names']
+        difficulty = input_dict['difficulty']
+        input_dict['bbox_fields'] = []
+
+        selected = self.drop_arrays_by_name(gt_names, ['DontCare'])
+        # selected = self.keep_arrays_by_name(gt_names, self.class_names)
+        gt_bboxes_3d = gt_bboxes_3d[selected]
+        gt_bboxes = gt_bboxes[selected]
+        gt_names = gt_names[selected]
+        difficulty = difficulty[selected]
+        gt_bboxes_mask = np.array([n in self.class_names for n in gt_names],
+                                  dtype=np.bool_)
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.astype('float32')
+        input_dict['gt_bboxes'] = gt_bboxes.astype('float32')
+        input_dict['gt_names'] = gt_names
+        input_dict['gt_names_3d'] = copy.deepcopy(gt_names)
+        input_dict['difficulty'] = difficulty
+        input_dict['gt_bboxes_mask'] = gt_bboxes_mask
+        input_dict['gt_bboxes_3d_mask'] = copy.deepcopy(gt_bboxes_mask)
+        input_dict['bbox_fields'].append('gt_bboxes')
+        if len(gt_bboxes) == 0:
+            return None
+        return input_dict
+
+    def prepare_test_data(self, index):
+        input_dict = self.get_sensor_data(index)
+        # input_dict = self.test_pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        return example
+
+    def test_pre_pipeline(self, input_dict):
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_bboxes = input_dict['gt_bboxes']
+        gt_names = input_dict['gt_names']
+
+        if gt_bboxes_3d is not None:
+            selected = self.keep_arrays_by_name(gt_names, self.class_names)
+            gt_bboxes_3d = gt_bboxes_3d[selected]
+            gt_bboxes = gt_bboxes[selected]
+            gt_names = gt_names[selected]
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_bboxes'] = gt_bboxes
+        input_dict['gt_names'] = gt_names
+        input_dict['gt_names_3d'] = copy.deepcopy(gt_names)
+        return input_dict
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0.
+        In kitti's pcd, they are all the same, thus are all zeros
+        """
+        self.flag = np.zeros(len(self), dtype=np.uint8)
+
+    def _rand_another(self, idx):
+        pool = np.where(self.flag == self.flag[idx])[0]
+        return np.random.choice(pool)
+
+    def get_lidar(self, idx):
+        lidar_file = os.path.join(self.root_split_path, 'velodyne',
+                                  '%06d.bin' % idx)
+        assert os.path.exists(lidar_file)
+        return np.fromfile(lidar_file, dtype=np.float32).reshape(-1, 4)
+
+    def get_lidar_reduced(self, idx):
+        lidar_file = os.path.join(self.root_split_path, 'velodyne_reduced',
+                                  '%06d.bin' % idx)
+        assert os.path.exists(lidar_file)
+        return np.fromfile(lidar_file, dtype=np.float32).reshape(-1, 4)
+
+    def get_lidar_depth_reduced(self, idx):
+        lidar_file = os.path.join(self.root_split_path,
+                                  'velodyne_depth_reduced', '%06d.bin' % idx)
+        assert os.path.exists(lidar_file)
+        return np.fromfile(lidar_file, dtype=np.float32).reshape(-1, 4)
+
+    def get_pure_depth_reduced(self, idx):
+        lidar_file = os.path.join(self.root_split_path, 'depth_reduced',
+                                  '%06d.bin' % idx)
+        assert os.path.exists(lidar_file)
+        return np.fromfile(lidar_file, dtype=np.float32).reshape(-1, 4)
+
+    def get_depth(self, idx):
+        depth_file = os.path.join(self.root_split_path, 'depth_completion',
+                                  '%06d.png' % idx)
+        assert os.path.exists(depth_file)
+        depth_img = mmcv.imread(depth_file, -1) / 256.0
+        return depth_img
+
+    def __len__(self):
+        return len(self.kitti_infos)
+
+    def get_sensor_data(self, index):
+        info = self.kitti_infos[index]
+        sample_idx = info['image']['image_idx']
+        # TODO: consider use torch.Tensor only
+        rect = info['calib']['R0_rect'].astype(np.float32)
+        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+        P2 = info['calib']['P2'].astype(np.float32)
+        lidar2img = P2 @ rect @ Trv2c
+
+        if self.modality['use_depth'] and self.modality['use_lidar']:
+            points = self.get_lidar_depth_reduced(sample_idx)
+        elif self.modality['use_lidar']:
+            points = self.get_lidar_reduced(sample_idx)
+        elif self.modality['use_depth']:
+            points = self.get_pure_depth_reduced(sample_idx)
+        else:
+            assert (self.modality['use_depth'] or self.modality['use_lidar'])
+
+        if not self.modality['use_lidar_intensity']:
+            points = points[:, :3]
+
+        input_dict = dict(
+            sample_idx=sample_idx,
+            points=points,
+            lidar2img=lidar2img,
+        )
+
+        # TODO: support image input
+        if self.modality['use_camera']:
+            image_info = info['image']
+            image_path = image_info['image_path']
+            image_path = os.path.join(self.root_path, image_path)
+            img = mmcv.imread(image_path)
+            input_dict.update(
+                dict(
+                    img=img,
+                    img_shape=img.shape,
+                    ori_shape=img.shape,
+                    filename=image_path))
+        else:
+            input_dict.update(dict(img_shape=info['image']['image_shape']))
+        if self.with_label:
+            annos = self.get_ann_info(index)
+            input_dict.update(annos)
+
+        return input_dict
+
+    def get_ann_info(self, index):
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.kitti_infos[index]
+        rect = info['calib']['R0_rect'].astype(np.float32)
+        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+        # P2 = info['calib']['P2'].astype(np.float32)
+
+        annos = info['annos']
+        # we need other objects to avoid collision when sample
+        annos = remove_dontcare(annos)
+        loc = annos['location']
+        dims = annos['dimensions']
+        rots = annos['rotation_y']
+        gt_names = annos['name']
+        # print(gt_names, len(loc))
+        gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1).astype(np.float32)
+        difficulty = annos['difficulty']
+        # this change gt_bboxes_3d to velodyne coordinates
+        gt_bboxes_3d = box_np_ops.box_camera_to_lidar(gt_bboxes_3d, rect,
+                                                      Trv2c)
+        # only center format is allowed. so we need to convert
+        # kitti [0.5, 0.5, 0] center to [0.5, 0.5, 0.5]
+        # box_np_ops.change_box3d_center_(gt_bboxes, [0.5, 0.5, 0],
+        #                                 [0.5, 0.5, 0.5])
+
+        # For simplicity gt_bboxes means 2D gt bboxes
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_bboxes=annos['bbox'],
+            gt_names=gt_names,
+            difficulty=difficulty)
+        return anns_results
+
+    def drop_arrays_by_name(self, gt_names, used_classes):
+        inds = [i for i, x in enumerate(gt_names) if x not in used_classes]
+        inds = np.array(inds, dtype=np.int64)
+        return inds
+
+    def keep_arrays_by_name(self, gt_names, used_classes):
+        inds = [i for i, x in enumerate(gt_names) if x in used_classes]
+        inds = np.array(inds, dtype=np.int64)
+        return inds
+
+    def reformat_bbox(self, outputs, out=None):
+        if not isinstance(outputs[0][0], dict):
+            sample_idx = [
+                info['image']['image_idx'] for info in self.kitti_infos
+            ]
+            result_files = self.bbox2result_kitti2d(outputs, self.class_names,
+                                                    sample_idx, out)
+        else:
+            result_files = self.bbox2result_kitti(outputs, self.class_names,
+                                                  out)
+        return result_files
+
+    def evaluate(self, result_files, eval_types=None):
+        from mmdet3d.core.evaluation import kitti_eval
+        gt_annos = [info['annos'] for info in self.kitti_infos]
+        if eval_types == 'img_bbox':
+            ap_result_str, ap_dict = kitti_eval(
+                gt_annos, result_files, self.class_names, eval_types=['bbox'])
+        else:
+            ap_result_str, ap_dict = kitti_eval(gt_annos, result_files,
+                                                self.class_names)
+        return ap_result_str, ap_dict
+
+    def bbox2result_kitti(self, net_outputs, class_names, out=None):
+        if out:
+            output_dir = out[:-4] if out.endswith(('.pkl', '.pickle')) else out
+            result_dir = output_dir + '/data'
+            mmcv.mkdir_or_exist(result_dir)
+
+        det_annos = []
+        print('Converting prediction to KITTI format')
+        for idx, pred_dicts in enumerate(
+                mmcv.track_iter_progress(net_outputs)):
+            annos = []
+            info = self.kitti_infos[idx]
+            image_shape = info['image']['image_shape'][:2]
+            for i, box_dict in enumerate(pred_dicts):
+                num_example = 0
+                sample_idx = box_dict['sample_idx']
+                box_dict = self.convert_valid_bboxes(box_dict, info)
+                if box_dict['bbox'] is not None or box_dict['bbox'].size.numel(
+                ) != 0:
+                    box_2d_preds = box_dict['bbox']
+                    box_preds = box_dict['box3d_camera']
+                    scores = box_dict['scores']
+                    box_preds_lidar = box_dict['box3d_lidar']
+                    label_preds = box_dict['label_preds']
+
+                    anno = {
+                        'name': [],
+                        'truncated': [],
+                        'occluded': [],
+                        'alpha': [],
+                        'bbox': [],
+                        'dimensions': [],
+                        'location': [],
+                        'rotation_y': [],
+                        'score': []
+                    }
+                    gt_iou = scores * 0
+
+                    for box, box_lidar, bbox, score, label, cur_gt_iou in zip(
+                            box_preds, box_preds_lidar, box_2d_preds, scores,
+                            label_preds, gt_iou):
+                        bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
+                        bbox[:2] = np.maximum(bbox[:2], [0, 0])
+                        anno['name'].append(class_names[int(label)])
+                        anno['truncated'].append(0.0)
+                        anno['occluded'].append(0)
+                        anno['alpha'].append(
+                            -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6])
+                        anno['bbox'].append(bbox)
+                        anno['dimensions'].append(box[3:6])
+                        anno['location'].append(box[:3])
+                        anno['rotation_y'].append(box[6])
+                        # anno["gt_iou"].append(cur_gt_iou)
+                        anno['score'].append(score)
+
+                        num_example += 1
+
+                    if num_example != 0:
+                        anno = {k: np.stack(v) for k, v in anno.items()}
+                        annos.append(anno)
+
+                    if out:
+                        cur_det_file = result_dir + '/%06d.txt' % sample_idx
+                        with open(cur_det_file, 'w') as f:
+                            bbox = anno['bbox']
+                            loc = anno['location']
+                            dims = anno['dimensions']  # lhw -> hwl
+
+                            for idx in range(len(bbox)):
+                                print(
+                                    '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
+                                    '{:.4f} {:.4f} {:.4f} '
+                                    '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'
+                                    .format(anno['name'][idx],
+                                            anno['alpha'][idx], bbox[idx][0],
+                                            bbox[idx][1], bbox[idx][2],
+                                            bbox[idx][3], dims[idx][1],
+                                            dims[idx][2], dims[idx][0],
+                                            loc[idx][0], loc[idx][1],
+                                            loc[idx][2],
+                                            anno['rotation_y'][idx],
+                                            anno['score'][idx]),
+                                    file=f)
+
+                if num_example == 0:
+                    annos.append({
+                        'name': np.array([]),
+                        'truncated': np.array([]),
+                        'occluded': np.array([]),
+                        'alpha': np.array([]),
+                        'bbox': np.zeros([0, 4]),
+                        'dimensions': np.zeros([0, 3]),
+                        'location': np.zeros([0, 3]),
+                        'rotation_y': np.array([]),
+                        'score': np.array([]),
+                    })
+                annos[-1]['sample_idx'] = np.array(
+                    [sample_idx] * num_example, dtype=np.int64)
+
+            det_annos += annos
+
+        if out:
+            if not out.endswith(('.pkl', '.pickle')):
+                out = '{}.pkl'.format(out)
+            mmcv.dump(det_annos, out)
+            print('Result is saved to %s' % out)
+
+        return det_annos
+
+    def bbox2result_kitti2d(self,
+                            net_outputs,
+                            class_names,
+                            sample_ids,
+                            out=None):
+        """Convert results to kitti format for evaluation and test submission
+
+        Args:
+            net_outputs (List[array]): list of array storing the bbox and score
+            class_nanes (List[String]): A list of class names
+            sample_idx (List[Int]): A list of samples' index,
+                should have the same length as net_outputs.
+
+        Return:
+            List([dict]): A list of dict have the kitti format
+        """
+        assert len(net_outputs) == len(sample_ids)
+
+        det_annos = []
+        print('Converting prediction to KITTI format')
+        for i, bboxes_per_sample in enumerate(
+                mmcv.track_iter_progress(net_outputs)):
+            annos = []
+            anno = dict(
+                name=[],
+                truncated=[],
+                occluded=[],
+                alpha=[],
+                bbox=[],
+                dimensions=[],
+                location=[],
+                rotation_y=[],
+                score=[])
+            sample_idx = sample_ids[i]
+
+            num_example = 0
+            for label in range(len(bboxes_per_sample)):
+                bbox = bboxes_per_sample[label]
+                for i in range(bbox.shape[0]):
+                    anno['name'].append(class_names[int(label)])
+                    anno['truncated'].append(0.0)
+                    anno['occluded'].append(0)
+                    anno['alpha'].append(0.0)
+                    anno['bbox'].append(bbox[i, :4])
+                    # set dimensions (height, width, length) to zero
+                    anno['dimensions'].append(
+                        np.zeros(shape=[3], dtype=np.float32))
+                    # set the 3D translation to (-1000, -1000, -1000)
+                    anno['location'].append(
+                        np.ones(shape=[3], dtype=np.float32) * (-1000.0))
+                    anno['rotation_y'].append(0.0)
+                    anno['score'].append(bbox[i, 4])
+                    num_example += 1
+
+            if num_example == 0:
+                annos.append(
+                    dict(
+                        name=np.array([]),
+                        truncated=np.array([]),
+                        occluded=np.array([]),
+                        alpha=np.array([]),
+                        bbox=np.zeros([0, 4]),
+                        dimensions=np.zeros([0, 3]),
+                        location=np.zeros([0, 3]),
+                        rotation_y=np.array([]),
+                        score=np.array([]),
+                    ))
+            else:
+                anno = {k: np.stack(v) for k, v in anno.items()}
+                annos.append(anno)
+
+            annos[-1]['sample_idx'] = np.array(
+                [sample_idx] * num_example, dtype=np.int64)
+            det_annos += annos
+
+        if out:
+            # save file in submission format
+            output_dir = out[:-4] if out.endswith(('.pkl', '.pickle')) else out
+            result_dir = output_dir + '/data'
+            mmcv.mkdir_or_exist(result_dir)
+            out = '{}.pkl'.format(result_dir)
+            mmcv.dump(det_annos, out)
+            print('Result is saved to {}'.format(out))
+            for i, anno in enumerate(det_annos):
+                sample_idx = sample_ids[i]
+                cur_det_file = result_dir + '/%06d.txt' % sample_idx
+                with open(cur_det_file, 'w') as f:
+                    bbox = anno['bbox']
+                    loc = anno['location']
+                    dims = anno['dimensions'][::-1]  # lhw -> hwl
+                    for idx in range(len(bbox)):
+                        print(
+                            '{} -1 -1 {:4f} {:4f} {:4f} {:4f} {:4f} {:4f} '
+                            '{:4f} {:4f} {:4f} {:4f} {:4f} {:4f} {:4f}'.format(
+                                anno['name'][idx],
+                                anno['alpha'][idx],
+                                *bbox[idx],  # 4 float
+                                *dims[idx],  # 3 float
+                                *loc[idx],  # 3 float
+                                anno['rotation_y'][idx],
+                                anno['score'][idx]),
+                            file=f,
+                        )
+            print('Result is saved to {}'.format(result_dir))
+
+        return det_annos
+
+    def convert_valid_bboxes(self, box_dict, info):
+        # TODO: refactor this function
+        final_box_preds = box_dict['box3d_lidar']
+        final_scores = box_dict['scores']
+        final_labels = box_dict['label_preds']
+        sample_idx = info['image']['image_idx']
+        final_box_preds[:, -1] = box_np_ops.limit_period(
+            final_box_preds[:, -1] - np.pi, offset=0.5, period=np.pi * 2)
+
+        if final_box_preds.shape[0] == 0:
+            return dict(
+                bbox=final_box_preds.new_zeros([0, 4]).numpy(),
+                box3d_camera=final_box_preds.new_zeros([0, 7]).numpy(),
+                box3d_lidar=final_box_preds.new_zeros([0, 7]).numpy(),
+                scores=final_box_preds.new_zeros([0]).numpy(),
+                label_preds=final_box_preds.new_zeros([0, 4]).numpy(),
+                sample_idx=sample_idx,
+            )
+
+        from mmdet3d.core.bbox import box_torch_ops
+        rect = info['calib']['R0_rect'].astype(np.float32)
+        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+        P2 = info['calib']['P2'].astype(np.float32)
+        img_shape = info['image']['image_shape']
+        rect = final_box_preds.new_tensor(rect)
+        Trv2c = final_box_preds.new_tensor(Trv2c)
+        P2 = final_box_preds.new_tensor(P2)
+
+        final_box_preds_camera = box_torch_ops.box_lidar_to_camera(
+            final_box_preds, rect, Trv2c)
+        locs = final_box_preds_camera[:, :3]
+        dims = final_box_preds_camera[:, 3:6]
+        angles = final_box_preds_camera[:, 6]
+        camera_box_origin = [0.5, 1.0, 0.5]
+        box_corners = box_torch_ops.center_to_corner_box3d(
+            locs, dims, angles, camera_box_origin, axis=1)
+        box_corners_in_image = box_torch_ops.project_to_image(box_corners, P2)
+        # box_corners_in_image: [N, 8, 2]
+        minxy = torch.min(box_corners_in_image, dim=1)[0]
+        maxxy = torch.max(box_corners_in_image, dim=1)[0]
+        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
+        # Post-processing
+        # check final_box_preds_camera
+        image_shape = final_box_preds.new_tensor(img_shape)
+        valid_cam_inds = ((final_box_preds_camera[:, 0] < image_shape[1]) &
+                          (final_box_preds_camera[:, 1] < image_shape[0]) &
+                          (final_box_preds_camera[:, 2] > 0) &
+                          (final_box_preds_camera[:, 3] > 0))
+        # check final_box_preds
+        limit_range = final_box_preds.new_tensor(self.pcd_limit_range)
+        valid_pcd_inds = ((final_box_preds[:, :3] > limit_range[:3]) &
+                          (final_box_preds[:, :3] < limit_range[3:]))
+        valid_inds = valid_cam_inds & valid_pcd_inds.all(-1)
+
+        if valid_inds.sum() > 0:
+            return dict(
+                bbox=box_2d_preds[valid_inds, :].numpy(),
+                box3d_camera=final_box_preds_camera[valid_inds, :].numpy(),
+                box3d_lidar=final_box_preds[valid_inds, :].numpy(),
+                scores=final_scores[valid_inds].numpy(),
+                label_preds=final_labels[valid_inds].numpy(),
+                sample_idx=sample_idx,
+            )
+        else:
+            return dict(
+                bbox=final_box_preds.new_zeros([0, 4]).numpy(),
+                box3d_camera=final_box_preds.new_zeros([0, 7]).numpy(),
+                box3d_lidar=final_box_preds.new_zeros([0, 7]).numpy(),
+                scores=final_box_preds.new_zeros([0]).numpy(),
+                label_preds=final_box_preds.new_zeros([0, 4]).numpy(),
+                sample_idx=sample_idx,
+            )
diff --git a/mmdet3d/datasets/loader/__init__.py b/mmdet3d/datasets/loader/__init__.py
new file mode 100644
index 0000000000..4404615be6
--- /dev/null
+++ b/mmdet3d/datasets/loader/__init__.py
@@ -0,0 +1,4 @@
+from .build_loader import build_dataloader
+from .sampler import DistributedGroupSampler, GroupSampler
+
+__all__ = ['GroupSampler', 'DistributedGroupSampler', 'build_dataloader']
diff --git a/mmdet3d/datasets/loader/build_loader.py b/mmdet3d/datasets/loader/build_loader.py
new file mode 100644
index 0000000000..14ff9b1375
--- /dev/null
+++ b/mmdet3d/datasets/loader/build_loader.py
@@ -0,0 +1,57 @@
+import platform
+import random
+from functools import partial
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from torch.utils.data import DataLoader
+
+from .sampler import DistributedGroupSampler, DistributedSampler, GroupSampler
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
+
+
+def build_dataloader(dataset,
+                     samples_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     seed=None,
+                     **kwargs):
+    shuffle = kwargs.get('shuffle', True)
+    if dist:
+        rank, world_size = get_dist_info()
+        if shuffle:
+            sampler = DistributedGroupSampler(dataset, samples_per_gpu,
+                                              world_size, rank)
+        else:
+            sampler = DistributedSampler(
+                dataset, world_size, rank, shuffle=False)
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=False,
+        worker_init_fn=worker_init_fn if seed is not None else None,
+        **kwargs)
+
+    return data_loader
+
+
+def worker_init_fn(seed):
+    np.random.seed(seed)
+    random.seed(seed)
diff --git a/mmdet3d/datasets/loader/sampler.py b/mmdet3d/datasets/loader/sampler.py
new file mode 100644
index 0000000000..f3dd996207
--- /dev/null
+++ b/mmdet3d/datasets/loader/sampler.py
@@ -0,0 +1,164 @@
+from __future__ import division
+import math
+
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+from torch.utils.data import DistributedSampler as _DistributedSampler
+from torch.utils.data import Sampler
+
+
+class DistributedSampler(_DistributedSampler):
+
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank)
+        self.shuffle = shuffle
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+
+class GroupSampler(Sampler):
+
+    def __init__(self, dataset, samples_per_gpu=1):
+        assert hasattr(dataset, 'flag')
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.flag = dataset.flag.astype(np.int64)
+        self.group_sizes = np.bincount(self.flag)
+        self.num_samples = 0
+        for i, size in enumerate(self.group_sizes):
+            self.num_samples += int(np.ceil(
+                size / self.samples_per_gpu)) * self.samples_per_gpu
+
+    def __iter__(self):
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size == 0:
+                continue
+            indice = np.where(self.flag == i)[0]
+            assert len(indice) == size
+            np.random.shuffle(indice)
+            num_extra = int(np.ceil(size / self.samples_per_gpu)
+                            ) * self.samples_per_gpu - len(indice)
+            indice = np.concatenate(
+                [indice, np.random.choice(indice, num_extra)])
+            indices.append(indice)
+        indices = np.concatenate(indices)
+        indices = [
+            indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
+            for i in np.random.permutation(
+                range(len(indices) // self.samples_per_gpu))
+        ]
+        indices = np.concatenate(indices)
+        indices = indices.astype(np.int64).tolist()
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                indice = indice[list(torch.randperm(int(size),
+                                                    generator=g))].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                # pad indice
+                tmp = indice.copy()
+                for _ in range(extra // size):
+                    indice.extend(tmp)
+                indice.extend(tmp[:extra % size])
+                indices.extend(indice)
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/mmdet3d/datasets/nuscenes2d_dataset.py b/mmdet3d/datasets/nuscenes2d_dataset.py
new file mode 100644
index 0000000000..636a55e7ee
--- /dev/null
+++ b/mmdet3d/datasets/nuscenes2d_dataset.py
@@ -0,0 +1,38 @@
+from pycocotools.coco import COCO
+
+from mmdet3d.core.evaluation.coco_utils import getImgIds
+from mmdet.datasets import DATASETS, CocoDataset
+
+
+@DATASETS.register_module
+class NuScenes2DDataset(CocoDataset):
+
+    CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+               'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+               'barrier')
+
+    def load_annotations(self, ann_file):
+        if not self.class_names:
+            self.class_names = self.CLASSES
+        self.coco = COCO(ann_file)
+        # send class_names into the get id
+        # in case we only need to train on several classes
+        # by default self.class_names = CLASSES
+        self.cat_ids = self.coco.getCatIds(catNms=self.class_names)
+
+        self.cat2label = {
+            cat_id: i  # + 1 rm +1 here thus the 0-79 are fg, 80 is bg
+            for i, cat_id in enumerate(self.cat_ids)
+        }
+        # send cat ids to the get img id
+        # in case we only need to train on several classes
+        if len(self.cat_ids) < len(self.CLASSES):
+            self.img_ids = getImgIds(self.coco, catIds=self.cat_ids)
+        else:
+            self.img_ids = self.coco.getImgIds()
+        img_infos = []
+        for i in self.img_ids:
+            info = self.coco.loadImgs([i])[0]
+            info['filename'] = info['file_name']
+            img_infos.append(info)
+        return img_infos
diff --git a/mmdet3d/datasets/nuscenes_dataset.py b/mmdet3d/datasets/nuscenes_dataset.py
new file mode 100644
index 0000000000..b46f687f62
--- /dev/null
+++ b/mmdet3d/datasets/nuscenes_dataset.py
@@ -0,0 +1,495 @@
+import copy
+import os.path as osp
+import tempfile
+
+import mmcv
+import numpy as np
+import pyquaternion
+import torch.utils.data as torch_data
+from nuscenes.utils.data_classes import Box as NuScenesBox
+
+from mmdet.datasets import DATASETS
+from ..core.bbox import box_np_ops
+from .pipelines import Compose
+
+
+@DATASETS.register_module
+class NuScenesDataset(torch_data.Dataset):
+    NumPointFeatures = 4  # xyz, timestamp. set 4 to use kitti pretrain
+    NameMapping = {
+        'movable_object.barrier': 'barrier',
+        'vehicle.bicycle': 'bicycle',
+        'vehicle.bus.bendy': 'bus',
+        'vehicle.bus.rigid': 'bus',
+        'vehicle.car': 'car',
+        'vehicle.construction': 'construction_vehicle',
+        'vehicle.motorcycle': 'motorcycle',
+        'human.pedestrian.adult': 'pedestrian',
+        'human.pedestrian.child': 'pedestrian',
+        'human.pedestrian.construction_worker': 'pedestrian',
+        'human.pedestrian.police_officer': 'pedestrian',
+        'movable_object.trafficcone': 'traffic_cone',
+        'vehicle.trailer': 'trailer',
+        'vehicle.truck': 'truck'
+    }
+    DefaultAttribute = {
+        'car': 'vehicle.parked',
+        'pedestrian': 'pedestrian.moving',
+        'trailer': 'vehicle.parked',
+        'truck': 'vehicle.parked',
+        'bus': 'vehicle.moving',
+        'motorcycle': 'cycle.without_rider',
+        'construction_vehicle': 'vehicle.parked',
+        'bicycle': 'cycle.without_rider',
+        'barrier': '',
+        'traffic_cone': '',
+    }
+    AttrMapping = {
+        'cycle.with_rider': 0,
+        'cycle.without_rider': 1,
+        'pedestrian.moving': 2,
+        'pedestrian.standing': 3,
+        'pedestrian.sitting_lying_down': 4,
+        'vehicle.moving': 5,
+        'vehicle.parked': 6,
+        'vehicle.stopped': 7,
+    }
+    AttrMapping_rev = [
+        'cycle.with_rider',
+        'cycle.without_rider',
+        'pedestrian.moving',
+        'pedestrian.standing',
+        'pedestrian.sitting_lying_down',
+        'vehicle.moving',
+        'vehicle.parked',
+        'vehicle.stopped',
+    ]
+    CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+               'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+               'barrier')
+
+    def __init__(self,
+                 ann_file,
+                 pipeline=None,
+                 root_path=None,
+                 class_names=None,
+                 load_interval=1,
+                 with_velocity=True,
+                 test_mode=False,
+                 modality=None,
+                 eval_version='detection_cvpr_2019',
+                 with_label=True,
+                 max_sweeps=10,
+                 filter_empty_gt=True):
+        super().__init__()
+        self.data_root = root_path
+        self.class_names = class_names if class_names else self.CLASSES
+        self.test_mode = test_mode
+        self.load_interval = load_interval
+        self.with_label = with_label
+        self.max_sweeps = max_sweeps
+
+        self.ann_file = ann_file
+        data = mmcv.load(ann_file)
+        self.infos = list(sorted(data['infos'], key=lambda e: e['timestamp']))
+        self.infos = self.infos[::load_interval]
+        self.metadata = data['metadata']
+        self.version = self.metadata['version']
+        self.with_velocity = with_velocity
+        self.eval_version = eval_version
+        from nuscenes.eval.detection.config import config_factory
+        self.eval_detection_configs = config_factory(self.eval_version)
+
+        if modality is None:
+            modality = dict(
+                use_camera=False,
+                use_lidar=True,
+                use_radar=False,
+                use_map=False,
+                use_external=False,
+            )
+        self.modality = modality
+        # set group flag for the sampler
+        if not self.test_mode:
+            self._set_group_flag()
+
+        # processing pipeline
+        if pipeline is not None:
+            self.pipeline = Compose(pipeline)
+
+        # kitti map: nusc det name -> kitti eval name
+        self._kitti_name_mapping = {
+            'car': 'car',
+            'pedestrian': 'pedestrian',
+        }  # we only eval these classes in kitti
+
+    def __getitem__(self, idx):
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0.
+        In kitti's pcd, they are all the same, thus are all zeros
+        """
+        self.flag = np.zeros(len(self), dtype=np.uint8)
+
+    def _rand_another(self, idx):
+        pool = np.where(self.flag == self.flag[idx])[0]
+        return np.random.choice(pool)
+
+    def __len__(self):
+        return len(self.infos)
+
+    def prepare_train_data(self, index):
+        input_dict = self.get_sensor_data(index)
+        input_dict = self.train_pre_pipeline(input_dict)
+        if input_dict is None:
+            return None
+        example = self.pipeline(input_dict)
+        if len(example['gt_bboxes_3d']._data) == 0:
+            return None
+        return example
+
+    def train_pre_pipeline(self, input_dict):
+        if len(input_dict['gt_bboxes_3d']) == 0:
+            return None
+        return input_dict
+
+    def prepare_test_data(self, index):
+        input_dict = self.get_sensor_data(index)
+        # input_dict = self.test_pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        return example
+
+    def test_pre_pipeline(self, input_dict):
+        gt_names = input_dict['gt_names']
+        input_dict['gt_names_3d'] = copy.deepcopy(gt_names)
+        return input_dict
+
+    def get_sensor_data(self, index):
+        info = self.infos[index]
+        points = np.fromfile(
+            info['lidar_path'], dtype=np.float32, count=-1).reshape([-1, 5])
+        # standard protocal modified from SECOND.Pytorch
+        points[:, 3] /= 255
+        points[:, 4] = 0
+        sweep_points_list = [points]
+        ts = info['timestamp'] / 1e6
+
+        for idx, sweep in enumerate(info['sweeps']):
+            if idx >= self.max_sweeps:
+                break
+            points_sweep = np.fromfile(
+                sweep['data_path'], dtype=np.float32,
+                count=-1).reshape([-1, 5])
+            sweep_ts = sweep['timestamp'] / 1e6
+            points_sweep[:, 3] /= 255
+            points_sweep[:, :3] = points_sweep[:, :3] @ sweep[
+                'sensor2lidar_rotation'].T
+            points_sweep[:, :3] += sweep['sensor2lidar_translation']
+            points_sweep[:, 4] = ts - sweep_ts
+            sweep_points_list.append(points_sweep)
+
+        points = np.concatenate(sweep_points_list, axis=0)[:, [0, 1, 2, 4]]
+        input_dict = dict(
+            points=points,
+            sample_idx=info['token'],
+        )
+
+        if self.modality['use_camera']:
+            # TODO support image
+            imgs = []
+            ori_shapes = []
+            image_paths = []
+            lidar2img_rts = []
+            for cam_type, cam_info in info['cams'].items():
+                image_path = cam_info['data_path']
+                # image_path = osp.join(self.data_root, image_path)
+                img = mmcv.imread(image_path)
+                imgs.append(img)
+                ori_shapes.append(img.shape)
+                image_paths.append(image_path)
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+            input_dict.update(
+                dict(
+                    img=imgs,
+                    img_shape=ori_shapes,
+                    ori_shape=ori_shapes,
+                    filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                ))
+
+        if self.with_label:
+            annos = self.get_ann_info(index)
+            input_dict.update(annos)
+
+        return input_dict
+
+    def get_ann_info(self, index):
+        info = self.infos[index]
+        # filter out bbox containing no points
+        mask = info['num_lidar_pts'] > 0
+        gt_bboxes_3d = info['gt_boxes'][mask]
+        # the nuscenes box center is [0.5, 0.5, 0.5], we keep it
+        # the same as KITTI [0.5, 0.5, 0]
+        box_np_ops.change_box3d_center_(gt_bboxes_3d, [0.5, 0.5, 0.5],
+                                        [0.5, 0.5, 0])
+        gt_names_3d = info['gt_names'][mask]
+
+        if self.with_velocity:
+            gt_velocity = info['gt_velocity'][mask]
+            nan_mask = np.isnan(gt_velocity[:, 0])
+            gt_velocity[nan_mask] = [0.0, 0.0]
+            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+
+        gt_bboxes_3d_mask = np.array(
+            [n in self.class_names for n in gt_names_3d], dtype=np.bool_)
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_names_3d=gt_names_3d,
+            gt_bboxes_3d_mask=gt_bboxes_3d_mask,
+        )
+        return anns_results
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        nusc_annos = {}
+        mapped_class_names = self.class_names
+        token2info = {}
+        for info in self.infos:
+            token2info[info['token']] = info
+        print('Start to convert detection format...')
+        for det in mmcv.track_iter_progress(results):
+            annos = []
+            boxes = output_to_nusc_box(det[0])
+            boxes = lidar_nusc_box_to_global(token2info[det[0]['sample_idx']],
+                                             boxes, mapped_class_names,
+                                             self.eval_detection_configs,
+                                             self.eval_version)
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
+                    if name in [
+                            'car',
+                            'construction_vehicle',
+                            'bus',
+                            'truck',
+                            'trailer',
+                    ]:
+                        attr = 'vehicle.moving'
+                    elif name in ['bicycle', 'motorcycle']:
+                        attr = 'cycle.with_rider'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+                else:
+                    if name in ['pedestrian']:
+                        attr = 'pedestrian.standing'
+                    elif name in ['bus']:
+                        attr = 'vehicle.stopped'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+
+                nusc_anno = dict(
+                    sample_token=det[0]['sample_idx'],
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr)
+                annos.append(nusc_anno)
+            nusc_annos[det[0]['sample_idx']] = annos
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        mmcv.dump(nusc_submissions, res_path)
+        return res_path
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='pts_bbox'):
+        from nuscenes import NuScenes
+        from nuscenes.eval.detection.evaluate import NuScenesEval
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        nusc = NuScenes(
+            version=self.version, dataroot=self.data_root, verbose=False)
+        eval_set_map = {
+            'v1.0-mini': 'mini_train',
+            'v1.0-trainval': 'val',
+        }
+        nusc_eval = NuScenesEval(
+            nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=False)
+        nusc_eval.main(render_curves=False)
+
+        # record metrics
+        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = '{}_NuScenes'.format(result_name)
+        for name in self.class_names:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
+
+    def format_results(self, results, jsonfile_prefix=None):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list): Testing results of the dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing
+                the json filepaths, tmp_dir is the temporal directory created
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        if not isinstance(results[0], dict):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            result_files = dict()
+            for name in results[0]:
+                print('Formating bboxes of {}'.format(name))
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['pts_bbox']):
+        """Evaluation in nuScenes protocol.
+
+        Args:
+            results (list): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            dict[str: float]
+        """
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name])
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return results_dict
+
+
+def output_to_nusc_box(detection):
+    box3d = detection['box3d_lidar'].numpy()
+    scores = detection['scores'].numpy()
+    labels = detection['label_preds'].numpy()
+    # TODO: check whether this is necessary
+    # with dir_offset & dir_limit in the head
+    box3d[:, 6] = -box3d[:, 6] - np.pi / 2
+    # the trained model is in [0.5, 0.5, 0],
+    # change them back to nuscenes [0.5, 0.5, 0.5]
+    box_np_ops.change_box3d_center_(box3d, [0.5, 0.5, 0], [0.5, 0.5, 0.5])
+    box_list = []
+    for i in range(box3d.shape[0]):
+        quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box3d[i, 6])
+        velocity = (*box3d[i, 7:9], 0.0)
+        # velo_val = np.linalg.norm(box3d[i, 7:9])
+        # velo_ori = box3d[i, 6]
+        # velocity = (
+        # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+        box = NuScenesBox(
+            box3d[i, :3],
+            box3d[i, 3:6],
+            quat,
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box_list.append(box)
+    return box_list
+
+
+def lidar_nusc_box_to_global(info,
+                             boxes,
+                             classes,
+                             eval_configs,
+                             eval_version='detection_cvpr_2019'):
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation']))
+        box.translate(np.array(info['lidar2ego_translation']))
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+    return box_list
diff --git a/mmdet3d/datasets/pipelines/__init__.py b/mmdet3d/datasets/pipelines/__init__.py
new file mode 100644
index 0000000000..44863eed45
--- /dev/null
+++ b/mmdet3d/datasets/pipelines/__init__.py
@@ -0,0 +1,13 @@
+from mmdet.dataset import Compose
+from .formating import (Collect, Collect3D, ImageToTensor, ToDataContainer,
+                        ToTensor, Transpose, to_tensor)
+from .train_aug import (GlobalRotScale, ObjectNoise, ObjectRangeFilter,
+                        ObjectSample, PointShuffle, PointsRangeFilter,
+                        RandomFlip3D)
+
+__all__ = [
+    'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer',
+    'Transpose', 'Collect', 'PhotoMetricDistortion', 'ObjectSample',
+    'RandomFlip3D', 'ObjectNoise', 'GlobalRotScale', 'PointShuffle',
+    'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D'
+]
diff --git a/mmdet3d/datasets/pipelines/data_augment_utils.py b/mmdet3d/datasets/pipelines/data_augment_utils.py
new file mode 100644
index 0000000000..268958cef6
--- /dev/null
+++ b/mmdet3d/datasets/pipelines/data_augment_utils.py
@@ -0,0 +1,326 @@
+import numba
+import numpy as np
+
+from mmdet3d.core.bbox import box_np_ops
+
+
+@numba.njit
+def _rotation_box2d_jit_(corners, angle, rot_mat_T):
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    rot_mat_T[0, 0] = rot_cos
+    rot_mat_T[0, 1] = -rot_sin
+    rot_mat_T[1, 0] = rot_sin
+    rot_mat_T[1, 1] = rot_cos
+    corners[:] = corners @ rot_mat_T
+
+
+@numba.jit(nopython=True)
+def box_collision_test(boxes, qboxes, clockwise=True):
+    N = boxes.shape[0]
+    K = qboxes.shape[0]
+    ret = np.zeros((N, K), dtype=np.bool_)
+    slices = np.array([1, 2, 3, 0])
+    lines_boxes = np.stack((boxes, boxes[:, slices, :]),
+                           axis=2)  # [N, 4, 2(line), 2(xy)]
+    lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2)
+    # vec = np.zeros((2,), dtype=boxes.dtype)
+    boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes)
+    qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes)
+    for i in range(N):
+        for j in range(K):
+            # calculate standup first
+            iw = (
+                min(boxes_standup[i, 2], qboxes_standup[j, 2]) -
+                max(boxes_standup[i, 0], qboxes_standup[j, 0]))
+            if iw > 0:
+                ih = (
+                    min(boxes_standup[i, 3], qboxes_standup[j, 3]) -
+                    max(boxes_standup[i, 1], qboxes_standup[j, 1]))
+                if ih > 0:
+                    for k in range(4):
+                        for l in range(4):
+                            A = lines_boxes[i, k, 0]
+                            B = lines_boxes[i, k, 1]
+                            C = lines_qboxes[j, l, 0]
+                            D = lines_qboxes[j, l, 1]
+                            acd = (D[1] - A[1]) * (C[0] -
+                                                   A[0]) > (C[1] - A[1]) * (
+                                                       D[0] - A[0])
+                            bcd = (D[1] - B[1]) * (C[0] -
+                                                   B[0]) > (C[1] - B[1]) * (
+                                                       D[0] - B[0])
+                            if acd != bcd:
+                                abc = (C[1] - A[1]) * (B[0] - A[0]) > (
+                                    B[1] - A[1]) * (
+                                        C[0] - A[0])
+                                abd = (D[1] - A[1]) * (B[0] - A[0]) > (
+                                    B[1] - A[1]) * (
+                                        D[0] - A[0])
+                                if abc != abd:
+                                    ret[i, j] = True  # collision.
+                                    break
+                        if ret[i, j] is True:
+                            break
+                    if ret[i, j] is False:
+                        # now check complete overlap.
+                        # box overlap qbox:
+                        box_overlap_qbox = True
+                        for l in range(4):  # point l in qboxes
+                            for k in range(4):  # corner k in boxes
+                                vec = boxes[i, k] - boxes[i, (k + 1) % 4]
+                                if clockwise:
+                                    vec = -vec
+                                cross = vec[1] * (
+                                    boxes[i, k, 0] - qboxes[j, l, 0])
+                                cross -= vec[0] * (
+                                    boxes[i, k, 1] - qboxes[j, l, 1])
+                                if cross >= 0:
+                                    box_overlap_qbox = False
+                                    break
+                            if box_overlap_qbox is False:
+                                break
+
+                        if box_overlap_qbox is False:
+                            qbox_overlap_box = True
+                            for l in range(4):  # point l in boxes
+                                for k in range(4):  # corner k in qboxes
+                                    vec = qboxes[j, k] - qboxes[j, (k + 1) % 4]
+                                    if clockwise:
+                                        vec = -vec
+                                    cross = vec[1] * (
+                                        qboxes[j, k, 0] - boxes[i, l, 0])
+                                    cross -= vec[0] * (
+                                        qboxes[j, k, 1] - boxes[i, l, 1])
+                                    if cross >= 0:  #
+                                        qbox_overlap_box = False
+                                        break
+                                if qbox_overlap_box is False:
+                                    break
+                            if qbox_overlap_box:
+                                ret[i, j] = True  # collision.
+                        else:
+                            ret[i, j] = True  # collision.
+    return ret
+
+
+@numba.njit
+def noise_per_box(boxes, valid_mask, loc_noises, rot_noises):
+    # boxes: [N, 5]
+    # valid_mask: [N]
+    # loc_noises: [N, M, 3]
+    # rot_noises: [N, M]
+    num_boxes = boxes.shape[0]
+    num_tests = loc_noises.shape[1]
+    box_corners = box_np_ops.box2d_to_corner_jit(boxes)
+    current_corners = np.zeros((4, 2), dtype=boxes.dtype)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    success_mask = -np.ones((num_boxes, ), dtype=np.int64)
+    # print(valid_mask)
+    for i in range(num_boxes):
+        if valid_mask[i]:
+            for j in range(num_tests):
+                current_corners[:] = box_corners[i]
+                current_corners -= boxes[i, :2]
+                _rotation_box2d_jit_(current_corners, rot_noises[i, j],
+                                     rot_mat_T)
+                current_corners += boxes[i, :2] + loc_noises[i, j, :2]
+                coll_mat = box_collision_test(
+                    current_corners.reshape(1, 4, 2), box_corners)
+                coll_mat[0, i] = False
+                # print(coll_mat)
+                if not coll_mat.any():
+                    success_mask[i] = j
+                    box_corners[i] = current_corners
+                    break
+    return success_mask
+
+
+@numba.njit
+def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises,
+                      global_rot_noises):
+    # boxes: [N, 5]
+    # valid_mask: [N]
+    # loc_noises: [N, M, 3]
+    # rot_noises: [N, M]
+    num_boxes = boxes.shape[0]
+    num_tests = loc_noises.shape[1]
+    box_corners = box_np_ops.box2d_to_corner_jit(boxes)
+    current_corners = np.zeros((4, 2), dtype=boxes.dtype)
+    current_box = np.zeros((1, 5), dtype=boxes.dtype)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    dst_pos = np.zeros((2, ), dtype=boxes.dtype)
+    success_mask = -np.ones((num_boxes, ), dtype=np.int64)
+    corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
+    corners_norm[1, 1] = 1.0
+    corners_norm[2] = 1.0
+    corners_norm[3, 0] = 1.0
+    corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
+    corners_norm = corners_norm.reshape(4, 2)
+    for i in range(num_boxes):
+        if valid_mask[i]:
+            for j in range(num_tests):
+                current_box[0, :] = boxes[i]
+                current_radius = np.sqrt(boxes[i, 0]**2 + boxes[i, 1]**2)
+                current_grot = np.arctan2(boxes[i, 0], boxes[i, 1])
+                dst_grot = current_grot + global_rot_noises[i, j]
+                dst_pos[0] = current_radius * np.sin(dst_grot)
+                dst_pos[1] = current_radius * np.cos(dst_grot)
+                current_box[0, :2] = dst_pos
+                current_box[0, -1] += (dst_grot - current_grot)
+
+                rot_sin = np.sin(current_box[0, -1])
+                rot_cos = np.cos(current_box[0, -1])
+                rot_mat_T[0, 0] = rot_cos
+                rot_mat_T[0, 1] = -rot_sin
+                rot_mat_T[1, 0] = rot_sin
+                rot_mat_T[1, 1] = rot_cos
+                current_corners[:] = current_box[
+                    0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2]
+                current_corners -= current_box[0, :2]
+                _rotation_box2d_jit_(current_corners, rot_noises[i, j],
+                                     rot_mat_T)
+                current_corners += current_box[0, :2] + loc_noises[i, j, :2]
+                coll_mat = box_collision_test(
+                    current_corners.reshape(1, 4, 2), box_corners)
+                coll_mat[0, i] = False
+                if not coll_mat.any():
+                    success_mask[i] = j
+                    box_corners[i] = current_corners
+                    loc_noises[i, j, :2] += (dst_pos - boxes[i, :2])
+                    rot_noises[i, j] += (dst_grot - current_grot)
+                    break
+    return success_mask
+
+
+def _select_transform(transform, indices):
+    result = np.zeros((transform.shape[0], *transform.shape[2:]),
+                      dtype=transform.dtype)
+    for i in range(transform.shape[0]):
+        if indices[i] != -1:
+            result[i] = transform[i, indices[i]]
+    return result
+
+
+@numba.njit
+def _rotation_matrix_3d_(rot_mat_T, angle, axis):
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    rot_mat_T[:] = np.eye(3)
+    if axis == 1:
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 2] = -rot_sin
+        rot_mat_T[2, 0] = rot_sin
+        rot_mat_T[2, 2] = rot_cos
+    elif axis == 2 or axis == -1:
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 1] = -rot_sin
+        rot_mat_T[1, 0] = rot_sin
+        rot_mat_T[1, 1] = rot_cos
+    elif axis == 0:
+        rot_mat_T[1, 1] = rot_cos
+        rot_mat_T[1, 2] = -rot_sin
+        rot_mat_T[2, 1] = rot_sin
+        rot_mat_T[2, 2] = rot_cos
+
+
+@numba.njit
+def points_transform_(points, centers, point_masks, loc_transform,
+                      rot_transform, valid_mask):
+    num_box = centers.shape[0]
+    num_points = points.shape[0]
+    rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype)
+    for i in range(num_box):
+        _rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2)
+    for i in range(num_points):
+        for j in range(num_box):
+            if valid_mask[j]:
+                if point_masks[i, j] == 1:
+                    points[i, :3] -= centers[j, :3]
+                    points[i:i + 1, :3] = points[i:i + 1, :3] @ rot_mat_T[j]
+                    points[i, :3] += centers[j, :3]
+                    points[i, :3] += loc_transform[j]
+                    break  # only apply first box's transform
+
+
+@numba.njit
+def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask):
+    num_box = boxes.shape[0]
+    for i in range(num_box):
+        if valid_mask[i]:
+            boxes[i, :3] += loc_transform[i]
+            boxes[i, 6] += rot_transform[i]
+
+
+def noise_per_object_v3_(gt_boxes,
+                         points=None,
+                         valid_mask=None,
+                         rotation_perturb=np.pi / 4,
+                         center_noise_std=1.0,
+                         global_random_rot_range=np.pi / 4,
+                         num_try=100):
+    """random rotate or remove each groundtrutn independently.
+    use kitti viewer to test this function points_transform_
+
+    Args:
+        gt_boxes: [N, 7], gt box in lidar.points_transform_
+        points: [M, 4], point cloud in lidar.
+    """
+    num_boxes = gt_boxes.shape[0]
+    if not isinstance(rotation_perturb, (list, tuple, np.ndarray)):
+        rotation_perturb = [-rotation_perturb, rotation_perturb]
+    if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)):
+        global_random_rot_range = [
+            -global_random_rot_range, global_random_rot_range
+        ]
+    enable_grot = np.abs(global_random_rot_range[0] -
+                         global_random_rot_range[1]) >= 1e-3
+
+    if not isinstance(center_noise_std, (list, tuple, np.ndarray)):
+        center_noise_std = [
+            center_noise_std, center_noise_std, center_noise_std
+        ]
+    if valid_mask is None:
+        valid_mask = np.ones((num_boxes, ), dtype=np.bool_)
+    center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype)
+
+    loc_noises = np.random.normal(
+        scale=center_noise_std, size=[num_boxes, num_try, 3])
+    rot_noises = np.random.uniform(
+        rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try])
+    gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1])
+    grot_lowers = global_random_rot_range[0] - gt_grots
+    grot_uppers = global_random_rot_range[1] - gt_grots
+    global_rot_noises = np.random.uniform(
+        grot_lowers[..., np.newaxis],
+        grot_uppers[..., np.newaxis],
+        size=[num_boxes, num_try])
+
+    origin = [0.5, 0.5, 0]
+    gt_box_corners = box_np_ops.center_to_corner_box3d(
+        gt_boxes[:, :3],
+        gt_boxes[:, 3:6],
+        gt_boxes[:, 6],
+        origin=origin,
+        axis=2)
+
+    # TODO: rewrite this noise box function?
+    if not enable_grot:
+        selected_noise = noise_per_box(gt_boxes[:, [0, 1, 3, 4, 6]],
+                                       valid_mask, loc_noises, rot_noises)
+    else:
+        selected_noise = noise_per_box_v2_(gt_boxes[:, [0, 1, 3, 4, 6]],
+                                           valid_mask, loc_noises, rot_noises,
+                                           global_rot_noises)
+
+    loc_transforms = _select_transform(loc_noises, selected_noise)
+    rot_transforms = _select_transform(rot_noises, selected_noise)
+    surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners)
+    if points is not None:
+        # TODO: replace this points_in_convex function by my tools?
+        point_masks = box_np_ops.points_in_convex_polygon_3d_jit(
+            points[:, :3], surfaces)
+        points_transform_(points, gt_boxes[:, :3], point_masks, loc_transforms,
+                          rot_transforms, valid_mask)
+
+    box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask)
diff --git a/mmdet3d/datasets/pipelines/dbsampler.py b/mmdet3d/datasets/pipelines/dbsampler.py
new file mode 100644
index 0000000000..f0357d2d40
--- /dev/null
+++ b/mmdet3d/datasets/pipelines/dbsampler.py
@@ -0,0 +1,509 @@
+import copy
+import os
+import pickle
+
+import cv2
+import mmcv
+import numpy as np
+
+from mmdet3d.core.bbox import box_np_ops
+from mmdet3d.datasets.pipelines import data_augment_utils
+from ..registry import OBJECTSAMPLERS
+
+
+class BatchSampler:
+
+    def __init__(self,
+                 sampled_list,
+                 name=None,
+                 epoch=None,
+                 shuffle=True,
+                 drop_reminder=False):
+        self._sampled_list = sampled_list
+        self._indices = np.arange(len(sampled_list))
+        if shuffle:
+            np.random.shuffle(self._indices)
+        self._idx = 0
+        self._example_num = len(sampled_list)
+        self._name = name
+        self._shuffle = shuffle
+        self._epoch = epoch
+        self._epoch_counter = 0
+        self._drop_reminder = drop_reminder
+
+    def _sample(self, num):
+        if self._idx + num >= self._example_num:
+            ret = self._indices[self._idx:].copy()
+            self._reset()
+        else:
+            ret = self._indices[self._idx:self._idx + num]
+            self._idx += num
+        return ret
+
+    def _reset(self):
+        assert self._name is not None
+        # print("reset", self._name)
+        if self._shuffle:
+            np.random.shuffle(self._indices)
+        self._idx = 0
+
+    def sample(self, num):
+        indices = self._sample(num)
+        return [self._sampled_list[i] for i in indices]
+
+
+@OBJECTSAMPLERS.register_module
+class DataBaseSampler(object):
+
+    def __init__(self, info_path, root_path, rate, prepare, object_rot_range,
+                 sample_groups, use_road_plane):
+        super().__init__()
+        self.root_path = root_path
+        self.info_path = info_path
+        self.rate = rate
+        self.prepare = prepare
+        self.object_rot_range = object_rot_range
+
+        with open(info_path, 'rb') as f:
+            db_infos = pickle.load(f)
+
+        # filter database infos
+        from mmdet3d.apis import get_root_logger
+        logger = get_root_logger()
+        for k, v in db_infos.items():
+            logger.info(f'load {len(v)} {k} database infos')
+        for prep_func, val in prepare.items():
+            db_infos = getattr(self, prep_func)(db_infos, val)
+        logger.info('After filter database:')
+        for k, v in db_infos.items():
+            logger.info(f'load {len(v)} {k} database infos')
+
+        self.db_infos = db_infos
+
+        # load sample groups
+        # TODO: more elegant way to load sample groups
+        self.sample_groups = []
+        for name, num in sample_groups.items():
+            self.sample_groups.append({name: int(num)})
+
+        self.group_db_infos = self.db_infos  # just use db_infos
+        self.sample_classes = []
+        self.sample_max_nums = []
+        for group_info in self.sample_groups:
+            self.sample_classes += list(group_info.keys())
+            self.sample_max_nums += list(group_info.values())
+
+        self.sampler_dict = {}
+        for k, v in self.group_db_infos.items():
+            self.sampler_dict[k] = BatchSampler(v, k, shuffle=True)
+
+        self.object_rot_range = object_rot_range
+        self.object_rot_enable = np.abs(self.object_rot_range[0] -
+                                        self.object_rot_range[1]) >= 1e-3
+
+        # TODO: No group_sampling currently
+
+    @staticmethod
+    def filter_by_difficulty(db_infos, removed_difficulty):
+        new_db_infos = {}
+        for key, dinfos in db_infos.items():
+            new_db_infos[key] = [
+                info for info in dinfos
+                if info['difficulty'] not in removed_difficulty
+            ]
+        return new_db_infos
+
+    @staticmethod
+    def filter_by_min_points(db_infos, min_gt_points_dict):
+        for name, min_num in min_gt_points_dict.items():
+            min_num = int(min_num)
+            if min_num > 0:
+                filtered_infos = []
+                for info in db_infos[name]:
+                    if info['num_points_in_gt'] >= min_num:
+                        filtered_infos.append(info)
+                db_infos[name] = filtered_infos
+        return db_infos
+
+    def sample_all(self, gt_bboxes, gt_names, img=None):
+        sampled_num_dict = {}
+        sample_num_per_class = []
+        for class_name, max_sample_num in zip(self.sample_classes,
+                                              self.sample_max_nums):
+            sampled_num = int(max_sample_num -
+                              np.sum([n == class_name for n in gt_names]))
+            sampled_num = np.round(self.rate * sampled_num).astype(np.int64)
+            sampled_num_dict[class_name] = sampled_num
+            sample_num_per_class.append(sampled_num)
+
+        sampled = []
+        sampled_gt_bboxes = []
+        avoid_coll_boxes = gt_bboxes
+
+        for class_name, sampled_num in zip(self.sample_classes,
+                                           sample_num_per_class):
+            if sampled_num > 0:
+                sampled_cls = self.sample_class_v2(class_name, sampled_num,
+                                                   avoid_coll_boxes)
+
+                sampled += sampled_cls
+                if len(sampled_cls) > 0:
+                    if len(sampled_cls) == 1:
+                        sampled_gt_box = sampled_cls[0]['box3d_lidar'][
+                            np.newaxis, ...]
+                    else:
+                        sampled_gt_box = np.stack(
+                            [s['box3d_lidar'] for s in sampled_cls], axis=0)
+
+                    sampled_gt_bboxes += [sampled_gt_box]
+                    avoid_coll_boxes = np.concatenate(
+                        [avoid_coll_boxes, sampled_gt_box], axis=0)
+
+        ret = None
+        if len(sampled) > 0:
+            sampled_gt_bboxes = np.concatenate(sampled_gt_bboxes, axis=0)
+            # center = sampled_gt_bboxes[:, 0:3]
+
+            num_sampled = len(sampled)
+            s_points_list = []
+            count = 0
+            for info in sampled:
+                file_path = os.path.join(
+                    self.root_path,
+                    info['path']) if self.root_path else info['path']
+                s_points = np.fromfile(
+                    file_path, dtype=np.float32).reshape([-1, 4])
+
+                if 'rot_transform' in info:
+                    rot = info['rot_transform']
+                    s_points[:, :3] = box_np_ops.rotation_points_single_angle(
+                        s_points[:, :3], rot, axis=2)
+                s_points[:, :3] += info['box3d_lidar'][:3]
+
+                count += 1
+
+                s_points_list.append(s_points)
+
+            ret = {
+                'gt_names':
+                np.array([s['name'] for s in sampled]),
+                'difficulty':
+                np.array([s['difficulty'] for s in sampled]),
+                'gt_bboxes_3d':
+                sampled_gt_bboxes,
+                'points':
+                np.concatenate(s_points_list, axis=0),
+                'gt_masks':
+                np.ones((num_sampled, ), dtype=np.bool_),
+                'group_ids':
+                np.arange(gt_bboxes.shape[0],
+                          gt_bboxes.shape[0] + len(sampled))
+            }
+
+        return ret
+
+    def sample_class_v2(self, name, num, gt_bboxes):
+        sampled = self.sampler_dict[name].sample(num)
+        sampled = copy.deepcopy(sampled)
+        num_gt = gt_bboxes.shape[0]
+        num_sampled = len(sampled)
+        gt_bboxes_bv = box_np_ops.center_to_corner_box2d(
+            gt_bboxes[:, 0:2], gt_bboxes[:, 3:5], gt_bboxes[:, 6])
+
+        sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0)
+
+        valid_mask = np.zeros([gt_bboxes.shape[0]], dtype=np.bool_)
+        valid_mask = np.concatenate(
+            [valid_mask,
+             np.ones([sp_boxes.shape[0]], dtype=np.bool_)], axis=0)
+        boxes = np.concatenate([gt_bboxes, sp_boxes], axis=0).copy()
+        if self.object_rot_enable:
+            assert False, 'This part needs to be checked'
+            # place samples to any place in a circle.
+            # TODO: rm it if not needed
+            data_augment_utils.noise_per_object_v3_(
+                boxes,
+                None,
+                valid_mask,
+                0,
+                0,
+                self._global_rot_range,
+                num_try=100)
+
+        sp_boxes_new = boxes[gt_bboxes.shape[0]:]
+        sp_boxes_bv = box_np_ops.center_to_corner_box2d(
+            sp_boxes_new[:, 0:2], sp_boxes_new[:, 3:5], sp_boxes_new[:, 6])
+
+        total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0)
+        coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv)
+        diag = np.arange(total_bv.shape[0])
+        coll_mat[diag, diag] = False
+
+        valid_samples = []
+        for i in range(num_gt, num_gt + num_sampled):
+            if coll_mat[i].any():
+                coll_mat[i] = False
+                coll_mat[:, i] = False
+            else:
+                if self.object_rot_enable:
+                    assert False, 'This part needs to be checked'
+                    sampled[i - num_gt]['box3d_lidar'][:2] = boxes[i, :2]
+                    sampled[i - num_gt]['box3d_lidar'][-1] = boxes[i, -1]
+                    sampled[i - num_gt]['rot_transform'] = (
+                        boxes[i, -1] - sp_boxes[i - num_gt, -1])
+                valid_samples.append(sampled[i - num_gt])
+        return valid_samples
+
+
+@OBJECTSAMPLERS.register_module
+class MMDataBaseSampler(DataBaseSampler):
+
+    def __init__(self,
+                 info_path,
+                 root_path,
+                 rate,
+                 prepare,
+                 object_rot_range,
+                 sample_groups,
+                 check_2D_collision=False,
+                 collision_thr=0,
+                 collision_in_classes=False,
+                 depth_consistent=False,
+                 blending_type=None):
+        super(MMDataBaseSampler, self).__init__(
+            info_path=info_path,
+            root_path=root_path,
+            rate=rate,
+            prepare=prepare,
+            object_rot_range=object_rot_range,
+            sample_groups=sample_groups,
+            use_road_plane=False,
+        )
+        self.blending_type = blending_type
+        self.depth_consistent = depth_consistent
+        self.check_2D_collision = check_2D_collision
+        self.collision_thr = collision_thr
+        self.collision_in_classes = collision_in_classes
+
+    def sample_all(self, gt_bboxes_3d, gt_names, gt_bboxes_2d=None, img=None):
+        sampled_num_dict = {}
+        sample_num_per_class = []
+        for class_name, max_sample_num in zip(self.sample_classes,
+                                              self.sample_max_nums):
+            sampled_num = int(max_sample_num -
+                              np.sum([n == class_name for n in gt_names]))
+            sampled_num = np.round(self.rate * sampled_num).astype(np.int64)
+            sampled_num_dict[class_name] = sampled_num
+            sample_num_per_class.append(sampled_num)
+
+        sampled = []
+        sampled_gt_bboxes_3d = []
+        sampled_gt_bboxes_2d = []
+        avoid_coll_boxes_3d = gt_bboxes_3d
+        avoid_coll_boxes_2d = gt_bboxes_2d
+
+        for class_name, sampled_num in zip(self.sample_classes,
+                                           sample_num_per_class):
+            if sampled_num > 0:
+                sampled_cls = self.sample_class_v2(class_name, sampled_num,
+                                                   avoid_coll_boxes_3d,
+                                                   avoid_coll_boxes_2d)
+
+                sampled += sampled_cls
+                if len(sampled_cls) > 0:
+                    if len(sampled_cls) == 1:
+                        sampled_gt_box_3d = sampled_cls[0]['box3d_lidar'][
+                            np.newaxis, ...]
+                        sampled_gt_box_2d = sampled_cls[0]['box2d_camera'][
+                            np.newaxis, ...]
+                    else:
+                        sampled_gt_box_3d = np.stack(
+                            [s['box3d_lidar'] for s in sampled_cls], axis=0)
+                        sampled_gt_box_2d = np.stack(
+                            [s['box2d_camera'] for s in sampled_cls], axis=0)
+
+                    sampled_gt_bboxes_3d += [sampled_gt_box_3d]
+                    sampled_gt_bboxes_2d += [sampled_gt_box_2d]
+                    if self.collision_in_classes:
+                        # TODO: check whether check collision check among
+                        # classes is necessary
+                        avoid_coll_boxes_3d = np.concatenate(
+                            [avoid_coll_boxes_3d, sampled_gt_box_3d], axis=0)
+                        avoid_coll_boxes_2d = np.concatenate(
+                            [avoid_coll_boxes_2d, sampled_gt_box_2d], axis=0)
+
+        ret = None
+        if len(sampled) > 0:
+            sampled_gt_bboxes_3d = np.concatenate(sampled_gt_bboxes_3d, axis=0)
+            sampled_gt_bboxes_2d = np.concatenate(sampled_gt_bboxes_2d, axis=0)
+
+            num_sampled = len(sampled)
+            s_points_list = []
+            count = 0
+
+            if self.depth_consistent:
+                # change the paster order based on distance
+                center = sampled_gt_bboxes_3d[:, 0:3]
+                paste_order = np.argsort(
+                    -np.power(np.sum(np.power(center, 2), axis=-1), 1 / 2),
+                    axis=-1)
+
+            for idx in range(len(sampled)):
+                if self.depth_consistent:
+                    inds = np.where(paste_order == idx)[0][0]
+                    info = sampled[inds]
+                else:
+                    info = sampled[idx]
+                pcd_file_path = os.path.join(
+                    self.root_path,
+                    info['path']) if self.root_path else info['path']
+                img_file_path = pcd_file_path + '.png'
+                mask_file_path = pcd_file_path + '.mask.png'
+                s_points = np.fromfile(
+                    pcd_file_path, dtype=np.float32).reshape([-1, 4])
+                s_patch = mmcv.imread(img_file_path)
+                s_mask = mmcv.imread(mask_file_path, 'grayscale')
+
+                if 'rot_transform' in info:
+                    rot = info['rot_transform']
+                    s_points[:, :3] = box_np_ops.rotation_points_single_angle(
+                        s_points[:, :3], rot, axis=2)
+                    # TODO: might need to rot 2d bbox in the future
+
+                # the points of each sample already minus the object center
+                # so this time it needs to add the offset back
+                s_points[:, :3] += info['box3d_lidar'][:3]
+                img = self.paste_obj(
+                    img,
+                    s_patch,
+                    s_mask,
+                    bbox_2d=info['box2d_camera'].astype(np.int32))
+
+                count += 1
+                s_points_list.append(s_points)
+
+            ret = dict(
+                img=img,
+                gt_names=np.array([s['name'] for s in sampled]),
+                difficulty=np.array([s['difficulty'] for s in sampled]),
+                gt_bboxes_3d=sampled_gt_bboxes_3d,
+                gt_bboxes_2d=sampled_gt_bboxes_2d,
+                points=np.concatenate(s_points_list, axis=0),
+                gt_masks=np.ones((num_sampled, ), dtype=np.bool_),
+                group_ids=np.arange(gt_bboxes_3d.shape[0],
+                                    gt_bboxes_3d.shape[0] + len(sampled)))
+
+        return ret
+
+    def paste_obj(self, img, obj_img, obj_mask, bbox_2d):
+        # paste the image patch back
+        x1, y1, x2, y2 = bbox_2d
+        # the bbox might exceed the img size because the img is different
+        img_h, img_w = img.shape[:2]
+        w = np.maximum(min(x2, img_w - 1) - x1 + 1, 1)
+        h = np.maximum(min(y2, img_h - 1) - y1 + 1, 1)
+        obj_mask = obj_mask[:h, :w]
+        obj_img = obj_img[:h, :w]
+
+        # choose a blend option
+        if not self.blending_type:
+            blending_op = 'none'
+
+        else:
+            blending_choice = np.random.randint(len(self.blending_type))
+            blending_op = self.blending_type[blending_choice]
+
+        if blending_op.find('poisson') != -1:
+            # options: cv2.NORMAL_CLONE=1, or cv2.MONOCHROME_TRANSFER=3
+            # cv2.MIXED_CLONE mixed the texture, thus is not used.
+            if blending_op == 'poisson':
+                mode = np.random.choice([1, 3], 1)[0]
+            elif blending_op == 'poisson_normal':
+                mode = cv2.NORMAL_CLONE
+            elif blending_op == 'poisson_transfer':
+                mode = cv2.MONOCHROME_TRANSFER
+            else:
+                raise NotImplementedError
+            center = (int(x1 + w / 2), int(y1 + h / 2))
+            img = cv2.seamlessClone(obj_img, img, obj_mask * 255, center, mode)
+        else:
+            if blending_op == 'gaussian':
+                obj_mask = cv2.GaussianBlur(
+                    obj_mask.astype(np.float32), (5, 5), 2)
+            elif blending_op == 'box':
+                obj_mask = cv2.blur(obj_mask.astype(np.float32), (3, 3))
+            paste_mask = 1 - obj_mask
+            img[y1:y1 + h,
+                x1:x1 + w] = (img[y1:y1 + h, x1:x1 + w].astype(np.float32) *
+                              paste_mask[..., None]).astype(np.uint8)
+            img[y1:y1 + h, x1:x1 + w] += (obj_img.astype(np.float32) *
+                                          obj_mask[..., None]).astype(np.uint8)
+        return img
+
+    def sample_class_v2(self, name, num, gt_bboxes_3d, gt_bboxes_2d):
+        sampled = self.sampler_dict[name].sample(num)
+        sampled = copy.deepcopy(sampled)
+        num_gt = gt_bboxes_3d.shape[0]
+        num_sampled = len(sampled)
+        # avoid collision in BEV first
+        gt_bboxes_bv = box_np_ops.center_to_corner_box2d(
+            gt_bboxes_3d[:, 0:2], gt_bboxes_3d[:, 3:5], gt_bboxes_3d[:, 6])
+        sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0)
+        sp_boxes_bv = box_np_ops.center_to_corner_box2d(
+            sp_boxes[:, 0:2], sp_boxes[:, 3:5], sp_boxes[:, 6])
+        total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0)
+        coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv)
+
+        # Then avoid collision in 2D space
+        if self.check_2D_collision:
+            sp_boxes_2d = np.stack([i['box2d_camera'] for i in sampled],
+                                   axis=0)
+            total_bbox_2d = np.concatenate([gt_bboxes_2d, sp_boxes_2d],
+                                           axis=0)  # Nx4
+            # random select a collision threshold
+            if isinstance(self.collision_thr, float):
+                collision_thr = self.collision_thr
+            elif isinstance(self.collision_thr, list):
+                collision_thr = np.random.choice(self.collision_thr)
+            elif isinstance(self.collision_thr, dict):
+                mode = self.collision_thr.get('mode', 'value')
+                if mode == 'value':
+                    collision_thr = np.random.choice(
+                        self.collision_thr['thr_range'])
+                elif mode == 'range':
+                    collision_thr = np.random.uniform(
+                        self.collision_thr['thr_range'][0],
+                        self.collision_thr['thr_range'][1])
+
+            if collision_thr == 0:
+                # use similar collision test as BEV did
+                # Nx4 (x1, y1, x2, y2) -> corners: Nx4x2
+                # ((x1, y1), (x2, y1), (x1, y2), (x2, y2))
+                x1y1 = total_bbox_2d[:, :2]
+                x2y2 = total_bbox_2d[:, 2:]
+                x1y2 = np.stack([total_bbox_2d[:, 0], total_bbox_2d[:, 3]],
+                                axis=-1)
+                x2y1 = np.stack([total_bbox_2d[:, 2], total_bbox_2d[:, 1]],
+                                axis=-1)
+                total_2d = np.stack([x1y1, x2y1, x1y2, x2y2], axis=1)
+                coll_mat_2d = data_augment_utils.box_collision_test(
+                    total_2d, total_2d)
+            else:
+                # use iof rather than iou to protect the foreground
+                overlaps = box_np_ops.iou_jit(total_bbox_2d, total_bbox_2d,
+                                              'iof')
+                coll_mat_2d = overlaps > collision_thr
+            coll_mat = coll_mat + coll_mat_2d
+
+        diag = np.arange(total_bv.shape[0])
+        coll_mat[diag, diag] = False
+
+        valid_samples = []
+        for i in range(num_gt, num_gt + num_sampled):
+            if coll_mat[i].any():
+                coll_mat[i] = False
+                coll_mat[:, i] = False
+            else:
+                valid_samples.append(sampled[i - num_gt])
+
+        return valid_samples
diff --git a/mmdet3d/datasets/pipelines/formating.py b/mmdet3d/datasets/pipelines/formating.py
new file mode 100644
index 0000000000..14eeaa96e3
--- /dev/null
+++ b/mmdet3d/datasets/pipelines/formating.py
@@ -0,0 +1,165 @@
+import numpy as np
+from mmcv.parallel import DataContainer as DC
+
+from mmdet.datasets.pipelines import PIPELINES, to_tensor
+
+PIPELINES._module_dict.pop('DefaultFormatBundle')
+
+
+@PIPELINES.register_module
+class DefaultFormatBundle(object):
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor,
+                       (3)to DataContainer (stack=True)
+    """
+
+    def __init__(self, ):
+        return
+
+    def __call__(self, results):
+        if 'img' in results:
+            if isinstance(results['img'], list):
+                # process multiple imgs in single frame
+                imgs = [img.transpose(2, 0, 1) for img in results['img']]
+                imgs = np.ascontiguousarray(np.stack(imgs, axis=0))
+                results['img'] = DC(to_tensor(imgs), stack=True)
+            else:
+                img = np.ascontiguousarray(results['img'].transpose(2, 0, 1))
+                results['img'] = DC(to_tensor(img), stack=True)
+        for key in [
+                'proposals', 'gt_bboxes', 'gt_bboxes_3d', 'gt_bboxes_ignore',
+                'gt_labels', 'gt_labels_3d'
+        ]:
+            if key not in results:
+                continue
+            if isinstance(results[key], list):
+                results[key] = DC([to_tensor(res) for res in results[key]])
+            else:
+                results[key] = DC(to_tensor(results[key]))
+        if 'gt_masks' in results:
+            results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = DC(
+                to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+@PIPELINES.register_module
+class Collect3D(object):
+
+    def __init__(self,
+                 keys,
+                 pcd_shape=[1, 1600, 1408],
+                 meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
+                            'pad_shape', 'scale_factor', 'flip', 'pcd_flip',
+                            'img_norm_cfg', 'rect', 'Trv2c', 'P2', 'pcd_trans',
+                            'sample_idx', 'pcd_scale_factor', 'pcd_rotation')):
+        self.keys = keys
+        self.meta_keys = meta_keys
+        self.pcd_shape = pcd_shape
+
+    def __call__(self, results):
+        data = {}
+        img_meta = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_meta[key] = results[key]
+        img_meta.update(pcd_shape=self.pcd_shape, pcd_pad_shape=self.pcd_shape)
+        data['img_meta'] = DC(img_meta, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(keys={}, meta_keys={})'.format(
+            self.keys, self.meta_keys)
+
+
+@PIPELINES.register_module
+class DefaultFormatBundle3D(DefaultFormatBundle):
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields for voxels,
+    including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
+    "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    """
+
+    def __init__(self, class_names, with_gt=True, with_label=True):
+        super(DefaultFormatBundle3D, self).__init__()
+        self.class_names = class_names
+        self.with_gt = with_gt
+        self.with_label = with_label
+
+    def __call__(self, results):
+        # Format 3D data
+        for key in [
+                'voxels', 'coors', 'voxel_centers', 'num_points', 'points'
+        ]:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]), stack=False)
+
+        if self.with_gt:
+            # Clean GT bboxes in the final
+            if 'gt_bboxes_3d_mask' in results:
+                gt_bboxes_3d_mask = results['gt_bboxes_3d_mask']
+                results['gt_bboxes_3d'] = results['gt_bboxes_3d'][
+                    gt_bboxes_3d_mask]
+                results['gt_names_3d'] = results['gt_names_3d'][
+                    gt_bboxes_3d_mask]
+            if 'gt_bboxes_mask' in results:
+                gt_bboxes_mask = results['gt_bboxes_mask']
+                if 'gt_bboxes' in results:
+                    results['gt_bboxes'] = results['gt_bboxes'][gt_bboxes_mask]
+                results['gt_names'] = results['gt_names'][gt_bboxes_mask]
+            if self.with_label:
+                if 'gt_names' in results and len(results['gt_names']) == 0:
+                    results['gt_labels'] = np.array([], dtype=np.int64)
+                elif 'gt_names' in results and isinstance(
+                        results['gt_names'][0], list):
+                    # gt_labels might be a list of list in multi-view setting
+                    results['gt_labels'] = [
+                        np.array([self.class_names.index(n) for n in res],
+                                 dtype=np.int64) for res in results['gt_names']
+                    ]
+                elif 'gt_names' in results:
+                    results['gt_labels'] = np.array([
+                        self.class_names.index(n) for n in results['gt_names']
+                    ],
+                                                    dtype=np.int64)
+                # we still assume one pipeline for one frame LiDAR
+                # thus, the 3D name is list[string]
+                results['gt_labels_3d'] = np.array([
+                    self.class_names.index(n) for n in results['gt_names_3d']
+                ],
+                                                   dtype=np.int64)
+        results = super(DefaultFormatBundle3D, self).__call__(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(class_names={}, '.format(self.class_names)
+        repr_str += 'with_gt={}, with_label={})'.format(
+            self.with_gt, self.with_label)
+        return repr_str
diff --git a/mmdet3d/datasets/pipelines/loading.py b/mmdet3d/datasets/pipelines/loading.py
new file mode 100644
index 0000000000..31437faae4
--- /dev/null
+++ b/mmdet3d/datasets/pipelines/loading.py
@@ -0,0 +1,143 @@
+import os.path as osp
+
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+
+from mmdet.datasets.pipelines import PIPELINES
+
+
+@PIPELINES.register_module
+class LoadImageFromFile(object):
+
+    def __init__(self, to_float32=False):
+        self.to_float32 = to_float32
+
+    def __call__(self, results):
+        if results['img_prefix'] is not None:
+            filename = osp.join(results['img_prefix'],
+                                results['img_info']['filename'])
+        else:
+            filename = results['img_info']['filename']
+        img = mmcv.imread(filename)
+        if self.to_float32:
+            img = img.astype(np.float32)
+        results['filename'] = filename
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(to_float32={})'.format(
+            self.to_float32)
+
+
+@PIPELINES.register_module
+class LoadAnnotations(object):
+
+    def __init__(self,
+                 with_bbox=True,
+                 with_label=True,
+                 with_mask=False,
+                 with_seg=False,
+                 poly2mask=True):
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_mask = with_mask
+        self.with_seg = with_seg
+        self.poly2mask = poly2mask
+
+    def _load_bboxes(self, results):
+        ann_info = results['ann_info']
+        results['gt_bboxes'] = ann_info['bboxes']
+
+        gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
+        if gt_bboxes_ignore is not None:
+            results['gt_bboxes_ignore'] = gt_bboxes_ignore
+            results['bbox_fields'].append('gt_bboxes_ignore')
+        results['bbox_fields'].append('gt_bboxes')
+        return results
+
+    def _load_labels(self, results):
+        results['gt_labels'] = results['ann_info']['labels']
+        return results
+
+    def _poly2mask(self, mask_ann, img_h, img_w):
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+            rle = maskUtils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = maskUtils.decode(rle)
+        return mask
+
+    def _load_masks(self, results):
+        h, w = results['img_info']['height'], results['img_info']['width']
+        gt_masks = results['ann_info']['masks']
+        if self.poly2mask:
+            gt_masks = [self._poly2mask(mask, h, w) for mask in gt_masks]
+        results['gt_masks'] = gt_masks
+        results['mask_fields'].append('gt_masks')
+        return results
+
+    def _load_semantic_seg(self, results):
+        results['gt_semantic_seg'] = mmcv.imread(
+            osp.join(results['seg_prefix'], results['ann_info']['seg_map']),
+            flag='unchanged').squeeze()
+        results['seg_fields'].append('gt_semantic_seg')
+        return results
+
+    def __call__(self, results):
+        if self.with_bbox:
+            results = self._load_bboxes(results)
+            if results is None:
+                return None
+        if self.with_label:
+            results = self._load_labels(results)
+        if self.with_mask:
+            results = self._load_masks(results)
+        if self.with_seg:
+            results = self._load_semantic_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += ('(with_bbox={}, with_label={}, with_mask={},'
+                     ' with_seg={})').format(self.with_bbox, self.with_label,
+                                             self.with_mask, self.with_seg)
+        return repr_str
+
+
+@PIPELINES.register_module
+class LoadProposals(object):
+
+    def __init__(self, num_max_proposals=None):
+        self.num_max_proposals = num_max_proposals
+
+    def __call__(self, results):
+        proposals = results['proposals']
+        if proposals.shape[1] not in (4, 5):
+            raise AssertionError(
+                'proposals should have shapes (n, 4) or (n, 5), '
+                'but found {}'.format(proposals.shape))
+        proposals = proposals[:, :4]
+
+        if self.num_max_proposals is not None:
+            proposals = proposals[:self.num_max_proposals]
+
+        if len(proposals) == 0:
+            proposals = np.array([[0, 0, 0, 0]], dtype=np.float32)
+        results['proposals'] = proposals
+        results['bbox_fields'].append('proposals')
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(num_max_proposals={})'.format(
+            self.num_max_proposals)
diff --git a/mmdet3d/datasets/pipelines/train_aug.py b/mmdet3d/datasets/pipelines/train_aug.py
new file mode 100644
index 0000000000..cf8eb71de4
--- /dev/null
+++ b/mmdet3d/datasets/pipelines/train_aug.py
@@ -0,0 +1,326 @@
+import numpy as np
+
+from mmdet3d.core.bbox import box_np_ops
+from mmdet3d.utils import build_from_cfg
+from mmdet.datasets.registry import PIPELINES
+from ..registry import OBJECTSAMPLERS
+from .data_augment_utils import noise_per_object_v3_
+from .transforms import RandomFlip
+
+
+@PIPELINES.register_module
+class RandomFlip3D(RandomFlip):
+    """Flip the points & bbox.
+
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+
+    Args:
+        flip_ratio (float, optional): The flipping probability.
+    """
+
+    def __init__(self, sync_2d=True, **kwargs):
+        super(RandomFlip3D, self).__init__(**kwargs)
+        self.sync_2d = sync_2d
+
+    def random_flip_points(self, gt_bboxes_3d, points):
+        gt_bboxes_3d[:, 1] = -gt_bboxes_3d[:, 1]
+        gt_bboxes_3d[:, 6] = -gt_bboxes_3d[:, 6] + np.pi
+        points[:, 1] = -points[:, 1]
+        if gt_bboxes_3d.shape[1] == 9:
+            # flip velocitys at the same time
+            gt_bboxes_3d[:, 8] = -gt_bboxes_3d[:, 8]
+        return gt_bboxes_3d, points
+
+    def __call__(self, input_dict):
+        super(RandomFlip3D, self).__call__(input_dict)
+        if self.sync_2d:
+            input_dict['pcd_flip'] = input_dict['flip']
+        else:
+            flip = True if np.random.rand() < self.flip_ratio else False
+            input_dict['pcd_flip'] = flip
+        if input_dict['pcd_flip']:
+            # flip image
+            gt_bboxes_3d = input_dict['gt_bboxes_3d']
+            points = input_dict['points']
+            gt_bboxes_3d, points = self.random_flip_points(
+                gt_bboxes_3d, points)
+            input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+            input_dict['points'] = points
+        return input_dict
+
+
+@PIPELINES.register_module
+class ObjectSample(object):
+
+    def __init__(self, db_sampler, sample_2d=False):
+        self.sampler_cfg = db_sampler
+        self.sample_2d = sample_2d
+        if 'type' not in db_sampler.keys():
+            db_sampler['type'] = 'DataBaseSampler'
+        self.db_sampler = build_from_cfg(db_sampler, OBJECTSAMPLERS)
+
+    @staticmethod
+    def remove_points_in_boxes(points, boxes):
+        masks = box_np_ops.points_in_rbbox(points, boxes)
+        points = points[np.logical_not(masks.any(-1))]
+        return points
+
+    def __call__(self, input_dict):
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_names_3d = input_dict['gt_names_3d']
+        gt_bboxes_3d_mask = input_dict['gt_bboxes_3d_mask']
+        # change to float for blending operation
+        points = input_dict['points']
+        #         rect = input_dict['rect']
+        #         Trv2c = input_dict['Trv2c']
+        #         P2 = input_dict['P2']
+        if self.sample_2d:
+            img = input_dict['img']  # .astype(np.float32)
+            gt_bboxes_2d = input_dict['gt_bboxes']
+            gt_bboxes_mask = input_dict['gt_bboxes_mask']
+            gt_names = input_dict['gt_names']
+            # Assume for now 3D & 2D bboxes are the same
+            sampled_dict = self.db_sampler.sample_all(
+                gt_bboxes_3d, gt_names_3d, gt_bboxes_2d=gt_bboxes_2d, img=img)
+        else:
+            sampled_dict = self.db_sampler.sample_all(
+                gt_bboxes_3d, gt_names_3d, img=None)
+
+        if sampled_dict is not None:
+            sampled_gt_names = sampled_dict['gt_names']
+            sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d']
+            sampled_points = sampled_dict['points']
+            sampled_gt_masks = sampled_dict['gt_masks']
+
+            gt_names_3d = np.concatenate([gt_names_3d, sampled_gt_names],
+                                         axis=0)
+            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, sampled_gt_bboxes_3d
+                                           ]).astype(np.float32)
+            gt_bboxes_3d_mask = np.concatenate(
+                [gt_bboxes_3d_mask, sampled_gt_masks], axis=0)
+            points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d)
+            # check the points dimension
+            dim_inds = points.shape[-1]
+            points = np.concatenate([sampled_points[:, :dim_inds], points],
+                                    axis=0)
+
+            if self.sample_2d:
+                sampled_gt_bboxes_2d = sampled_dict['gt_bboxes_2d']
+                gt_bboxes_2d = np.concatenate(
+                    [gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32)
+                gt_bboxes_mask = np.concatenate(
+                    [gt_bboxes_mask, sampled_gt_masks], axis=0)
+                gt_names = np.concatenate([gt_names, sampled_gt_names], axis=0)
+                input_dict['gt_names'] = gt_names
+                input_dict['gt_bboxes'] = gt_bboxes_2d
+                input_dict['gt_bboxes_mask'] = gt_bboxes_mask
+                input_dict['img'] = sampled_dict['img']  # .astype(np.uint8)
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_names_3d'] = gt_names_3d
+        input_dict['points'] = points
+        input_dict['gt_bboxes_3d_mask'] = gt_bboxes_3d_mask
+        return input_dict
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+@PIPELINES.register_module
+class ObjectNoise(object):
+
+    def __init__(self,
+                 loc_noise_std=[0.25, 0.25, 0.25],
+                 global_rot_range=[0.0, 0.0],
+                 rot_uniform_noise=[-0.15707963267, 0.15707963267],
+                 num_try=100):
+        self.loc_noise_std = loc_noise_std
+        self.global_rot_range = global_rot_range
+        self.rot_uniform_noise = rot_uniform_noise
+        self.num_try = num_try
+
+    def __call__(self, input_dict):
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        points = input_dict['points']
+        gt_bboxes_3d_mask = input_dict['gt_bboxes_3d_mask']
+        # TODO: check this inplace function
+        noise_per_object_v3_(
+            gt_bboxes_3d,
+            points,
+            gt_bboxes_3d_mask,
+            rotation_perturb=self.rot_uniform_noise,
+            center_noise_std=self.loc_noise_std,
+            global_random_rot_range=self.global_rot_range,
+            num_try=self.num_try)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.astype('float32')
+        input_dict['points'] = points
+        return input_dict
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(num_try={},'.format(self.num_try)
+        repr_str += ' loc_noise_std={},'.format(self.loc_noise_std)
+        repr_str += ' global_rot_range={},'.format(self.global_rot_range)
+        repr_str += ' rot_uniform_noise={})'.format(self.rot_uniform_noise)
+        return repr_str
+
+
+@PIPELINES.register_module
+class GlobalRotScale(object):
+
+    def __init__(self,
+                 rot_uniform_noise=[-0.78539816, 0.78539816],
+                 scaling_uniform_noise=[0.95, 1.05],
+                 trans_normal_noise=[0, 0, 0]):
+        self.rot_uniform_noise = rot_uniform_noise
+        self.scaling_uniform_noise = scaling_uniform_noise
+        self.trans_normal_noise = trans_normal_noise
+
+    def _trans_bbox_points(self, gt_boxes, points):
+        noise_trans = np.random.normal(0, self.trans_normal_noise[0], 3).T
+        points[:, :3] += noise_trans
+        gt_boxes[:, :3] += noise_trans
+        return gt_boxes, points, noise_trans
+
+    def _rot_bbox_points(self, gt_boxes, points, rotation=np.pi / 4):
+        if not isinstance(rotation, list):
+            rotation = [-rotation, rotation]
+        noise_rotation = np.random.uniform(rotation[0], rotation[1])
+        points[:, :3], rot_mat_T = box_np_ops.rotation_points_single_angle(
+            points[:, :3], noise_rotation, axis=2)
+        gt_boxes[:, :3], _ = box_np_ops.rotation_points_single_angle(
+            gt_boxes[:, :3], noise_rotation, axis=2)
+        gt_boxes[:, 6] += noise_rotation
+        if gt_boxes.shape[1] == 9:
+            # rotate velo vector
+            rot_cos = np.cos(noise_rotation)
+            rot_sin = np.sin(noise_rotation)
+            rot_mat_T_bev = np.array([[rot_cos, -rot_sin], [rot_sin, rot_cos]],
+                                     dtype=points.dtype)
+            gt_boxes[:, 7:9] = gt_boxes[:, 7:9] @ rot_mat_T_bev
+        return gt_boxes, points, rot_mat_T
+
+    def _scale_bbox_points(self,
+                           gt_boxes,
+                           points,
+                           min_scale=0.95,
+                           max_scale=1.05):
+        noise_scale = np.random.uniform(min_scale, max_scale)
+        points[:, :3] *= noise_scale
+        gt_boxes[:, :6] *= noise_scale
+        if gt_boxes.shape[1] == 9:
+            gt_boxes[:, 7:] *= noise_scale
+        return gt_boxes, points, noise_scale
+
+    def __call__(self, input_dict):
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        points = input_dict['points']
+
+        gt_bboxes_3d, points, rotation_factor = self._rot_bbox_points(
+            gt_bboxes_3d, points, rotation=self.rot_uniform_noise)
+        gt_bboxes_3d, points, scale_factor = self._scale_bbox_points(
+            gt_bboxes_3d, points, *self.scaling_uniform_noise)
+        gt_bboxes_3d, points, trans_factor = self._trans_bbox_points(
+            gt_bboxes_3d, points)
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.astype('float32')
+        input_dict['points'] = points
+        input_dict['pcd_scale_factor'] = scale_factor
+        input_dict['pcd_rotation'] = rotation_factor
+        input_dict['pcd_trans'] = trans_factor
+        return input_dict
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(rot_uniform_noise={},'.format(self.rot_uniform_noise)
+        repr_str += ' scaling_uniform_noise={},'.format(
+            self.scaling_uniform_noise)
+        repr_str += ' trans_normal_noise={})'.format(self.trans_normal_noise)
+        return repr_str
+
+
+@PIPELINES.register_module
+class PointShuffle(object):
+
+    def __call__(self, input_dict):
+        np.random.shuffle(input_dict['points'])
+        return input_dict
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+@PIPELINES.register_module
+class ObjectRangeFilter(object):
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+        self.bev_range = self.pcd_range[[0, 1, 3, 4]]
+
+    @staticmethod
+    def limit_period(val, offset=0.5, period=np.pi):
+        return val - np.floor(val / period + offset) * period
+
+    @staticmethod
+    def filter_gt_box_outside_range(gt_bboxes_3d, limit_range):
+        """remove gtbox outside training range.
+        this function should be applied after other prep functions
+        Args:
+            gt_bboxes_3d ([type]): [description]
+            limit_range ([type]): [description]
+        """
+        gt_bboxes_3d_bv = box_np_ops.center_to_corner_box2d(
+            gt_bboxes_3d[:, [0, 1]], gt_bboxes_3d[:, [3, 3 + 1]],
+            gt_bboxes_3d[:, 6])
+        bounding_box = box_np_ops.minmax_to_corner_2d(
+            np.asarray(limit_range)[np.newaxis, ...])
+        ret = box_np_ops.points_in_convex_polygon_jit(
+            gt_bboxes_3d_bv.reshape(-1, 2), bounding_box)
+        return np.any(ret.reshape(-1, 4), axis=1)
+
+    def __call__(self, input_dict):
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_names_3d = input_dict['gt_names_3d']
+        gt_bboxes_3d_mask = input_dict['gt_bboxes_3d_mask']
+        mask = self.filter_gt_box_outside_range(gt_bboxes_3d, self.bev_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        gt_names_3d = gt_names_3d[mask]
+        # the mask should also be updated
+        gt_bboxes_3d_mask = gt_bboxes_3d_mask[mask]
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d[:, 6] = self.limit_period(
+            gt_bboxes_3d[:, 6], offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.astype('float32')
+        input_dict['gt_names_3d'] = gt_names_3d
+        input_dict['gt_bboxes_3d_mask'] = gt_bboxes_3d_mask
+        return input_dict
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(point_cloud_range={})'.format(self.pcd_range.tolist())
+        return repr_str
+
+
+@PIPELINES.register_module
+class PointsRangeFilter(object):
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(
+            point_cloud_range, dtype=np.float32)[np.newaxis, :]
+
+    def __call__(self, input_dict):
+        points = input_dict['points']
+        points_mask = ((points[:, :3] >= self.pcd_range[:, :3])
+                       & (points[:, :3] < self.pcd_range[:, 3:]))
+        points_mask = points_mask[:, 0] & points_mask[:, 1] & points_mask[:, 2]
+        clean_points = points[points_mask, :]
+        input_dict['points'] = clean_points
+        return input_dict
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += '(point_cloud_range={})'.format(self.pcd_range.tolist())
+        return repr_str
diff --git a/mmdet3d/datasets/registry.py b/mmdet3d/datasets/registry.py
new file mode 100644
index 0000000000..b1acde485c
--- /dev/null
+++ b/mmdet3d/datasets/registry.py
@@ -0,0 +1,3 @@
+from mmdet3d.utils import Registry
+
+OBJECTSAMPLERS = Registry('object_sampler')
diff --git a/mmdet3d/datasets/utils.py b/mmdet3d/datasets/utils.py
new file mode 100644
index 0000000000..9e3a7a2fc3
--- /dev/null
+++ b/mmdet3d/datasets/utils.py
@@ -0,0 +1,37 @@
+from collections import Sequence
+
+import mmcv
+import numpy as np
+import torch
+
+
+def remove_dontcare(image_anno):
+    img_filtered_annotations = {}
+    relevant_annotation_indices = [
+        i for i, x in enumerate(image_anno['name']) if x != 'DontCare'
+    ]
+    for key in image_anno.keys():
+        img_filtered_annotations[key] = (
+            image_anno[key][relevant_annotation_indices])
+    return img_filtered_annotations
+
+
+def to_tensor(data):
+    # TODO: remove this duplicated method in the future
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+    """
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError('type {} cannot be converted to tensor.'.format(
+            type(data)))
diff --git a/mmdet3d/models/__init__.py b/mmdet3d/models/__init__.py
new file mode 100644
index 0000000000..4e2b48972e
--- /dev/null
+++ b/mmdet3d/models/__init__.py
@@ -0,0 +1,21 @@
+from .anchor_heads import *  # noqa: F401,F403
+from .backbones import *  # noqa: F401,F403
+from .bbox_heads import *  # noqa: F401,F403
+from .builder import (build_backbone, build_detector, build_head, build_loss,
+                      build_neck, build_roi_extractor, build_shared_head)
+from .detectors import *  # noqa: F401,F403
+from .fusion_layers import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .middle_encoders import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .registry import (BACKBONES, DETECTORS, HEADS, LOSSES, MIDDLE_ENCODERS,
+                       NECKS, ROI_EXTRACTORS, SHARED_HEADS, VOXEL_ENCODERS)
+from .roi_extractors import *  # noqa: F401,F403
+from .voxel_encoders import *  # noqa: F401,F403
+
+__all__ = [
+    'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'SHARED_HEADS', 'HEADS', 'LOSSES',
+    'VOXEL_ENCODERS', 'MIDDLE_ENCODERS', 'DETECTORS', 'build_backbone',
+    'build_neck', 'build_roi_extractor', 'build_shared_head', 'build_head',
+    'build_loss', 'build_detector'
+]
diff --git a/mmdet3d/models/anchor_heads/__init__.py b/mmdet3d/models/anchor_heads/__init__.py
new file mode 100644
index 0000000000..a86c226f0c
--- /dev/null
+++ b/mmdet3d/models/anchor_heads/__init__.py
@@ -0,0 +1,4 @@
+from .boxvelo_head import Anchor3DVeloHead
+from .second_head import SECONDHead
+
+__all__ = ['Anchor3DVeloHead', 'SECONDHead']
diff --git a/mmdet3d/models/anchor_heads/boxvelo_head.py b/mmdet3d/models/anchor_heads/boxvelo_head.py
new file mode 100644
index 0000000000..d30d759784
--- /dev/null
+++ b/mmdet3d/models/anchor_heads/boxvelo_head.py
@@ -0,0 +1,224 @@
+import numpy as np
+import torch
+from mmcv.cnn import normal_init
+
+from mmdet3d.core import box_torch_ops, boxes3d_to_bev_torch_lidar
+from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu
+from ..registry import HEADS
+from ..utils import bias_init_with_prob
+from .second_head import SECONDHead
+
+
+@HEADS.register_module
+class Anchor3DVeloHead(SECONDHead):
+    """Anchor-based head for 3D anchor with velocity
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of channels of the feature map.
+        anchor_scales (Iterable): Anchor scales.
+        anchor_ratios (Iterable): Anchor aspect ratios.
+        anchor_strides (Iterable): Anchor strides.
+        anchor_base_sizes (Iterable): Anchor base sizes.
+        target_means (Iterable): Mean values of regression targets.
+        target_stds (Iterable): Std values of regression targets.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+    """  # noqa: W605
+
+    def __init__(self,
+                 class_names,
+                 num_classes,
+                 in_channels,
+                 train_cfg,
+                 test_cfg,
+                 cache_anchor=False,
+                 feat_channels=256,
+                 use_direction_classifier=True,
+                 encode_bg_as_zeros=False,
+                 box_code_size=9,
+                 anchor_generator=dict(type='AnchorGeneratorRange', ),
+                 anchor_range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
+                 anchor_strides=[2],
+                 anchor_sizes=[[1.6, 3.9, 1.56]],
+                 anchor_rotations=[0, 1.57],
+                 anchor_custom_values=[0, 0],
+                 assigner_per_size=False,
+                 assign_per_class=False,
+                 diff_rad_by_sin=True,
+                 dir_offset=0,
+                 dir_limit_offset=1,
+                 target_means=(.0, .0, .0, .0),
+                 target_stds=(1.0, 1.0, 1.0, 1.0),
+                 bbox_coder=dict(type='ResidualCoder', ),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+                 loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2)):
+        super().__init__(class_names, in_channels, train_cfg, test_cfg,
+                         cache_anchor, feat_channels, use_direction_classifier,
+                         encode_bg_as_zeros, box_code_size, anchor_generator,
+                         anchor_range, anchor_strides, anchor_sizes,
+                         anchor_rotations, anchor_custom_values,
+                         assigner_per_size, assign_per_class, diff_rad_by_sin,
+                         dir_offset, dir_limit_offset, target_means,
+                         target_stds, bbox_coder, loss_cls, loss_bbox,
+                         loss_dir)
+        self.num_classes = num_classes
+        # build head layers & losses
+        if not self.use_sigmoid_cls:
+            self.num_classes += 1
+        self._init_layers()
+
+    def init_weights(self):
+        # pass
+        # use the initialization when ready
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.conv_cls, std=0.01, bias=bias_cls)
+        normal_init(self.conv_reg, std=0.01)
+
+    @staticmethod
+    def add_sin_difference(boxes1, boxes2):
+        # Caution: the 7th dim is the rotation, (last dim without velo)
+        rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
+            boxes2[..., 6:7])
+        rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[...,
+                                                                         6:7])
+        boxes1 = torch.cat(
+            [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1)
+        boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]],
+                           dim=-1)
+        return boxes1, boxes2
+
+    def get_bboxes_single(self,
+                          cls_scores,
+                          bbox_preds,
+                          dir_cls_preds,
+                          mlvl_anchors,
+                          input_meta,
+                          rescale=False):
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
+                cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.num_classes)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1, self.box_code_size)
+
+            nms_pre = self.test_cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                dir_cls_score = dir_cls_score[topk_inds]
+
+            bboxes = self.bbox_coder.decode_torch(anchors, bbox_pred,
+                                                  self.target_means,
+                                                  self.target_stds)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_score)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_bboxes_for_nms = boxes3d_to_bev_torch_lidar(mlvl_bboxes)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the front when using sigmoid
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+
+        score_thr = self.test_cfg.get('score_thr', 0)
+        result = self.multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                     mlvl_scores, mlvl_dir_scores, score_thr,
+                                     self.test_cfg.max_per_img)
+
+        result.update(dict(sample_idx=input_meta['sample_idx']))
+        return result
+
+    def multiclass_nms(self, mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_scores,
+                       mlvl_dir_scores, score_thr, max_num):
+        # do multi class nms
+        # the fg class id range: [0, num_classes-1]
+        num_classes = mlvl_scores.shape[1] - 1
+        bboxes = []
+        scores = []
+        labels = []
+        dir_scores = []
+        for i in range(0, num_classes):
+            # get bboxes and scores of this class
+            cls_inds = mlvl_scores[:, i] > score_thr
+            if not cls_inds.any():
+                continue
+            _scores = mlvl_scores[cls_inds, i]
+            _bboxes_for_nms = mlvl_bboxes_for_nms[cls_inds, :]
+            if self.test_cfg.use_rotate_nms:
+                nms_func = nms_gpu
+            else:
+                nms_func = nms_normal_gpu
+            selected = nms_func(_bboxes_for_nms, _scores,
+                                self.test_cfg.nms_thr)
+
+            _mlvl_bboxes = mlvl_bboxes[cls_inds, :]
+            _mlvl_dir_scores = mlvl_dir_scores[cls_inds]
+
+            if len(selected) > 0:
+                bboxes.append(_mlvl_bboxes[selected])
+                scores.append(_scores[selected])
+                dir_scores.append(_mlvl_dir_scores[selected])
+                dir_rot = box_torch_ops.limit_period(
+                    bboxes[-1][..., 6] - self.dir_offset,
+                    self.dir_limit_offset, np.pi)
+                bboxes[-1][..., 6] = (
+                    dir_rot + self.dir_offset +
+                    np.pi * dir_scores[-1].to(bboxes[-1].dtype))
+
+                cls_label = mlvl_bboxes.new_full((len(selected), ),
+                                                 i,
+                                                 dtype=torch.long)
+                labels.append(cls_label)
+
+        if bboxes:
+            bboxes = torch.cat(bboxes, dim=0)
+            scores = torch.cat(scores, dim=0)
+            labels = torch.cat(labels, dim=0)
+            dir_scores = torch.cat(dir_scores, dim=0)
+            if bboxes.shape[0] > max_num:
+                _, inds = scores.sort(descending=True)
+                inds = inds[:max_num]
+                bboxes = bboxes[inds, :]
+                labels = labels[inds]
+                scores = scores[inds]
+                dir_scores = dir_scores[inds]
+            return dict(
+                box3d_lidar=bboxes.cpu(),
+                scores=scores.cpu(),
+                label_preds=labels.cpu(),
+            )
+        else:
+            return dict(
+                box3d_lidar=mlvl_bboxes.new_zeros([0,
+                                                   self.box_code_size]).cpu(),
+                scores=mlvl_bboxes.new_zeros([0]).cpu(),
+                label_preds=mlvl_bboxes.new_zeros([0, 4]).cpu(),
+            )
diff --git a/mmdet3d/models/anchor_heads/second_head.py b/mmdet3d/models/anchor_heads/second_head.py
new file mode 100644
index 0000000000..fa59ffa717
--- /dev/null
+++ b/mmdet3d/models/anchor_heads/second_head.py
@@ -0,0 +1,405 @@
+from __future__ import division
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import normal_init
+
+from mmdet3d.core import (PseudoSampler, box_torch_ops,
+                          boxes3d_to_bev_torch_lidar, build_anchor_generator,
+                          build_assigner, build_bbox_coder, build_sampler,
+                          multi_apply)
+from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu
+from ..builder import build_loss
+from ..registry import HEADS
+from ..utils import bias_init_with_prob
+from .train_mixins import AnchorTrainMixin
+
+
+@HEADS.register_module
+class SECONDHead(nn.Module, AnchorTrainMixin):
+    """Anchor-based head (RPN, RetinaNet, SSD, etc.).
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of channels of the feature map.
+        anchor_scales (Iterable): Anchor scales.
+        anchor_ratios (Iterable): Anchor aspect ratios.
+        anchor_strides (Iterable): Anchor strides.
+        anchor_base_sizes (Iterable): Anchor base sizes.
+        target_means (Iterable): Mean values of regression targets.
+        target_stds (Iterable): Std values of regression targets.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+    """  # noqa: W605
+
+    def __init__(self,
+                 class_name,
+                 in_channels,
+                 train_cfg,
+                 test_cfg,
+                 cache_anchor=False,
+                 feat_channels=256,
+                 use_direction_classifier=True,
+                 encode_bg_as_zeros=False,
+                 box_code_size=7,
+                 anchor_generator=dict(type='AnchorGeneratorRange'),
+                 anchor_range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
+                 anchor_strides=[2],
+                 anchor_sizes=[[1.6, 3.9, 1.56]],
+                 anchor_rotations=[0, 1.57],
+                 anchor_custom_values=[],
+                 assigner_per_size=False,
+                 assign_per_class=False,
+                 diff_rad_by_sin=True,
+                 dir_offset=0,
+                 dir_limit_offset=1,
+                 target_means=(.0, .0, .0, .0),
+                 target_stds=(1.0, 1.0, 1.0, 1.0),
+                 bbox_coder=dict(type='ResidualCoder'),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+                 loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2)):
+        super().__init__()
+        self.in_channels = in_channels
+        self.num_classes = len(class_name)
+        self.feat_channels = feat_channels
+        self.diff_rad_by_sin = diff_rad_by_sin
+        self.use_direction_classifier = use_direction_classifier
+        # self.encode_background_as_zeros = encode_bg_as_zeros
+        self.box_code_size = box_code_size
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.assigner_per_size = assigner_per_size
+        self.assign_per_class = assign_per_class
+        self.dir_offset = dir_offset
+        self.dir_limit_offset = dir_limit_offset
+
+        # build target assigner & sampler
+        if train_cfg is not None:
+            self.sampling = loss_cls['type'] not in ['FocalLoss', 'GHMC']
+            if self.sampling:
+                self.bbox_sampler = build_sampler(train_cfg.sampler)
+            else:
+                self.bbox_sampler = PseudoSampler()
+            if isinstance(train_cfg.assigner, dict):
+                self.bbox_assigner = build_assigner(train_cfg.assigner)
+            elif isinstance(train_cfg.assigner, list):
+                self.bbox_assigner = [
+                    build_assigner(res) for res in train_cfg.assigner
+                ]
+
+        # build anchor generator
+        self.anchor_range = anchor_range
+        self.anchor_rotations = anchor_rotations
+        self.anchor_strides = anchor_strides
+        self.anchor_sizes = anchor_sizes
+        self.target_means = target_means
+        self.target_stds = target_stds
+        self.anchor_generators = []
+        # In 3D detection, the anchor stride is connected with anchor size
+        self.num_anchors = (
+            len(self.anchor_rotations) * len(self.anchor_sizes))
+        # if len(self.anchor_sizes) != self.anchor_strides:
+        #     # this means different anchor in the same anchor strides
+        #     anchor_sizes = [self.anchor_sizes]
+        for anchor_stride in self.anchor_strides:
+            anchor_generator.update(
+                anchor_ranges=anchor_range,
+                sizes=self.anchor_sizes,
+                stride=anchor_stride,
+                rotations=anchor_rotations,
+                custom_values=anchor_custom_values,
+                cache_anchor=cache_anchor)
+            self.anchor_generators.append(
+                build_anchor_generator(anchor_generator))
+
+        self._init_layers()
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if not self.use_sigmoid_cls:
+            self.num_classes += 1
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_dir = build_loss(loss_dir)
+        self.fp16_enabled = False
+
+    def _init_layers(self):
+        self.cls_out_channels = self.num_anchors * self.num_classes
+        self.conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1)
+        self.conv_reg = nn.Conv2d(self.feat_channels,
+                                  self.num_anchors * self.box_code_size, 1)
+        if self.use_direction_classifier:
+            self.conv_dir_cls = nn.Conv2d(self.feat_channels,
+                                          self.num_anchors * 2, 1)
+
+    def init_weights(self):
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.conv_cls, std=0.01, bias=bias_cls)
+        normal_init(self.conv_reg, std=0.01)
+
+    def forward_single(self, x):
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        dir_cls_preds = None
+        if self.use_direction_classifier:
+            dir_cls_preds = self.conv_dir_cls(x)
+        return cls_score, bbox_pred, dir_cls_preds
+
+    def forward(self, feats):
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(self, featmap_sizes, input_metas):
+        """Get anchors according to feature map sizes.
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            input_metas (list[dict]): contain pcd and img's meta info.
+        Returns:
+            tuple: anchors of each image, valid flags of each image
+        """
+        num_imgs = len(input_metas)
+        num_levels = len(featmap_sizes)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = []
+        for i in range(num_levels):
+            anchors = self.anchor_generators[i].grid_anchors(featmap_sizes[i])
+            if not self.assigner_per_size:
+                anchors = anchors.reshape(-1, anchors.size(-1))
+            multi_level_anchors.append(anchors)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+        return anchor_list
+
+    def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels,
+                    label_weights, bbox_targets, bbox_weights, dir_targets,
+                    dir_weights, num_total_samples):
+        # classification loss
+        if num_total_samples is None:
+            num_total_samples = int(cls_score.shape[0])
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+
+        # regression loss
+        bbox_targets = bbox_targets.reshape(-1, self.box_code_size)
+        bbox_weights = bbox_weights.reshape(-1, self.box_code_size)
+        code_weight = self.train_cfg.get('code_weight', None)
+
+        if code_weight:
+            bbox_weights = bbox_weights * bbox_weights.new_tensor(code_weight)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, self.box_code_size)
+        if self.diff_rad_by_sin:
+            bbox_pred, bbox_targets = self.add_sin_difference(
+                bbox_pred, bbox_targets)
+        loss_bbox = self.loss_bbox(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            avg_factor=num_total_samples)
+
+        # direction classification loss
+        loss_dir = None
+        if self.use_direction_classifier:
+            dir_cls_preds = dir_cls_preds.permute(0, 2, 3, 1).reshape(-1, 2)
+            dir_targets = dir_targets.reshape(-1)
+            dir_weights = dir_weights.reshape(-1)
+            loss_dir = self.loss_dir(
+                dir_cls_preds,
+                dir_targets,
+                dir_weights,
+                avg_factor=num_total_samples)
+
+        return loss_cls, loss_bbox, loss_dir
+
+    @staticmethod
+    def add_sin_difference(boxes1, boxes2):
+        rad_pred_encoding = torch.sin(boxes1[..., -1:]) * torch.cos(
+            boxes2[..., -1:])
+        rad_tg_encoding = torch.cos(boxes1[..., -1:]) * torch.sin(boxes2[...,
+                                                                         -1:])
+        boxes1 = torch.cat([boxes1[..., :-1], rad_pred_encoding], dim=-1)
+        boxes2 = torch.cat([boxes2[..., :-1], rad_tg_encoding], dim=-1)
+        return boxes1, boxes2
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             dir_cls_preds,
+             gt_bboxes,
+             gt_labels,
+             input_metas,
+             gt_bboxes_ignore=None):
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == len(self.anchor_generators)
+
+        anchor_list = self.get_anchors(featmap_sizes, input_metas)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.anchor_target_3d(
+            anchor_list,
+            gt_bboxes,
+            input_metas,
+            self.target_means,
+            self.target_stds,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            num_classes=self.num_classes,
+            label_channels=label_channels,
+            sampling=self.sampling)
+
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         dir_targets_list, dir_weights_list, num_total_pos,
+         num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # num_total_samples = None
+        losses_cls, losses_bbox, losses_dir = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            dir_cls_preds,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            dir_targets_list,
+            dir_weights_list,
+            num_total_samples=num_total_samples)
+        return dict(
+            loss_cls_3d=losses_cls,
+            loss_bbox_3d=losses_bbox,
+            loss_dir_3d=losses_dir)
+
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   dir_cls_preds,
+                   input_metas,
+                   rescale=False):
+        assert len(cls_scores) == len(bbox_preds)
+        assert len(cls_scores) == len(dir_cls_preds)
+        num_levels = len(cls_scores)
+
+        mlvl_anchors = [
+            self.anchor_generators[i].grid_anchors(
+                cls_scores[i].size()[-2:]).reshape(-1, self.box_code_size)
+            for i in range(num_levels)
+        ]
+        result_list = []
+        for img_id in range(len(input_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            dir_cls_pred_list = [
+                dir_cls_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+
+            input_meta = input_metas[img_id]
+            proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list,
+                                               dir_cls_pred_list, mlvl_anchors,
+                                               input_meta, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def get_bboxes_single(self,
+                          cls_scores,
+                          bbox_preds,
+                          dir_cls_preds,
+                          mlvl_anchors,
+                          input_meta,
+                          rescale=False):
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        mlvl_bboxes_for_nms = []
+        for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
+                cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            if self.use_direction_classifier:
+                assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
+
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.num_classes)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1, self.box_code_size)
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+
+            score_thr = self.test_cfg.get('score_thr', 0)
+            if score_thr > 0:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    max_scores, _ = scores[:, 1:].max(dim=1)
+                thr_inds = (max_scores >= score_thr)
+                anchors = anchors[thr_inds]
+                bbox_pred = bbox_pred[thr_inds]
+                scores = scores[thr_inds]
+                dir_cls_scores = dir_cls_score[thr_inds]
+            bboxes = self.bbox_coder.decode_torch(anchors, bbox_pred,
+                                                  self.target_means,
+                                                  self.target_stds)
+            bboxes_for_nms = boxes3d_to_bev_torch_lidar(bboxes)
+            mlvl_bboxes_for_nms.append(bboxes_for_nms)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_scores)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_bboxes_for_nms = torch.cat(mlvl_bboxes_for_nms)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+
+        if len(mlvl_scores) > 0:
+            mlvl_scores, mlvl_label_preds = mlvl_scores.max(dim=-1)
+            if self.test_cfg.use_rotate_nms:
+                nms_func = nms_gpu
+            else:
+                nms_func = nms_normal_gpu
+            selected = nms_func(mlvl_bboxes_for_nms, mlvl_scores,
+                                self.test_cfg.nms_thr)
+        else:
+            selected = []
+
+        if len(selected) > 0:
+            selected_bboxes = mlvl_bboxes[selected]
+            selected_scores = mlvl_scores[selected]
+            selected_label_preds = mlvl_label_preds[selected]
+            selected_dir_scores = mlvl_dir_scores[selected]
+            dir_rot = box_torch_ops.limit_period(
+                selected_bboxes[..., -1] - self.dir_offset,
+                self.dir_limit_offset, np.pi)
+            selected_bboxes[..., -1] = (
+                dir_rot + self.dir_offset +
+                np.pi * selected_dir_scores.to(selected_bboxes.dtype))
+
+            return dict(
+                box3d_lidar=selected_bboxes.cpu(),
+                scores=selected_scores.cpu(),
+                label_preds=selected_label_preds.cpu(),
+                sample_idx=input_meta['sample_idx'],
+            )
+
+        return dict(
+            box3d_lidar=mlvl_scores.new_zeros([0, 7]).cpu(),
+            scores=mlvl_scores.new_zeros([0]).cpu(),
+            label_preds=mlvl_scores.new_zeros([0, 4]).cpu(),
+            sample_idx=input_meta['sample_idx'],
+        )
diff --git a/mmdet3d/models/anchor_heads/train_mixins.py b/mmdet3d/models/anchor_heads/train_mixins.py
new file mode 100644
index 0000000000..721a7c68db
--- /dev/null
+++ b/mmdet3d/models/anchor_heads/train_mixins.py
@@ -0,0 +1,245 @@
+import numpy as np
+import torch
+
+from mmdet3d.core import box_torch_ops, images_to_levels, multi_apply
+
+
+class AnchorTrainMixin(object):
+
+    def anchor_target_3d(self,
+                         anchor_list,
+                         gt_bboxes_list,
+                         input_metas,
+                         target_means,
+                         target_stds,
+                         gt_bboxes_ignore_list=None,
+                         gt_labels_list=None,
+                         label_channels=1,
+                         num_classes=1,
+                         sampling=True):
+        """Compute regression and classification targets for anchors.
+
+        Args:
+            anchor_list (list[list]): Multi level anchors of each image.
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            target_means (Iterable): Mean value of regression targets.
+            target_stds (Iterable): Std value of regression targets.
+
+        Returns:
+            tuple
+        """
+        num_imgs = len(input_metas)
+        assert len(anchor_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [
+            anchors.view(-1, self.box_code_size).size(0)
+            for anchors in anchor_list[0]
+        ]
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            anchor_list[i] = torch.cat(anchor_list[i])
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         all_dir_targets, all_dir_weights, pos_inds_list,
+         neg_inds_list) = multi_apply(
+             self.anchor_target_3d_single,
+             anchor_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             input_metas,
+             target_means=target_means,
+             target_stds=target_stds,
+             label_channels=label_channels,
+             num_classes=num_classes,
+             sampling=sampling)
+
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        dir_targets_list = images_to_levels(all_dir_targets, num_level_anchors)
+        dir_weights_list = images_to_levels(all_dir_weights, num_level_anchors)
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, dir_targets_list, dir_weights_list,
+                num_total_pos, num_total_neg)
+
+    def anchor_target_3d_single(self,
+                                anchors,
+                                gt_bboxes,
+                                gt_bboxes_ignore,
+                                gt_labels,
+                                input_meta,
+                                target_means,
+                                target_stds,
+                                label_channels=1,
+                                num_classes=1,
+                                sampling=True):
+        if isinstance(self.bbox_assigner, list):
+            feat_size = anchors.size(0) * anchors.size(1) * anchors.size(2)
+            rot_angles = anchors.size(-2)
+            assert len(self.bbox_assigner) == anchors.size(-3)
+            (total_labels, total_label_weights, total_bbox_targets,
+             total_bbox_weights, total_dir_targets, total_dir_weights,
+             total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], []
+            current_anchor_num = 0
+            for i, assigner in enumerate(self.bbox_assigner):
+                current_anchors = anchors[..., i, :, :].reshape(
+                    -1, self.box_code_size)
+                current_anchor_num += current_anchors.size(0)
+                if self.assign_per_class:
+                    gt_per_cls = (gt_labels == i)
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_bboxes[gt_per_cls, :],
+                        gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta,
+                        target_means, target_stds, label_channels, num_classes,
+                        sampling)
+                else:
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_bboxes, gt_bboxes_ignore,
+                        gt_labels, input_meta, target_means, target_stds,
+                        label_channels, num_classes, sampling)
+
+                (labels, label_weights, bbox_targets, bbox_weights,
+                 dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
+                total_labels.append(labels.reshape(feat_size, 1, rot_angles))
+                total_label_weights.append(
+                    label_weights.reshape(feat_size, 1, rot_angles))
+                total_bbox_targets.append(
+                    bbox_targets.reshape(feat_size, 1, rot_angles,
+                                         anchors.size(-1)))
+                total_bbox_weights.append(
+                    bbox_weights.reshape(feat_size, 1, rot_angles,
+                                         anchors.size(-1)))
+                total_dir_targets.append(
+                    dir_targets.reshape(feat_size, 1, rot_angles))
+                total_dir_weights.append(
+                    dir_weights.reshape(feat_size, 1, rot_angles))
+                total_pos_inds.append(pos_inds)
+                total_neg_inds.append(neg_inds)
+
+            total_labels = torch.cat(total_labels, dim=-2).reshape(-1)
+            total_label_weights = torch.cat(
+                total_label_weights, dim=-2).reshape(-1)
+            total_bbox_targets = torch.cat(
+                total_bbox_targets, dim=-3).reshape(-1, anchors.size(-1))
+            total_bbox_weights = torch.cat(
+                total_bbox_weights, dim=-3).reshape(-1, anchors.size(-1))
+            total_dir_targets = torch.cat(
+                total_dir_targets, dim=-2).reshape(-1)
+            total_dir_weights = torch.cat(
+                total_dir_weights, dim=-2).reshape(-1)
+            total_pos_inds = torch.cat(total_pos_inds, dim=0).reshape(-1)
+            total_neg_inds = torch.cat(total_neg_inds, dim=0).reshape(-1)
+            return (total_labels, total_label_weights, total_bbox_targets,
+                    total_bbox_weights, total_dir_targets, total_dir_weights,
+                    total_pos_inds, total_neg_inds)
+        else:
+            return self.anchor_target_single_assigner(
+                self.bbox_assigner, anchors, gt_bboxes, gt_bboxes_ignore,
+                gt_labels, input_meta, target_means, target_stds,
+                label_channels, num_classes, sampling)
+
+    def anchor_target_single_assigner(self,
+                                      bbox_assigner,
+                                      anchors,
+                                      gt_bboxes,
+                                      gt_bboxes_ignore,
+                                      gt_labels,
+                                      input_meta,
+                                      target_means,
+                                      target_stds,
+                                      label_channels=1,
+                                      num_classes=1,
+                                      sampling=True):
+        anchors = anchors.reshape(-1, anchors.size(-1))
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        dir_targets = anchors.new_zeros((anchors.shape[0]), dtype=torch.long)
+        dir_weights = anchors.new_zeros((anchors.shape[0]), dtype=torch.float)
+        labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        if len(gt_bboxes) > 0:
+            assign_result = bbox_assigner.assign(anchors, gt_bboxes,
+                                                 gt_bboxes_ignore, gt_labels)
+            sampling_result = self.bbox_sampler.sample(assign_result, anchors,
+                                                       gt_bboxes)
+            pos_inds = sampling_result.pos_inds
+            neg_inds = sampling_result.neg_inds
+        else:
+            pos_inds = torch.nonzero(
+                anchors.new_zeros((anchors.shape[0], ), dtype=torch.long) > 0
+            ).squeeze(-1).unique()
+            neg_inds = torch.nonzero(
+                anchors.new_zeros((anchors.shape[0], ), dtype=torch.long) ==
+                0).squeeze(-1).unique()
+
+        if gt_labels is not None:
+            labels += num_classes
+        if len(pos_inds) > 0:
+            pos_bbox_targets = self.bbox_coder.encode_torch(
+                sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes,
+                target_means, target_stds)
+            pos_dir_targets = get_direction_target(
+                sampling_result.pos_bboxes,
+                pos_bbox_targets,
+                self.dir_offset,
+                one_hot=False)
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            dir_targets[pos_inds] = pos_dir_targets
+            dir_weights[pos_inds] = 1.0
+
+            if gt_labels is None:
+                labels[pos_inds] = 1
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+        return (labels, label_weights, bbox_targets, bbox_weights, dir_targets,
+                dir_weights, pos_inds, neg_inds)
+
+
+def get_direction_target(anchors,
+                         reg_targets,
+                         dir_offset=0,
+                         num_bins=2,
+                         one_hot=True):
+    rot_gt = reg_targets[..., 6] + anchors[..., 6]
+    offset_rot = box_torch_ops.limit_period(rot_gt - dir_offset, 0, 2 * np.pi)
+    dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long()
+    dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
+    if one_hot:
+        dir_targets = torch.zeros(
+            *list(dir_cls_targets.shape),
+            num_bins,
+            dtype=anchors.dtype,
+            device=dir_cls_targets.device)
+        dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0)
+        dir_cls_targets = dir_targets
+    return dir_cls_targets
diff --git a/mmdet3d/models/backbones/__init__.py b/mmdet3d/models/backbones/__init__.py
new file mode 100644
index 0000000000..f3070c1133
--- /dev/null
+++ b/mmdet3d/models/backbones/__init__.py
@@ -0,0 +1,4 @@
+from mmdet.models.backbone import SSDVGG, HRNet, ResNet, ResNetV1d, ResNeXt
+from .second import SECOND
+
+__all__ = ['ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'SECOND']
diff --git a/mmdet3d/models/backbones/second.py b/mmdet3d/models/backbones/second.py
new file mode 100644
index 0000000000..0f1e18eb31
--- /dev/null
+++ b/mmdet3d/models/backbones/second.py
@@ -0,0 +1,84 @@
+from functools import partial
+
+import torch.nn as nn
+from mmcv.runner import load_checkpoint
+
+from ..registry import BACKBONES
+from ..utils import build_norm_layer
+
+
+class Empty(nn.Module):
+
+    def __init__(self, *args, **kwargs):
+        super(Empty, self).__init__()
+
+    def forward(self, *args, **kwargs):
+        if len(args) == 1:
+            return args[0]
+        elif len(args) == 0:
+            return None
+        return args
+
+
+@BACKBONES.register_module
+class SECOND(nn.Module):
+    """Compare with RPN, RPNV2 support arbitrary number of stage.
+    """
+
+    def __init__(self,
+                 in_channels=128,
+                 layer_nums=[3, 5, 5],
+                 layer_strides=[2, 2, 2],
+                 num_filters=[128, 128, 256],
+                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01)):
+        super(SECOND, self).__init__()
+        assert len(layer_strides) == len(layer_nums)
+        assert len(num_filters) == len(layer_nums)
+
+        if norm_cfg is not None:
+            Conv2d = partial(nn.Conv2d, bias=False)
+        else:
+            Conv2d = partial(nn.Conv2d, bias=True)
+
+        in_filters = [in_channels, *num_filters[:-1]]
+        # note that when stride > 1, conv2d with same padding isn't
+        # equal to pad-conv2d. we should use pad-conv2d.
+        blocks = []
+
+        for i, layer_num in enumerate(layer_nums):
+            norm_layer = (
+                build_norm_layer(norm_cfg, num_filters[i])[1]
+                if norm_cfg is not None else Empty)
+            block = [
+                nn.ZeroPad2d(1),
+                Conv2d(
+                    in_filters[i], num_filters[i], 3, stride=layer_strides[i]),
+                norm_layer,
+                nn.ReLU(inplace=True),
+            ]
+            for j in range(layer_num):
+                norm_layer = (
+                    build_norm_layer(norm_cfg, num_filters[i])[1]
+                    if norm_cfg is not None else Empty)
+                block.append(
+                    Conv2d(num_filters[i], num_filters[i], 3, padding=1))
+                block.append(norm_layer)
+                block.append(nn.ReLU(inplace=True))
+
+            block = nn.Sequential(*block)
+            blocks.append(block)
+
+        self.blocks = nn.ModuleList(blocks)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            from mmdet3d.apis import get_root_logger
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+
+    def forward(self, x):
+        outs = []
+        for i in range(len(self.blocks)):
+            x = self.blocks[i](x)
+            outs.append(x)
+        return tuple(outs)
diff --git a/mmdet3d/models/bbox_heads/__init__.py b/mmdet3d/models/bbox_heads/__init__.py
new file mode 100644
index 0000000000..41998d7d17
--- /dev/null
+++ b/mmdet3d/models/bbox_heads/__init__.py
@@ -0,0 +1,8 @@
+from mmdet.models.bbox_heads import (BBoxHead, ConvFCBBoxHead,
+                                     DoubleConvFCBBoxHead, Shared2FCBBoxHead,
+                                     Shared4Conv1FCBBoxHead)
+
+__all__ = [
+    'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead',
+    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead'
+]
diff --git a/mmdet3d/models/builder.py b/mmdet3d/models/builder.py
new file mode 100644
index 0000000000..8d101b18cb
--- /dev/null
+++ b/mmdet3d/models/builder.py
@@ -0,0 +1,56 @@
+from torch import nn
+
+from mmdet.models.registry import (BACKBONES, DETECTORS, HEADS, LOSSES, NECKS,
+                                   ROI_EXTRACTORS, SHARED_HEADS)
+from ..utils import build_from_cfg
+from .registry import FUSION_LAYERS, MIDDLE_ENCODERS, VOXEL_ENCODERS
+
+
+def build(cfg, registry, default_args=None):
+    if isinstance(cfg, list):
+        modules = [
+            build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
+        ]
+        return nn.Sequential(*modules)
+    else:
+        return build_from_cfg(cfg, registry, default_args)
+
+
+def build_backbone(cfg):
+    return build(cfg, BACKBONES)
+
+
+def build_neck(cfg):
+    return build(cfg, NECKS)
+
+
+def build_roi_extractor(cfg):
+    return build(cfg, ROI_EXTRACTORS)
+
+
+def build_shared_head(cfg):
+    return build(cfg, SHARED_HEADS)
+
+
+def build_head(cfg):
+    return build(cfg, HEADS)
+
+
+def build_loss(cfg):
+    return build(cfg, LOSSES)
+
+
+def build_detector(cfg, train_cfg=None, test_cfg=None):
+    return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg))
+
+
+def build_voxel_encoder(cfg):
+    return build(cfg, VOXEL_ENCODERS)
+
+
+def build_middle_encoder(cfg):
+    return build(cfg, MIDDLE_ENCODERS)
+
+
+def build_fusion_layer(cfg):
+    return build(cfg, FUSION_LAYERS)
diff --git a/mmdet3d/models/detectors/__init__.py b/mmdet3d/models/detectors/__init__.py
new file mode 100644
index 0000000000..15fb21656e
--- /dev/null
+++ b/mmdet3d/models/detectors/__init__.py
@@ -0,0 +1,14 @@
+from .base import BaseDetector
+from .mvx_faster_rcnn import (DynamicMVXFasterRCNN, DynamicMVXFasterRCNNV2,
+                              DynamicMVXFasterRCNNV3)
+from .mvx_single_stage import MVXSingleStageDetector
+from .mvx_two_stage import MVXTwoStageDetector
+from .single_stage import SingleStageDetector
+from .two_stage import TwoStageDetector
+from .voxelnet import DynamicVoxelNet, VoxelNet
+
+__all__ = [
+    'BaseDetector', 'SingleStageDetector', 'VoxelNet', 'DynamicVoxelNet',
+    'TwoStageDetector', 'MVXSingleStageDetector', 'MVXTwoStageDetector',
+    'DynamicMVXFasterRCNN', 'DynamicMVXFasterRCNNV2', 'DynamicMVXFasterRCNNV3'
+]
diff --git a/mmdet3d/models/detectors/base.py b/mmdet3d/models/detectors/base.py
new file mode 100644
index 0000000000..83df170c51
--- /dev/null
+++ b/mmdet3d/models/detectors/base.py
@@ -0,0 +1,110 @@
+from abc import ABCMeta, abstractmethod
+
+import torch.nn as nn
+
+
+class BaseDetector(nn.Module, metaclass=ABCMeta):
+    """Base class for detectors"""
+
+    def __init__(self):
+        super(BaseDetector, self).__init__()
+        self.fp16_enabled = False
+
+    @property
+    def with_neck(self):
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_voxel_encoder(self):
+        return hasattr(self,
+                       'voxel_encoder') and self.voxel_encoder is not None
+
+    @property
+    def with_middle_encoder(self):
+        return hasattr(self,
+                       'middle_encoder') and self.middle_encoder is not None
+
+    @property
+    def with_shared_head(self):
+        return hasattr(self, 'shared_head') and self.shared_head is not None
+
+    @property
+    def with_bbox(self):
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_mask(self):
+        return hasattr(self, 'mask_head') and self.mask_head is not None
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        pass
+
+    def extract_feats(self, imgs):
+        assert isinstance(imgs, list)
+        for img in imgs:
+            yield self.extract_feat(img)
+
+    @abstractmethod
+    def forward_train(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def simple_test(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def aug_test(self, **kwargs):
+        pass
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            from mmdet3d.apis import get_root_logger
+            logger = get_root_logger()
+            logger.info('load model from: {}'.format(pretrained))
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_meta (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch
+        """
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(
+                'num of augmentations ({}) != num of image meta ({})'.format(
+                    len(imgs), len(img_metas)))
+        # TODO: remove the restriction of imgs_per_gpu == 1 when prepared
+        imgs_per_gpu = imgs[0].size(0)
+        assert imgs_per_gpu == 1
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    def forward(self, img, img_meta, return_loss=True, **kwargs):
+        """
+        Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+
+        # TODO: current version only support 2D detector now, find
+        # a better way to be compatible with both
+        if return_loss:
+            return self.forward_train(img, img_meta, **kwargs)
+        else:
+            return self.forward_test(img, img_meta, **kwargs)
diff --git a/mmdet3d/models/detectors/mvx_faster_rcnn.py b/mmdet3d/models/detectors/mvx_faster_rcnn.py
new file mode 100644
index 0000000000..34cdb420bc
--- /dev/null
+++ b/mmdet3d/models/detectors/mvx_faster_rcnn.py
@@ -0,0 +1,103 @@
+import torch
+import torch.nn.functional as F
+
+from mmdet.models.registry import DETECTORS
+from .mvx_two_stage import MVXTwoStageDetector
+
+
+@DETECTORS.register_module
+class DynamicMVXFasterRCNN(MVXTwoStageDetector):
+
+    def __init__(self, **kwargs):
+        super(DynamicMVXFasterRCNN, self).__init__(**kwargs)
+
+    def extract_pts_feat(self, points, img_feats, img_meta):
+        if not self.with_pts_bbox:
+            return None
+        voxels, coors = self.voxelize(points)
+        # adopt an early fusion strategy
+        if self.with_fusion:
+            voxels = self.pts_fusion_layer(img_feats, points, voxels, img_meta)
+        voxel_features, feature_coors = self.pts_voxel_encoder(voxels, coors)
+        batch_size = coors[-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
+
+    @torch.no_grad()
+    def voxelize(self, points):
+        coors = []
+        # dynamic voxelization only provide a coors mapping
+        for res in points:
+            res_coors = self.pts_voxel_layer(res)
+            coors.append(res_coors)
+        points = torch.cat(points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return points, coors_batch
+
+
+@DETECTORS.register_module
+class DynamicMVXFasterRCNNV2(DynamicMVXFasterRCNN):
+
+    def __init__(self, **kwargs):
+        super(DynamicMVXFasterRCNNV2, self).__init__(**kwargs)
+
+    def extract_pts_feat(self, points, img_feats, img_meta):
+        if not self.with_pts_bbox:
+            return None
+        voxels, coors = self.voxelize(points)
+        voxel_features, feature_coors = self.pts_voxel_encoder(
+            voxels, coors, points, img_feats, img_meta)
+        batch_size = coors[-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
+
+
+@DETECTORS.register_module
+class MVXFasterRCNNV2(MVXTwoStageDetector):
+
+    def __init__(self, **kwargs):
+        super(MVXFasterRCNNV2, self).__init__(**kwargs)
+
+    def extract_pts_feat(self, pts, img_feats, img_meta):
+        if not self.with_pts_bbox:
+            return None
+        voxels, num_points, coors = self.voxelize(pts)
+        voxel_features = self.pts_voxel_encoder(voxels, num_points, coors,
+                                                img_feats, img_meta)
+
+        batch_size = coors[-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, coors, batch_size)
+        x = self.pts_backbone(x)
+
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
+
+
+@DETECTORS.register_module
+class DynamicMVXFasterRCNNV3(DynamicMVXFasterRCNN):
+
+    def __init__(self, **kwargs):
+        super(DynamicMVXFasterRCNNV3, self).__init__(**kwargs)
+
+    def extract_pts_feat(self, points, img_feats, img_meta):
+        if not self.with_pts_bbox:
+            return None
+        voxels, coors = self.voxelize(points)
+        voxel_features, feature_coors = self.pts_voxel_encoder(voxels, coors)
+        batch_size = coors[-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x, coors, points, img_feats, img_meta)
+        return x
diff --git a/mmdet3d/models/detectors/mvx_single_stage.py b/mmdet3d/models/detectors/mvx_single_stage.py
new file mode 100644
index 0000000000..5bb7890d7a
--- /dev/null
+++ b/mmdet3d/models/detectors/mvx_single_stage.py
@@ -0,0 +1,330 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet3d.ops import Voxelization
+from mmdet.models.registry import DETECTORS
+from .. import builder
+from .base import BaseDetector
+
+
+@DETECTORS.register_module
+class MVXSingleStageDetector(BaseDetector):
+
+    def __init__(self,
+                 voxel_layer,
+                 voxel_encoder,
+                 middle_encoder,
+                 fusion_layer,
+                 img_backbone,
+                 pts_backbone,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(MVXSingleStageDetector, self).__init__()
+        self.voxel_layer = Voxelization(**voxel_layer)
+        self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder)
+        self.middle_encoder = builder.build_middle_encoder(middle_encoder)
+        self.pts_backbone = builder.build_backbone(pts_backbone)
+
+        if fusion_layer:
+            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
+        if img_backbone:
+            self.img_backbone = builder.build_backbone(img_backbone)
+
+        pts_bbox_head.update(train_cfg=train_cfg)
+        pts_bbox_head.update(test_cfg=test_cfg)
+        self.pts_bbox_head = builder.build_head(pts_bbox_head)
+        if img_neck is not None:
+            self.img_neck = builder.build_neck(img_neck)
+        if pts_neck is not None:
+            self.pts_neck = builder.build_neck(pts_neck)
+        if img_bbox_head is not None:
+            self.img_bbox_head = builder.build_head(img_bbox_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        super(MVXSingleStageDetector, self).init_weights(pretrained)
+        if self.with_img_backbone:
+            self.img_backbone.init_weights(pretrained=pretrained)
+        if self.with_img_neck:
+            if isinstance(self.img_neck, nn.Sequential):
+                for m in self.img_neck:
+                    m.init_weights()
+            else:
+                self.img_neck.init_weights()
+        if self.with_img_bbox:
+            self.img_bbox_head.init_weights()
+        if self.with_pts_bbox:
+            self.pts_bbox_head.init_weights()
+
+    @property
+    def with_pts_bbox(self):
+        return hasattr(self,
+                       'pts_bbox_head') and self.pts_bbox_head is not None
+
+    @property
+    def with_img_bbox(self):
+        return hasattr(self,
+                       'img_bbox_head') and self.img_bbox_head is not None
+
+    @property
+    def with_img_backbone(self):
+        return hasattr(self, 'img_backbone') and self.img_backbone is not None
+
+    @property
+    def with_fusion(self):
+        return hasattr(self, 'fusion_layer') and self.fusion_layer is not None
+
+    @property
+    def with_img_neck(self):
+        return hasattr(self, 'img_neck') and self.img_neck is not None
+
+    @property
+    def with_pts_neck(self):
+        return hasattr(self, 'pts_neck') and self.pts_neck is not None
+
+    def extract_feat(self, points, img, img_meta):
+        if self.with_img_backbone:
+            img_feats = self.img_backbone(img)
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        voxels, num_points, coors = self.voxelize(points)
+        voxel_features = self.voxel_encoder(voxels, num_points, coors)
+        batch_size = coors[-1, 0] + 1
+        x = self.middle_encoder(voxel_features, coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_neck:
+            x = self.pts_neck(x)
+        return x
+
+    @torch.no_grad()
+    def voxelize(self, points):
+        voxels, coors, num_points = [], [], []
+        for res in points:
+            res_voxels, res_coors, res_num_points = self.voxel_layer(res)
+            voxels.append(res_voxels)
+            coors.append(res_coors)
+            num_points.append(res_num_points)
+        voxels = torch.cat(voxels, dim=0)
+        num_points = torch.cat(num_points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return voxels, num_points, coors_batch
+
+    def forward_train(self,
+                      points,
+                      img_meta,
+                      gt_bboxes_3d,
+                      gt_labels,
+                      img=None,
+                      gt_bboxes_ignore=None):
+        x = self.extract_feat(points, img=img, img_meta=img_meta)
+        outs = self.pts_bbox_head(x)
+        loss_inputs = outs + (gt_bboxes_3d, gt_labels, img_meta)
+        losses = self.pts_bbox_head.loss(
+            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def forward_test(self, **kwargs):
+        return self.simple_test(**kwargs)
+
+    def forward(self, return_loss=True, **kwargs):
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+
+    def simple_test(self,
+                    points,
+                    img_meta,
+                    img=None,
+                    gt_bboxes_3d=None,
+                    rescale=False):
+        x = self.extract_feat(points, img, img_meta)
+        outs = self.pts_bbox_head(x)
+        bbox_inputs = outs + (img_meta, rescale)
+        bbox_list = self.pts_bbox_head.get_bboxes(*bbox_inputs)
+        return bbox_list
+
+    def aug_test(self, points, imgs, img_metas, rescale=False):
+        raise NotImplementedError
+
+
+@DETECTORS.register_module
+class DynamicMVXNet(MVXSingleStageDetector):
+
+    def __init__(self,
+                 voxel_layer,
+                 voxel_encoder,
+                 middle_encoder,
+                 pts_backbone,
+                 fusion_layer=None,
+                 img_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(DynamicMVXNet, self).__init__(
+            voxel_layer=voxel_layer,
+            voxel_encoder=voxel_encoder,
+            middle_encoder=middle_encoder,
+            img_backbone=img_backbone,
+            fusion_layer=fusion_layer,
+            pts_backbone=pts_backbone,
+            pts_neck=pts_neck,
+            img_neck=img_neck,
+            img_bbox_head=img_bbox_head,
+            pts_bbox_head=pts_bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+        )
+
+    def extract_feat(self, points, img, img_meta):
+        if self.with_img_backbone:
+            img_feats = self.img_backbone(img)
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        voxels, coors = self.voxelize(points)
+        # adopt an early fusion strategy
+        if self.with_fusion:
+            voxels = self.fusion_layer(img_feats, points, voxels, img_meta)
+
+        voxel_features, feature_coors = self.voxel_encoder(voxels, coors)
+        batch_size = coors[-1, 0] + 1
+        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
+
+    @torch.no_grad()
+    def voxelize(self, points):
+        coors = []
+        # dynamic voxelization only provide a coors mapping
+        for res in points:
+            res_coors = self.voxel_layer(res)
+            coors.append(res_coors)
+        points = torch.cat(points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return points, coors_batch
+
+
+@DETECTORS.register_module
+class DynamicMVXNetV2(DynamicMVXNet):
+
+    def __init__(self,
+                 voxel_layer,
+                 voxel_encoder,
+                 middle_encoder,
+                 pts_backbone,
+                 fusion_layer=None,
+                 img_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(DynamicMVXNetV2, self).__init__(
+            voxel_layer=voxel_layer,
+            voxel_encoder=voxel_encoder,
+            middle_encoder=middle_encoder,
+            img_backbone=img_backbone,
+            fusion_layer=fusion_layer,
+            pts_backbone=pts_backbone,
+            pts_neck=pts_neck,
+            img_neck=img_neck,
+            img_bbox_head=img_bbox_head,
+            pts_bbox_head=pts_bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+        )
+
+    def extract_feat(self, points, img, img_meta):
+        if self.with_img_backbone:
+            img_feats = self.img_backbone(img)
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        voxels, coors = self.voxelize(points)
+
+        voxel_features, feature_coors = self.voxel_encoder(
+            voxels, coors, points, img_feats, img_meta)
+        batch_size = coors[-1, 0] + 1
+        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
+
+
+@DETECTORS.register_module
+class DynamicMVXNetV3(DynamicMVXNet):
+
+    def __init__(self,
+                 voxel_layer,
+                 voxel_encoder,
+                 middle_encoder,
+                 pts_backbone,
+                 fusion_layer=None,
+                 img_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(DynamicMVXNetV3, self).__init__(
+            voxel_layer=voxel_layer,
+            voxel_encoder=voxel_encoder,
+            middle_encoder=middle_encoder,
+            img_backbone=img_backbone,
+            fusion_layer=fusion_layer,
+            pts_backbone=pts_backbone,
+            pts_neck=pts_neck,
+            img_neck=img_neck,
+            img_bbox_head=img_bbox_head,
+            pts_bbox_head=pts_bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+        )
+
+    def extract_feat(self, points, img, img_meta):
+        if self.with_img_backbone:
+            img_feats = self.img_backbone(img)
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        voxels, coors = self.voxelize(points)
+        voxel_features, feature_coors = self.voxel_encoder(voxels, coors)
+        batch_size = coors[-1, 0] + 1
+        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x, coors, points, img_feats, img_meta)
+        return x
diff --git a/mmdet3d/models/detectors/mvx_two_stage.py b/mmdet3d/models/detectors/mvx_two_stage.py
new file mode 100644
index 0000000000..b085c632a2
--- /dev/null
+++ b/mmdet3d/models/detectors/mvx_two_stage.py
@@ -0,0 +1,376 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet3d.core import (bbox2result_coco, bbox2roi, build_assigner,
+                          build_sampler)
+from mmdet3d.ops import Voxelization
+from mmdet.models.registry import DETECTORS
+from .. import builder
+from .base import BaseDetector
+from .test_mixins import BBoxTestMixin, RPNTestMixin
+
+
+@DETECTORS.register_module
+class MVXTwoStageDetector(BaseDetector, RPNTestMixin, BBoxTestMixin):
+
+    def __init__(self,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_bbox_head=None,
+                 img_shared_head=None,
+                 img_rpn_head=None,
+                 img_bbox_roi_extractor=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(MVXTwoStageDetector, self).__init__()
+
+        if pts_voxel_layer:
+            self.pts_voxel_layer = Voxelization(**pts_voxel_layer)
+        if pts_voxel_encoder:
+            self.pts_voxel_encoder = builder.build_voxel_encoder(
+                pts_voxel_encoder)
+        if pts_middle_encoder:
+            self.pts_middle_encoder = builder.build_middle_encoder(
+                pts_middle_encoder)
+        if pts_backbone:
+            self.pts_backbone = builder.build_backbone(pts_backbone)
+        if pts_fusion_layer:
+            self.pts_fusion_layer = builder.build_fusion_layer(
+                pts_fusion_layer)
+        if pts_neck is not None:
+            self.pts_neck = builder.build_neck(pts_neck)
+        if pts_bbox_head:
+            pts_train_cfg = train_cfg.pts if train_cfg else None
+            pts_bbox_head.update(train_cfg=pts_train_cfg)
+            pts_test_cfg = test_cfg.pts if test_cfg else None
+            pts_bbox_head.update(test_cfg=pts_test_cfg)
+            self.pts_bbox_head = builder.build_head(pts_bbox_head)
+
+        if img_backbone:
+            self.img_backbone = builder.build_backbone(img_backbone)
+        if img_neck is not None:
+            self.img_neck = builder.build_neck(img_neck)
+        if img_shared_head is not None:
+            self.img_shared_head = builder.build_shared_head(img_shared_head)
+        if img_rpn_head is not None:
+            self.img_rpn_head = builder.build_head(img_rpn_head)
+        if img_bbox_head is not None:
+            self.img_bbox_roi_extractor = builder.build_roi_extractor(
+                img_bbox_roi_extractor)
+            self.img_bbox_head = builder.build_head(img_bbox_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        super(MVXTwoStageDetector, self).init_weights(pretrained)
+        if self.with_img_backbone:
+            self.img_backbone.init_weights(pretrained=pretrained)
+        if self.with_img_neck:
+            if isinstance(self.img_neck, nn.Sequential):
+                for m in self.img_neck:
+                    m.init_weights()
+            else:
+                self.img_neck.init_weights()
+        if self.with_shared_head:
+            self.img_shared_head.init_weights(pretrained=pretrained)
+        if self.with_img_rpn:
+            self.img_rpn_head.init_weights()
+        if self.with_img_bbox:
+            self.img_bbox_roi_extractor.init_weights()
+            self.img_bbox_head.init_weights()
+        if self.with_pts_bbox:
+            self.pts_bbox_head.init_weights()
+
+    @property
+    def with_img_shared_head(self):
+        return hasattr(self,
+                       'img_shared_head') and self.img_shared_head is not None
+
+    @property
+    def with_pts_bbox(self):
+        return hasattr(self,
+                       'pts_bbox_head') and self.pts_bbox_head is not None
+
+    @property
+    def with_img_bbox(self):
+        return hasattr(self,
+                       'img_bbox_head') and self.img_bbox_head is not None
+
+    @property
+    def with_img_backbone(self):
+        return hasattr(self, 'img_backbone') and self.img_backbone is not None
+
+    @property
+    def with_fusion(self):
+        return hasattr(self,
+                       'pts_fusion_layer') and self.fusion_layer is not None
+
+    @property
+    def with_img_neck(self):
+        return hasattr(self, 'img_neck') and self.img_neck is not None
+
+    @property
+    def with_pts_neck(self):
+        return hasattr(self, 'pts_neck') and self.pts_neck is not None
+
+    @property
+    def with_img_rpn(self):
+        return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None
+
+    def extract_img_feat(self, img, img_meta):
+        if self.with_img_backbone:
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.view(B * N, C, H, W)
+            img_feats = self.img_backbone(img)
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+        if torch.isnan(img_feats[0]).any():
+            import pdb
+            pdb.set_trace()
+        return img_feats
+
+    def extract_pts_feat(self, pts, img_feats, img_meta):
+        if not self.with_pts_bbox:
+            return None
+        voxels, num_points, coors = self.voxelize(pts)
+        voxel_features = self.pts_voxel_encoder(voxels, num_points, coors)
+        batch_size = coors[-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
+
+    def extract_feat(self, points, img, img_meta):
+        img_feats = self.extract_img_feat(img, img_meta)
+        pts_feats = self.extract_pts_feat(points, img_feats, img_meta)
+        return (img_feats, pts_feats)
+
+    @torch.no_grad()
+    def voxelize(self, points):
+        voxels, coors, num_points = [], [], []
+        for res in points:
+            res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
+            voxels.append(res_voxels)
+            coors.append(res_coors)
+            num_points.append(res_num_points)
+        voxels = torch.cat(voxels, dim=0)
+        num_points = torch.cat(num_points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return voxels, num_points, coors_batch
+
+    def forward_train(self,
+                      points=None,
+                      img_meta=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None):
+        img_feats, pts_feats = self.extract_feat(
+            points, img=img, img_meta=img_meta)
+        losses = dict()
+        if pts_feats:
+            losses_pts = self.forward_pts_train(pts_feats, gt_bboxes_3d,
+                                                gt_labels_3d, img_meta,
+                                                gt_bboxes_ignore)
+            losses.update(losses_pts)
+        if img_feats:
+            losses_img = self.forward_img_train(
+                img_feats,
+                img_meta=img_meta,
+                gt_bboxes=gt_bboxes,
+                gt_labels=gt_labels,
+                gt_bboxes_ignore=gt_bboxes_ignore,
+                proposals=proposals,
+            )
+            losses.update(losses_img)
+        return losses
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          img_meta,
+                          gt_bboxes_ignore=None):
+        outs = self.pts_bbox_head(pts_feats)
+        loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_meta)
+        losses = self.pts_bbox_head.loss(
+            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def forward_img_train(self,
+                          x,
+                          img_meta,
+                          gt_bboxes,
+                          gt_labels,
+                          gt_bboxes_ignore=None,
+                          proposals=None):
+        losses = dict()
+        # RPN forward and loss
+        if self.with_img_rpn:
+            rpn_outs = self.img_rpn_head(x)
+            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
+                                          self.train_cfg.img_rpn)
+            rpn_losses = self.img_rpn_head.loss(
+                *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+            losses.update(rpn_losses)
+
+            proposal_cfg = self.train_cfg.get('img_rpn_proposal',
+                                              self.test_cfg.img_rpn)
+            proposal_inputs = rpn_outs + (img_meta, proposal_cfg)
+            proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs)
+        else:
+            proposal_list = proposals
+
+        # assign gts and sample proposals
+        if self.with_img_bbox:
+            bbox_assigner = build_assigner(self.train_cfg.img_rcnn.assigner)
+            bbox_sampler = build_sampler(
+                self.train_cfg.img_rcnn.sampler, context=self)
+            num_imgs = len(img_meta)
+            if gt_bboxes_ignore is None:
+                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+            sampling_results = []
+            for i in range(num_imgs):
+                assign_result = bbox_assigner.assign(proposal_list[i],
+                                                     gt_bboxes[i],
+                                                     gt_bboxes_ignore[i],
+                                                     gt_labels[i])
+                sampling_result = bbox_sampler.sample(
+                    assign_result,
+                    proposal_list[i],
+                    gt_bboxes[i],
+                    gt_labels[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+        # bbox head forward and loss
+        if self.with_img_bbox:
+            rois = bbox2roi([res.bboxes for res in sampling_results])
+            # TODO: a more flexible way to decide which feature maps to use
+            bbox_feats = self.img_bbox_roi_extractor(
+                x[:self.img_bbox_roi_extractor.num_inputs], rois)
+            if self.with_shared_head:
+                bbox_feats = self.img_shared_head(bbox_feats)
+            cls_score, bbox_pred = self.img_bbox_head(bbox_feats)
+
+            bbox_targets = self.img_bbox_head.get_target(
+                sampling_results, gt_bboxes, gt_labels,
+                self.train_cfg.img_rcnn)
+            loss_bbox = self.img_bbox_head.loss(cls_score, bbox_pred,
+                                                *bbox_targets)
+            losses.update(loss_bbox)
+
+        return losses
+
+    def forward_test(self, **kwargs):
+        return self.simple_test(**kwargs)
+
+    def forward(self, return_loss=True, **kwargs):
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+
+    def simple_test_img(self, x, img_meta, proposals=None, rescale=False):
+        """Test without augmentation."""
+        if proposals is None:
+            proposal_list = self.simple_test_rpn(x, img_meta,
+                                                 self.test_cfg.img_rpn)
+        else:
+            proposal_list = proposals
+
+        det_bboxes, det_labels = self.simple_test_bboxes(
+            x,
+            img_meta,
+            proposal_list,
+            self.test_cfg.img_rcnn,
+            rescale=rescale)
+        bbox_results = bbox2result_coco(det_bboxes, det_labels,
+                                        self.img_bbox_head.num_classes)
+
+        return bbox_results
+
+    def simple_test_bboxes(self,
+                           x,
+                           img_meta,
+                           proposals,
+                           rcnn_test_cfg,
+                           rescale=False):
+        """Test only det bboxes without augmentation."""
+        rois = bbox2roi(proposals)
+        roi_feats = self.img_bbox_roi_extractor(
+            x[:len(self.img_bbox_roi_extractor.featmap_strides)], rois)
+        if self.with_img_shared_head:
+            roi_feats = self.img_shared_head(roi_feats)
+        cls_score, bbox_pred = self.img_bbox_head(roi_feats)
+
+        img_shape = img_meta[0]['img_shape']
+        scale_factor = img_meta[0]['scale_factor']
+        det_bboxes, det_labels = self.img_bbox_head.get_det_bboxes(
+            rois,
+            cls_score,
+            bbox_pred,
+            img_shape,
+            scale_factor,
+            rescale=rescale,
+            cfg=rcnn_test_cfg)
+        return det_bboxes, det_labels
+
+    def simple_test_rpn(self, x, img_meta, rpn_test_cfg):
+        rpn_outs = self.img_rpn_head(x)
+        proposal_inputs = rpn_outs + (img_meta, rpn_test_cfg)
+        proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs)
+        return proposal_list
+
+    def simple_test_pts(self, x, img_meta, rescale=False):
+        outs = self.pts_bbox_head(x)
+        bbox_inputs = outs + (img_meta, rescale)
+        bbox_list = self.pts_bbox_head.get_bboxes(*bbox_inputs)
+        return bbox_list
+
+    def simple_test(self,
+                    points,
+                    img_meta,
+                    img=None,
+                    gt_bboxes_3d=None,
+                    rescale=False):
+        img_feats, pts_feats = self.extract_feat(
+            points, img=img, img_meta=img_meta)
+
+        bbox_list = dict()
+        if pts_feats and self.with_pts_bbox:
+            bbox_pts = self.simple_test_pts(
+                pts_feats, img_meta, rescale=rescale)
+            bbox_list.update(pts_bbox=bbox_pts)
+        if img_feats and self.with_img_bbox:
+            bbox_img = self.simple_test_img(
+                img_feats, img_meta, rescale=rescale)
+            bbox_list.update(img_bbox=bbox_img)
+        return bbox_list
+
+    def aug_test(self, points, imgs, img_metas, rescale=False):
+        raise NotImplementedError
diff --git a/mmdet3d/models/detectors/single_stage.py b/mmdet3d/models/detectors/single_stage.py
new file mode 100644
index 0000000000..e10eb25b61
--- /dev/null
+++ b/mmdet3d/models/detectors/single_stage.py
@@ -0,0 +1,89 @@
+import torch.nn as nn
+
+from mmdet3d.core import bbox2result_coco
+from mmdet.models.registry import DETECTORS
+from .. import builder
+from .base import BaseDetector
+
+
+@DETECTORS.register_module
+class SingleStageDetector(BaseDetector):
+    """Base class for single-stage detectors.
+
+    Single-stage detectors directly and densely predict bounding boxes on the
+    output features of the backbone+neck.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(SingleStageDetector, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = builder.build_head(bbox_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        super(SingleStageDetector, self).init_weights(pretrained)
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.with_neck:
+            if isinstance(self.neck, nn.Sequential):
+                for m in self.neck:
+                    m.init_weights()
+            else:
+                self.neck.init_weights()
+        self.bbox_head.init_weights()
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone+neck
+        """
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmedetection/tools/get_flops.py`
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        return outs
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None):
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, self.train_cfg)
+        losses = self.bbox_head.loss(
+            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, img, img_meta, rescale=False):
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        bbox_inputs = outs + (img_meta, self.test_cfg, rescale)
+        bbox_list = self.bbox_head.get_bboxes(*bbox_inputs)
+        bbox_results = [
+            bbox2result_coco(det_bboxes, det_labels,
+                             self.bbox_head.num_classes)
+            for det_bboxes, det_labels in bbox_list
+        ]
+        return bbox_results[0]
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        raise NotImplementedError
diff --git a/mmdet3d/models/detectors/test_mixins.py b/mmdet3d/models/detectors/test_mixins.py
new file mode 100644
index 0000000000..a457e523f3
--- /dev/null
+++ b/mmdet3d/models/detectors/test_mixins.py
@@ -0,0 +1,266 @@
+import logging
+import sys
+
+import torch
+
+from mmdet3d.core import (bbox2roi, bbox_mapping, merge_aug_bboxes,
+                          merge_aug_masks, merge_aug_proposals, multiclass_nms)
+
+logger = logging.getLogger(__name__)
+
+if sys.version_info >= (3, 7):
+    from mmdet3d.utils.contextmanagers import completed
+
+
+class RPNTestMixin(object):
+
+    if sys.version_info >= (3, 7):
+
+        async def async_test_rpn(self, x, img_meta, rpn_test_cfg):
+            sleep_interval = rpn_test_cfg.pop('async_sleep_interval', 0.025)
+            async with completed(
+                    __name__, 'rpn_head_forward',
+                    sleep_interval=sleep_interval):
+                rpn_outs = self.rpn_head(x)
+
+            proposal_inputs = rpn_outs + (img_meta, rpn_test_cfg)
+
+            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
+            return proposal_list
+
+    def simple_test_rpn(self, x, img_meta, rpn_test_cfg):
+        rpn_outs = self.rpn_head(x)
+        proposal_inputs = rpn_outs + (img_meta, rpn_test_cfg)
+        proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
+        return proposal_list
+
+    def aug_test_rpn(self, feats, img_metas, rpn_test_cfg):
+        imgs_per_gpu = len(img_metas[0])
+        aug_proposals = [[] for _ in range(imgs_per_gpu)]
+        for x, img_meta in zip(feats, img_metas):
+            proposal_list = self.simple_test_rpn(x, img_meta, rpn_test_cfg)
+            for i, proposals in enumerate(proposal_list):
+                aug_proposals[i].append(proposals)
+        # reorganize the order of 'img_metas' to match the dimensions
+        # of 'aug_proposals'
+        aug_img_metas = []
+        for i in range(imgs_per_gpu):
+            aug_img_meta = []
+            for j in range(len(img_metas)):
+                aug_img_meta.append(img_metas[j][i])
+            aug_img_metas.append(aug_img_meta)
+        # after merging, proposals will be rescaled to the original image size
+        merged_proposals = [
+            merge_aug_proposals(proposals, aug_img_meta, rpn_test_cfg)
+            for proposals, aug_img_meta in zip(aug_proposals, aug_img_metas)
+        ]
+        return merged_proposals
+
+
+class BBoxTestMixin(object):
+
+    if sys.version_info >= (3, 7):
+
+        async def async_test_bboxes(self,
+                                    x,
+                                    img_meta,
+                                    proposals,
+                                    rcnn_test_cfg,
+                                    rescale=False,
+                                    bbox_semaphore=None,
+                                    global_lock=None):
+            """Async test only det bboxes without augmentation."""
+            rois = bbox2roi(proposals)
+            roi_feats = self.bbox_roi_extractor(
+                x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+            if self.with_shared_head:
+                roi_feats = self.shared_head(roi_feats)
+            sleep_interval = rcnn_test_cfg.get('async_sleep_interval', 0.017)
+
+            async with completed(
+                    __name__, 'bbox_head_forward',
+                    sleep_interval=sleep_interval):
+                cls_score, bbox_pred = self.bbox_head(roi_feats)
+
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            det_bboxes, det_labels = self.bbox_head.get_det_bboxes(
+                rois,
+                cls_score,
+                bbox_pred,
+                img_shape,
+                scale_factor,
+                rescale=rescale,
+                cfg=rcnn_test_cfg)
+            return det_bboxes, det_labels
+
+    def simple_test_bboxes(self,
+                           x,
+                           img_meta,
+                           proposals,
+                           rcnn_test_cfg,
+                           rescale=False):
+        """Test only det bboxes without augmentation."""
+        rois = bbox2roi(proposals)
+        roi_feats = self.bbox_roi_extractor(
+            x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+        if self.with_shared_head:
+            roi_feats = self.shared_head(roi_feats)
+        cls_score, bbox_pred = self.bbox_head(roi_feats)
+        img_shape = img_meta[0]['img_shape']
+        scale_factor = img_meta[0]['scale_factor']
+        det_bboxes, det_labels = self.bbox_head.get_det_bboxes(
+            rois,
+            cls_score,
+            bbox_pred,
+            img_shape,
+            scale_factor,
+            rescale=rescale,
+            cfg=rcnn_test_cfg)
+        return det_bboxes, det_labels
+
+    def aug_test_bboxes(self, feats, img_metas, proposal_list, rcnn_test_cfg):
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            # TODO more flexible
+            proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
+                                     scale_factor, flip)
+            rois = bbox2roi([proposals])
+            # recompute feature maps to save GPU memory
+            roi_feats = self.bbox_roi_extractor(
+                x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+            if self.with_shared_head:
+                roi_feats = self.shared_head(roi_feats)
+            cls_score, bbox_pred = self.bbox_head(roi_feats)
+            bboxes, scores = self.bbox_head.get_det_bboxes(
+                rois,
+                cls_score,
+                bbox_pred,
+                img_shape,
+                scale_factor,
+                rescale=False,
+                cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores,
+                                                rcnn_test_cfg.score_thr,
+                                                rcnn_test_cfg.nms,
+                                                rcnn_test_cfg.max_per_img)
+        return det_bboxes, det_labels
+
+
+class MaskTestMixin(object):
+
+    if sys.version_info >= (3, 7):
+
+        async def async_test_mask(self,
+                                  x,
+                                  img_meta,
+                                  det_bboxes,
+                                  det_labels,
+                                  rescale=False,
+                                  mask_test_cfg=None):
+            # image shape of the first image in the batch (only one)
+            ori_shape = img_meta[0]['ori_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            if det_bboxes.shape[0] == 0:
+                segm_result = [[]
+                               for _ in range(self.mask_head.num_classes - 1)]
+            else:
+                _bboxes = (
+                    det_bboxes[:, :4] *
+                    scale_factor if rescale else det_bboxes)
+                mask_rois = bbox2roi([_bboxes])
+                mask_feats = self.mask_roi_extractor(
+                    x[:len(self.mask_roi_extractor.featmap_strides)],
+                    mask_rois)
+
+                if self.with_shared_head:
+                    mask_feats = self.shared_head(mask_feats)
+                if mask_test_cfg and mask_test_cfg.get('async_sleep_interval'):
+                    sleep_interval = mask_test_cfg['async_sleep_interval']
+                else:
+                    sleep_interval = 0.035
+                async with completed(
+                        __name__,
+                        'mask_head_forward',
+                        sleep_interval=sleep_interval):
+                    mask_pred = self.mask_head(mask_feats)
+                segm_result = self.mask_head.get_seg_masks(
+                    mask_pred, _bboxes, det_labels, self.test_cfg.rcnn,
+                    ori_shape, scale_factor, rescale)
+            return segm_result
+
+    def simple_test_mask(self,
+                         x,
+                         img_meta,
+                         det_bboxes,
+                         det_labels,
+                         rescale=False):
+        # image shape of the first image in the batch (only one)
+        ori_shape = img_meta[0]['ori_shape']
+        scale_factor = img_meta[0]['scale_factor']
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            if rescale and not isinstance(scale_factor, float):
+                scale_factor = torch.from_numpy(scale_factor).to(
+                    det_bboxes.device)
+            _bboxes = (
+                det_bboxes[:, :4] * scale_factor if rescale else det_bboxes)
+            mask_rois = bbox2roi([_bboxes])
+            mask_feats = self.mask_roi_extractor(
+                x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois)
+            if self.with_shared_head:
+                mask_feats = self.shared_head(mask_feats)
+            mask_pred = self.mask_head(mask_feats)
+            segm_result = self.mask_head.get_seg_masks(mask_pred, _bboxes,
+                                                       det_labels,
+                                                       self.test_cfg.rcnn,
+                                                       ori_shape, scale_factor,
+                                                       rescale)
+        return segm_result
+
+    def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels):
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes)]
+        else:
+            aug_masks = []
+            for x, img_meta in zip(feats, img_metas):
+                img_shape = img_meta[0]['img_shape']
+                scale_factor = img_meta[0]['scale_factor']
+                flip = img_meta[0]['flip']
+                _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                       scale_factor, flip)
+                mask_rois = bbox2roi([_bboxes])
+                mask_feats = self.mask_roi_extractor(
+                    x[:len(self.mask_roi_extractor.featmap_strides)],
+                    mask_rois)
+                if self.with_shared_head:
+                    mask_feats = self.shared_head(mask_feats)
+                mask_pred = self.mask_head(mask_feats)
+                # convert to numpy array to save memory
+                aug_masks.append(mask_pred.sigmoid().cpu().numpy())
+            merged_masks = merge_aug_masks(aug_masks, img_metas,
+                                           self.test_cfg.rcnn)
+
+            ori_shape = img_metas[0][0]['ori_shape']
+            segm_result = self.mask_head.get_seg_masks(
+                merged_masks,
+                det_bboxes,
+                det_labels,
+                self.test_cfg.rcnn,
+                ori_shape,
+                scale_factor=1.0,
+                rescale=False)
+        return segm_result
diff --git a/mmdet3d/models/detectors/two_stage.py b/mmdet3d/models/detectors/two_stage.py
new file mode 100644
index 0000000000..91a0e1ba33
--- /dev/null
+++ b/mmdet3d/models/detectors/two_stage.py
@@ -0,0 +1,314 @@
+import torch
+import torch.nn as nn
+
+from mmdet3d.core import (bbox2result_coco, bbox2roi, build_assigner,
+                          build_sampler)
+from mmdet.models.registry import DETECTORS
+from .. import builder
+from .base import BaseDetector
+from .test_mixins import BBoxTestMixin, MaskTestMixin, RPNTestMixin
+
+
+@DETECTORS.register_module
+class TwoStageDetector(BaseDetector, RPNTestMixin, BBoxTestMixin,
+                       MaskTestMixin):
+    """Base class for two-stage detectors.
+
+    Two-stage detectors typically consisting of a region proposal network and a
+    task-specific regression head.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 shared_head=None,
+                 rpn_head=None,
+                 bbox_roi_extractor=None,
+                 bbox_head=None,
+                 mask_roi_extractor=None,
+                 mask_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(TwoStageDetector, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+
+        if shared_head is not None:
+            self.shared_head = builder.build_shared_head(shared_head)
+
+        if rpn_head is not None:
+            self.rpn_head = builder.build_head(rpn_head)
+
+        if bbox_head is not None:
+            self.bbox_roi_extractor = builder.build_roi_extractor(
+                bbox_roi_extractor)
+            self.bbox_head = builder.build_head(bbox_head)
+
+        if mask_head is not None:
+            if mask_roi_extractor is not None:
+                self.mask_roi_extractor = builder.build_roi_extractor(
+                    mask_roi_extractor)
+                self.share_roi_extractor = False
+            else:
+                self.share_roi_extractor = True
+                self.mask_roi_extractor = self.bbox_roi_extractor
+            self.mask_head = builder.build_head(mask_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_rpn(self):
+        return hasattr(self, 'rpn_head') and self.rpn_head is not None
+
+    def init_weights(self, pretrained=None):
+        super(TwoStageDetector, self).init_weights(pretrained)
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.with_neck:
+            if isinstance(self.neck, nn.Sequential):
+                for m in self.neck:
+                    m.init_weights()
+            else:
+                self.neck.init_weights()
+        if self.with_shared_head:
+            self.shared_head.init_weights(pretrained=pretrained)
+        if self.with_rpn:
+            self.rpn_head.init_weights()
+        if self.with_bbox:
+            self.bbox_roi_extractor.init_weights()
+            self.bbox_head.init_weights()
+        if self.with_mask:
+            self.mask_head.init_weights()
+            if not self.share_roi_extractor:
+                self.mask_roi_extractor.init_weights()
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone+neck
+        """
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmedetection/tools/get_flops.py`
+        """
+        outs = ()
+        # backbone
+        x = self.extract_feat(img)
+        # rpn
+        if self.with_rpn:
+            rpn_outs = self.rpn_head(x)
+            outs = outs + (rpn_outs, )
+        proposals = torch.randn(1000, 4).cuda()
+        # bbox head
+        rois = bbox2roi([proposals])
+        if self.with_bbox:
+            bbox_feats = self.bbox_roi_extractor(
+                x[:self.bbox_roi_extractor.num_inputs], rois)
+            if self.with_shared_head:
+                bbox_feats = self.shared_head(bbox_feats)
+            cls_score, bbox_pred = self.bbox_head(bbox_feats)
+            outs = outs + (cls_score, bbox_pred)
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_feats = self.mask_roi_extractor(
+                x[:self.mask_roi_extractor.num_inputs], mask_rois)
+            if self.with_shared_head:
+                mask_feats = self.shared_head(mask_feats)
+            mask_pred = self.mask_head(mask_feats)
+            outs = outs + (mask_pred, )
+        return outs
+
+    def forward_train(self,
+                      img,
+                      img_meta,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      proposals=None):
+        """
+        Args:
+            img (Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+
+            img_meta (list[dict]): list of image info dict where each dict has:
+                'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+            gt_bboxes (list[Tensor]): each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+
+            gt_labels (list[Tensor]): class indices corresponding to each box
+
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+            gt_masks (None | Tensor) : true segmentation masks for each box
+                used if the architecture supports a segmentation task.
+
+            proposals : override rpn proposals with custom proposals. Use when
+                `with_rpn` is False.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        x = self.extract_feat(img)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.with_rpn:
+            rpn_outs = self.rpn_head(x)
+            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
+                                          self.train_cfg.rpn)
+            rpn_losses = self.rpn_head.loss(
+                *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+            losses.update(rpn_losses)
+
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            proposal_inputs = rpn_outs + (img_meta, proposal_cfg)
+            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
+        else:
+            proposal_list = proposals
+
+        # assign gts and sample proposals
+        if self.with_bbox or self.with_mask:
+            bbox_assigner = build_assigner(self.train_cfg.rcnn.assigner)
+            bbox_sampler = build_sampler(
+                self.train_cfg.rcnn.sampler, context=self)
+            num_imgs = img.size(0)
+            if gt_bboxes_ignore is None:
+                gt_bboxes_ignore = [None for _ in range(num_imgs)]
+            sampling_results = []
+            for i in range(num_imgs):
+                assign_result = bbox_assigner.assign(proposal_list[i],
+                                                     gt_bboxes[i],
+                                                     gt_bboxes_ignore[i],
+                                                     gt_labels[i])
+                sampling_result = bbox_sampler.sample(
+                    assign_result,
+                    proposal_list[i],
+                    gt_bboxes[i],
+                    gt_labels[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+        # bbox head forward and loss
+        if self.with_bbox:
+            rois = bbox2roi([res.bboxes for res in sampling_results])
+            # TODO: a more flexible way to decide which feature maps to use
+            bbox_feats = self.bbox_roi_extractor(
+                x[:self.bbox_roi_extractor.num_inputs], rois)
+            if self.with_shared_head:
+                bbox_feats = self.shared_head(bbox_feats)
+            cls_score, bbox_pred = self.bbox_head(bbox_feats)
+
+            bbox_targets = self.bbox_head.get_target(sampling_results,
+                                                     gt_bboxes, gt_labels,
+                                                     self.train_cfg.rcnn)
+            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred,
+                                            *bbox_targets)
+            losses.update(loss_bbox)
+
+        # mask head forward and loss
+        if self.with_mask:
+            if not self.share_roi_extractor:
+                pos_rois = bbox2roi(
+                    [res.pos_bboxes for res in sampling_results])
+                mask_feats = self.mask_roi_extractor(
+                    x[:self.mask_roi_extractor.num_inputs], pos_rois)
+                if self.with_shared_head:
+                    mask_feats = self.shared_head(mask_feats)
+            else:
+                pos_inds = []
+                device = bbox_feats.device
+                for res in sampling_results:
+                    pos_inds.append(
+                        torch.ones(
+                            res.pos_bboxes.shape[0],
+                            device=device,
+                            dtype=torch.uint8))
+                    pos_inds.append(
+                        torch.zeros(
+                            res.neg_bboxes.shape[0],
+                            device=device,
+                            dtype=torch.uint8))
+                pos_inds = torch.cat(pos_inds)
+                mask_feats = bbox_feats[pos_inds]
+
+            if mask_feats.shape[0] > 0:
+                mask_pred = self.mask_head(mask_feats)
+                mask_targets = self.mask_head.get_target(
+                    sampling_results, gt_masks, self.train_cfg.rcnn)
+                pos_labels = torch.cat(
+                    [res.pos_gt_labels for res in sampling_results])
+                loss_mask = self.mask_head.loss(mask_pred, mask_targets,
+                                                pos_labels)
+                losses.update(loss_mask)
+
+        return losses
+
+    def simple_test(self, img, img_meta, proposals=None, rescale=False):
+        """Test without augmentation."""
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        x = self.extract_feat(img)
+
+        if proposals is None:
+            proposal_list = self.simple_test_rpn(x, img_meta,
+                                                 self.test_cfg.rpn)
+        else:
+            proposal_list = proposals
+
+        det_bboxes, det_labels = self.simple_test_bboxes(
+            x, img_meta, proposal_list, self.test_cfg.rcnn, rescale=rescale)
+        bbox_results = bbox2result_coco(det_bboxes, det_labels,
+                                        self.bbox_head.num_classes)
+
+        if not self.with_mask:
+            return bbox_results
+        else:
+            segm_results = self.simple_test_mask(
+                x, img_meta, det_bboxes, det_labels, rescale=rescale)
+            return bbox_results, segm_results
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        # recompute feats to save memory
+        proposal_list = self.aug_test_rpn(
+            self.extract_feats(imgs), img_metas, self.test_cfg.rpn)
+        det_bboxes, det_labels = self.aug_test_bboxes(
+            self.extract_feats(imgs), img_metas, proposal_list,
+            self.test_cfg.rcnn)
+
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= img_metas[0][0]['scale_factor']
+        bbox_results = bbox2result_coco(_det_bboxes, det_labels,
+                                        self.bbox_head.num_classes)
+
+        # det_bboxes always keep the original scale
+        if self.with_mask:
+            segm_results = self.aug_test_mask(
+                self.extract_feats(imgs), img_metas, det_bboxes, det_labels)
+            return bbox_results, segm_results
+        else:
+            return bbox_results
diff --git a/mmdet3d/models/detectors/voxelnet.py b/mmdet3d/models/detectors/voxelnet.py
new file mode 100644
index 0000000000..5095cf51e1
--- /dev/null
+++ b/mmdet3d/models/detectors/voxelnet.py
@@ -0,0 +1,140 @@
+import torch
+import torch.nn.functional as F
+
+from mmdet3d.ops import Voxelization
+from mmdet.models.registry import DETECTORS
+from .. import builder
+from .single_stage import SingleStageDetector
+
+
+@DETECTORS.register_module
+class VoxelNet(SingleStageDetector):
+
+    def __init__(self,
+                 voxel_layer,
+                 voxel_encoder,
+                 middle_encoder,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(VoxelNet, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+        )
+        self.voxel_layer = Voxelization(**voxel_layer)
+        self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder)
+        self.middle_encoder = builder.build_middle_encoder(middle_encoder)
+
+    def extract_feat(self, points, img_meta):
+        voxels, num_points, coors = self.voxelize(points)
+        voxel_features = self.voxel_encoder(voxels, num_points, coors)
+        batch_size = coors[-1, 0].item() + 1
+        x = self.middle_encoder(voxel_features, coors, batch_size)
+        x = self.backbone(x)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    @torch.no_grad()
+    def voxelize(self, points):
+        voxels, coors, num_points = [], [], []
+        for res in points:
+            res_voxels, res_coors, res_num_points = self.voxel_layer(res)
+            voxels.append(res_voxels)
+            coors.append(res_coors)
+            num_points.append(res_num_points)
+        voxels = torch.cat(voxels, dim=0)
+        num_points = torch.cat(num_points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return voxels, num_points, coors_batch
+
+    def forward_train(self,
+                      points,
+                      img_meta,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      gt_bboxes_ignore=None):
+        x = self.extract_feat(points, img_meta)
+        outs = self.bbox_head(x)
+        loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_meta)
+        losses = self.bbox_head.loss(
+            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def forward_test(self, **kwargs):
+        return self.simple_test(**kwargs)
+
+    def forward(self, return_loss=True, **kwargs):
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+
+    def simple_test(self, points, img_meta, gt_bboxes_3d=None, rescale=False):
+        x = self.extract_feat(points, img_meta)
+        outs = self.bbox_head(x)
+        bbox_inputs = outs + (img_meta, rescale)
+        bbox_list = self.bbox_head.get_bboxes(*bbox_inputs)
+        return bbox_list
+
+
+@DETECTORS.register_module
+class DynamicVoxelNet(VoxelNet):
+
+    def __init__(self,
+                 voxel_layer,
+                 voxel_encoder,
+                 middle_encoder,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(DynamicVoxelNet, self).__init__(
+            voxel_layer=voxel_layer,
+            voxel_encoder=voxel_encoder,
+            middle_encoder=middle_encoder,
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+        )
+
+    def extract_feat(self, points, img_meta):
+        voxels, coors = self.voxelize(points)
+        voxel_features, feature_coors = self.voxel_encoder(voxels, coors)
+        batch_size = coors[-1, 0].item() + 1
+        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.backbone(x)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    @torch.no_grad()
+    def voxelize(self, points):
+        coors = []
+        # dynamic voxelization only provide a coors mapping
+        for res in points:
+            res_coors = self.voxel_layer(res)
+            coors.append(res_coors)
+        points = torch.cat(points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return points, coors_batch
diff --git a/mmdet3d/models/fusion_layers/__init__.py b/mmdet3d/models/fusion_layers/__init__.py
new file mode 100644
index 0000000000..93142ced2f
--- /dev/null
+++ b/mmdet3d/models/fusion_layers/__init__.py
@@ -0,0 +1,3 @@
+from .point_fusion import PointFusion
+
+__all__ = ['PointFusion']
diff --git a/mmdet3d/models/fusion_layers/point_fusion.py b/mmdet3d/models/fusion_layers/point_fusion.py
new file mode 100644
index 0000000000..005306f269
--- /dev/null
+++ b/mmdet3d/models/fusion_layers/point_fusion.py
@@ -0,0 +1,287 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import xavier_init
+
+from mmdet3d.models.utils import ConvModule
+from ..plugins import NonLocal2D
+from ..registry import FUSION_LAYERS
+
+
+def point_sample(
+    img_features,
+    points,
+    lidar2img_rt,
+    pcd_rotate_mat,
+    img_scale_factor,
+    img_crop_offset,
+    pcd_trans_factor,
+    pcd_scale_factor,
+    pcd_flip,
+    img_flip,
+    img_pad_shape,
+    img_shape,
+    aligned=True,
+    padding_mode='zeros',
+    align_corners=True,
+):
+    """sample image features using point coordinates
+
+    Arguments:
+        img_features (Tensor): 1xCxHxW image features
+        points (Tensor): Nx3 point cloud coordinates
+        P (Tensor): 4x4 transformation matrix
+        scale_factor (Tensor): scale_factor of images
+        img_pad_shape (int, int): int tuple indicates the h & w after padding,
+            this is necessary to obtain features in feature map
+        img_shape (int, int): int tuple indicates the h & w before padding
+            after scaling, this is necessary for flipping coordinates
+    return:
+        (Tensor): NxC image features sampled by point coordinates
+    """
+    # aug order: flip -> trans -> scale -> rot
+    # The transformation follows the augmentation order in data pipeline
+    if pcd_flip:
+        # if the points are flipped, flip them back first
+        points[:, 1] = -points[:, 1]
+
+    points -= pcd_trans_factor
+    # the points should be scaled to the original scale in velo coordinate
+    points /= pcd_scale_factor
+    # the points should be rotated back
+    # pcd_rotate_mat @ pcd_rotate_mat.inverse() is not exactly an identity
+    # matrix, use angle to create the inverse rot matrix neither.
+    points = points @ pcd_rotate_mat.inverse()
+
+    # project points from velo coordinate to camera coordinate
+    num_points = points.shape[0]
+    pts_4d = torch.cat([points, points.new_ones(size=(num_points, 1))], dim=-1)
+    pts_2d = pts_4d @ lidar2img_rt.t()
+
+    # cam_points is Tensor of Nx4 whose last column is 1
+    # transform camera coordinate to image coordinate
+
+    pts_2d[:, 2] = torch.clamp(pts_2d[:, 2], min=1e-5)
+    pts_2d[:, 0] /= pts_2d[:, 2]
+    pts_2d[:, 1] /= pts_2d[:, 2]
+
+    # img transformation: scale -> crop -> flip
+    # the image is resized by img_scale_factor
+    img_coors = pts_2d[:, 0:2] * img_scale_factor  # Nx2
+    img_coors -= img_crop_offset
+
+    # grid sample, the valid grid range should be in [-1,1]
+    coor_x, coor_y = torch.split(img_coors, 1, dim=1)  # each is Nx1
+
+    if img_flip:
+        # by default we take it as horizontal flip
+        # use img_shape before padding for flip
+        orig_h, orig_w = img_shape
+        coor_x = orig_w - coor_x
+
+    h, w = img_pad_shape
+    coor_y = coor_y / h * 2 - 1
+    coor_x = coor_x / w * 2 - 1
+    grid = torch.cat([coor_x, coor_y],
+                     dim=1).unsqueeze(0).unsqueeze(0)  # Nx2 -> 1x1xNx2
+
+    # align_corner=True provides higher performance
+    mode = 'bilinear' if aligned else 'nearest'
+    point_features = F.grid_sample(
+        img_features,
+        grid,
+        mode=mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners)  # 1xCx1xN feats
+
+    return point_features.squeeze().t()
+
+
+@FUSION_LAYERS.register_module
+class PointFusion(nn.Module):
+    """Fuse image features from fused single scale features
+    """
+
+    def __init__(self,
+                 img_channels,
+                 pts_channels,
+                 mid_channels,
+                 out_channels,
+                 img_levels=3,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 activation=None,
+                 activate_out=True,
+                 fuse_out=False,
+                 refine_type=None,
+                 dropout_ratio=0,
+                 aligned=True,
+                 align_corners=True,
+                 padding_mode='zeros',
+                 lateral_conv=True):
+        super(PointFusion, self).__init__()
+        if isinstance(img_levels, int):
+            img_levels = [img_levels]
+        if isinstance(img_channels, int):
+            img_channels = [img_channels] * len(img_levels)
+        assert isinstance(img_levels, list)
+        assert isinstance(img_channels, list)
+        assert len(img_channels) == len(img_levels)
+
+        self.img_levels = img_levels
+        self.activation = activation
+        self.activate_out = activate_out
+        self.fuse_out = fuse_out
+        self.refine_type = refine_type
+        self.dropout_ratio = dropout_ratio
+        self.img_channels = img_channels
+        self.aligned = aligned
+        self.align_corners = align_corners
+        self.padding_mode = padding_mode
+
+        self.lateral_convs = None
+        if lateral_conv:
+            self.lateral_convs = nn.ModuleList()
+            for i in range(len(img_channels)):
+                l_conv = ConvModule(
+                    img_channels[i],
+                    mid_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    activation=self.activation,
+                    inplace=False)
+                self.lateral_convs.append(l_conv)
+            self.img_transform = nn.Sequential(
+                nn.Linear(mid_channels * len(img_channels), out_channels),
+                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+            )
+        else:
+            self.img_transform = nn.Sequential(
+                nn.Linear(sum(img_channels), out_channels),
+                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+            )
+        self.pts_transform = nn.Sequential(
+            nn.Linear(pts_channels, out_channels),
+            nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+        )
+
+        if self.fuse_out:
+            self.fuse_conv = nn.Sequential(
+                nn.Linear(mid_channels, out_channels),
+                # For pts the BN is initialized differently by default
+                # TODO: check whether this is necessary
+                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+                nn.ReLU(inplace=False))
+
+        if self.refine_type == 'non_local':
+            self.refine = NonLocal2D(
+                out_channels,
+                reduction=1,
+                use_scale=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg)
+        self.init_weights()
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, (nn.Conv2d, nn.Linear)):
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, img_feats, pts, pts_feats, img_meta):
+        """
+        img_feats (List[Tensor]): img features
+        pts: [List[Tensor]]: a batch of points with shape Nx3
+        pts_feats (Tensor): a tensor consist of point features of the
+            total batch
+
+        """
+        img_pts = self.obtain_mlvl_feats(img_feats, pts, img_meta)
+        img_pre_fuse = self.img_transform(img_pts)
+        if self.training and self.dropout_ratio > 0:
+            img_pre_fuse = F.dropout(img_pre_fuse, self.dropout_ratio)
+        pts_pre_fuse = self.pts_transform(pts_feats)
+
+        fuse_out = img_pre_fuse + pts_pre_fuse
+        if self.activate_out:
+            fuse_out = F.relu(fuse_out)
+        if self.fuse_out:
+            fuse_out = self.fuse_conv(fuse_out)
+
+        if self.refine_type is not None:
+            fuse_out_T = fuse_out.t()[None, ..., None]  # NxC -> 1xCxNx1
+            batch_idx = 0
+            attentive = []
+            for i in range(len(pts)):
+                end_idx = batch_idx + len(pts[i])
+                attentive.append(
+                    self.refine(fuse_out_T[:, :, batch_idx:end_idx]))
+                batch_idx = end_idx
+            fuse_out = torch.cat(attentive, dim=-2).squeeze().t()
+        return fuse_out
+
+    def obtain_mlvl_feats(self, img_feats, pts, img_meta):
+        if self.lateral_convs is not None:
+            img_ins = [
+                lateral_conv(img_feats[i])
+                for i, lateral_conv in zip(self.img_levels, self.lateral_convs)
+            ]
+        else:
+            img_ins = img_feats
+        img_feats_per_point = []
+        # Sample multi-level features
+        for i in range(len(img_meta)):
+            mlvl_img_feats = []
+            for level in range(len(self.img_levels)):
+                if torch.isnan(img_ins[level][i:i + 1]).any():
+                    import pdb
+                    pdb.set_trace()
+                mlvl_img_feats.append(
+                    self.sample_single(img_ins[level][i:i + 1], pts[i][:, :3],
+                                       img_meta[i]))
+            mlvl_img_feats = torch.cat(mlvl_img_feats, dim=-1)
+            img_feats_per_point.append(mlvl_img_feats)
+
+        img_pts = torch.cat(img_feats_per_point, dim=0)
+        return img_pts
+
+    def sample_single(self, img_feats, pts, img_meta):
+        pcd_scale_factor = (
+            img_meta['pcd_scale_factor']
+            if 'pcd_scale_factor' in img_meta.keys() else 1)
+        pcd_trans_factor = (
+            pts.new_tensor(img_meta['pcd_trans'])
+            if 'pcd_trans' in img_meta.keys() else 0)
+        pcd_rotate_mat = (
+            pts.new_tensor(img_meta['pcd_rotation'])
+            if 'pcd_rotation' in img_meta.keys() else
+            torch.eye(3).type_as(pts).to(pts.device))
+        img_scale_factor = (
+            img_meta['scale_factor']
+            if 'scale_factor' in img_meta.keys() else 1)
+        pcd_flip = img_meta['pcd_flip'] if 'pcd_flip' in img_meta.keys(
+        ) else False
+        img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False
+        img_crop_offset = (
+            pts.new_tensor(img_meta['img_crop_offset'])
+            if 'img_crop_offset' in img_meta.keys() else 0)
+        img_pts = point_sample(
+            img_feats,
+            pts,
+            pts.new_tensor(img_meta['lidar2img']),
+            pcd_rotate_mat,
+            img_scale_factor,
+            img_crop_offset,
+            pcd_trans_factor,
+            pcd_scale_factor,
+            pcd_flip=pcd_flip,
+            img_flip=img_flip,
+            img_pad_shape=img_meta['pad_shape'][:2],
+            img_shape=img_meta['img_shape'][:2],
+            aligned=self.aligned,
+            padding_mode=self.padding_mode,
+            align_corners=self.align_corners,
+        )
+        return img_pts
diff --git a/mmdet3d/models/losses/__init__.py b/mmdet3d/models/losses/__init__.py
new file mode 100644
index 0000000000..e2da0a0955
--- /dev/null
+++ b/mmdet3d/models/losses/__init__.py
@@ -0,0 +1,3 @@
+from mmdet.models.losses import FocalLoss, SmoothL1Loss
+
+__all__ = ['FocalLoss', 'SmoothL1Loss']
diff --git a/mmdet3d/models/middle_encoders/__init__.py b/mmdet3d/models/middle_encoders/__init__.py
new file mode 100644
index 0000000000..b20bcb049a
--- /dev/null
+++ b/mmdet3d/models/middle_encoders/__init__.py
@@ -0,0 +1,4 @@
+from .pillar_scatter import PointPillarsScatter
+from .sparse_encoder import SparseEncoder
+
+__all__ = ['PointPillarsScatter', 'SparseEncoder']
diff --git a/mmdet3d/models/middle_encoders/pillar_scatter.py b/mmdet3d/models/middle_encoders/pillar_scatter.py
new file mode 100644
index 0000000000..5e502ed4f5
--- /dev/null
+++ b/mmdet3d/models/middle_encoders/pillar_scatter.py
@@ -0,0 +1,85 @@
+import torch
+from torch import nn
+
+from ..registry import MIDDLE_ENCODERS
+
+
+@MIDDLE_ENCODERS.register_module
+class PointPillarsScatter(nn.Module):
+
+    def __init__(self, in_channels, output_shape):
+        """
+        Point Pillar's Scatter.
+        Converts learned features from dense tensor to sparse pseudo image.
+
+        Args:
+            output_shape (list[int]): Required output shape of features.
+            in_channels (int): Number of input features.
+        """
+
+        super().__init__()
+        self.name = 'PointPillarsScatter'
+        self.output_shape = output_shape
+        self.ny = output_shape[0]
+        self.nx = output_shape[1]
+        self.nchannels = in_channels
+
+    def forward(self, voxel_features, coors, batch_size=None):
+        # TODO: rewrite the function in a batch manner
+        # no need to deal with different batch cases
+        if batch_size is not None:
+            return self.forward_batch(voxel_features, coors, batch_size)
+        else:
+            return self.forward_single(voxel_features, coors)
+
+    def forward_single(self, voxel_features, coors):
+        # Create the canvas for this sample
+        canvas = torch.zeros(
+            self.nchannels,
+            self.nx * self.ny,
+            dtype=voxel_features.dtype,
+            device=voxel_features.device)
+
+        indices = coors[:, 1] * self.nx + coors[:, 2]
+        indices = indices.long()
+        voxels = voxel_features.t()
+        # Now scatter the blob back to the canvas.
+        canvas[:, indices] = voxels
+        # Undo the column stacking to final 4-dim tensor
+        canvas = canvas.view(1, self.nchannels, self.ny, self.nx)
+        return [canvas]
+
+    def forward_batch(self, voxel_features, coors, batch_size):
+
+        # batch_canvas will be the final output.
+        batch_canvas = []
+        for batch_itt in range(batch_size):
+            # Create the canvas for this sample
+            canvas = torch.zeros(
+                self.nchannels,
+                self.nx * self.ny,
+                dtype=voxel_features.dtype,
+                device=voxel_features.device)
+
+            # Only include non-empty pillars
+            batch_mask = coors[:, 0] == batch_itt
+            this_coors = coors[batch_mask, :]
+            indices = this_coors[:, 2] * self.nx + this_coors[:, 3]
+            indices = indices.type(torch.long)
+            voxels = voxel_features[batch_mask, :]
+            voxels = voxels.t()
+
+            # Now scatter the blob back to the canvas.
+            canvas[:, indices] = voxels
+
+            # Append to a list for later stacking.
+            batch_canvas.append(canvas)
+
+        # Stack to 3-dim tensor (batch-size, nchannels, nrows*ncols)
+        batch_canvas = torch.stack(batch_canvas, 0)
+
+        # Undo the column stacking to final 4-dim tensor
+        batch_canvas = batch_canvas.view(batch_size, self.nchannels, self.ny,
+                                         self.nx)
+
+        return batch_canvas
diff --git a/mmdet3d/models/middle_encoders/sparse_encoder.py b/mmdet3d/models/middle_encoders/sparse_encoder.py
new file mode 100644
index 0000000000..70b437a47a
--- /dev/null
+++ b/mmdet3d/models/middle_encoders/sparse_encoder.py
@@ -0,0 +1,215 @@
+import torch.nn as nn
+
+import mmdet3d.ops.spconv as spconv
+from ..registry import MIDDLE_ENCODERS
+from ..utils import build_norm_layer
+
+
+@MIDDLE_ENCODERS.register_module
+class SparseEncoder(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 output_shape,
+                 pre_act,
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01)):
+        super().__init__()
+        self.sparse_shape = output_shape
+        self.output_shape = output_shape
+        self.in_channels = in_channels
+        self.pre_act = pre_act
+        # Spconv init all weight on its own
+        # TODO: make the network could be modified
+
+        if pre_act:
+            self.conv_input = spconv.SparseSequential(
+                spconv.SubMConv3d(
+                    in_channels,
+                    16,
+                    3,
+                    padding=1,
+                    bias=False,
+                    indice_key='subm1'), )
+            block = self.pre_act_block
+        else:
+            norm_name, norm_layer = build_norm_layer(norm_cfg, 16)
+            self.conv_input = spconv.SparseSequential(
+                spconv.SubMConv3d(
+                    in_channels,
+                    16,
+                    3,
+                    padding=1,
+                    bias=False,
+                    indice_key='subm1'),
+                norm_layer,
+                nn.ReLU(),
+            )
+            block = self.post_act_block
+
+        self.conv1 = spconv.SparseSequential(
+            block(16, 16, 3, norm_cfg=norm_cfg, padding=1,
+                  indice_key='subm1'), )
+
+        self.conv2 = spconv.SparseSequential(
+            # [1600, 1408, 41] -> [800, 704, 21]
+            block(
+                16,
+                32,
+                3,
+                norm_cfg=norm_cfg,
+                stride=2,
+                padding=1,
+                indice_key='spconv2',
+                conv_type='spconv'),
+            block(32, 32, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm2'),
+            block(32, 32, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm2'),
+        )
+
+        self.conv3 = spconv.SparseSequential(
+            # [800, 704, 21] -> [400, 352, 11]
+            block(
+                32,
+                64,
+                3,
+                norm_cfg=norm_cfg,
+                stride=2,
+                padding=1,
+                indice_key='spconv3',
+                conv_type='spconv'),
+            block(64, 64, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm3'),
+            block(64, 64, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm3'),
+        )
+
+        self.conv4 = spconv.SparseSequential(
+            # [400, 352, 11] -> [200, 176, 5]
+            block(
+                64,
+                64,
+                3,
+                norm_cfg=norm_cfg,
+                stride=2,
+                padding=(0, 1, 1),
+                indice_key='spconv4',
+                conv_type='spconv'),
+            block(64, 64, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm4'),
+            block(64, 64, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm4'),
+        )
+
+        norm_name, norm_layer = build_norm_layer(norm_cfg, 128)
+        self.conv_out = spconv.SparseSequential(
+            # [200, 176, 5] -> [200, 176, 2]
+            spconv.SparseConv3d(
+                128,
+                128, (3, 1, 1),
+                stride=(2, 1, 1),
+                padding=0,
+                bias=False,
+                indice_key='spconv_down2'),
+            norm_layer,
+            nn.ReLU(),
+        )
+
+    def forward(self, voxel_features, coors, batch_size):
+        """
+        :param voxel_features:  (N, C)
+        :param coors:   (N, 4)  [batch_idx, z_idx, y_idx, x_idx]
+        :param batch_size:
+        :return:
+        """
+        coors = coors.int()
+        input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors,
+                                                  self.sparse_shape,
+                                                  batch_size)
+        x = self.conv_input(input_sp_tensor)
+
+        x_conv1 = self.conv1(x)
+        x_conv2 = self.conv2(x_conv1)
+        x_conv3 = self.conv3(x_conv2)
+        x_conv4 = self.conv4(x_conv3)
+
+        # for detection head
+        # [200, 176, 5] -> [200, 176, 2]
+        out = self.conv_out(x_conv4)
+        spatial_features = out.dense()
+
+        N, C, D, H, W = spatial_features.shape
+        spatial_features = spatial_features.view(N, C * D, H, W)
+
+        return spatial_features
+
+    def pre_act_block(self,
+                      in_channels,
+                      out_channels,
+                      kernel_size,
+                      indice_key=None,
+                      stride=1,
+                      padding=0,
+                      conv_type='subm',
+                      norm_cfg=None):
+        norm_name, norm_layer = build_norm_layer(norm_cfg, in_channels)
+        if conv_type == 'subm':
+            m = spconv.SparseSequential(
+                norm_layer,
+                nn.ReLU(inplace=True),
+                spconv.SubMConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    padding=padding,
+                    bias=False,
+                    indice_key=indice_key),
+            )
+        elif conv_type == 'spconv':
+            m = spconv.SparseSequential(
+                norm_layer,
+                nn.ReLU(inplace=True),
+                spconv.SparseConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    bias=False,
+                    indice_key=indice_key),
+            )
+        else:
+            raise NotImplementedError
+        return m
+
+    def post_act_block(self,
+                       in_channels,
+                       out_channels,
+                       kernel_size,
+                       indice_key,
+                       stride=1,
+                       padding=0,
+                       conv_type='subm',
+                       norm_cfg=None):
+        norm_name, norm_layer = build_norm_layer(norm_cfg, out_channels)
+        if conv_type == 'subm':
+            m = spconv.SparseSequential(
+                spconv.SubMConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    bias=False,
+                    indice_key=indice_key),
+                norm_layer,
+                nn.ReLU(inplace=True),
+            )
+        elif conv_type == 'spconv':
+            m = spconv.SparseSequential(
+                spconv.SparseConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    bias=False,
+                    indice_key=indice_key),
+                norm_layer,
+                nn.ReLU(inplace=True),
+            )
+        else:
+            raise NotImplementedError
+        return m
diff --git a/mmdet3d/models/necks/__init__.py b/mmdet3d/models/necks/__init__.py
new file mode 100644
index 0000000000..85904b497c
--- /dev/null
+++ b/mmdet3d/models/necks/__init__.py
@@ -0,0 +1,4 @@
+from mmdet.models.necks.fpn import FPN
+from .second_fpn import SECONDFPN
+
+__all__ = ['FPN', 'SECONDFPN']
diff --git a/mmdet3d/models/necks/second_fpn.py b/mmdet3d/models/necks/second_fpn.py
new file mode 100644
index 0000000000..59d676ffea
--- /dev/null
+++ b/mmdet3d/models/necks/second_fpn.py
@@ -0,0 +1,147 @@
+import logging
+from functools import partial
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import constant_init, kaiming_init
+from mmcv.runner import load_checkpoint
+from torch.nn import Sequential
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from .. import builder
+from ..registry import NECKS
+from ..utils import build_norm_layer
+
+
+class Empty(nn.Module):
+
+    def __init__(self, *args, **kwargs):
+        super(Empty, self).__init__()
+
+    def forward(self, *args, **kwargs):
+        if len(args) == 1:
+            return args[0]
+        elif len(args) == 0:
+            return None
+        return args
+
+
+@NECKS.register_module
+class SECONDFPN(nn.Module):
+    """Compare with RPN, RPNV2 support arbitrary number of stage.
+    """
+
+    def __init__(self,
+                 use_norm=True,
+                 in_channels=[128, 128, 256],
+                 upsample_strides=[1, 2, 4],
+                 num_upsample_filters=[256, 256, 256],
+                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01)):
+        # if for GroupNorm,
+        # cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True)
+        super(SECONDFPN, self).__init__()
+        assert len(num_upsample_filters) == len(upsample_strides)
+        self.in_channels = in_channels
+
+        if norm_cfg is not None:
+            ConvTranspose2d = partial(nn.ConvTranspose2d, bias=False)
+        else:
+            ConvTranspose2d = partial(nn.ConvTranspose2d, bias=True)
+
+        deblocks = []
+
+        for i, num_upsample_filter in enumerate(num_upsample_filters):
+            norm_layer = (
+                build_norm_layer(norm_cfg, num_upsample_filter)[1]
+                if norm_cfg is not None else Empty)
+            deblock = Sequential(
+                ConvTranspose2d(
+                    in_channels[i],
+                    num_upsample_filter,
+                    upsample_strides[i],
+                    stride=upsample_strides[i]),
+                norm_layer,
+                nn.ReLU(inplace=True),
+            )
+            deblocks.append(deblock)
+        self.deblocks = nn.ModuleList(deblocks)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            # keeping the initiation yields better results
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+        return
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+        ups = [deblock(inputs[i]) for i, deblock in enumerate(self.deblocks)]
+
+        if len(ups) > 1:
+            x = torch.cat(ups, dim=1)
+        else:
+            x = ups[0]
+        return [x]
+
+
+@NECKS.register_module
+class SECONDFusionFPN(SECONDFPN):
+    """Compare with RPN, RPNV2 support arbitrary number of stage.
+    """
+
+    def __init__(self,
+                 use_norm=True,
+                 in_channels=[128, 128, 256],
+                 upsample_strides=[1, 2, 4],
+                 num_upsample_filters=[256, 256, 256],
+                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+                 down_sample_rate=[40, 8, 8],
+                 fusion_layer=None,
+                 cat_points=False):
+        super(SECONDFusionFPN, self).__init__(
+            use_norm,
+            in_channels,
+            upsample_strides,
+            num_upsample_filters,
+            norm_cfg,
+        )
+        self.fusion_layer = None
+        if fusion_layer is not None:
+            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
+        self.cat_points = cat_points
+        self.down_sample_rate = down_sample_rate
+
+    def forward(self,
+                inputs,
+                coors=None,
+                points=None,
+                img_feats=None,
+                img_meta=None):
+        assert len(inputs) == len(self.in_channels)
+        ups = [deblock(inputs[i]) for i, deblock in enumerate(self.deblocks)]
+
+        if len(ups) > 1:
+            x = torch.cat(ups, dim=1)
+        else:
+            x = ups[0]
+        if (self.fusion_layer is not None and img_feats is not None):
+            downsample_pts_coors = torch.zeros_like(coors)
+            downsample_pts_coors[:, 0] = coors[:, 0]
+            downsample_pts_coors[:, 1] = (
+                coors[:, 1] / self.down_sample_rate[0])
+            downsample_pts_coors[:, 2] = (
+                coors[:, 2] / self.down_sample_rate[1])
+            downsample_pts_coors[:, 3] = (
+                coors[:, 3] / self.down_sample_rate[2])
+            # fusion for each point
+            x = self.fusion_layer(img_feats, points, x, downsample_pts_coors,
+                                  img_meta)
+        return [x]
diff --git a/mmdet3d/models/registry.py b/mmdet3d/models/registry.py
new file mode 100644
index 0000000000..9eb47d3ba6
--- /dev/null
+++ b/mmdet3d/models/registry.py
@@ -0,0 +1,5 @@
+from mmdet.utils import Registry
+
+VOXEL_ENCODERS = Registry('voxel_encoder')
+MIDDLE_ENCODERS = Registry('middle_encoder')
+FUSION_LAYERS = Registry('fusion_layer')
diff --git a/mmdet3d/models/roi_extractors/__init__.py b/mmdet3d/models/roi_extractors/__init__.py
new file mode 100644
index 0000000000..80c3c30f88
--- /dev/null
+++ b/mmdet3d/models/roi_extractors/__init__.py
@@ -0,0 +1,3 @@
+from mmdet.models.roi_extractors.single_level import SingleRoIExtractor
+
+__all__ = ['SingleRoIExtractor']
diff --git a/mmdet3d/models/utils/__init__.py b/mmdet3d/models/utils/__init__.py
new file mode 100644
index 0000000000..8cd39f7324
--- /dev/null
+++ b/mmdet3d/models/utils/__init__.py
@@ -0,0 +1,3 @@
+from mmdet.models.utils import ResLayer, bias_init_with_prob
+
+__all__ = ['bias_init_with_prob', 'ResLayer']
diff --git a/mmdet3d/models/utils/weight_init.py b/mmdet3d/models/utils/weight_init.py
new file mode 100644
index 0000000000..17d49880fd
--- /dev/null
+++ b/mmdet3d/models/utils/weight_init.py
@@ -0,0 +1,46 @@
+import numpy as np
+import torch.nn as nn
+
+
+def xavier_init(module, gain=1, bias=0, distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.xavier_uniform_(module.weight, gain=gain)
+    else:
+        nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def uniform_init(module, a=0, b=1, bias=0):
+    nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.kaiming_uniform_(
+            module.weight, mode=mode, nonlinearity=nonlinearity)
+    else:
+        nn.init.kaiming_normal_(
+            module.weight, mode=mode, nonlinearity=nonlinearity)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def bias_init_with_prob(prior_prob):
+    """ initialize conv/fc bias value according to giving probablity"""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
diff --git a/mmdet3d/models/voxel_encoders/__init__.py b/mmdet3d/models/voxel_encoders/__init__.py
new file mode 100644
index 0000000000..96f13579b8
--- /dev/null
+++ b/mmdet3d/models/voxel_encoders/__init__.py
@@ -0,0 +1,8 @@
+from .pillar_encoder import AlignedPillarFeatureNet, PillarFeatureNet
+from .voxel_encoder import (DynamicVFE, VoxelFeatureExtractor,
+                            VoxelFeatureExtractorV2, VoxelFeatureExtractorV3)
+
+__all__ = [
+    'PillarFeatureNet', 'AlignedPillarFeatureNet', 'VoxelFeatureExtractor',
+    'DynamicVFE', 'VoxelFeatureExtractorV2', 'VoxelFeatureExtractorV3'
+]
diff --git a/mmdet3d/models/voxel_encoders/pillar_encoder.py b/mmdet3d/models/voxel_encoders/pillar_encoder.py
new file mode 100644
index 0000000000..21cf57acf8
--- /dev/null
+++ b/mmdet3d/models/voxel_encoders/pillar_encoder.py
@@ -0,0 +1,378 @@
+import torch
+from torch import nn
+
+from mmdet3d.ops import DynamicScatter, build_norm_layer
+from ..registry import VOXEL_ENCODERS
+from .utils import PFNLayer, get_paddings_indicator
+
+
+@VOXEL_ENCODERS.register_module
+class PillarFeatureNet(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 use_norm=True,
+                 num_filters=(64, ),
+                 with_distance=False,
+                 with_cluster_center=True,
+                 with_voxel_center=True,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 mode='max'):
+        """ Pillar Feature Net.
+        The network prepares the pillar features and performs forward pass
+        through PFNLayers.
+
+        Args:
+            num_input_features (int). Number of input features,
+                either x, y, z or x, y, z, r.
+            use_norm (bool). Whether to include BatchNorm.
+            num_filters (list[int]). Number of features in each of the
+                N PFNLayers.
+            with_distance (bool). Whether to include Euclidean distance
+                to points.
+            voxel_size (list[float]). Size of voxels, only utilize x and y
+                size.
+            point_cloud_range (list[float>]). Point cloud range, only
+                utilize x and y min.
+        """
+
+        super(PillarFeatureNet, self).__init__()
+        assert len(num_filters) > 0
+        if with_cluster_center:
+            num_input_features += 3
+        if with_voxel_center:
+            num_input_features += 2
+        if with_distance:
+            num_input_features += 1
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+
+        # Create PillarFeatureNet layers
+        self.num_input_features = num_input_features
+        num_filters = [num_input_features] + list(num_filters)
+        pfn_layers = []
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            if i < len(num_filters) - 2:
+                last_layer = False
+            else:
+                last_layer = True
+            pfn_layers.append(
+                PFNLayer(
+                    in_filters,
+                    out_filters,
+                    use_norm,
+                    last_layer=last_layer,
+                    mode=mode))
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+
+        # Need pillar (voxel) size and x/y offset in order to calculate offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.point_cloud_range = point_cloud_range
+
+    def forward(self, features, num_points, coors):
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            points_mean = features[:, :, :3].sum(
+                dim=1, keepdim=True) / num_points.type_as(features).view(
+                    -1, 1, 1)
+            f_cluster = features[:, :, :3] - points_mean
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features[:, :, :2]
+            f_center[:, :, 0] = f_center[:, :, 0] - (
+                coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
+                self.x_offset)
+            f_center[:, :, 1] = f_center[:, :, 1] - (
+                coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
+                self.y_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+        # The feature decorations were calculated without regard to whether
+        # pillar was empty. Need to ensure that
+        # empty pillars remain set to zeros.
+        voxel_count = features.shape[1]
+        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(features)
+        features *= mask
+
+        for pfn in self.pfn_layers:
+            features = pfn(features, num_points)
+
+        return features.squeeze()
+
+
+@VOXEL_ENCODERS.register_module
+class DynamicPillarFeatureNet(PillarFeatureNet):
+
+    def __init__(self,
+                 num_input_features=4,
+                 use_norm=True,
+                 num_filters=(64, ),
+                 with_distance=False,
+                 with_cluster_center=True,
+                 with_voxel_center=True,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode='max'):
+        """
+        Dynamic Pillar Feature Net for Dynamic Voxelization.
+        The difference is in the forward part
+        """
+
+        super(DynamicPillarFeatureNet, self).__init__(
+            num_input_features,
+            use_norm,
+            num_filters,
+            with_distance,
+            with_cluster_center=with_cluster_center,
+            with_voxel_center=with_voxel_center,
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            mode=mode)
+
+        num_filters = [self.num_input_features] + list(num_filters)
+        pfn_layers = []
+        # TODO: currently only support one PFNLayer
+
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            if i > 0:
+                in_filters *= 2
+            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
+            pfn_layers.append(
+                nn.Sequential(
+                    nn.Linear(in_filters, out_filters, bias=False), norm_layer,
+                    nn.ReLU(inplace=True)))
+        self.num_pfn = len(pfn_layers)
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+        self.pfn_scatter = DynamicScatter(voxel_size, point_cloud_range,
+                                          (mode != 'max'))
+        self.cluster_scatter = DynamicScatter(
+            voxel_size, point_cloud_range, average_points=True)
+
+    def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
+        # Step 1: scatter voxel into canvas
+        # Calculate necessary things for canvas creation
+        canvas_y = int(
+            (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
+        canvas_x = int(
+            (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
+        canvas_channel = voxel_mean.size(1)
+        batch_size = pts_coors[-1, 0] + 1
+        canvas_len = canvas_y * canvas_x * batch_size
+        # Create the canvas for this sample
+        canvas = voxel_mean.new_zeros(canvas_channel, canvas_len)
+        # Only include non-empty pillars
+        indices = (
+            voxel_coors[:, 0] * canvas_y * canvas_x +
+            voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3])
+        # Scatter the blob back to the canvas
+        canvas[:, indices.long()] = voxel_mean.t()
+
+        # Step 2: get voxel mean for each point
+        voxel_index = (
+            pts_coors[:, 0] * canvas_y * canvas_x +
+            pts_coors[:, 2] * canvas_x + pts_coors[:, 3])
+        center_per_point = canvas[:, voxel_index.long()].t()
+        return center_per_point
+
+    def forward(self, features, coors):
+        """
+        features (torch.Tensor): NxC
+        coors (torch.Tensor): Nx(1+NDim)
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            voxel_mean, mean_coors = self.cluster_scatter(features, coors)
+            points_mean = self.map_voxel_center_to_point(
+                coors, voxel_mean, mean_coors)
+            # TODO: maybe also do cluster for reflectivity
+            f_cluster = features[:, :3] - points_mean[:, :3]
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(size=(features.size(0), 2))
+            f_center[:, 0] = features[:, 0] - (
+                coors[:, 3].type_as(features) * self.vx + self.x_offset)
+            f_center[:, 1] = features[:, 1] - (
+                coors[:, 2].type_as(features) * self.vy + self.y_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+        for i, pfn in enumerate(self.pfn_layers):
+            point_feats = pfn(features)
+            voxel_feats, voxel_coors = self.pfn_scatter(point_feats, coors)
+            if i != len(self.pfn_layers) - 1:
+                # need to concat voxel feats if it is not the last pfn
+                feat_per_point = self.map_voxel_center_to_point(
+                    coors, voxel_feats, voxel_coors)
+                features = torch.cat([point_feats, feat_per_point], dim=1)
+
+        return voxel_feats, voxel_coors
+
+
+@VOXEL_ENCODERS.register_module
+class AlignedPillarFeatureNet(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 use_norm=True,
+                 num_filters=(64, ),
+                 with_distance=False,
+                 with_cluster_center=True,
+                 with_voxel_center=True,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 mode='max'):
+        """ Pillar Feature Net.
+
+        The network prepares the pillar features and performs forward pass
+        through PFNLayers.
+
+        Args:
+            num_input_features (int): Number of input features, either x, y, z
+                or x, y, z, r.
+            use_norm (bool): Whether to include BatchNorm.
+            num_filters (list[int]): Number of features in each of the N
+                PFNLayers.
+            with_distance (bool): Whether to include Euclidean distance to
+                points.
+            voxel_size (list[float]): Size of voxels, only utilize x and y
+                size.
+            point_cloud_range: (list[float]): Point cloud range, only
+                utilize x and y min.
+        """
+
+        super(AlignedPillarFeatureNet, self).__init__()
+
+        assert len(num_filters) > 0
+        if with_cluster_center:
+            print('Use cluster center')
+            num_input_features += 3
+        if with_voxel_center:
+            print('Use voxel center')
+            num_input_features += 2
+        if with_distance:
+            num_input_features += 1
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+
+        # Create PillarFeatureNet layers
+        num_filters = [num_input_features] + list(num_filters)
+        pfn_layers = []
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            if i < len(num_filters) - 2:
+                last_layer = False
+            else:
+                last_layer = True
+            pfn_layers.append(
+                PFNLayer(
+                    in_filters,
+                    out_filters,
+                    use_norm,
+                    last_layer=last_layer,
+                    mode=mode))
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+
+        # Need pillar (voxel) size and x/y offset in order to
+        # calculate pillar offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+
+    def forward(self, features, num_points, coors):
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            points_mean = features[:, :, :3].sum(
+                dim=1, keepdim=True) / num_points.type_as(features).view(
+                    -1, 1, 1)
+            f_cluster = features[:, :, :3] - points_mean
+            features_ls.append(f_cluster)
+
+        x_distance = features[:, :, 0] - (
+            coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
+            self.x_offset)
+        y_distance = features[:, :, 1] - (
+            coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
+            self.y_offset)
+        z_distance = features[:, :, 2] - (
+            coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
+            self.z_offset)
+
+        normed_x_distance = 1 - torch.abs(x_distance / self.vx)
+        normed_y_distance = 1 - torch.abs(y_distance / self.vy)
+        normed_z_distance = 1 - torch.abs(z_distance / self.vz)
+
+        x_mask = torch.gt(normed_x_distance, 0).type_as(features)
+        y_mask = torch.gt(normed_y_distance, 0).type_as(features)
+        z_mask = torch.gt(normed_z_distance, 0).type_as(features)
+
+        nonzero_points_mask = x_mask.mul(y_mask).mul(z_mask)
+        aligned_distance = normed_x_distance.mul(normed_y_distance).mul(
+            normed_z_distance).mul(nonzero_points_mask)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features[:, :, :2]
+            f_center[:, :, 0] = f_center[:, :, 0] - (
+                coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
+                self.x_offset)
+            f_center[:, :, 1] = f_center[:, :, 1] - (
+                coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
+                self.y_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+
+        # The feature decorations were calculated without regard to
+        # whether pillar was empty. Need to ensure that
+        # empty pillars remain set to zeros.
+        voxel_count = features.shape[1]
+        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(features)
+        features *= mask
+
+        for pfn in self.pfn_layers:
+            if pfn.last_vfe:
+                features = pfn(features, aligned_distance)
+            else:
+                features = pfn(features)
+
+        return features.squeeze()
diff --git a/mmdet3d/models/voxel_encoders/utils.py b/mmdet3d/models/voxel_encoders/utils.py
new file mode 100644
index 0000000000..c81a6b92fb
--- /dev/null
+++ b/mmdet3d/models/voxel_encoders/utils.py
@@ -0,0 +1,148 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ..utils import build_norm_layer
+
+
+class Empty(nn.Module):
+
+    def __init__(self, *args, **kwargs):
+        super(Empty, self).__init__()
+
+    def forward(self, *args, **kwargs):
+        if len(args) == 1:
+            return args[0]
+        elif len(args) == 0:
+            return None
+        return args
+
+
+def get_paddings_indicator(actual_num, max_num, axis=0):
+    """Create boolean mask by actually number of a padded tensor.
+
+    Args:
+        actual_num ([type]): [description]
+        max_num ([type]): [description]
+
+    Returns:
+        [type]: [description]
+    """
+    actual_num = torch.unsqueeze(actual_num, axis + 1)
+    # tiled_actual_num: [N, M, 1]
+    max_num_shape = [1] * len(actual_num.shape)
+    max_num_shape[axis + 1] = -1
+    max_num = torch.arange(
+        max_num, dtype=torch.int, device=actual_num.device).view(max_num_shape)
+    # tiled_actual_num: [[3,3,3,3,3], [4,4,4,4,4], [2,2,2,2,2]]
+    # tiled_max_num: [[0,1,2,3,4], [0,1,2,3,4], [0,1,2,3,4]]
+    paddings_indicator = actual_num.int() > max_num
+    # paddings_indicator shape: [batch_size, max_num]
+    return paddings_indicator
+
+
+class VFELayer(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 max_out=True,
+                 cat_max=True):
+        super(VFELayer, self).__init__()
+        self.cat_max = cat_max
+        self.max_out = max_out
+        # self.units = int(out_channels / 2)
+        if norm_cfg:
+            norm_name, norm_layer = build_norm_layer(norm_cfg, out_channels)
+            self.norm = norm_layer
+            self.linear = nn.Linear(in_channels, out_channels, bias=False)
+        else:
+            self.norm = Empty(out_channels)
+            self.linear = nn.Linear(in_channels, out_channels, bias=True)
+
+    def forward(self, inputs):
+        # [K, T, 7] tensordot [7, units] = [K, T, units]
+        voxel_count = inputs.shape[1]
+        x = self.linear(inputs)
+        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
+                                                               1).contiguous()
+        pointwise = F.relu(x)
+        # [K, T, units]
+        if self.max_out:
+            aggregated = torch.max(pointwise, dim=1, keepdim=True)[0]
+        else:
+            # this is for fusion layer
+            return pointwise
+
+        if not self.cat_max:
+            return aggregated.squeeze(1)
+        else:
+            # [K, 1, units]
+            repeated = aggregated.repeat(1, voxel_count, 1)
+            concatenated = torch.cat([pointwise, repeated], dim=2)
+            # [K, T, 2 * units]
+            return concatenated
+
+
+class PFNLayer(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 use_norm=True,
+                 last_layer=False,
+                 mode='max'):
+        """ Pillar Feature Net Layer.
+
+        The Pillar Feature Net is composed of a series of these layers, but the
+        PointPillars paper results only used a single PFNLayer.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            use_norm (bool): Whether to include BatchNorm.
+            last_layer (bool): If last_layer, there is no concatenation of
+                features.
+        """
+
+        super().__init__()
+        self.name = 'PFNLayer'
+        self.last_vfe = last_layer
+        if not self.last_vfe:
+            out_channels = out_channels // 2
+        self.units = out_channels
+
+        if use_norm:
+            self.norm = nn.BatchNorm1d(self.units, eps=1e-3, momentum=0.01)
+            self.linear = nn.Linear(in_channels, self.units, bias=False)
+        else:
+            self.norm = Empty(self.unints)
+            self.linear = nn.Linear(in_channels, self.units, bias=True)
+
+        self.mode = mode
+
+    def forward(self, inputs, num_voxels=None, aligned_distance=None):
+
+        x = self.linear(inputs)
+        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
+                                                               1).contiguous()
+        x = F.relu(x)
+
+        if self.mode == 'max':
+            if aligned_distance is not None:
+                x = x.mul(aligned_distance.unsqueeze(-1))
+            x_max = torch.max(x, dim=1, keepdim=True)[0]
+        elif self.mode == 'avg':
+            if aligned_distance is not None:
+                x = x.mul(aligned_distance.unsqueeze(-1))
+            x_max = x.sum(
+                dim=1, keepdim=True) / num_voxels.type_as(inputs).view(
+                    -1, 1, 1)
+
+        if self.last_vfe:
+            return x_max
+        else:
+            x_repeat = x_max.repeat(1, inputs.shape[1], 1)
+            x_concatenated = torch.cat([x, x_repeat], dim=2)
+            return x_concatenated
diff --git a/mmdet3d/models/voxel_encoders/voxel_encoder.py b/mmdet3d/models/voxel_encoders/voxel_encoder.py
new file mode 100644
index 0000000000..c8afaf2216
--- /dev/null
+++ b/mmdet3d/models/voxel_encoders/voxel_encoder.py
@@ -0,0 +1,478 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from mmdet3d.ops import DynamicScatter
+from .. import builder
+from ..registry import VOXEL_ENCODERS
+from ..utils import build_norm_layer
+from .utils import Empty, VFELayer, get_paddings_indicator
+
+
+@VOXEL_ENCODERS.register_module
+class VoxelFeatureExtractor(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 use_norm=True,
+                 num_filters=[32, 128],
+                 with_distance=False,
+                 name='VoxelFeatureExtractor'):
+        super(VoxelFeatureExtractor, self).__init__()
+        self.name = name
+        assert len(num_filters) == 2
+        num_input_features += 3  # add mean features
+        if with_distance:
+            num_input_features += 1
+        self._with_distance = with_distance
+        self.vfe1 = VFELayer(num_input_features, num_filters[0], use_norm)
+        self.vfe2 = VFELayer(num_filters[0], num_filters[1], use_norm)
+
+        if use_norm:
+            self.linear = nn.Linear(num_filters[1], num_filters[1], bias=False)
+            self.norm = nn.BatchNorm1d(num_filters[1], eps=1e-3, momentum=0.01)
+        else:
+            self.linear = nn.Linear(num_filters[1], num_filters[1], bias=True)
+            self.norm = Empty(num_filters[1])
+
+    def forward(self, features, num_voxels, **kwargs):
+        # features: [concated_num_points, num_voxel_size, 3(4)]
+        # num_voxels: [concated_num_points]
+        # t = time.time()
+        # torch.cuda.synchronize()
+
+        points_mean = features[:, :, :3].sum(
+            dim=1, keepdim=True) / num_voxels.type_as(features).view(-1, 1, 1)
+        features_relative = features[:, :, :3] - points_mean
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features = torch.cat([features, features_relative, points_dist],
+                                 dim=-1)
+        else:
+            features = torch.cat([features, features_relative], dim=-1)
+        voxel_count = features.shape[1]
+        mask = get_paddings_indicator(num_voxels, voxel_count, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(features)
+        # mask = features.max(dim=2, keepdim=True)[0] != 0
+
+        # torch.cuda.synchronize()
+        # print("vfe prep forward time", time.time() - t)
+        x = self.vfe1(features)
+        x *= mask
+        x = self.vfe2(x)
+        x *= mask
+        x = self.linear(x)
+        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
+                                                               1).contiguous()
+        x = F.relu(x)
+        x *= mask
+        # x: [concated_num_points, num_voxel_size, 128]
+        voxelwise = torch.max(x, dim=1)[0]
+        return voxelwise
+
+
+@VOXEL_ENCODERS.register_module
+class VoxelFeatureExtractorV2(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 use_norm=True,
+                 num_filters=[32, 128],
+                 with_distance=False,
+                 name='VoxelFeatureExtractor'):
+        super(VoxelFeatureExtractorV2, self).__init__()
+        self.name = name
+        assert len(num_filters) > 0
+        num_input_features += 3
+        if with_distance:
+            num_input_features += 1
+        self._with_distance = with_distance
+
+        num_filters = [num_input_features] + num_filters
+        filters_pairs = [[num_filters[i], num_filters[i + 1]]
+                         for i in range(len(num_filters) - 1)]
+        self.vfe_layers = nn.ModuleList(
+            [VFELayer(i, o, use_norm) for i, o in filters_pairs])
+
+        if use_norm:
+            self.linear = nn.Linear(
+                num_filters[-1], num_filters[-1], bias=False)
+            self.norm = nn.BatchNorm1d(
+                num_filters[-1], eps=1e-3, momentum=0.01)
+        else:
+            self.linear = nn.Linear(
+                num_filters[-1], num_filters[-1], bias=True)
+            self.norm = Empty(num_filters[-1])
+
+    def forward(self, features, num_voxels, **kwargs):
+        # features: [concated_num_points, num_voxel_size, 3(4)]
+        # num_voxels: [concated_num_points]
+        points_mean = features[:, :, :3].sum(
+            dim=1, keepdim=True) / num_voxels.type_as(features).view(-1, 1, 1)
+        features_relative = features[:, :, :3] - points_mean
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features = torch.cat([features, features_relative, points_dist],
+                                 dim=-1)
+        else:
+            features = torch.cat([features, features_relative], dim=-1)
+        voxel_count = features.shape[1]
+        mask = get_paddings_indicator(num_voxels, voxel_count, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(features)
+        for vfe in self.vfe_layers:
+            features = vfe(features)
+            features *= mask
+        features = self.linear(features)
+        features = self.norm(features.permute(0, 2, 1).contiguous()).permute(
+            0, 2, 1).contiguous()
+        features = F.relu(features)
+        features *= mask
+        # x: [concated_num_points, num_voxel_size, 128]
+        voxelwise = torch.max(features, dim=1)[0]
+        return voxelwise
+
+
+@VOXEL_ENCODERS.register_module
+class VoxelFeatureExtractorV3(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 use_norm=True,
+                 num_filters=[32, 128],
+                 with_distance=False,
+                 name='VoxelFeatureExtractor'):
+        super(VoxelFeatureExtractorV3, self).__init__()
+        self.name = name
+
+    def forward(self, features, num_points, coors):
+        # features: [concated_num_points, num_voxel_size, 3(4)]
+        # num_points: [concated_num_points]
+        points_mean = features[:, :, :4].sum(
+            dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1)
+        return points_mean.contiguous()
+
+
+@VOXEL_ENCODERS.register_module
+class DynamicVFEV3(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1)):
+        super(DynamicVFEV3, self).__init__()
+        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
+
+    @torch.no_grad()
+    def forward(self, features, coors):
+        # This function is used from the start of the voxelnet
+        # num_points: [concated_num_points]
+        features, features_coors = self.scatter(features, coors)
+        return features, features_coors
+
+
+@VOXEL_ENCODERS.register_module
+class DynamicVFE(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 num_filters=[],
+                 with_distance=False,
+                 with_cluster_center=False,
+                 with_voxel_center=False,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode='max',
+                 fusion_layer=None,
+                 return_point_feats=False):
+        super(DynamicVFE, self).__init__()
+        assert len(num_filters) > 0
+        if with_cluster_center:
+            num_input_features += 3
+        if with_voxel_center:
+            num_input_features += 3
+        if with_distance:
+            num_input_features += 3
+        self.num_input_features = num_input_features
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+        self.return_point_feats = return_point_feats
+
+        # Need pillar (voxel) size and x/y offset in order to calculate offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+        self.point_cloud_range = point_cloud_range
+        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
+
+        num_filters = [self.num_input_features] + list(num_filters)
+        vfe_layers = []
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            if i > 0:
+                in_filters *= 2
+            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
+            vfe_layers.append(
+                nn.Sequential(
+                    nn.Linear(in_filters, out_filters, bias=False), norm_layer,
+                    nn.ReLU(inplace=True)))
+            self.vfe_layers = nn.ModuleList(vfe_layers)
+        self.num_vfe = len(vfe_layers)
+        self.vfe_scatter = DynamicScatter(voxel_size, point_cloud_range,
+                                          (mode != 'max'))
+        self.cluster_scatter = DynamicScatter(
+            voxel_size, point_cloud_range, average_points=True)
+        self.fusion_layer = None
+        if fusion_layer is not None:
+            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
+
+    def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
+        # Step 1: scatter voxel into canvas
+        # Calculate necessary things for canvas creation
+        canvas_z = int(
+            (self.point_cloud_range[5] - self.point_cloud_range[2]) / self.vz)
+        canvas_y = int(
+            (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
+        canvas_x = int(
+            (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
+        # canvas_channel = voxel_mean.size(1)
+        batch_size = pts_coors[-1, 0] + 1
+        canvas_len = canvas_z * canvas_y * canvas_x * batch_size
+        # Create the canvas for this sample
+        canvas = voxel_mean.new_zeros(canvas_len, dtype=torch.long)
+        # Only include non-empty pillars
+        indices = (
+            voxel_coors[:, 0] * canvas_z * canvas_y * canvas_x +
+            voxel_coors[:, 1] * canvas_y * canvas_x +
+            voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3])
+        # Scatter the blob back to the canvas
+        canvas[indices.long()] = torch.arange(
+            start=0, end=voxel_mean.size(0), device=voxel_mean.device)
+
+        # Step 2: get voxel mean for each point
+        voxel_index = (
+            pts_coors[:, 0] * canvas_z * canvas_y * canvas_x +
+            pts_coors[:, 1] * canvas_y * canvas_x +
+            pts_coors[:, 2] * canvas_x + pts_coors[:, 3])
+        voxel_inds = canvas[voxel_index.long()]
+        center_per_point = voxel_mean[voxel_inds, ...]
+        return center_per_point
+
+    def forward(self,
+                features,
+                coors,
+                points=None,
+                img_feats=None,
+                img_meta=None):
+        """
+        features (torch.Tensor): NxC
+        coors (torch.Tensor): Nx(1+NDim)
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            voxel_mean, mean_coors = self.cluster_scatter(features, coors)
+            points_mean = self.map_voxel_center_to_point(
+                coors, voxel_mean, mean_coors)
+            # TODO: maybe also do cluster for reflectivity
+            f_cluster = features[:, :3] - points_mean[:, :3]
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(size=(features.size(0), 3))
+            f_center[:, 0] = features[:, 0] - (
+                coors[:, 3].type_as(features) * self.vx + self.x_offset)
+            f_center[:, 1] = features[:, 1] - (
+                coors[:, 2].type_as(features) * self.vy + self.y_offset)
+            f_center[:, 2] = features[:, 2] - (
+                coors[:, 1].type_as(features) * self.vz + self.z_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+        for i, vfe in enumerate(self.vfe_layers):
+            point_feats = vfe(features)
+            if (i == len(self.vfe_layers) - 1 and self.fusion_layer is not None
+                    and img_feats is not None):
+                point_feats = self.fusion_layer(img_feats, points, point_feats,
+                                                img_meta)
+            voxel_feats, voxel_coors = self.vfe_scatter(point_feats, coors)
+            if i != len(self.vfe_layers) - 1:
+                # need to concat voxel feats if it is not the last vfe
+                feat_per_point = self.map_voxel_center_to_point(
+                    coors, voxel_feats, voxel_coors)
+                features = torch.cat([point_feats, feat_per_point], dim=1)
+
+        if self.return_point_feats:
+            return point_feats
+        return voxel_feats, voxel_coors
+
+
+@VOXEL_ENCODERS.register_module
+class HardVFE(nn.Module):
+
+    def __init__(self,
+                 num_input_features=4,
+                 num_filters=[],
+                 with_distance=False,
+                 with_cluster_center=False,
+                 with_voxel_center=False,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode='max',
+                 fusion_layer=None,
+                 return_point_feats=False):
+        super(HardVFE, self).__init__()
+        assert len(num_filters) > 0
+        if with_cluster_center:
+            num_input_features += 3
+        if with_voxel_center:
+            num_input_features += 3
+        if with_distance:
+            num_input_features += 3
+        self.num_input_features = num_input_features
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+        self.return_point_feats = return_point_feats
+
+        # Need pillar (voxel) size and x/y offset to calculate pillar offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+        self.point_cloud_range = point_cloud_range
+        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
+
+        num_filters = [self.num_input_features] + list(num_filters)
+        vfe_layers = []
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            if i > 0:
+                in_filters *= 2
+            # TODO: pass norm_cfg to VFE
+            # norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
+            if i == (len(num_filters) - 2):
+                cat_max = False
+                max_out = True
+                if fusion_layer:
+                    max_out = False
+            else:
+                max_out = True
+                cat_max = True
+            vfe_layers.append(
+                VFELayer(
+                    in_filters,
+                    out_filters,
+                    norm_cfg=norm_cfg,
+                    max_out=max_out,
+                    cat_max=cat_max))
+            self.vfe_layers = nn.ModuleList(vfe_layers)
+        self.num_vfe = len(vfe_layers)
+
+        self.fusion_layer = None
+        if fusion_layer is not None:
+            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
+
+    def forward(self,
+                features,
+                num_points,
+                coors,
+                img_feats=None,
+                img_meta=None):
+        """
+        features (torch.Tensor): NxMxC
+        coors (torch.Tensor): Nx(1+NDim)
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            points_mean = (
+                features[:, :, :3].sum(dim=1, keepdim=True) /
+                num_points.type_as(features).view(-1, 1, 1))
+            # TODO: maybe also do cluster for reflectivity
+            f_cluster = features[:, :, :3] - points_mean
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(
+                size=(features.size(0), features.size(1), 3))
+            f_center[:, :, 0] = features[:, :, 0] - (
+                coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
+                self.x_offset)
+            f_center[:, :, 1] = features[:, :, 1] - (
+                coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
+                self.y_offset)
+            f_center[:, :, 2] = features[:, :, 2] - (
+                coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
+                self.z_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        voxel_feats = torch.cat(features_ls, dim=-1)
+        # The feature decorations were calculated without regard to whether
+        # pillar was empty.
+        # Need to ensure that empty voxels remain set to zeros.
+        voxel_count = voxel_feats.shape[1]
+        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
+        voxel_feats *= mask.unsqueeze(-1).type_as(voxel_feats)
+
+        for i, vfe in enumerate(self.vfe_layers):
+            voxel_feats = vfe(voxel_feats)
+        if torch.isnan(voxel_feats).any():
+            import pdb
+            pdb.set_trace()
+        if (self.fusion_layer is not None and img_feats is not None):
+            voxel_feats = self.fusion_with_mask(features, mask, voxel_feats,
+                                                coors, img_feats, img_meta)
+        if torch.isnan(voxel_feats).any():
+            import pdb
+            pdb.set_trace()
+        return voxel_feats
+
+    def fusion_with_mask(self, features, mask, voxel_feats, coors, img_feats,
+                         img_meta):
+        # the features is consist of a batch of points
+        batch_size = coors[-1, 0] + 1
+        points = []
+        for i in range(batch_size):
+            single_mask = (coors[:, 0] == i)
+            points.append(features[single_mask][mask[single_mask]])
+
+        point_feats = voxel_feats[mask]
+        if torch.isnan(point_feats).any():
+            import pdb
+            pdb.set_trace()
+        point_feats = self.fusion_layer(img_feats, points, point_feats,
+                                        img_meta)
+        if torch.isnan(point_feats).any():
+            import pdb
+            pdb.set_trace()
+        voxel_canvas = voxel_feats.new_zeros(
+            size=(voxel_feats.size(0), voxel_feats.size(1),
+                  point_feats.size(-1)))
+        voxel_canvas[mask] = point_feats
+        out = torch.max(voxel_canvas, dim=1)[0]
+        if torch.isnan(out).any():
+            import pdb
+            pdb.set_trace()
+        return out
diff --git a/mmdet3d/ops/__init__.py b/mmdet3d/ops/__init__.py
new file mode 100644
index 0000000000..6489651139
--- /dev/null
+++ b/mmdet3d/ops/__init__.py
@@ -0,0 +1,11 @@
+from mmdet.ops import (RoIAlign, SigmoidFocalLoss, build_norm_layer,
+                       get_compiler_version, get_compiling_cuda_version, nms,
+                       roi_align, sigmoid_focal_loss)
+from .voxel import DynamicScatter, Voxelization, dynamic_scatter, voxelization
+
+__all__ = [
+    'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'get_compiler_version',
+    'get_compiling_cuda_version', 'build_conv_layer', 'build_norm_layer',
+    'batched_nms', 'Voxelization', 'voxelization', 'dynamic_scatter',
+    'DynamicScatter', 'sigmoid_focal_loss', 'SigmoidFocalLoss'
+]
diff --git a/mmdet3d/ops/iou3d/__init__.py b/mmdet3d/ops/iou3d/__init__.py
new file mode 100644
index 0000000000..df156f916f
--- /dev/null
+++ b/mmdet3d/ops/iou3d/__init__.py
@@ -0,0 +1,4 @@
+from .iou3d_utils import (boxes_iou3d_gpu, boxes_iou_bev, nms_gpu,
+                          nms_normal_gpu)
+
+__all__ = ['boxes_iou_bev', 'boxes_iou3d_gpu', 'nms_gpu', 'nms_normal_gpu']
diff --git a/mmdet3d/ops/iou3d/iou3d_utils.py b/mmdet3d/ops/iou3d/iou3d_utils.py
new file mode 100644
index 0000000000..a12578e1de
--- /dev/null
+++ b/mmdet3d/ops/iou3d/iou3d_utils.py
@@ -0,0 +1,113 @@
+import torch
+
+from . import iou3d_cuda
+
+
+def boxes_iou_bev(boxes_a, boxes_b):
+    """
+    :param boxes_a: (M, 5)
+    :param boxes_b: (N, 5)
+    :return:
+        ans_iou: (M, N)
+    """
+
+    ans_iou = torch.cuda.FloatTensor(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0]))).zero_()
+
+    iou3d_cuda.boxes_iou_bev_gpu(boxes_a.contiguous(), boxes_b.contiguous(),
+                                 ans_iou)
+
+    return ans_iou
+
+
+def boxes_iou3d_gpu(boxes_a, boxes_b, mode='iou'):
+    """
+    :param boxes_a: (N, 7) [x, y, z, h, w, l, ry]
+    :param boxes_b: (M, 7) [x, y, z, h, w, l, ry]
+    :param mode  "iou" (intersection over union) or iof (intersection over
+            foreground).
+    :return:
+        ans_iou: (M, N)
+    """
+    boxes_a_bev = boxes3d_to_bev_torch(boxes_a)
+    boxes_b_bev = boxes3d_to_bev_torch(boxes_b)
+
+    # bev overlap
+    overlaps_bev = torch.cuda.FloatTensor(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0]))).zero_()  # (N, M)
+    iou3d_cuda.boxes_overlap_bev_gpu(boxes_a_bev.contiguous(),
+                                     boxes_b_bev.contiguous(), overlaps_bev)
+
+    # height overlap
+    boxes_a_height_min = (boxes_a[:, 1] - boxes_a[:, 3]).view(-1, 1)
+    boxes_a_height_max = boxes_a[:, 1].view(-1, 1)
+    boxes_b_height_min = (boxes_b[:, 1] - boxes_b[:, 3]).view(1, -1)
+    boxes_b_height_max = boxes_b[:, 1].view(1, -1)
+
+    max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min)
+    min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max)
+    overlaps_h = torch.clamp(min_of_max - max_of_min, min=0)
+
+    # 3d iou
+    overlaps_3d = overlaps_bev * overlaps_h
+
+    vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)
+    vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)
+
+    if mode == 'iou':
+        # the clamp func is used to avoid division of 0
+        iou3d = overlaps_3d / torch.clamp(
+            vol_a + vol_b - overlaps_3d, min=1e-8)
+    else:
+        iou3d = overlaps_3d / torch.clamp(vol_a, min=1e-8)
+
+    return iou3d
+
+
+def nms_gpu(boxes, scores, thresh):
+    """
+    :param boxes: (N, 5) [x1, y1, x2, y2, ry]
+    :param scores: (N)
+    :param thresh:
+    :return:
+    """
+    # areas = (x2 - x1) * (y2 - y1)
+    order = scores.sort(0, descending=True)[1]
+
+    boxes = boxes[order].contiguous()
+
+    keep = torch.LongTensor(boxes.size(0))
+    num_out = iou3d_cuda.nms_gpu(boxes, keep, thresh)
+    return order[keep[:num_out].cuda()].contiguous()
+
+
+def nms_normal_gpu(boxes, scores, thresh):
+    """
+    :param boxes: (N, 5) [x1, y1, x2, y2, ry]
+    :param scores: (N)
+    :param thresh:
+    :return:
+    """
+    # areas = (x2 - x1) * (y2 - y1)
+    order = scores.sort(0, descending=True)[1]
+
+    boxes = boxes[order].contiguous()
+
+    keep = torch.LongTensor(boxes.size(0))
+    num_out = iou3d_cuda.nms_normal_gpu(boxes, keep, thresh)
+    return order[keep[:num_out].cuda()].contiguous()
+
+
+def boxes3d_to_bev_torch(boxes3d):
+    """
+    :param boxes3d: (N, 7) [x, y, z, h, w, l, ry] in camera coords
+    :return:
+        boxes_bev: (N, 5) [x1, y1, x2, y2, ry]
+    """
+    boxes_bev = boxes3d.new(torch.Size((boxes3d.shape[0], 5)))
+    cu, cv = boxes3d[:, 0], boxes3d[:, 2]
+    half_l, half_w = boxes3d[:, 5] / 2, boxes3d[:, 4] / 2
+    boxes_bev[:, 0], boxes_bev[:, 1] = cu - half_l, cv - half_w
+    boxes_bev[:, 2], boxes_bev[:, 3] = cu + half_l, cv + half_w
+    boxes_bev[:, 4] = boxes3d[:, 6]
+    return boxes_bev
diff --git a/mmdet3d/ops/iou3d/setup.py b/mmdet3d/ops/iou3d/setup.py
new file mode 100644
index 0000000000..bd148e6ddc
--- /dev/null
+++ b/mmdet3d/ops/iou3d/setup.py
@@ -0,0 +1,18 @@
+from setuptools import setup
+
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='iou3d',
+    ext_modules=[
+        CUDAExtension(
+            'iou3d_cuda', [
+                'src/iou3d.cpp',
+                'src/iou3d_kernel.cu',
+            ],
+            extra_compile_args={
+                'cxx': ['-g', '-I /usr/local/cuda/include'],
+                'nvcc': ['-O2']
+            })
+    ],
+    cmdclass={'build_ext': BuildExtension})
diff --git a/mmdet3d/ops/iou3d/src/iou3d.cpp b/mmdet3d/ops/iou3d/src/iou3d.cpp
new file mode 100644
index 0000000000..2cf4b650c7
--- /dev/null
+++ b/mmdet3d/ops/iou3d/src/iou3d.cpp
@@ -0,0 +1,179 @@
+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+#include <vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+
+#define CHECK_ERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess)
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+
+void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap);
+void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou);
+void nmsLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh);
+void nmsNormalLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh);
+
+int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_overlap){
+    // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+    // params boxes_b: (M, 5)
+    // params ans_overlap: (N, M)
+
+    CHECK_INPUT(boxes_a);
+    CHECK_INPUT(boxes_b);
+    CHECK_INPUT(ans_overlap);
+
+    int num_a = boxes_a.size(0);
+    int num_b = boxes_b.size(0);
+
+    const float * boxes_a_data = boxes_a.data<float>();
+    const float * boxes_b_data = boxes_b.data<float>();
+    float * ans_overlap_data = ans_overlap.data<float>();
+
+    boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_overlap_data);
+
+    return 1;
+}
+
+int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou){
+    // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+    // params boxes_b: (M, 5)
+    // params ans_overlap: (N, M)
+
+    CHECK_INPUT(boxes_a);
+    CHECK_INPUT(boxes_b);
+    CHECK_INPUT(ans_iou);
+
+    int num_a = boxes_a.size(0);
+    int num_b = boxes_b.size(0);
+
+    const float * boxes_a_data = boxes_a.data<float>();
+    const float * boxes_b_data = boxes_b.data<float>();
+    float * ans_iou_data = ans_iou.data<float>();
+
+    boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data);
+
+    return 1;
+}
+
+int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
+    // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+    // params keep: (N)
+
+    CHECK_INPUT(boxes);
+    CHECK_CONTIGUOUS(keep);
+
+    int boxes_num = boxes.size(0);
+    const float * boxes_data = boxes.data<float>();
+    long * keep_data = keep.data<long>();
+
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+    unsigned long long *mask_data = NULL;
+    CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long)));
+    nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
+
+    // unsigned long long mask_cpu[boxes_num * col_blocks];
+    // unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks];
+    std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
+
+//    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
+    CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long),
+                           cudaMemcpyDeviceToHost));
+
+    cudaFree(mask_data);
+
+    unsigned long long remv_cpu[col_blocks];
+    memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long));
+
+    int num_to_keep = 0;
+
+    for (int i = 0; i < boxes_num; i++){
+        int nblock = i / THREADS_PER_BLOCK_NMS;
+        int inblock = i % THREADS_PER_BLOCK_NMS;
+
+        if (!(remv_cpu[nblock] & (1ULL << inblock))){
+            keep_data[num_to_keep++] = i;
+            unsigned long long *p = &mask_cpu[0] + i * col_blocks;
+            for (int j = nblock; j < col_blocks; j++){
+                remv_cpu[j] |= p[j];
+            }
+        }
+    }
+    if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" );
+
+    return num_to_keep;
+}
+
+
+int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
+    // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+    // params keep: (N)
+
+    CHECK_INPUT(boxes);
+    CHECK_CONTIGUOUS(keep);
+
+    int boxes_num = boxes.size(0);
+    const float * boxes_data = boxes.data<float>();
+    long * keep_data = keep.data<long>();
+
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+    unsigned long long *mask_data = NULL;
+    CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(unsigned long long)));
+    nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
+
+    // unsigned long long mask_cpu[boxes_num * col_blocks];
+    // unsigned long long *mask_cpu = new unsigned long long [boxes_num * col_blocks];
+    std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
+
+//    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
+    CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(unsigned long long),
+                           cudaMemcpyDeviceToHost));
+
+    cudaFree(mask_data);
+
+    unsigned long long remv_cpu[col_blocks];
+    memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long));
+
+    int num_to_keep = 0;
+
+    for (int i = 0; i < boxes_num; i++){
+        int nblock = i / THREADS_PER_BLOCK_NMS;
+        int inblock = i % THREADS_PER_BLOCK_NMS;
+
+        if (!(remv_cpu[nblock] & (1ULL << inblock))){
+            keep_data[num_to_keep++] = i;
+            unsigned long long *p = &mask_cpu[0] + i * col_blocks;
+            for (int j = nblock; j < col_blocks; j++){
+                remv_cpu[j] |= p[j];
+            }
+        }
+    }
+    if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" );
+
+    return num_to_keep;
+}
+
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu, "oriented boxes overlap");
+  m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou");
+  m.def("nms_gpu", &nms_gpu, "oriented nms gpu");
+  m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu");
+}
diff --git a/mmdet3d/ops/iou3d/src/iou3d_kernel.cu b/mmdet3d/ops/iou3d/src/iou3d_kernel.cu
new file mode 100644
index 0000000000..7aac72ed03
--- /dev/null
+++ b/mmdet3d/ops/iou3d/src/iou3d_kernel.cu
@@ -0,0 +1,381 @@
+#include <stdio.h>
+#define THREADS_PER_BLOCK 16
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+//#define DEBUG
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+const float EPS = 1e-8;
+struct Point {
+    float x, y;
+    __device__ Point() {}
+    __device__ Point(double _x, double _y){
+        x = _x, y = _y;
+    }
+
+    __device__ void set(float _x, float _y){
+        x = _x; y = _y;
+    }
+
+    __device__ Point operator +(const Point &b)const{
+        return Point(x + b.x, y + b.y);
+    }
+
+    __device__ Point operator -(const Point &b)const{
+        return Point(x - b.x, y - b.y);
+    }
+};
+
+__device__ inline float cross(const Point &a, const Point &b){
+    return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point &p1, const Point &p2, const Point &p0){
+    return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2){
+    int ret = min(p1.x,p2.x) <= max(q1.x,q2.x)  &&
+              min(q1.x,q2.x) <= max(p1.x,p2.x) &&
+              min(p1.y,p2.y) <= max(q1.y,q2.y) &&
+              min(q1.y,q2.y) <= max(p1.y,p2.y);
+    return ret;
+}
+
+__device__ inline int check_in_box2d(const float *box, const Point &p){
+    //params: box (5) [x1, y1, x2, y2, angle]
+    const float MARGIN = 1e-5;
+
+    float center_x = (box[0] + box[2]) / 2;
+    float center_y = (box[1] + box[3]) / 2;
+    float angle_cos = cos(-box[4]), angle_sin = sin(-box[4]);  // rotate the point in the opposite direction of box
+    float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * angle_sin + center_x;
+    float rot_y = -(p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y;
+#ifdef DEBUG
+    printf("box: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", box[0], box[1], box[2], box[3], box[4]);
+    printf("center: (%.3f, %.3f), cossin(%.3f, %.3f), src(%.3f, %.3f), rot(%.3f, %.3f)\n", center_x, center_y,
+            angle_cos, angle_sin, p.x, p.y, rot_x, rot_y);
+#endif
+    return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN && rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN);
+}
+
+__device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){
+    // fast exclusion
+    if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+    // check cross standing
+    float s1 = cross(q0, p1, p0);
+    float s2 = cross(p1, q1, p0);
+    float s3 = cross(p0, q1, q0);
+    float s4 = cross(q1, p1, q0);
+
+    if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+    // calculate intersection of two lines
+    float s5 = cross(q1, p1, p0);
+    if(fabs(s5 - s1) > EPS){
+        ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+        ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+    }
+    else{
+        float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+        float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+        float D = a0 * b1 - a1 * b0;
+
+        ans.x = (b0 * c1 - b1 * c0) / D;
+        ans.y = (a1 * c0 - a0 * c1) / D;
+    }
+
+    return 1;
+}
+
+__device__ inline void rotate_around_center(const Point &center, const float angle_cos, const float angle_sin, Point &p){
+    float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * angle_sin + center.x;
+    float new_y = -(p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+    p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point &a, const Point &b, const Point &center){
+    return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float *box_a, const float *box_b){
+    // params: box_a (5) [x1, y1, x2, y2, angle]
+    // params: box_b (5) [x1, y1, x2, y2, angle]
+
+    float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3], a_angle = box_a[4];
+    float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3], b_angle = box_b[4];
+
+    Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2);
+    Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2);
+#ifdef DEBUG
+    printf("a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle,
+           b_x1, b_y1, b_x2, b_y2, b_angle);
+    printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y);
+#endif
+
+    Point box_a_corners[5];
+    box_a_corners[0].set(a_x1, a_y1);
+    box_a_corners[1].set(a_x2, a_y1);
+    box_a_corners[2].set(a_x2, a_y2);
+    box_a_corners[3].set(a_x1, a_y2);
+
+    Point box_b_corners[5];
+    box_b_corners[0].set(b_x1, b_y1);
+    box_b_corners[1].set(b_x2, b_y1);
+    box_b_corners[2].set(b_x2, b_y2);
+    box_b_corners[3].set(b_x1, b_y2);
+
+    // get oriented corners
+    float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+    float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+    for (int k = 0; k < 4; k++){
+#ifdef DEBUG
+        printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+        rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+        rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+#ifdef DEBUG
+        printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+    }
+
+    box_a_corners[4] = box_a_corners[0];
+    box_b_corners[4] = box_b_corners[0];
+
+    // get intersection of lines
+    Point cross_points[16];
+    Point poly_center;
+    int cnt = 0, flag = 0;
+
+    poly_center.set(0, 0);
+    for (int i = 0; i < 4; i++){
+        for (int j = 0; j < 4; j++){
+            flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]);
+            if (flag){
+                poly_center = poly_center + cross_points[cnt];
+                cnt++;
+            }
+        }
+    }
+
+    // check corners
+    for (int k = 0; k < 4; k++){
+        if (check_in_box2d(box_a, box_b_corners[k])){
+            poly_center = poly_center + box_b_corners[k];
+            cross_points[cnt] = box_b_corners[k];
+            cnt++;
+        }
+        if (check_in_box2d(box_b, box_a_corners[k])){
+            poly_center = poly_center + box_a_corners[k];
+            cross_points[cnt] = box_a_corners[k];
+            cnt++;
+        }
+    }
+
+    poly_center.x /= cnt;
+    poly_center.y /= cnt;
+
+    // sort the points of polygon
+    Point temp;
+    for (int j = 0; j < cnt - 1; j++){
+        for (int i = 0; i < cnt - j - 1; i++){
+            if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){
+                temp = cross_points[i];
+                cross_points[i] = cross_points[i + 1];
+                cross_points[i + 1] = temp;
+            }
+        }
+    }
+
+#ifdef DEBUG
+    printf("cnt=%d\n", cnt);
+    for (int i = 0; i < cnt; i++){
+        printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y);
+    }
+#endif
+
+    // get the overlap areas
+    float area = 0;
+    for (int k = 0; k < cnt - 1; k++){
+        area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]);
+    }
+
+    return fabs(area) / 2.0;
+}
+
+__device__ inline float iou_bev(const float *box_a, const float *box_b){
+    // params: box_a (5) [x1, y1, x2, y2, angle]
+    // params: box_b (5) [x1, y1, x2, y2, angle]
+    float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]);
+    float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]);
+    float s_overlap = box_overlap(box_a, box_b);
+    return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+    if (a_idx >= num_a || b_idx >= num_b){
+        return;
+    }
+    const float * cur_box_a = boxes_a + a_idx * 5;
+    const float * cur_box_b = boxes_b + b_idx * 5;
+    float s_overlap = box_overlap(cur_box_a, cur_box_b);
+    ans_overlap[a_idx * num_b + b_idx] = s_overlap;
+}
+
+__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){
+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+    if (a_idx >= num_a || b_idx >= num_b){
+        return;
+    }
+
+    const float * cur_box_a = boxes_a + a_idx * 5;
+    const float * cur_box_b = boxes_b + b_idx * 5;
+    float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
+    ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
+}
+
+__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
+                           const float *boxes, unsigned long long *mask){
+    //params: boxes (N, 5) [x1, y1, x2, y2, ry]
+    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
+
+    if (threadIdx.x < col_size) {
+        block_boxes[threadIdx.x * 5 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
+        block_boxes[threadIdx.x * 5 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
+        block_boxes[threadIdx.x * 5 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
+        block_boxes[threadIdx.x * 5 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
+        block_boxes[threadIdx.x * 5 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+        const float *cur_box = boxes + cur_box_idx * 5;
+
+        int i = 0;
+        unsigned long long t = 0;
+        int start = 0;
+        if (row_start == col_start) {
+          start = threadIdx.x + 1;
+        }
+        for (i = start; i < col_size; i++) {
+            if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh){
+                t |= 1ULL << i;
+            }
+        }
+        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+        mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+}
+
+
+__device__ inline float iou_normal(float const * const a, float const * const b) {
+    float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+    float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+    float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+    float interS = width * height;
+    float Sa = (a[2] - a[0]) * (a[3] - a[1]);
+    float Sb = (b[2] - b[0]) * (b[3] - b[1]);
+    return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+
+__global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_thresh,
+                           const float *boxes, unsigned long long *mask){
+    //params: boxes (N, 5) [x1, y1, x2, y2, ry]
+    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
+
+    if (threadIdx.x < col_size) {
+        block_boxes[threadIdx.x * 5 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
+        block_boxes[threadIdx.x * 5 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
+        block_boxes[threadIdx.x * 5 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
+        block_boxes[threadIdx.x * 5 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
+        block_boxes[threadIdx.x * 5 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+        const float *cur_box = boxes + cur_box_idx * 5;
+
+        int i = 0;
+        unsigned long long t = 0;
+        int start = 0;
+        if (row_start == col_start) {
+          start = threadIdx.x + 1;
+        }
+        for (i = start; i < col_size; i++) {
+            if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh){
+                t |= 1ULL << i;
+            }
+        }
+        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+        mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+}
+
+
+
+
+
+void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
+
+    dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+
+    boxes_overlap_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b, ans_overlap);
+#ifdef DEBUG
+    cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){
+
+    dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+
+    boxes_iou_bev_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b, ans_iou);
+}
+
+
+void nmsLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh){
+    dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
+                DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+    dim3 threads(THREADS_PER_BLOCK_NMS);
+    nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
+}
+
+
+void nmsNormalLauncher(const float *boxes, unsigned long long * mask, int boxes_num, float nms_overlap_thresh){
+    dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
+                DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+    dim3 threads(THREADS_PER_BLOCK_NMS);
+    nms_normal_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
+}
diff --git a/mmdet3d/ops/norm.py b/mmdet3d/ops/norm.py
new file mode 100644
index 0000000000..c054e62e45
--- /dev/null
+++ b/mmdet3d/ops/norm.py
@@ -0,0 +1,10 @@
+import torch.nn as nn
+
+from mmdet.ops.norm import norm_cfg
+from .sync_bn import NaiveSyncBatchNorm1d, NaiveSyncBatchNorm2d
+
+norm_cfg.update({
+    'BN1d': ('bn', nn.BatchNorm1d),
+    'naiveSyncBN2d': ('bn', NaiveSyncBatchNorm2d),
+    'naiveSyncBN1d': ('bn', NaiveSyncBatchNorm1d),
+})
diff --git a/mmdet3d/ops/spconv/__init__.py b/mmdet3d/ops/spconv/__init__.py
new file mode 100644
index 0000000000..20214baa7d
--- /dev/null
+++ b/mmdet3d/ops/spconv/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
+                   SparseConvTranspose3d, SparseInverseConv2d,
+                   SparseInverseConv3d, SubMConv2d, SubMConv3d)
+from .modules import SparseModule, SparseSequential
+from .pool import SparseMaxPool2d, SparseMaxPool3d
+from .structure import SparseConvTensor, scatter_nd
+
+__all__ = [
+    'SparseConv2d',
+    'SparseConv3d',
+    'SubMConv2d',
+    'SubMConv3d',
+    'SparseConvTranspose2d',
+    'SparseConvTranspose3d',
+    'SparseInverseConv2d',
+    'SparseInverseConv3d',
+    'SparseModule',
+    'SparseSequential',
+    'SparseMaxPool2d',
+    'SparseMaxPool3d',
+    'SparseConvTensor',
+    'scatter_nd',
+]
diff --git a/mmdet3d/ops/spconv/conv.py b/mmdet3d/ops/spconv/conv.py
new file mode 100644
index 0000000000..3655749fe6
--- /dev/null
+++ b/mmdet3d/ops/spconv/conv.py
@@ -0,0 +1,446 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import torch
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+from . import functional as Fsp
+from . import ops
+from .modules import SparseModule
+from .structure import SparseConvTensor
+
+
+def _calculate_fan_in_and_fan_out_hwio(tensor):
+    dimensions = tensor.ndimension()
+    if dimensions < 2:
+        raise ValueError('fan in and fan out can not be computed for tensor'
+                         'with fewer than 2 dimensions')
+
+    if dimensions == 2:  # Linear
+        fan_in = tensor.size(-2)
+        fan_out = tensor.size(-1)
+    else:
+        num_input_fmaps = tensor.size(-2)
+        num_output_fmaps = tensor.size(-1)
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[..., 0, 0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+class SparseConvolution(SparseModule):
+
+    def __init__(self,
+                 ndim,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 subm=False,
+                 output_padding=0,
+                 transposed=False,
+                 inverse=False,
+                 indice_key=None,
+                 fused_bn=False):
+        super(SparseConvolution, self).__init__()
+        assert groups == 1
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+        if not isinstance(output_padding, (list, tuple)):
+            output_padding = [output_padding] * ndim
+
+        for d, s in zip(dilation, stride):
+            assert any([s == 1, d == 1]), "don't support this."
+
+        self.ndim = ndim
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.conv1x1 = np.prod(kernel_size) == 1
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.transposed = transposed
+        self.inverse = inverse
+        self.output_padding = output_padding
+        self.groups = groups
+        self.subm = subm
+        self.indice_key = indice_key
+        self.fused_bn = fused_bn
+
+        self.weight = Parameter(
+            torch.Tensor(*kernel_size, in_channels, out_channels))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            if self.transposed:
+                out_spatial_shape = ops.get_deconv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation, self.output_padding)
+            else:
+                out_spatial_shape = ops.get_conv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation)
+
+        else:
+            out_spatial_shape = spatial_shape
+        # input.update_grid(out_spatial_shape)
+        # t = time.time()
+        if self.conv1x1:
+            features = torch.mm(
+                input.features,
+                self.weight.view(self.in_channels, self.out_channels))
+            if self.bias is not None:
+                features += self.bias
+            out_tensor = SparseConvTensor(features, input.indices,
+                                          input.spatial_shape,
+                                          input.batch_size)
+            out_tensor.indice_dict = input.indice_dict
+            out_tensor.grid = input.grid
+            return out_tensor
+        datas = input.find_indice_pair(self.indice_key)
+        if self.inverse:
+            assert datas is not None and self.indice_key is not None
+            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas
+            assert indice_pairs.shape[0] == np.prod(
+                self.kernel_size
+            ), 'inverse conv must have same kernel size as its couple conv'
+        else:
+            if self.indice_key is not None and datas is not None:
+                outids, _, indice_pairs, indice_pair_num, _ = datas
+            else:
+                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
+                    indices,
+                    batch_size,
+                    spatial_shape,
+                    self.kernel_size,
+                    self.stride,
+                    self.padding,
+                    self.dilation,
+                    self.output_padding,
+                    self.subm,
+                    self.transposed,
+                    grid=input.grid)
+                input.indice_dict[self.indice_key] = (outids, indices,
+                                                      indice_pairs,
+                                                      indice_pair_num,
+                                                      spatial_shape)
+        if self.fused_bn:
+            assert self.bias is not None
+            out_features = ops.fused_indice_conv(features, self.weight,
+                                                 self.bias,
+                                                 indice_pairs.to(device),
+                                                 indice_pair_num,
+                                                 outids.shape[0], self.inverse,
+                                                 self.subm)
+        else:
+            if self.subm:
+                out_features = Fsp.indice_subm_conv(features, self.weight,
+                                                    indice_pairs.to(device),
+                                                    indice_pair_num,
+                                                    outids.shape[0])
+            else:
+                if self.inverse:
+                    out_features = Fsp.indice_inverse_conv(
+                        features, self.weight, indice_pairs.to(device),
+                        indice_pair_num, outids.shape[0])
+                else:
+                    out_features = Fsp.indice_conv(features, self.weight,
+                                                   indice_pairs.to(device),
+                                                   indice_pair_num,
+                                                   outids.shape[0])
+
+            if self.bias is not None:
+                out_features += self.bias
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+class SparseConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConv2d, self).__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+class SparseConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConv3d, self).__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+class SparseConv4d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConv4d, self).__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+class SparseConvTranspose2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConvTranspose2d, self).__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+
+
+class SparseConvTranspose3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SparseConvTranspose3d, self).__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+
+
+class SparseInverseConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key,
+                 bias=True):
+        super(SparseInverseConv2d, self).__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+
+
+class SparseInverseConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key,
+                 bias=True):
+        super(SparseInverseConv3d, self).__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+
+
+class SubMConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SubMConv2d, self).__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+
+
+class SubMConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SubMConv3d, self).__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+
+
+class SubMConv4d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super(SubMConv4d, self).__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
diff --git a/mmdet3d/ops/spconv/functional.py b/mmdet3d/ops/spconv/functional.py
new file mode 100644
index 0000000000..92daf190dc
--- /dev/null
+++ b/mmdet3d/ops/spconv/functional.py
@@ -0,0 +1,98 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch.autograd import Function
+
+from . import ops as ops
+
+
+class SparseConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
+                num_activate_out):
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, False)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            False)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SparseInverseConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
+                num_activate_out):
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, True, False)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            True, False)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SubMConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features, filters, indice_pairs, indice_pair_num,
+                num_activate_out):
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, False, True)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            False, True)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SparseMaxPoolFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features, indice_pairs, indice_pair_num,
+                num_activate_out):
+        out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
+                                 num_activate_out)
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
+        input_bp = ops.indice_maxpool_backward(features, out, grad_output,
+                                               indice_pairs, indice_pair_num)
+        return input_bp, None, None, None
+
+
+indice_conv = SparseConvFunction.apply
+indice_inverse_conv = SparseInverseConvFunction.apply
+indice_subm_conv = SubMConvFunction.apply
+indice_maxpool = SparseMaxPoolFunction.apply
diff --git a/mmdet3d/ops/spconv/include/paramsgrid.h b/mmdet3d/ops/spconv/include/paramsgrid.h
new file mode 100644
index 0000000000..9dafd417af
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/paramsgrid.h
@@ -0,0 +1,62 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PARAMS_GRID_H_
+#define PARAMS_GRID_H_
+#include <tuple>
+#include <vector>
+
+namespace detail {
+template <class T> int getTotalSize(std::vector<T> arg) { return arg.size(); }
+
+template <class T, class... TArgs>
+int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
+  return arg.size() * getTotalSize(args...);
+}
+template <typename T> int getSize(std::vector<T> arg) { return arg.size(); }
+
+template <int Idx, class TT, class T>
+void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
+  std::get<Idx>(src) = arg[counter[Idx]];
+}
+
+template <int Idx, class TT, class T, class... TArgs>
+void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
+              std::vector<TArgs> &... args) {
+  std::get<Idx>(src) = arg[counter[Idx]];
+  assigner<Idx + 1>(src, counter, args...);
+}
+} // namespace detail
+template <class... TArgs>
+std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
+  int length = detail::getTotalSize(args...);
+  std::vector<int> sizes = {detail::getSize(args)...};
+  int size = sizes.size();
+
+  std::vector<std::tuple<TArgs...>> params(length);
+  std::vector<int> counter(size);
+  for (int i = 0; i < length; ++i) {
+    detail::assigner<0>(params[i], counter, args...);
+    counter[size - 1] += 1;
+    for (int c = size - 1; c >= 0; --c) {
+      if (counter[c] == sizes[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return params;
+}
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/prettyprint.h b/mmdet3d/ops/spconv/include/prettyprint.h
new file mode 100644
index 0000000000..0bc06189f3
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/prettyprint.h
@@ -0,0 +1,445 @@
+//          Copyright Louis Delacroix 2010 - 2014.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+//
+// A pretty printing library for C++
+//
+// Usage:
+// Include this header, and operator<< will "just work".
+
+#ifndef H_PRETTY_PRINT
+#define H_PRETTY_PRINT
+
+#include <cstddef>
+#include <iterator>
+#include <memory>
+#include <ostream>
+#include <set>
+#include <tuple>
+#include <type_traits>
+#include <unordered_set>
+#include <utility>
+#include <valarray>
+
+namespace pretty_print
+{
+    namespace detail
+    {
+        // SFINAE type trait to detect whether T::const_iterator exists.
+
+        struct sfinae_base
+        {
+            using yes = char;
+            using no  = yes[2];
+        };
+
+        template <typename T>
+        struct has_const_iterator : private sfinae_base
+        {
+        private:
+            template <typename C> static yes & test(typename C::const_iterator*);
+            template <typename C> static no  & test(...);
+        public:
+            static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
+            using type =  T;
+        };
+
+        template <typename T>
+        struct has_begin_end : private sfinae_base
+        {
+        private:
+            template <typename C>
+            static yes & f(typename std::enable_if<
+                std::is_same<decltype(static_cast<typename C::const_iterator(C::*)() const>(&C::begin)),
+                             typename C::const_iterator(C::*)() const>::value>::type *);
+
+            template <typename C> static no & f(...);
+
+            template <typename C>
+            static yes & g(typename std::enable_if<
+                std::is_same<decltype(static_cast<typename C::const_iterator(C::*)() const>(&C::end)),
+                             typename C::const_iterator(C::*)() const>::value, void>::type*);
+
+            template <typename C> static no & g(...);
+
+        public:
+            static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
+            static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
+        };
+
+    }  // namespace detail
+
+
+    // Holds the delimiter values for a specific character type
+
+    template <typename TChar>
+    struct delimiters_values
+    {
+        using char_type = TChar;
+        const char_type * prefix;
+        const char_type * delimiter;
+        const char_type * postfix;
+    };
+
+
+    // Defines the delimiter values for a specific container and character type
+
+    template <typename T, typename TChar>
+    struct delimiters
+    {
+        using type = delimiters_values<TChar>;
+        static const type values;
+    };
+
+
+    // Functor to print containers. You can use this directly if you want
+    // to specificy a non-default delimiters type. The printing logic can
+    // be customized by specializing the nested template.
+
+    template <typename T,
+              typename TChar = char,
+              typename TCharTraits = ::std::char_traits<TChar>,
+              typename TDelimiters = delimiters<T, TChar>>
+    struct print_container_helper
+    {
+        using delimiters_type = TDelimiters;
+        using ostream_type = std::basic_ostream<TChar, TCharTraits>;
+
+        template <typename U>
+        struct printer
+        {
+            static void print_body(const U & c, ostream_type & stream)
+            {
+                using std::begin;
+                using std::end;
+
+                auto it = begin(c);
+                const auto the_end = end(c);
+
+                if (it != the_end)
+                {
+                    for ( ; ; )
+                    {
+                        stream << *it;
+
+                    if (++it == the_end) break;
+
+                    if (delimiters_type::values.delimiter != NULL)
+                        stream << delimiters_type::values.delimiter;
+                    }
+                }
+            }
+        };
+
+        print_container_helper(const T & container)
+        : container_(container)
+        { }
+
+        inline void operator()(ostream_type & stream) const
+        {
+            if (delimiters_type::values.prefix != NULL)
+                stream << delimiters_type::values.prefix;
+
+            printer<T>::print_body(container_, stream);
+
+            if (delimiters_type::values.postfix != NULL)
+                stream << delimiters_type::values.postfix;
+        }
+
+    private:
+        const T & container_;
+    };
+
+    // Specialization for pairs
+
+    template <typename T, typename TChar, typename TCharTraits, typename TDelimiters>
+    template <typename T1, typename T2>
+    struct print_container_helper<T, TChar, TCharTraits, TDelimiters>::printer<std::pair<T1, T2>>
+    {
+        using ostream_type = typename print_container_helper<T, TChar, TCharTraits, TDelimiters>::ostream_type;
+
+        static void print_body(const std::pair<T1, T2> & c, ostream_type & stream)
+        {
+            stream << c.first;
+            if (print_container_helper<T, TChar, TCharTraits, TDelimiters>::delimiters_type::values.delimiter != NULL)
+                stream << print_container_helper<T, TChar, TCharTraits, TDelimiters>::delimiters_type::values.delimiter;
+            stream << c.second;
+        }
+    };
+
+    // Specialization for tuples
+
+    template <typename T, typename TChar, typename TCharTraits, typename TDelimiters>
+    template <typename ...Args>
+    struct print_container_helper<T, TChar, TCharTraits, TDelimiters>::printer<std::tuple<Args...>>
+    {
+        using ostream_type = typename print_container_helper<T, TChar, TCharTraits, TDelimiters>::ostream_type;
+        using element_type = std::tuple<Args...>;
+
+        template <std::size_t I> struct Int { };
+
+        static void print_body(const element_type & c, ostream_type & stream)
+        {
+            tuple_print(c, stream, Int<0>());
+        }
+
+        static void tuple_print(const element_type &, ostream_type &, Int<sizeof...(Args)>)
+        {
+        }
+
+        static void tuple_print(const element_type & c, ostream_type & stream,
+                                typename std::conditional<sizeof...(Args) != 0, Int<0>, std::nullptr_t>::type)
+        {
+            stream << std::get<0>(c);
+            tuple_print(c, stream, Int<1>());
+        }
+
+        template <std::size_t N>
+        static void tuple_print(const element_type & c, ostream_type & stream, Int<N>)
+        {
+            if (print_container_helper<T, TChar, TCharTraits, TDelimiters>::delimiters_type::values.delimiter != NULL)
+                stream << print_container_helper<T, TChar, TCharTraits, TDelimiters>::delimiters_type::values.delimiter;
+
+            stream << std::get<N>(c);
+
+            tuple_print(c, stream, Int<N + 1>());
+        }
+    };
+
+    // Prints a print_container_helper to the specified stream.
+
+    template<typename T, typename TChar, typename TCharTraits, typename TDelimiters>
+    inline std::basic_ostream<TChar, TCharTraits> & operator<<(
+        std::basic_ostream<TChar, TCharTraits> & stream,
+        const print_container_helper<T, TChar, TCharTraits, TDelimiters> & helper)
+    {
+        helper(stream);
+        return stream;
+    }
+
+
+    // Basic is_container template; specialize to derive from std::true_type for all desired container types
+
+    template <typename T>
+    struct is_container : public std::integral_constant<bool,
+                                                        detail::has_const_iterator<T>::value &&
+                                                        detail::has_begin_end<T>::beg_value  &&
+                                                        detail::has_begin_end<T>::end_value> { };
+
+    template <typename T, std::size_t N>
+    struct is_container<T[N]> : std::true_type { };
+
+    template <std::size_t N>
+    struct is_container<char[N]> : std::false_type { };
+
+    template <typename T>
+    struct is_container<std::valarray<T>> : std::true_type { };
+
+    template <typename T1, typename T2>
+    struct is_container<std::pair<T1, T2>> : std::true_type { };
+
+    template <typename ...Args>
+    struct is_container<std::tuple<Args...>> : std::true_type { };
+
+
+    // Default delimiters
+
+    template <typename T> struct delimiters<T, char> { static const delimiters_values<char> values; };
+    template <typename T> const delimiters_values<char> delimiters<T, char>::values = { "[", ", ", "]" };
+    template <typename T> struct delimiters<T, wchar_t> { static const delimiters_values<wchar_t> values; };
+    template <typename T> const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = { L"[", L", ", L"]" };
+
+
+    // Delimiters for (multi)set and unordered_(multi)set
+
+    template <typename T, typename TComp, typename TAllocator>
+    struct delimiters< ::std::set<T, TComp, TAllocator>, char> { static const delimiters_values<char> values; };
+
+    template <typename T, typename TComp, typename TAllocator>
+    const delimiters_values<char> delimiters< ::std::set<T, TComp, TAllocator>, char>::values = { "{", ", ", "}" };
+
+    template <typename T, typename TComp, typename TAllocator>
+    struct delimiters< ::std::set<T, TComp, TAllocator>, wchar_t> { static const delimiters_values<wchar_t> values; };
+
+    template <typename T, typename TComp, typename TAllocator>
+    const delimiters_values<wchar_t> delimiters< ::std::set<T, TComp, TAllocator>, wchar_t>::values = { L"{", L", ", L"}" };
+
+    template <typename T, typename TComp, typename TAllocator>
+    struct delimiters< ::std::multiset<T, TComp, TAllocator>, char> { static const delimiters_values<char> values; };
+
+    template <typename T, typename TComp, typename TAllocator>
+    const delimiters_values<char> delimiters< ::std::multiset<T, TComp, TAllocator>, char>::values = { "{", ", ", "}" };
+
+    template <typename T, typename TComp, typename TAllocator>
+    struct delimiters< ::std::multiset<T, TComp, TAllocator>, wchar_t> { static const delimiters_values<wchar_t> values; };
+
+    template <typename T, typename TComp, typename TAllocator>
+    const delimiters_values<wchar_t> delimiters< ::std::multiset<T, TComp, TAllocator>, wchar_t>::values = { L"{", L", ", L"}" };
+
+    template <typename T, typename THash, typename TEqual, typename TAllocator>
+    struct delimiters< ::std::unordered_set<T, THash, TEqual, TAllocator>, char> { static const delimiters_values<char> values; };
+
+    template <typename T, typename THash, typename TEqual, typename TAllocator>
+    const delimiters_values<char> delimiters< ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = { "{", ", ", "}" };
+
+    template <typename T, typename THash, typename TEqual, typename TAllocator>
+    struct delimiters< ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> { static const delimiters_values<wchar_t> values; };
+
+    template <typename T, typename THash, typename TEqual, typename TAllocator>
+    const delimiters_values<wchar_t> delimiters< ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = { L"{", L", ", L"}" };
+
+    template <typename T, typename THash, typename TEqual, typename TAllocator>
+    struct delimiters< ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char> { static const delimiters_values<char> values; };
+
+    template <typename T, typename THash, typename TEqual, typename TAllocator>
+    const delimiters_values<char> delimiters< ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = { "{", ", ", "}" };
+
+    template <typename T, typename THash, typename TEqual, typename TAllocator>
+    struct delimiters< ::std::unordered_multiset<T, THash, TEqual, TAllocator>, wchar_t> { static const delimiters_values<wchar_t> values; };
+
+    template <typename T, typename THash, typename TEqual, typename TAllocator>
+    const delimiters_values<wchar_t> delimiters< ::std::unordered_multiset<T, THash, TEqual, TAllocator>, wchar_t>::values = { L"{", L", ", L"}" };
+
+
+    // Delimiters for pair and tuple
+
+    template <typename T1, typename T2> struct delimiters<std::pair<T1, T2>, char> { static const delimiters_values<char> values; };
+    template <typename T1, typename T2> const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = { "(", ", ", ")" };
+    template <typename T1, typename T2> struct delimiters< ::std::pair<T1, T2>, wchar_t> { static const delimiters_values<wchar_t> values; };
+    template <typename T1, typename T2> const delimiters_values<wchar_t> delimiters< ::std::pair<T1, T2>, wchar_t>::values = { L"(", L", ", L")" };
+
+    template <typename ...Args> struct delimiters<std::tuple<Args...>, char> { static const delimiters_values<char> values; };
+    template <typename ...Args> const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = { "(", ", ", ")" };
+    template <typename ...Args> struct delimiters< ::std::tuple<Args...>, wchar_t> { static const delimiters_values<wchar_t> values; };
+    template <typename ...Args> const delimiters_values<wchar_t> delimiters< ::std::tuple<Args...>, wchar_t>::values = { L"(", L", ", L")" };
+
+
+    // Type-erasing helper class for easy use of custom delimiters.
+    // Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t, and MyDelims needs to be defined for TChar.
+    // Usage: "cout << pretty_print::custom_delims<MyDelims>(x)".
+
+    struct custom_delims_base
+    {
+        virtual ~custom_delims_base() { }
+        virtual std::ostream & stream(::std::ostream &) = 0;
+        virtual std::wostream & stream(::std::wostream &) = 0;
+    };
+
+    template <typename T, typename Delims>
+    struct custom_delims_wrapper : custom_delims_base
+    {
+        custom_delims_wrapper(const T & t_) : t(t_) { }
+
+        std::ostream & stream(std::ostream & s)
+        {
+            return s << print_container_helper<T, char, std::char_traits<char>, Delims>(t);
+        }
+
+        std::wostream & stream(std::wostream & s)
+        {
+            return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>, Delims>(t);
+        }
+
+    private:
+        const T & t;
+    };
+
+    template <typename Delims>
+    struct custom_delims
+    {
+        template <typename Container>
+        custom_delims(const Container & c) : base(new custom_delims_wrapper<Container, Delims>(c)) { }
+
+        std::unique_ptr<custom_delims_base> base;
+    };
+
+    template <typename TChar, typename TCharTraits, typename Delims>
+    inline std::basic_ostream<TChar, TCharTraits> & operator<<(std::basic_ostream<TChar, TCharTraits> & s, const custom_delims<Delims> & p)
+    {
+        return p.base->stream(s);
+    }
+
+
+    // A wrapper for a C-style array given as pointer-plus-size.
+    // Usage: std::cout << pretty_print_array(arr, n) << std::endl;
+
+    template<typename T>
+    struct array_wrapper_n
+    {
+        typedef const T * const_iterator;
+        typedef T value_type;
+
+        array_wrapper_n(const T * const a, size_t n) : _array(a), _n(n) { }
+        inline const_iterator begin() const { return _array; }
+        inline const_iterator end() const { return _array + _n; }
+
+    private:
+        const T * const _array;
+        size_t _n;
+    };
+
+
+    // A wrapper for hash-table based containers that offer local iterators to each bucket.
+    // Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket 5 of container m.)
+
+    template <typename T>
+    struct bucket_print_wrapper
+    {
+        typedef typename T::const_local_iterator const_iterator;
+        typedef typename T::size_type size_type;
+
+        const_iterator begin() const
+        {
+            return m_map.cbegin(n);
+        }
+
+        const_iterator end() const
+        {
+            return m_map.cend(n);
+        }
+
+        bucket_print_wrapper(const T & m, size_type bucket) : m_map(m), n(bucket) { }
+
+    private:
+        const T & m_map;
+        const size_type n;
+    };
+
+}   // namespace pretty_print
+
+
+// Global accessor functions for the convenience wrappers
+
+template<typename T>
+inline pretty_print::array_wrapper_n<T> pretty_print_array(const T * const a, size_t n)
+{
+    return pretty_print::array_wrapper_n<T>(a, n);
+}
+
+template <typename T> pretty_print::bucket_print_wrapper<T>
+bucket_print(const T & m, typename T::size_type n)
+{
+    return pretty_print::bucket_print_wrapper<T>(m, n);
+}
+
+
+// Main magic entry point: An overload snuck into namespace std.
+// Can we do better?
+
+namespace std
+{
+    // Prints a container to the stream using default delimiters
+
+    template<typename T, typename TChar, typename TCharTraits>
+    inline typename enable_if< ::pretty_print::is_container<T>::value,
+                              basic_ostream<TChar, TCharTraits> &>::type
+    operator<<(basic_ostream<TChar, TCharTraits> & stream, const T & container)
+    {
+        return stream << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(container);
+    }
+}
+
+
+
+#endif // H_PRETTY_PRINT
diff --git a/mmdet3d/ops/spconv/include/pybind11_utils.h b/mmdet3d/ops/spconv/include/pybind11_utils.h
new file mode 100644
index 0000000000..d23a0f6dbf
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/pybind11_utils.h
@@ -0,0 +1,61 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <pybind11/embed.h> // everything needed for embedding
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <tensorview/tensorview.h>
+
+namespace py = pybind11;
+
+template <typename T, typename TPyObject>
+std::vector<T> array2Vector(TPyObject arr){
+    py::array arr_np = arr;
+    size_t size = arr.attr("size").template cast<size_t>();
+    py::array_t<T> arr_cc = arr_np;
+    std::vector<T> data(arr_cc.data(), arr_cc.data() + size);
+    return data;
+}
+
+template <typename T>
+std::vector<T> arrayT2Vector(py::array_t<T> arr)
+{
+  std::vector<T> data(arr.data(), arr.data() + arr.size());
+  return data;
+}
+
+template <typename T, typename TPyObject>
+tv::TensorView<T> array2TensorView(TPyObject arr){
+    py::array arr_np = arr;
+    py::array_t<T> arr_cc = arr_np;
+    tv::Shape shape;
+    for (int i = 0; i < arr_cc.ndim(); ++i){
+        shape.push_back(arr_cc.shape(i));
+    }
+    return tv::TensorView<T>(arr_cc.mutable_data(), shape);
+}
+template <typename T>
+tv::TensorView<T> arrayT2TensorView(py::array_t<T> arr){
+    tv::Shape shape;
+    for (int i = 0; i < arr.ndim(); ++i){
+        shape.push_back(arr.shape(i));
+    }
+    return tv::TensorView<T>(arr.mutable_data(), shape);
+}
diff --git a/mmdet3d/ops/spconv/include/spconv/box_iou.h b/mmdet3d/ops/spconv/include/spconv/box_iou.h
new file mode 100644
index 0000000000..937013374b
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/box_iou.h
@@ -0,0 +1,157 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef BOX_IOU_H
+#define BOX_IOU_H
+
+#include <pybind11/pybind11.h>
+// must include pybind11/eigen.h if using eigen matrix as arguments.
+#include <algorithm>
+#include <boost/geometry.hpp>
+#include <pybind11/numpy.h>
+
+namespace spconv {
+// #include "voxelnet/core/cc/pybind11_helper.h"
+namespace py = pybind11;
+using namespace pybind11::literals;
+template <typename DType, typename ShapeContainer>
+inline py::array_t<DType> constant(ShapeContainer shape, DType value) {
+  // create ROWMAJOR array.
+  py::array_t<DType> array(shape);
+  std::fill(array.mutable_data(), array.mutable_data() + array.size(), value);
+  return array;
+}
+
+template <typename DType>
+inline py::array_t<DType> zeros(std::vector<long int> shape) {
+  return constant<DType, std::vector<long int>>(shape, 0);
+}
+
+template <typename DType>
+py::array_t<DType>
+rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
+          py::array_t<DType> standup_iou, DType standup_thresh) {
+  namespace bg = boost::geometry;
+  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
+  typedef bg::model::polygon<point_t> polygon_t;
+  polygon_t poly, qpoly;
+  std::vector<polygon_t> poly_inter, poly_union;
+  DType inter_area, union_area;
+  auto box_corners_r = box_corners.template unchecked<3>();
+  auto qbox_corners_r = qbox_corners.template unchecked<3>();
+  auto standup_iou_r = standup_iou.template unchecked<2>();
+  auto N = box_corners_r.shape(0);
+  auto K = qbox_corners_r.shape(0);
+  py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
+  auto overlaps_rw = overlaps.template mutable_unchecked<2>();
+  if (N == 0 || K == 0) {
+    return overlaps;
+  }
+  for (int k = 0; k < K; ++k) {
+    for (int n = 0; n < N; ++n) {
+      if (standup_iou_r(n, k) <= standup_thresh)
+        continue;
+      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
+      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
+      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
+      bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
+      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
+      bg::append(qpoly,
+                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
+      bg::append(qpoly,
+                 point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
+      bg::append(qpoly,
+                 point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
+      bg::append(qpoly,
+                 point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
+      bg::append(qpoly,
+                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
+
+      bg::intersection(poly, qpoly, poly_inter);
+
+      if (!poly_inter.empty()) {
+        inter_area = bg::area(poly_inter.front());
+        bg::union_(poly, qpoly, poly_union);
+        if (!poly_union.empty()) {
+          union_area = bg::area(poly_union.front());
+          overlaps_rw(n, k) = inter_area / union_area;
+        }
+        poly_union.clear();
+      }
+      poly.clear();
+      qpoly.clear();
+      poly_inter.clear();
+    }
+  }
+  return overlaps;
+}
+
+template <typename DType>
+py::array_t<DType>
+rbbox_intersection(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
+          py::array_t<DType> standup_iou, DType standup_thresh) {
+  namespace bg = boost::geometry;
+  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
+  typedef bg::model::polygon<point_t> polygon_t;
+  polygon_t poly, qpoly;
+  std::vector<polygon_t> poly_inter, poly_union;
+  DType inter_area, union_area;
+  auto box_corners_r = box_corners.template unchecked<3>();
+  auto qbox_corners_r = qbox_corners.template unchecked<3>();
+  auto standup_iou_r = standup_iou.template unchecked<2>();
+  auto N = box_corners_r.shape(0);
+  auto K = qbox_corners_r.shape(0);
+  py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
+  auto overlaps_rw = overlaps.template mutable_unchecked<2>();
+  if (N == 0 || K == 0) {
+    return overlaps;
+  }
+  for (int k = 0; k < K; ++k) {
+    for (int n = 0; n < N; ++n) {
+      if (standup_iou_r(n, k) <= standup_thresh)
+        continue;
+      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
+      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
+      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
+      bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
+      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
+      bg::append(qpoly,
+                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
+      bg::append(qpoly,
+                 point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
+      bg::append(qpoly,
+                 point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
+      bg::append(qpoly,
+                 point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
+      bg::append(qpoly,
+                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
+
+      bg::intersection(poly, qpoly, poly_inter);
+
+      if (!poly_inter.empty()) {
+        inter_area = bg::area(poly_inter.front());
+        overlaps_rw(n, k) = inter_area;
+      }
+      poly.clear();
+      qpoly.clear();
+      poly_inter.clear();
+    }
+  }
+  return overlaps;
+}
+
+
+} // namespace spconv
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h b/mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
new file mode 100644
index 0000000000..526127d2ac
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/fused_spconv_ops.h
@@ -0,0 +1,127 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FUSED_SPARSE_CONV_OP_H_
+#define FUSED_SPARSE_CONV_OP_H_
+
+#include <cuda_runtime_api.h>
+#include <spconv/indice.h>
+#include <spconv/reordering.h>
+#include <torch/script.h>
+#include <torch_utils.h>
+#include <utility/timer.h>
+
+namespace spconv {
+// torch.jit's doc says only support int64, so we need to convert to int32.
+
+template <typename T>
+torch::Tensor fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+                       torch::Tensor indicePairs, torch::Tensor indiceNum,
+                       int64_t numActOut, int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter = std::max_element(
+      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+
+  /*if (_subM){
+    std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
+
+    auto indicePairVecMaxSizeIter = std::max_element(
+        indicePairNumVec.begin(), indicePairNumVec.end());
+    indicePairMaxSize = *indicePairVecMaxSizeIter;
+  }*/
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  // auto indicePairOptions =
+  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
+
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
+  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) { // the center index of subm conv don't need gather and scatter
+              // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    // auto timer = spconv::CudaContextTimer<>();
+    auto outputBufferBlob =
+        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
+    auto inputBufferBlob =
+        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
+
+    if (device == torch::kCPU) {
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+    } else {
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+      /* slower than SparseGatherFunctor, may due to int->long conversion
+      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
+      indicePairOptions);
+      torch::index_select_out(inputBufferBlob, features, 0,
+      indicePairBlob);*/
+    }
+    // totalGatherTime += timer.report() / 1000.0;
+    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+    // totalGEMMTime += timer.report() / 1000.0;
+
+    if (device == torch::kCPU) {
+      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+    } else {
+      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+      TV_CHECK_CUDA_ERR();
+    }
+    // totalSAddTime += timer.report() / 1000.0;
+  }
+  // std::cout << "gather time " << totalGatherTime << std::endl;
+  // std::cout << "gemm time " << totalGEMMTime << std::endl;
+  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
+  return output;
+}
+} // namespace spconv
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/geometry.h b/mmdet3d/ops/spconv/include/spconv/geometry.h
new file mode 100644
index 0000000000..e193e037d7
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/geometry.h
@@ -0,0 +1,301 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPCONV_GEOMETRY_H_
+#define SPCONV_GEOMETRY_H_
+
+#include <iostream>
+#include <limits>
+#include <tensorview/tensorview.h>
+
+namespace spconv {
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
+                                    const Index *kernelSize,
+                                    const Index *stride, const Index *padding,
+                                    const Index *dilation,
+                                    const Index *outSpatialShape, Index *out) {
+  Index lowers[NDim];
+  Index uppers[NDim];
+  Index counter[NDim];
+  Index counterSize[NDim];
+  Index pointCounter = 0;
+  Index val;
+  Index numPoints = 1;
+  Index m, offset;
+  bool valid = false;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
+                 stride[i] + padding[i]) /
+                stride[i];
+    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
+  }
+
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
+    numPoints *= counterSize[i];
+  }
+
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    counter[i] = 0;
+  }
+  for (int i = 0; i < numPoints; ++i) {
+    valid = true;
+    m = 1;
+    offset = 0;
+#pragma unroll
+    for (int j = NDim - 1; j >= 0; --j) {
+      val = uppers[j] - counter[j] * dilation[j];
+      out[pointCounter * (NDim + 1) + j] = val;
+      if (val < 0 || (val > outSpatialShape[j] - 1)) {
+        valid = false;
+        // break;
+      }
+      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
+      m *= kernelSize[j];
+    }
+
+    out[pointCounter * (NDim + 1) + NDim] = offset;
+    if (valid)
+      ++pointCounter;
+    counter[NDim - 1] += 1;
+#pragma unroll
+    for (int c = NDim - 1; c >= 0; --c) {
+      if (counter[c] == counterSize[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return pointCounter;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE Index getValidOutPosTranspose(
+    const Index *input_pos, const Index *kernelSize, const Index *stride,
+    const Index *padding, const Index *dilation, const Index *outSpatialShape,
+    Index *out) {
+  Index lowers[NDim];
+  Index uppers[NDim];
+  Index counter[NDim];
+  Index counterSize[NDim];
+  Index pointCounter = 0;
+  Index val;
+  Index numPoints = 1;
+  Index m, offset;
+  bool valid = false;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    lowers[i] = input_pos[i] * stride[i] - padding[i];
+    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
+  }
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
+    numPoints *= counterSize[i];
+  }
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    counter[i] = 0;
+  }
+  for (int i = 0; i < numPoints; ++i) {
+    valid = true;
+    m = 1;
+    offset = 0;
+#pragma unroll
+    for (int j = NDim - 1; j >= 0; --j) {
+      val = uppers[j] - counter[j] * dilation[j];
+      out[pointCounter * (NDim + 1) + j] = val;
+      if (val < 0 || (val > outSpatialShape[j] - 1)) {
+        valid = false;
+        // break;
+      }
+      offset += m * (val - lowers[j]) / dilation[j];
+      m *= kernelSize[j];
+    }
+    out[pointCounter * (NDim + 1) + NDim] = offset;
+    if (valid)
+      ++pointCounter;
+    counter[NDim - 1] += 1;
+#pragma unroll
+    for (int c = NDim - 1; c >= 0; --c) {
+      if (counter[c] == counterSize[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return pointCounter;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
+                         tv::TensorView<Index> indicesOut,
+                         tv::TensorView<IndexGrid> gridsOut,
+                         tv::TensorView<Index> indicePairs,
+                         tv::TensorView<Index> indiceNum,
+                         const Index *kernelSize, const Index *stride,
+                         const Index *padding, const Index *dilation,
+                         const Index *outSpatialShape) {
+  // indicesOut: num_active * kernelVolume * (NDim + 1)
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index* validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  for (int j = 0; j < numActIn; ++j) {
+    batchIdx = indicesIn(j, 0);
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+                   spatialVolume * batchIdx;
+      if (gridsOut[index] == -1) {
+        for (unsigned k = 1; k < NDim + 1; ++k) {
+          indicesOut(numAct, k) = pointPtr[k - 1];
+        }
+        indicesOut(numAct, 0) = batchIdx;
+        gridsOut[index] = numAct++;
+      }
+      // indicePairs: [K, 2, L]
+      indicePairs(offset, 0, indiceNum[offset]) = j;
+      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+    }
+  }
+  return numAct;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
+                           tv::TensorView<Index> indicesOut,
+                           tv::TensorView<IndexGrid> gridsOut,
+                           tv::TensorView<Index> indicePairs,
+                           tv::TensorView<Index> indiceNum,
+                           const Index *kernelSize, const Index *stride,
+                           const Index *padding, const Index *dilation,
+                           const Index *outSpatialShape) {
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index* validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  for (int j = 0; j < numActIn; ++j) {
+    batchIdx = indicesIn(j, 0);
+    numValidPoints = getValidOutPosTranspose<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+                   spatialVolume * batchIdx;
+      if (gridsOut[index] == -1) {
+        for (unsigned k = 1; k < NDim + 1; ++k) {
+          indicesOut(numAct, k) = pointPtr[k - 1];
+        }
+        indicesOut(numAct, 0) = batchIdx;
+        gridsOut[index] = numAct++;
+      }
+      // indicePairs: [K, 2, L]
+      indicePairs(offset, 0, indiceNum[offset]) = j;
+      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+    }
+  }
+  return numAct;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
+                         tv::TensorView<IndexGrid> gridsOut,
+                         tv::TensorView<Index> indicePairs,
+                         tv::TensorView<Index> indiceNum,
+                         const Index *const kernelSize,
+                         const Index *const stride, const Index *const padding,
+                         const Index *dilation, const Index *const outSpatialShape) {
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  // Index validPoints[kernelVolume * (NDim + 1)];
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index* validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  Index index = 0;
+  for (int j = 0; j < numActIn; ++j) {
+    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
+                                         outSpatialShape) +
+            spatialVolume * indicesIn(j, 0);
+    gridsOut[index] = j;
+  }
+  for (int j = 0; j < numActIn; ++j) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+              spatialVolume * indicesIn(j, 0);
+      if (gridsOut[index] > -1) {
+        indicePairs(offset, 0, indiceNum[offset]) = j;
+        indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+      }
+    }
+  }
+  return numActIn;
+}
+
+} // namespace spconv
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/indice.cu.h b/mmdet3d/ops/spconv/include/spconv/indice.cu.h
new file mode 100644
index 0000000000..b9ceaef409
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/indice.cu.h
@@ -0,0 +1,243 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef INDICE_CU_H_
+#define INDICE_CU_H_
+#include <tensorview/tensorview.h>
+#include <tensorview/helper_kernel.cu.h>
+#include <spconv/geometry.h>
+
+namespace spconv {
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void prepareIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
+    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  auto indicePairsDim2 = indicePairs.dim(2);
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+      indicePairs(offset, 0, oldNum) = ix;
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      indicePairs(offset, 1, oldNum) = index;
+      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void prepareDeConvIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
+    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  auto indicePairsDim2 = indicePairs.dim(2);
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPosTranspose<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+      indicePairs(offset, 0, oldNum) = ix;
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      indicePairs(offset, 1, oldNum) = index;
+      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void assignGridAndIndiceOutKernel(
+    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+    int numAct, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
+
+  Index index;
+  auto indicesOutPtr = indicesOut.data();
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    index = indicePairUnique[ix];
+    gridsOut[index] = ix;
+    index = tv::rowArrayIdxInv<Index, NDim>(
+        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
+    indicesOut[ix * (NDim + 1)] = index % batchSize;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void
+assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
+                        tv::TensorView<IndexGrid> gridsOut, int numActIn,
+                        tv::TensorView<Index> indicePairs,
+                        tv::TensorView<Index> indicePairUnique,
+                        const tv::SimpleVector<Index, NDim> outSpatialShape) {
+
+  Index index;
+  int kernelVolume = indicePairs.dim(0);
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    for (int i = 0; i < kernelVolume; ++i) {
+      index = indicePairs(i, 1, ix);
+      if (index > -1) {
+        indicePairs(i, 1, ix) = gridsOut[index];
+      }
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void
+prepareSubMGridKernel(tv::TensorView<const Index> indicesIn,
+                  tv::TensorView<IndexGrid> gridsOut,
+                  const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index index = 0;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
+                                         outSpatialShape.data()) +
+            spatialVolume * indicesIn(ix, 0);
+    gridsOut[index] = ix;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void getSubMIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  Index index = 0;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (int i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      if (gridsOut[index] > -1) {
+        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+        indicePairs(offset, 1, oldNum) = gridsOut[index];
+        indicePairs(offset, 0, oldNum) = ix;
+      }
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void resetGridKernel(const Index *indicePairUnique,
+                                tv::TensorView<IndexGrid> gridsOut,
+                                int numAct) {
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    gridsOut[indicePairUnique[ix]] = -1;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void
+resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
+                    const tv::SimpleVector<Index, NDim> outSpatialShape,
+                    int numAct) {
+  int outSpatialShapeReg[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShapeReg[i] = outSpatialShape[i];
+  }
+  Index spatialVolume = 1;
+  auto indsPtr = indices;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    indsPtr = indices + ix * (NDim + 1);
+    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);
+    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
+  }
+}
+
+} // namespace spconv
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/indice.h b/mmdet3d/ops/spconv/include/spconv/indice.h
new file mode 100644
index 0000000000..809c56f734
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/indice.h
@@ -0,0 +1,79 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
+#define SPARSE_CONV_INDICE_FUNCTOR_H_
+#include <tensorview/tensorview.h>
+
+namespace spconv
+{
+namespace functor
+{
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1
+{
+    Index operator()(
+        const Device& d, tv::TensorView<const Index> indicesIn,
+        tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+        tv::TensorView<Index> indicePairUnique,
+        const tv::SimpleVector<Index, NDim> kernelSize,
+        const tv::SimpleVector<Index, NDim> stride,
+        const tv::SimpleVector<Index, NDim> padding,
+        const tv::SimpleVector<Index, NDim> dilation,
+        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2
+{
+    Index operator()(
+        const Device& d, tv::TensorView<const Index> indicesIn,
+        tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+        tv::TensorView<Index> indicePairUnique,
+        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose,
+        bool resetGrid=false);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor
+{
+    Index operator()(
+        const Device& d, tv::TensorView<const Index> indicesIn,
+        tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+        const tv::SimpleVector<Index, NDim> kernelSize,
+        const tv::SimpleVector<Index, NDim> stride,
+        const tv::SimpleVector<Index, NDim> padding,
+        const tv::SimpleVector<Index, NDim> dilation,
+        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor
+{
+    Index operator()(
+        const Device& d, tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+        tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+        const tv::SimpleVector<Index, NDim> kernelSize,
+        const tv::SimpleVector<Index, NDim> stride,
+        const tv::SimpleVector<Index, NDim> padding,
+        const tv::SimpleVector<Index, NDim> dilation,
+        const tv::SimpleVector<Index, NDim> outSpatialShape, bool transpose, bool resetGrid=false);
+};
+} // namespace functor
+} // namespace spconv
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/maxpool.h b/mmdet3d/ops/spconv/include/spconv/maxpool.h
new file mode 100644
index 0000000000..5ee91353da
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/maxpool.h
@@ -0,0 +1,44 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
+#define SPARSE_MAXPOOL_FUNCTOR_H_
+#include <tensorview/tensorview.h>
+
+namespace spconv
+{
+namespace functor
+{
+template <typename Device, typename T, typename Index>
+struct SparseMaxPoolForwardFunctor
+{
+    void operator()(const Device& d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename T, typename Index>
+struct SparseMaxPoolBackwardFunctor
+{
+    void operator()(const Device& d, tv::TensorView<const T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const T> dout,
+                  tv::TensorView<T> din,
+                  tv::TensorView<const Index> indices, int size);
+};
+
+} // namespace functor
+} // namespace spconv
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/mp_helper.h b/mmdet3d/ops/spconv/include/spconv/mp_helper.h
new file mode 100644
index 0000000000..cff8dccffe
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/mp_helper.h
@@ -0,0 +1,47 @@
+#ifndef MP_HELPER_H_
+#define MP_HELPER_H_
+#include <type_traits>
+#include <utility>
+
+namespace spconv {
+template <class... T> struct mp_list {};
+
+template <class T, T... I>
+using mp_list_c = mp_list<std::integral_constant<T, I>...>;
+
+namespace detail {
+
+template <class... T, class F>
+constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
+  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
+}
+
+template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) {
+  return std::forward<F>(f);
+}
+
+} // namespace detail
+
+namespace detail {
+
+template <class A, template <class...> class B> struct mp_rename_impl {
+  // An error "no type named 'type'" here means that the first argument to
+  // mp_rename is not a list
+};
+
+template <template <class...> class A, class... T, template <class...> class B>
+struct mp_rename_impl<A<T...>, B> {
+  using type = B<T...>;
+};
+
+} // namespace detail
+
+template <class A, template <class...> class B>
+using mp_rename = typename detail::mp_rename_impl<A, B>::type;
+
+template <class L, class F> constexpr F mp_for_each(F &&f) {
+  return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
+}
+} // namespace spconv
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/nms.h b/mmdet3d/ops/spconv/include/spconv/nms.h
new file mode 100644
index 0000000000..b5470a6af4
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/nms.h
@@ -0,0 +1,201 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef NMS_CPU_H
+#define NMS_CPU_H
+#include <pybind11/pybind11.h>
+// must include pybind11/stl.h if using containers in STL in arguments.
+#include <algorithm>
+#include <boost/geometry.hpp>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+#include <vector>
+#include "box_iou.h"
+#include "nms_gpu.h"
+namespace spconv {
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+template <typename DType>
+std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
+                                         py::array_t<int> order, DType thresh,
+                                         DType eps = 0) {
+  auto ndets = boxes.shape(0);
+  auto boxes_r = boxes.template unchecked<2>();
+  auto order_r = order.template unchecked<1>();
+  auto suppressed = zeros<int>({int(ndets)});
+  auto suppressed_rw = suppressed.template mutable_unchecked<1>();
+  auto area = zeros<DType>({int(ndets)});
+  auto area_rw = area.template mutable_unchecked<1>();
+  // get areas
+  for (int i = 0; i < ndets; ++i) {
+    area_rw(i) = (boxes_r(i, 2) - boxes_r(i, 0) + eps) *
+                 (boxes_r(i, 3) - boxes_r(i, 1) + eps);
+  }
+  std::vector<int> keep;
+  int i, j;
+  DType xx1, xx2, w, h, inter, ovr;
+  for (int _i = 0; _i < ndets; ++_i) {
+    i = order_r(_i);
+    if (suppressed_rw(i) == 1)
+      continue;
+    keep.push_back(i);
+    for (int _j = _i + 1; _j < ndets; ++_j) {
+      j = order_r(_j);
+      if (suppressed_rw(j) == 1)
+        continue;
+      xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
+      xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
+      w = xx2 - xx1 + eps;
+      if (w > 0) {
+        xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3));
+        xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1));
+        h = xx2 - xx1 + eps;
+        if (h > 0) {
+          inter = w * h;
+          ovr = inter / (area_rw(i) + area_rw(j) - inter);
+          if (ovr >= thresh)
+            suppressed_rw(j) = 1;
+        }
+      }
+    }
+  }
+  return keep;
+}
+
+template <typename DType>
+std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
+                                                py::array_t<int> order,
+                                                py::array_t<DType> standup_iou,
+                                                DType thresh) {
+  auto ndets = box_corners.shape(0);
+  auto box_corners_r = box_corners.template unchecked<3>();
+  auto order_r = order.template unchecked<1>();
+  auto suppressed = zeros<int>({int(ndets)});
+  auto suppressed_rw = suppressed.template mutable_unchecked<1>();
+  auto standup_iou_r = standup_iou.template unchecked<2>();
+  std::vector<int> keep;
+  int i, j;
+
+  namespace bg = boost::geometry;
+  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
+  typedef bg::model::polygon<point_t> polygon_t;
+  polygon_t poly, qpoly;
+  std::vector<polygon_t> poly_inter, poly_union;
+  DType inter_area, union_area, overlap;
+
+  for (int _i = 0; _i < ndets; ++_i) {
+    i = order_r(_i);
+    if (suppressed_rw(i) == 1)
+      continue;
+    keep.push_back(i);
+    for (int _j = _i + 1; _j < ndets; ++_j) {
+      j = order_r(_j);
+      if (suppressed_rw(j) == 1)
+        continue;
+      if (standup_iou_r(i, j) <= 0.0)
+        continue;
+      // std::cout << "pre_poly" << std::endl;
+      try {
+        bg::append(poly,
+                   point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
+        bg::append(poly,
+                   point_t(box_corners_r(i, 1, 0), box_corners_r(i, 1, 1)));
+        bg::append(poly,
+                   point_t(box_corners_r(i, 2, 0), box_corners_r(i, 2, 1)));
+        bg::append(poly,
+                   point_t(box_corners_r(i, 3, 0), box_corners_r(i, 3, 1)));
+        bg::append(poly,
+                   point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
+        bg::append(qpoly,
+                   point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
+        bg::append(qpoly,
+                   point_t(box_corners_r(j, 1, 0), box_corners_r(j, 1, 1)));
+        bg::append(qpoly,
+                   point_t(box_corners_r(j, 2, 0), box_corners_r(j, 2, 1)));
+        bg::append(qpoly,
+                   point_t(box_corners_r(j, 3, 0), box_corners_r(j, 3, 1)));
+        bg::append(qpoly,
+                   point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
+        bg::intersection(poly, qpoly, poly_inter);
+      } catch (const std::exception &e) {
+        std::cout << "box i corners:" << std::endl;
+        for (int k = 0; k < 4; ++k) {
+          std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i, k, 1)
+                    << std::endl;
+        }
+        std::cout << "box j corners:" << std::endl;
+        for (int k = 0; k < 4; ++k) {
+          std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j, k, 1)
+                    << std::endl;
+        }
+        // throw e;
+        continue;
+      }
+      // std::cout << "post_poly" << std::endl;
+      // std::cout << "post_intsec" << std::endl;
+      if (!poly_inter.empty()) {
+        inter_area = bg::area(poly_inter.front());
+        // std::cout << "pre_union" << " " << inter_area << std::endl;
+        bg::union_(poly, qpoly, poly_union);
+        /*
+        if (poly_union.empty()){
+            std::cout << "intsec area:" << " " << inter_area << std::endl;
+            std::cout << "box i corners:" << std::endl;
+            for(int k = 0; k < 4; ++k){
+                std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i,
+        k, 1) << std::endl;
+            }
+            std::cout << "box j corners:" <<  std::endl;
+            for(int k = 0; k < 4; ++k){
+                std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j,
+        k, 1) << std::endl;
+            }
+        }*/
+        // std::cout << "post_union" << poly_union.empty() << std::endl;
+        if (!poly_union.empty()) { // ignore invalid box
+          union_area = bg::area(poly_union.front());
+          // std::cout << "post union area" << std::endl;
+          // std::cout << union_area << "debug" << std::endl;
+          overlap = inter_area / union_area;
+          if (overlap >= thresh)
+            suppressed_rw(j) = 1;
+          poly_union.clear();
+        }
+      }
+      poly.clear();
+      qpoly.clear();
+      poly_inter.clear();
+    }
+  }
+  return keep;
+}
+
+constexpr int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+template <typename DType>
+int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out,
+                        DType nms_overlap_thresh, int device_id) {
+  py::buffer_info info = boxes.request();
+  auto boxes_ptr = static_cast<DType *>(info.ptr);
+  py::buffer_info info_k = keep_out.request();
+  auto keep_out_ptr = static_cast<int *>(info_k.ptr);
+
+  return _nms_gpu<DType, threadsPerBlock>(keep_out_ptr, boxes_ptr,
+                                          boxes.shape(0), boxes.shape(1),
+                                          nms_overlap_thresh, device_id);
+}
+
+} // namespace spconv
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/nms_functor.h b/mmdet3d/ops/spconv/include/spconv/nms_functor.h
new file mode 100644
index 0000000000..45d055fea7
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/nms_functor.h
@@ -0,0 +1,42 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef NMS_FUNCTOR_H_
+#define NMS_FUNCTOR_H_
+#include <tensorview/tensorview.h>
+
+namespace spconv
+{
+namespace functor
+{
+template <typename Device, typename T, typename Index>
+struct NonMaxSupressionFunctor
+{
+    Index operator()(const Device& d, tv::TensorView<Index> keep,
+                  tv::TensorView<const T> boxes,
+                  T threshold, T eps);
+};
+
+template <typename Device, typename T, typename Index>
+struct rotateNonMaxSupressionFunctor
+{
+    Index operator()(const Device& d, tv::TensorView<Index> keep,
+                  tv::TensorView<const T> boxCorners,
+                  tv::TensorView<const T> standupIoU, T threshold);
+};
+
+} // namespace functor
+} // namespace spconv
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/nms_gpu.h b/mmdet3d/ops/spconv/include/spconv/nms_gpu.h
new file mode 100644
index 0000000000..c17e2ebe93
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/nms_gpu.h
@@ -0,0 +1,18 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+template <typename DType, int BLOCK_THREADS>
+int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
+             int boxes_dim, DType nms_overlap_thresh, int device_id);
diff --git a/mmdet3d/ops/spconv/include/spconv/nms_ops.h b/mmdet3d/ops/spconv/include/spconv/nms_ops.h
new file mode 100644
index 0000000000..dd79b492b7
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/nms_ops.h
@@ -0,0 +1,75 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef NMS_TORCH_OP_H_
+#define NMS_TORCH_OP_H_
+
+#include <cuda_runtime_api.h>
+#include <spconv/indice.h>
+#include <spconv/reordering.h>
+#include <torch/script.h>
+#include <torch_utils.h>
+#include <utility/timer.h>
+#include <spconv/nms_functor.h>
+
+namespace spconv {
+// torch.jit's doc says only support int64, so we need to convert to int32.
+template <typename T>
+torch::Tensor
+nonMaxSuppression(torch::Tensor boxes, torch::Tensor scores, int64_t preMaxSize,
+        int64_t postMaxSize, double thresh, double eps) {
+  // auto timer = spconv::CudaContextTimer<>();
+  tv::check_torch_dtype<T>(boxes);
+  auto resOptions =
+      torch::TensorOptions().dtype(torch::kInt64).device(boxes.device());
+  if (boxes.size(0) == 0){
+      return torch::zeros({0}, resOptions);
+  }
+  torch::Tensor indices;
+  if (preMaxSize > 0){
+      auto numKeepedScores = scores.size(0);
+      preMaxSize = std::min(numKeepedScores, preMaxSize);
+      auto res = torch::topk(scores, preMaxSize);
+      indices = std::get<1>(res);
+      boxes = torch::index_select(boxes, 0, indices);
+  }else{
+      indices = std::get<1>(torch::sort(scores));
+      boxes = torch::index_select(boxes, 0, indices);
+  }
+  if (boxes.size(0) == 0)
+    return torch::zeros({0}, resOptions);
+
+  auto keep = torch::zeros({boxes.size(0)}, resOptions);
+  int64_t keepNum = 0;
+  if (boxes.device().type() == torch::kCPU) {
+    auto nmsFunctor = functor::NonMaxSupressionFunctor<tv::CPU, T, int64_t>();
+    keepNum = nmsFunctor(tv::CPU(), tv::torch2tv<int64_t>(keep),
+    tv::torch2tv<const T>(boxes), T(thresh), T(eps));
+  }else{
+    TV_ASSERT_RT_ERR(false, "not implemented");
+  }
+  if (postMaxSize <= 0){
+    postMaxSize = keepNum;
+  }
+  // std::cout << keep << std::endl;
+  keep = keep.slice(0, 0, std::min(keepNum, postMaxSize));
+  if (preMaxSize > 0){
+    return torch::index_select(indices, 0, keep);
+  }
+  return keep;
+}
+
+} // namespace spconv
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/point2voxel.h b/mmdet3d/ops/spconv/include/spconv/point2voxel.h
new file mode 100644
index 0000000000..b5eae9b222
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/point2voxel.h
@@ -0,0 +1,414 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <pybind11/pybind11.h>
+// must include pybind11/eigen.h if using eigen matrix as arguments.
+// must include pybind11/stl.h if using containers in STL in arguments.
+#include <algorithm>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+// #include <vector>
+#include <iostream>
+#include <math.h>
+
+namespace spconv {
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<DType> means,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto means_rw = means.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+      for (int k = 0; k < num_features; ++k) {
+        means_rw(voxelidx, k) += (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
+      }
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    num = num_points_per_voxel_rw(i);
+    for (int j = num; j < max_points; ++j){
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(i, j, k) = means_rw(i, k);
+      }
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<DType> height,
+                          py::array_t<DType> maxs,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+        height_rw(voxelidx, k) = std::min(points_rw(i, k), height_rw(voxelidx, k));
+        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    for (int k = 0; k < num_features; ++k) {
+      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int block_filtering(py::array_t<DType> points,
+                          py::array_t<int> mask,
+                          py::array_t<DType> height,
+                          py::array_t<DType> maxs,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range,
+                          int max_voxels,
+                          DType eps) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<1>();
+  auto maxs_rw = maxs.template mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+    }
+    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
+    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
+  }
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps){
+      mask(i) = 0;
+    }
+  }
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<int> voxel_mask,
+                          py::array_t<DType> mins,
+                          py::array_t<DType> maxs,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range,
+                          int max_points,
+                          int max_voxels,
+                          int block_factor,
+                          int block_size,
+                          DType height_threshold) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto mins_rw = mins.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  // auto ndim = points_rw.shape(1) - 1;
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+
+  DType max_value, min_value;
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int block_shape_H = grid_size[1] / block_factor;
+  int block_shape_W = grid_size[0] / block_factor;
+  int voxelidx, num;
+  int block_coor[2];
+  int startx, stopx, starty, stopy;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed)
+      continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      block_coor[0] = coor[1] / block_factor;
+      block_coor[1] = coor[2] / block_factor;
+      mins_rw(block_coor[0], block_coor[1]) = std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      maxs_rw(block_coor[0], block_coor[1]) = std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor[1] = coors_rw(i, 1);
+    coor[2] = coors_rw(i, 2);
+    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
+    block_coor[0] = coor[1] / block_factor;
+    block_coor[1] = coor[2] / block_factor;
+    min_value = mins_rw(block_coor[0], block_coor[1]);
+    max_value = maxs_rw(block_coor[0], block_coor[1]);
+    startx = std::max(0, block_coor[0]-block_size/2);
+    stopx = std::min(block_shape_H, block_coor[0]+block_size-block_size/2);
+    starty = std::max(0, block_coor[1]-block_size/2);
+    stopy = std::min(block_shape_W, block_coor[1]+block_size-block_size/2);
+
+    for (int j = startx; j < stopx; ++j){
+      for (int k = starty; k < stopy; ++k){
+        min_value = std::min(min_value, mins_rw(j, k));
+        max_value = std::max(max_value, maxs_rw(j, k));
+      }
+    }
+    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
+  }
+  return voxel_num;
+}
+
+
+} // namespace spconv
diff --git a/mmdet3d/ops/spconv/include/spconv/pool_ops.h b/mmdet3d/ops/spconv/include/spconv/pool_ops.h
new file mode 100644
index 0000000000..6350f297e1
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/pool_ops.h
@@ -0,0 +1,97 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_POOL_OP_H_
+#define SPARSE_POOL_OP_H_
+
+#include <cuda_runtime_api.h>
+#include <spconv/maxpool.h>
+#include <torch/script.h>
+#include <torch_utils.h>
+#include <utility/timer.h>
+
+namespace spconv {
+template <typename T>
+torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
+                          torch::Tensor indiceNum, int64_t numAct) {
+  auto device = features.device().type();
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
+  double totalTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    // auto timer = spconv::CudaContextTimer<>();
+    if (device == torch::kCPU) {
+      functor::SparseMaxPoolForwardFunctor<tv::CPU, T, int> forwardFtor;
+      forwardFtor(tv::CPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(features),
+                  tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+    } else {
+      functor::SparseMaxPoolForwardFunctor<tv::GPU, T, int> forwardFtor;
+      forwardFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(features),
+                  tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+    // totalTime += timer.report() / 1000.0;
+  }
+  // std::cout << "maxpool forward time " << totalTime << std::endl;
+  return output;
+}
+
+template <typename T>
+torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
+                                  torch::Tensor outFeatures,
+                                  torch::Tensor outGrad, torch::Tensor indicePairs,
+                                  torch::Tensor indiceNum) {
+  auto device = features.device().type();
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+    auto kernelVolume = indicePairs.size(0);
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    if (device == torch::kCPU) {
+      functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, int> backwardFtor;
+      backwardFtor(tv::CPU(), tv::torch2tv<const T>(outFeatures),
+                   tv::torch2tv<const T>(features),
+                   tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad),
+                   tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+    } else {
+      functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, int> backwardFtor;
+      backwardFtor(tv::TorchGPU(), tv::torch2tv<const T>(outFeatures),
+                   tv::torch2tv<const T>(features),
+                   tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad),
+                   tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+  return inputGrad;
+}
+
+} // namespace spconv
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/reordering.cu.h b/mmdet3d/ops/spconv/include/spconv/reordering.cu.h
new file mode 100644
index 0000000000..5b1e740e17
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/reordering.cu.h
@@ -0,0 +1,161 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef REORDERING_CU_H_
+#define REORDERING_CU_H_
+#include <tensorview/helper_kernel.cu.h>
+
+// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+namespace spconv {
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void gatherGenericKernel(T *buffer, const T *features,
+                                    const Index *indices, int size,
+                                    int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size)
+          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
+              features[inds[ilp] + iy];
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
+__global__ void gatherVecKernel(T *buffer, const T *features,
+                                const Index *indices, int size, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size)
+          reinterpret_cast<VecType *>(
+              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
+              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP,
+          typename VecType = int4>
+__global__ void gatherVecBlockKernel(T *buffer, const T *features,
+                                     const Index *indices, int size,
+                                     int numPlanes) {
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  features += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      reinterpret_cast<VecType *>(
+          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
+          reinterpret_cast<const VecType *>(
+              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
+                        threadIdx.x];
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void scatterAddGenericKernel(T *outFeatures, const T *buffer,
+                                        const Index *indices, int size,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size) {
+          outFeatures[inds[ilp] + iy] +=
+              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP,
+          typename VecType = int4>
+__global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
+                                         const Index *indices, int size,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  outFeatures += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+  T buf[vecloadFactor];
+  T buf2[vecloadFactor];
+  Index idx;
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(buf)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idx];
+      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
+          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        buf[i] += buf2[i];
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idx] =
+          reinterpret_cast<VecType *>(buf)[0];
+    }
+  }
+}
+
+} // namespace spconv
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/reordering.h b/mmdet3d/ops/spconv/include/spconv/reordering.h
new file mode 100644
index 0000000000..9b084e65a5
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/reordering.h
@@ -0,0 +1,40 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_REORDERING_FUNCTOR_H_
+#define SPARSE_REORDERING_FUNCTOR_H_
+#include <tensorview/tensorview.h>
+
+namespace spconv
+{
+namespace functor
+{
+template <typename Device, typename T, typename Index>
+struct SparseGatherFunctor
+{
+    void operator()(const Device& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
+                    tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename T, typename Index>
+struct SparseScatterAddFunctor
+{
+    void operator()(const Device& d, tv::TensorView<T> out_features,
+                    tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
+                    int size, bool stable=false);
+};
+} // namespace functor
+} // namespace spconv
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/spconv/spconv_ops.h b/mmdet3d/ops/spconv/include/spconv/spconv_ops.h
new file mode 100644
index 0000000000..23c7fa6829
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/spconv/spconv_ops.h
@@ -0,0 +1,561 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_CONV_OP_H_
+#define SPARSE_CONV_OP_H_
+
+#include <cuda_runtime_api.h>
+#include <spconv/indice.h>
+#include <spconv/reordering.h>
+#include <torch/script.h>
+#include <torch_utils.h>
+#include <utility/timer.h>
+
+namespace spconv {
+// torch.jit's doc says only support int64, so we need to convert to int32.
+template <unsigned NDim>
+std::vector<torch::Tensor>
+getIndicePair(torch::Tensor indices, int64_t batchSize,
+        std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+        std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+        std::vector<int64_t> padding, std::vector<int64_t> dilation,
+        std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  // auto timer = spconv::CudaContextTimer<>();
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                   torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor gridOut =
+      torch::full({batchSize * outputVolume}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique =
+      torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                    torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
+      numActOut =
+          getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
+                        tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+                        tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+                        tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+                        padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor>
+getIndicePairPreGrid(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+        std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+        std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+        std::vector<int64_t> padding, std::vector<int64_t> dilation,
+        std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  // auto timer = spconv::CudaContextTimer<>();
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                   torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  // std::cout << "full time " << timer.report() / 1000.0 << std::endl;
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique =
+      torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose, true);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                    torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor = functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum), kernelSize32,
+          stride32, padding32, dilation32, outSpatialShape32, transpose, true);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
+      numActOut =
+          getIndicePairFtorP1(tv::TorchGPU(), tv::torch2tv<const int>(indices),
+                        tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+                        tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+                        tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+                        padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose, true);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+
+}
+
+
+template <typename T>
+torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
+                       torch::Tensor indicePairs, torch::Tensor indiceNum,
+                       int64_t numActOut, int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter = std::max_element(
+      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+
+  /*if (_subM){
+    std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
+
+    auto indicePairVecMaxSizeIter = std::max_element(
+        indicePairNumVec.begin(), indicePairNumVec.end());
+    indicePairMaxSize = *indicePairVecMaxSizeIter;
+  }*/
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  // auto indicePairOptions =
+  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
+
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) { // the center index of subm conv don't need gather and scatter
+              // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    // auto timer = spconv::CudaContextTimer<>();
+    auto outputBufferBlob =
+        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
+    auto inputBufferBlob =
+        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
+
+    if (device == torch::kCPU) {
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+    } else {
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+      /* slower than SparseGatherFunctor, may due to int->long conversion
+      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
+      indicePairOptions);
+      torch::index_select_out(inputBufferBlob, features, 0,
+      indicePairBlob);*/
+    }
+    // totalGatherTime += timer.report() / 1000.0;
+    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+    // totalGEMMTime += timer.report() / 1000.0;
+
+    if (device == torch::kCPU) {
+      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+    } else {
+      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+      TV_CHECK_CUDA_ERR();
+    }
+    // totalSAddTime += timer.report() / 1000.0;
+  }
+  // std::cout << "gather time " << totalGatherTime << std::endl;
+  // std::cout << "gemm time " << totalGEMMTime << std::endl;
+  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
+  return output;
+}
+
+template <typename T>
+std::vector<torch::Tensor>
+indiceConvBackward(torch::Tensor features, torch::Tensor filters,
+                 torch::Tensor outGrad, torch::Tensor indicePairs, torch::Tensor indiceNum,
+                 int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter = std::max_element(
+      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  auto filterShape = filters.sizes();
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+  torch::Tensor inputBuffer = torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    auto filterGradSub = filtersGrad[indicePairMaxOffset];
+    torch::mm_out(filterGradSub, features.t(), outGrad);
+    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    if (device == torch::kCPU) {
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
+      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
+                    tv::torch2tv<const T>(outGrad),
+                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
+    } else {
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
+      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+      gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
+                    tv::torch2tv<const T>(outGrad),
+                    tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+    auto filterGradSub = filtersGrad[i];
+    auto outputBufferBlob =
+        torch::from_blob(outputBuffer.data<T>(), {nHot, numOutPlanes}, options);
+    auto inputBufferBlob =
+        torch::from_blob(inputBuffer.data<T>(), {nHot, numInPlanes}, options);
+
+    torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
+    torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
+    if (device == torch::kCPU) {
+      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+      scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
+                  tv::torch2tv<const T>(inputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+    } else {
+      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
+                  tv::torch2tv<const T>(inputBuffer),
+                  tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+  return {inputGrad, filtersGrad.view(filterShape)};
+}
+
+template <typename T>
+torch::Tensor indiceConvDevelopDontUse(torch::Tensor features, torch::Tensor filters,
+                         torch::Tensor indicePairs, torch::Tensor indiceNum,
+                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto totalActsTen = indicePairNumCpu.sum();
+  auto totalActs = indicePairNumCpu.data<int>()[0];
+  auto indicePairMaxSizeIter = std::max_element(
+      indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
+  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumCpu.data<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(),
+                              indicePairNumCpu.data<int>() + kernelVolume);
+  indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
+  int subRuleMaxSize = *std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
+  if (subM) {
+    indicePairMaxSize = subRuleMaxSize;
+  }
+  auto timer = spconv::CudaContextTimer<>();
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  // auto indicePairOptions =
+  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
+
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({kernelVolume, indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  std::cout << "create time " << timer.report()/1000.0 << std::endl;
+  if (subM) { // the center index of subm conv don't need gather and scatter
+              // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  // auto timer = spconv::CudaContextTimer<>();
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    //
+    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+                                             {nHot, numOutPlanes}, options);
+    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
+                                            {nHot, numInPlanes}, options);
+    if (device == torch::kCPU) {
+      functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+      gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+    } else {
+      functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+      gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
+                 tv::torch2tv<const T>(features),
+                 tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+      TV_CHECK_CUDA_ERR();
+    }
+    // }
+    // for (int i = 0; i < kernelVolume; ++i) {
+    // totalGatherTime += timer.report() / 1000.0;
+    // auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+    // {nHot, numOutPlanes}, options);
+
+  }
+  // totalGatherTime += timer.report() / 1000.0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+                                             {nHot, numOutPlanes}, options);
+    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
+                                            {nHot, numInPlanes}, options);
+
+    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+  }
+  // totalGEMMTime += timer.report() / 1000.0;
+  // totalGEMMTime += timer.report() / 1000.0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+    auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
+                                             {nHot, numOutPlanes}, options);
+    auto inputBufferBlob = torch::from_blob(inputBuffer[i].data<T>(),
+                                            {nHot, numInPlanes}, options);
+
+    if (device == torch::kCPU) {
+      functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+      scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBufferBlob),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+    } else {
+      functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+      scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
+                  tv::torch2tv<const T>(outputBufferBlob),
+                  tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                  true);
+      TV_CHECK_CUDA_ERR();
+    }
+    // totalSAddTime += timer.report() / 1000.0;
+  }
+  // totalSAddTime += timer.report() / 1000.0;
+  // std::cout << "gather time " << totalGatherTime << std::endl;
+  // std::cout << "gemm time " << totalGEMMTime << std::endl;
+  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
+  return output;
+}
+
+} // namespace spconv
+
+#endif
diff --git a/mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h b/mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
new file mode 100644
index 0000000000..fa6ea44390
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
@@ -0,0 +1,81 @@
+#pragma once
+// from tensorflow
+namespace tv
+{
+namespace detail
+{
+
+template <typename T>
+class KernelLoop
+{
+  struct Iterator
+  {
+    __forceinline__ __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
+    __forceinline__ __device__ T operator*() const { return index_; }
+    __forceinline__ __device__ Iterator &operator++()
+    {
+      index_ += delta_;
+      return *this;
+    }
+    __forceinline__ __device__ bool operator!=(const Iterator &other) const
+    {
+      bool greater = index_ > other.index_;
+      bool less = index_ < other.index_;
+      // Anything past an end iterator (delta_ == 0) is equal.
+      // In range-based for loops, this optimizes to 'return less'.
+      if (!other.delta_)
+      {
+        return less;
+      }
+      if (!delta_)
+      {
+        return greater;
+      }
+      return less || greater;
+    }
+
+  private:
+    T index_;
+    const T delta_;
+  };
+
+public:
+  __forceinline__ __device__ KernelLoop(T begin, T delta, T end)
+      : begin_(begin), delta_(delta), end_(end) {}
+
+  __forceinline__ __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
+  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
+
+private:
+  T begin_;
+  T delta_;
+  T end_;
+};
+
+} // namespace detail
+template <typename T, int NumILP=1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count)
+{
+  return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
+                                  gridDim.x * blockDim.x * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
+// Usage: for(int i : KernelLoopY(count)) { visit(i); }
+template <typename T, int NumILP=1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count)
+{
+  return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
+                                  gridDim.y * blockDim.y * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
+// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
+template <typename T, int NumILP=1>
+__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count)
+{
+  return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
+                                  gridDim.z * blockDim.z * NumILP, count);
+}
+
+} // namespace tv
diff --git a/mmdet3d/ops/spconv/include/tensorview/helper_launch.h b/mmdet3d/ops/spconv/include/tensorview/helper_launch.h
new file mode 100644
index 0000000000..fa349c0871
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/tensorview/helper_launch.h
@@ -0,0 +1,21 @@
+#pragma once
+// from pytorch.aten
+#include "tensorview.h"
+namespace tv
+{
+namespace launch
+{
+
+template <typename T1, typename T2>
+inline int DivUp(const T1 a, const T2 b) { return (a + b - 1) / b; }
+
+// Use 1024 threads per block, which requires cuda sm_2x or above
+constexpr int CUDA_NUM_THREADS = 1024;
+// CUDA: number of blocks for threads.
+inline int getBlocks(const int N)
+{
+    TV_ASSERT_RT_ERR(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
+    return DivUp(N, CUDA_NUM_THREADS);
+}
+} // namespace launch
+} // namespace tv
diff --git a/mmdet3d/ops/spconv/include/tensorview/tensorview.h b/mmdet3d/ops/spconv/include/tensorview/tensorview.h
new file mode 100644
index 0000000000..782c4e3c29
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/tensorview/tensorview.h
@@ -0,0 +1,1144 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <cuda_runtime_api.h>
+#include <iostream>
+#include <memory>
+// #include <prettyprint.h>
+#include <sstream>
+#include <type_traits>
+#include <vector>
+
+namespace tv {
+
+#ifdef __NVCC__
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#define TV_ASSERT(expr) assert(expr)
+#elif defined(__CUDACC_RTC__)
+#define TV_ASSERT(expr) assert(expr)
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#else
+#define TV_ASSERT(x) assert(x)
+#define TV_HOST_DEVICE_INLINE inline
+#define TV_HOST_DEVICE
+#endif
+
+#define TV_REQUIRE(expr, ...)                                                  \
+  {                                                                            \
+    if (!(expr)) {                                                             \
+      printf(__VA_ARGS__);                                                     \
+      assert(expr);                                                            \
+    }                                                                          \
+  }
+
+#define TV_DEVICE_REQUIRE(expr, ...)                                           \
+  {                                                                            \
+    if (!(expr) && threadIdx.x == 0)                                           \
+      printf(__VA_ARGS__);                                                     \
+    assert(expr);                                                              \
+  }
+
+template <class SStream, class T> void sstream_print(SStream &ss, T val) {
+  ss << val;
+}
+
+template <class SStream, class T, class... TArgs>
+void sstream_print(SStream &ss, T val, TArgs... args) {
+  ss << val << " ";
+  sstream_print(ss, args...);
+}
+
+#define TV_ASSERT_RT_ERR(expr, ...)                                            \
+  {                                                                            \
+    if (!(expr)) {                                                             \
+      std::stringstream __macro_s;                                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
+      __macro_s << #expr << " assert faild. ";                                 \
+      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
+      throw std::runtime_error(__macro_s.str());                               \
+    }                                                                          \
+  }
+
+#define TV_ASSERT_INVALID_ARG(expr, ...)                                       \
+  {                                                                            \
+    if (!(expr)) {                                                             \
+      std::stringstream __macro_s;                                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
+      __macro_s << #expr << " assert faild. ";                                 \
+      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
+      throw std::invalid_argument(__macro_s.str());                            \
+    }                                                                          \
+  }
+
+#define TV_CHECK_CUDA_ERR()                                                    \
+  {                                                                            \
+    auto err = cudaGetLastError();                                             \
+    if (err != cudaSuccess) {                                                  \
+      std::stringstream __macro_s;                                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
+      __macro_s << "cuda execution failed with error " << err;                 \
+      throw std::runtime_error(__macro_s.str());                               \
+    }                                                                          \
+  }
+
+struct GPU {
+  GPU(cudaStream_t s = 0) : mStream(s) {}
+  virtual cudaStream_t getStream() const { return mStream; }
+  cudaStream_t mStream = 0;
+};
+struct CPU {};
+
+#define TV_MAX_DIM 6
+/*
+template <typename T>
+constexpr size_t calc_align(size_t ndim)
+{
+  if (ndim * sizeof(T) == 1)
+    return 1;
+  else if (ndim * sizeof(T) == 2)
+    return 2;
+  else if (ndim * sizeof(T) <= 4 && ndim * sizeof(T) > 2)
+    return 4;
+  else if (ndim * sizeof(T) <= 8 && ndim * sizeof(T) > 4)
+    return 8;
+  else if (ndim * sizeof(T) <= 16 && ndim * sizeof(T) > 8)
+    return 16;
+  else if (ndim * sizeof(T) <= 32 && ndim * sizeof(T) > 16)
+    return 32;
+  else
+    return 64;
+}
+*/
+template <typename T, size_t MaxDim = TV_MAX_DIM>
+struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
+public:
+  TV_HOST_DEVICE_INLINE SimpleVector(){};
+  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
+    TV_ASSERT(q.size() <= MaxDim);
+    mSize = 0;
+    for (T s : q) {
+      mArray[mSize++] = s;
+    }
+    mSize = q.size();
+  }
+  SimpleVector(const std::vector<T> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE SimpleVector(const SimpleVector<T, MaxDim> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE void push_back(T s) {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize < MaxDim);
+#endif
+    mArray[mSize] = s;
+    mSize++;
+  }
+  TV_HOST_DEVICE_INLINE void pop_back() {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize > 0);
+#endif
+    mSize--;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
+  TV_HOST_DEVICE_INLINE const T *data() const { return mArray; }
+  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
+
+  typedef size_t size_type;
+
+  class iterator {
+  public:
+    typedef iterator self_type;
+    typedef T value_type;
+    typedef T &reference;
+    typedef T *pointer;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef std::ptrdiff_t difference_type;
+    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+  private:
+    pointer ptr_;
+  };
+
+  class const_iterator {
+  public:
+    typedef const_iterator self_type;
+    typedef T value_type;
+    typedef const T &reference;
+    typedef const T *pointer;
+    typedef std::ptrdiff_t difference_type;
+    typedef std::forward_iterator_tag iterator_category;
+    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+  private:
+    pointer ptr_;
+  };
+
+  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
+
+  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
+
+  TV_HOST_DEVICE_INLINE const_iterator begin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator end() const {
+    return const_iterator(mArray + mSize);
+  }
+  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator cend() const {
+    return const_iterator(mArray + mSize);
+  }
+
+protected:
+  T mArray[MaxDim];
+  size_t mSize = 0;
+};
+
+template <typename T, size_t MaxDim>
+bool operator==(const SimpleVector<T, MaxDim> &lfs,
+                const SimpleVector<T, MaxDim> &rfs) {
+  if (lfs.size() != rfs.size())
+    return false;
+  for (size_t i = 0; i < lfs.size(); ++i) {
+    if (lfs[i] != rfs[i])
+      return false;
+  }
+  return true;
+}
+
+template <typename T, size_t MaxDim>
+bool operator!=(const SimpleVector<T, MaxDim> &lfs,
+                const SimpleVector<T, MaxDim> &rfs) {
+
+  return !(lfs == rfs);
+}
+
+struct Slice {
+  template <class... Integers> TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
+    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
+    SimpleVector<int, 3> slices{int(ints)...};
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    for (size_t i = 0; i < slices.size(); ++i) {
+      mSlices[i] = slices[i];
+    }
+  }
+
+  TV_HOST_DEVICE_INLINE Slice() {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+  }
+  template <typename T>
+  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<T> slice) {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    TV_ASSERT(slice.size() <= 3);
+    int idx = 0;
+    for (T s : slice) {
+      mSlices[idx] = int(s);
+      ++idx;
+    }
+  }
+  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+
+protected:
+  int mSlices[3];
+};
+
+template <size_t MaxDim = TV_MAX_DIM>
+struct ShapeBase : public SimpleVector<int, MaxDim> {
+  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
+  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+
+  template <typename T, template <class...> class Container>
+  ShapeBase(Container<T> shape) : SimpleVector<int, MaxDim>(shape) {}
+  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
+
+  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < end; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && start <= this->mSize);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < this->mSize; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const {
+    if (this->mSize == 0)
+      return 0;
+    size_t s = 1;
+    for (int i = 0; i < int(this->mSize); ++i) {
+      s *= this->mArray[i];
+    }
+    return s;
+  }
+  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (this->mArray[i] != 1)
+        shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (i != dim || this->mArray[i] != 1)
+        shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+};
+
+using Shape = ShapeBase<TV_MAX_DIM>;
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#ifdef TV_DEBUG
+  TV_ASSERT(sizeof...(indexes) == shape.size());
+#endif
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           std::vector<int> &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           const Shape &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
+                                           const Index *shape) {
+  unsigned offset = 0;
+  unsigned m = 1;
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    offset += m * indexes[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
+                                           const Index *shape) {
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    output[i] = index % shape[i];
+    index -= output[i];
+    index /= shape[i];
+  }
+  return index;
+}
+
+template <int N> struct ArrayIndexRowMajor {
+  // mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return indexes[N - 1] +
+           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
+  }
+};
+
+template <> struct ArrayIndexRowMajor<0> {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return 0;
+  }
+};
+
+namespace detail {
+template <typename T> constexpr const char *simpleTypeName(T val = T());
+template <> constexpr const char *simpleTypeName(float val) {
+  return "float32";
+}
+template <> constexpr const char *simpleTypeName(double val) {
+  return "float64";
+}
+template <> constexpr const char *simpleTypeName(int val) { return "int32"; }
+template <> constexpr const char *simpleTypeName(unsigned val) {
+  return "uint32";
+}
+template <> constexpr const char *simpleTypeName(long val) { return "int64"; }
+template <> constexpr const char *simpleTypeName(unsigned long val) {
+  return "uint64";
+}
+}; // namespace detail
+
+template <typename T, int Rank = -1> struct TensorView {
+  TV_HOST_DEVICE_INLINE TensorView() {}
+  explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Shape shape)
+      : mPtr(ptr), mShape(shape) {}
+  // explicit TV_HOST_DEVICE_INLINE TensorView(const
+  // TensorView<std::remove_const_t<T>> &tview) : mPtr(tview.data()),
+  // mShape(tview.shape()) {}
+  template <class... Integers>
+  explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Integers... shapes)
+      : mPtr(ptr) {
+    mShape = {int(shapes)...};
+  }
+
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
+  assign(const TensorView<T, Rank> &tensor) {
+    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
+               "\n");
+    T *ptr = mPtr;
+    const T *other_ptr = tensor.data();
+    for (size_t i = 0; i < size(); ++i)
+      *(ptr++) = *(other_ptr++);
+    return *this;
+  }
+
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &
+  assign(std::initializer_list<T1> seq) {
+    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
+               "\n");
+    T *ptr = mPtr;
+    for (const T1 &s : seq)
+      *(ptr++) = T(s);
+    return *this;
+  }
+
+  template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  TV_HOST_DEVICE_INLINE T &operator()() {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+  TV_HOST_DEVICE_INLINE const T &operator()() const {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+
+  template <class T1> TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  template <class T1> TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
+                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
+#else
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+               int(idx), size());
+#endif
+#endif
+    return mPtr[idx];
+  }
+  // TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
+  /*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
+    int idx = rowArrayIdx(mShape, index);
+#ifdef TV_DEBUG
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+                int(idx), size());
+#endif
+    return mPtr[idx];
+  }
+  TV_HOST_DEVICE_INLINE const T &operator[](const Shape index) const {
+    int idx = rowArrayIdx(mShape, index);
+#ifdef TV_DEBUG
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+                int(idx), size());
+#endif
+    return mPtr[idx];
+  }*/
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
+  operator[](SimpleVector<Slice> slice_vec) {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE const TensorView<T, Rank>
+  operator[](SimpleVector<Slice> slice_vec) const {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
+  TV_HOST_DEVICE_INLINE T *data() { return mPtr; }
+  TV_HOST_DEVICE_INLINE const T *data() const { return mPtr; }
+  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
+  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
+  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Inds... newShapes) {
+    Shape shapes{int(newShapes)...};
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> &reshape(Shape shapes) {
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Inds... newShapes) const {
+    Shape shapes{int(newShapes)...};
+    for (size_t i = 0; i < shapes.ndim(); ++i) {
+      if (shapes[i] == -1) {
+        shapes[i] = 1;
+        shapes[i] = size() / shapes.size();
+        break;
+      }
+    }
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<T, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> view(Shape shapes) const {
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<T, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze() const {
+    return TensorView<T, Rank>(mPtr, mShape.squeeze());
+  }
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> squeeze(int dim) const {
+    return TensorView<T, Rank>(mPtr, mShape.squeeze(dim));
+  }
+  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
+
+  template <class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slice slice,
+                                                    Slices... slices) const {
+    return subview<float, Slice, Slices...>(slice, slices...);
+  }
+  template <class T2 = float, class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(Slices... slices) const {
+    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
+    Shape new_shape{to_slice(slices)[0]...};
+    Shape start{to_slice(slices)[0]...};
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1; // reduce dim
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<T, Rank>(mPtr + offset, reduced_shape);
+  }
+
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank> subview(int id, Integers... ints) {
+    Shape start = {id, ints...};
+    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
+      start.push_back(0);
+    }
+    return TensorView<T, Rank>(mPtr + rowArrayIdx(mShape, start),
+                               mShape.subshape(sizeof...(ints) + 1));
+  }
+
+  std::string repr() const {
+    std::ostringstream ss;
+    if (empty())
+      return "";
+    if (mShape.ndim() == 0) {
+      ss << *mPtr;
+      // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
+      // detail::simpleTypeName<T>());
+      ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
+      return ss.str();
+    }
+    Shape counter = mShape;
+    auto tensor_flat = this->view(-1);
+    for (int i = 0; i < counter.ndim(); ++i) {
+      counter[i] = 0;
+      ss << "[";
+    }
+    for (size_t i = 0; i < this->size(); ++i) {
+      ss << tensor_flat(rowArrayIdx(mShape, counter));
+      counter[counter.ndim() - 1] += 1;
+      int inc_count = 0;
+      bool print_comma = true;
+      for (int c = counter.ndim() - 1; c >= 0; --c) {
+        if (counter[c] == this->dim(c) && c > 0) {
+          ++inc_count;
+          counter[c - 1] += 1;
+          counter[c] = 0;
+          print_comma = false;
+        }
+      }
+      if (print_comma && i != this->size() - 1)
+        ss << ", ";
+      for (int j = 0; j < inc_count; ++j) {
+        ss << "]";
+      }
+      if (i != this->size() - 1) {
+        if (inc_count != 0)
+          ss << "\n";
+        for (int j = 0; j < inc_count; ++j) {
+          ss << "[";
+        }
+      }
+    }
+    ss << "]";
+    // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
+    // detail::simpleTypeName<T>());
+    ss << "Tensor: dtype=" << detail::simpleTypeName<T>();
+    return ss.str();
+  }
+
+protected:
+  // TODO: make this function public.
+  // currently this function is called unexpectedly when using subview({0, 0}).
+  TV_HOST_DEVICE_INLINE TensorView<T, Rank>
+  _subview(SimpleVector<Slice> slice_vec) {
+    Shape new_shape;
+    for (int i = 0; i < slice_vec.size(); ++i) {
+      new_shape.push_back(slice_vec[i][0]);
+    }
+    Shape start = new_shape;
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1; // reduce dim
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<T, Rank>(mPtr + offset, reduced_shape);
+  }
+  template <typename T1> TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
+    return Slice{int(s), -1, -1};
+  }
+
+  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
+
+  T *mPtr = nullptr;
+  Shape mShape;
+};
+
+template <typename Os, typename T, int Rank>
+Os &operator<<(Os &os, const TensorView<T, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+template <typename Os, typename T, int Rank>
+Os &operator<<(Os &os, const TensorView<const T, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+namespace detail {
+template <typename T> constexpr const char *printfTypeFormat(T val = T());
+template <> constexpr const char *printfTypeFormat(float val) { return "%.2f"; }
+template <> constexpr const char *printfTypeFormat(double val) {
+  return "%.2f";
+}
+template <> constexpr const char *printfTypeFormat(int val) { return "%d"; }
+template <> constexpr const char *printfTypeFormat(unsigned val) {
+  return "%u";
+}
+template <> constexpr const char *printfTypeFormat(long val) { return "%ld"; }
+template <> constexpr const char *printfTypeFormat(unsigned long val) {
+  return "%lu";
+}
+}; // namespace detail
+
+template <typename T>
+TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
+                                    const char *format) {
+  if (tensor.empty())
+    return;
+  if (tensor.ndim() == 0) {
+    printf(format, tensor());
+    printf("\n");
+    return;
+  }
+  Shape counter = tensor.shape();
+  auto tensor_flat = tensor.view(-1);
+  for (int i = 0; i < counter.ndim(); ++i) {
+    counter[i] = 0;
+    printf("[");
+  }
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
+    counter[counter.ndim() - 1] += 1;
+    int inc_count = 0;
+    bool print_comma = true;
+    for (int c = counter.ndim() - 1; c >= 0; --c) {
+      if (counter[c] == tensor.dim(c) && c > 0) {
+        ++inc_count;
+        counter[c - 1] += 1;
+        counter[c] = 0;
+        print_comma = false;
+      }
+    }
+    if (print_comma && i != tensor.size() - 1)
+      printf(", ");
+    for (int j = 0; j < inc_count; ++j) {
+      printf("]");
+    }
+    if (i != tensor.size() - 1) {
+      if (inc_count != 0)
+        printf("\n");
+      for (int j = 0; j < inc_count; ++j) {
+        printf("[");
+      }
+    }
+  }
+  printf("]\n");
+}
+
+template <typename T>
+TV_HOST_DEVICE void printTensorView(TensorView<T> tensor) {
+  using Traw = typename std::remove_const<T>::type;
+  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
+}
+template <typename T>
+TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape) {
+  using Traw = typename std::remove_const<T>::type;
+  return printTensorView(TensorView<const T>(ptr, shape),
+                         detail::printfTypeFormat<Traw>());
+}
+template <typename T>
+TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
+                                    const char *format) {
+  return printTensorView(TensorView<const T>(ptr, shape), format);
+}
+
+} // namespace tv
diff --git a/mmdet3d/ops/spconv/include/torch_utils.h b/mmdet3d/ops/spconv/include/torch_utils.h
new file mode 100644
index 0000000000..ae1134cada
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/torch_utils.h
@@ -0,0 +1,70 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <tensorview/tensorview.h>
+#include <torch/script.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+namespace tv {
+
+struct TorchGPU: public tv::GPU {
+  virtual cudaStream_t getStream() const override {
+    return at::cuda::getCurrentCUDAStream();
+  }
+};
+
+template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
+  switch (tensor.type().scalarType()) {
+  case at::ScalarType::Double: {
+    auto val = std::is_same<std::remove_const_t<T>, double>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  case at::ScalarType::Float: {
+    auto val = std::is_same<std::remove_const_t<T>, float>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  case at::ScalarType::Int: {
+    auto val = std::is_same<std::remove_const_t<T>, int>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  case at::ScalarType::Half: {
+    auto val = std::is_same<std::remove_const_t<T>, at::Half>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  case at::ScalarType::Long: {
+    auto val = std::is_same<std::remove_const_t<T>, long>::value;
+    TV_ASSERT_RT_ERR(val, "error");
+    break;
+  }
+  default:
+    TV_ASSERT_RT_ERR(false, "error");
+  }
+}
+
+template <typename T>
+tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
+  check_torch_dtype<T>(tensor);
+  tv::Shape shape;
+  for (auto i : tensor.sizes()) {
+    shape.push_back(i);
+  }
+  return tv::TensorView<T>(tensor.data<std::remove_const_t<T>>(), shape);
+}
+} // namespace tv
diff --git a/mmdet3d/ops/spconv/include/utility/timer.h b/mmdet3d/ops/spconv/include/utility/timer.h
new file mode 100644
index 0000000000..43a41a6022
--- /dev/null
+++ b/mmdet3d/ops/spconv/include/utility/timer.h
@@ -0,0 +1,54 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <chrono>
+#include <cuda_runtime_api.h>
+#include <iostream>
+
+namespace spconv {
+
+template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
+  CudaContextTimer() {
+    cudaDeviceSynchronize();
+    mCurTime = std::chrono::steady_clock::now();
+  }
+  typename TimeT::rep report() {
+    cudaDeviceSynchronize();
+    auto duration = std::chrono::duration_cast<TimeT>(
+        std::chrono::steady_clock::now() - mCurTime);
+    auto res = duration.count();
+    mCurTime = std::chrono::steady_clock::now();
+    return res;
+  }
+
+private:
+  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
+};
+
+template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
+  CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }
+  typename TimeT::rep report() {
+    auto duration = std::chrono::duration_cast<TimeT>(
+        std::chrono::steady_clock::now() - mCurTime);
+    auto res = duration.count();
+    mCurTime = std::chrono::steady_clock::now();
+    return res;
+  }
+
+private:
+  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
+};
+
+} // namespace spconv
diff --git a/mmdet3d/ops/spconv/modules.py b/mmdet3d/ops/spconv/modules.py
new file mode 100644
index 0000000000..d973c48391
--- /dev/null
+++ b/mmdet3d/ops/spconv/modules.py
@@ -0,0 +1,205 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from collections import OrderedDict
+
+import torch
+from torch import nn
+
+from .structure import SparseConvTensor
+
+
+def is_spconv_module(module):
+    spconv_modules = (SparseModule, )
+    return isinstance(module, spconv_modules)
+
+
+def is_sparse_conv(module):
+    from .conv import SparseConvolution
+    return isinstance(module, SparseConvolution)
+
+
+def _mean_update(vals, m_vals, t):
+    outputs = []
+    if not isinstance(vals, list):
+        vals = [vals]
+    if not isinstance(m_vals, list):
+        m_vals = [m_vals]
+    for val, m_val in zip(vals, m_vals):
+        output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
+        outputs.append(output)
+    if len(outputs) == 1:
+        outputs = outputs[0]
+    return outputs
+
+
+class SparseModule(nn.Module):
+    """ place holder,
+        All module subclass from this will take sptensor in SparseSequential.
+    """
+    pass
+
+
+class SparseSequential(SparseModule):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the
+    constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+
+    To make it easier to understand, given is a small example::
+
+        # Example of using Sequential
+        model = SparseSequential(
+                  SparseConv2d(1,20,5),
+                  nn.ReLU(),
+                  SparseConv2d(20,64,5),
+                  nn.ReLU()
+                )
+
+        # Example of using Sequential with OrderedDict
+        model = SparseSequential(OrderedDict([
+                  ('conv1', SparseConv2d(1,20,5)),
+                  ('relu1', nn.ReLU()),
+                  ('conv2', SparseConv2d(20,64,5)),
+                  ('relu2', nn.ReLU())
+                ]))
+
+        # Example of using Sequential with kwargs(python 3.6+)
+        model = SparseSequential(
+                  conv1=SparseConv2d(1,20,5),
+                  relu1=nn.ReLU(),
+                  conv2=SparseConv2d(20,64,5),
+                  relu2=nn.ReLU()
+                )
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(SparseSequential, self).__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+        for name, module in kwargs.items():
+            if sys.version_info < (3, 6):
+                raise ValueError('kwargs only supported in py36+')
+            if name in self._modules:
+                raise ValueError('name exists.')
+            self.add_module(name, module)
+        self._sparity_dict = {}
+
+    def __getitem__(self, idx):
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError('index {} is out of range'.format(idx))
+        if idx < 0:
+            idx += len(self)
+        it = iter(self._modules.values())
+        for i in range(idx):
+            next(it)
+        return next(it)
+
+    def __len__(self):
+        return len(self._modules)
+
+    @property
+    def sparity_dict(self):
+        return self._sparity_dict
+
+    def add(self, module, name=None):
+        if name is None:
+            name = str(len(self._modules))
+            if name in self._modules:
+                raise KeyError('name exists')
+        self.add_module(name, module)
+
+    def forward(self, input):
+        for k, module in self._modules.items():
+            if is_spconv_module(module):  # use SpConvTensor as input
+                assert isinstance(input, SparseConvTensor)
+                self._sparity_dict[k] = input.sparity
+                input = module(input)
+            else:
+                if isinstance(input, SparseConvTensor):
+                    if input.indices.shape[0] != 0:
+                        input.features = module(input.features)
+                else:
+                    input = module(input)
+        return input
+
+    def fused(self):
+        """don't use this. no effect.
+        """
+        from .conv import SparseConvolution
+        mods = [v for k, v in self._modules.items()]
+        fused_mods = []
+        idx = 0
+        while idx < len(mods):
+            if is_sparse_conv(mods[idx]):
+                if idx < len(mods) - 1 and isinstance(mods[idx + 1],
+                                                      nn.BatchNorm1d):
+                    new_module = SparseConvolution(
+                        ndim=mods[idx].ndim,
+                        in_channels=mods[idx].in_channels,
+                        out_channels=mods[idx].out_channels,
+                        kernel_size=mods[idx].kernel_size,
+                        stride=mods[idx].stride,
+                        padding=mods[idx].padding,
+                        dilation=mods[idx].dilation,
+                        groups=mods[idx].groups,
+                        bias=True,
+                        subm=mods[idx].subm,
+                        output_padding=mods[idx].output_padding,
+                        transposed=mods[idx].transposed,
+                        inverse=mods[idx].inverse,
+                        indice_key=mods[idx].indice_key,
+                        fused_bn=True,
+                    )
+                    new_module.load_state_dict(mods[idx].state_dict(), False)
+                    new_module.to(mods[idx].weight.device)
+                    conv = new_module
+                    bn = mods[idx + 1]
+                    conv.bias.data.zero_()
+                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
+                        torch.sqrt(bn.running_var) + bn.eps)
+                    conv.bias.data[:] = (
+                        conv.bias.data - bn.running_mean) * bn.weight.data / (
+                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
+                    fused_mods.append(conv)
+                    idx += 2
+                else:
+                    fused_mods.append(mods[idx])
+                    idx += 1
+            else:
+                fused_mods.append(mods[idx])
+                idx += 1
+        return SparseSequential(*fused_mods)
+
+
+class ToDense(SparseModule):
+    """convert SparseConvTensor to NCHW dense tensor.
+    """
+
+    def forward(self, x: SparseConvTensor):
+        return x.dense()
+
+
+class RemoveGrid(SparseModule):
+    """remove pre-allocated grid buffer.
+    """
+
+    def forward(self, x: SparseConvTensor):
+        x.grid = None
+        return x
diff --git a/mmdet3d/ops/spconv/ops.py b/mmdet3d/ops/spconv/ops.py
new file mode 100644
index 0000000000..a7ea4fb28f
--- /dev/null
+++ b/mmdet3d/ops/spconv/ops.py
@@ -0,0 +1,183 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from . import sparse_conv_ext
+
+
+def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (input_size[i] + 2 * padding[i] - dilation[i] *
+                (kernel_size[i] - 1) - 1) // stride[i] + 1
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+
+
+def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
+                           output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        if kernel_size[i] == -1:
+            raise ValueError("deconv don't support kernel_size < 0")
+        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
+            i] + output_padding[i]
+        output_size.append(size)
+    return output_size
+
+
+def get_indice_pairs(indices,
+                     batch_size,
+                     spatial_shape,
+                     ksize=3,
+                     stride=1,
+                     padding=0,
+                     dilation=1,
+                     out_padding=0,
+                     subm=False,
+                     transpose=False,
+                     grid=None):
+    ndim = indices.shape[1] - 1
+    if not isinstance(ksize, (list, tuple)):
+        ksize = [ksize] * ndim
+    if not isinstance(stride, (list, tuple)):
+        stride = [stride] * ndim
+    if not isinstance(padding, (list, tuple)):
+        padding = [padding] * ndim
+    if not isinstance(dilation, (list, tuple)):
+        dilation = [dilation] * ndim
+    if not isinstance(out_padding, (list, tuple)):
+        out_padding = [out_padding] * ndim
+
+    for d, s in zip(dilation, stride):
+        assert any([s == 1, d == 1]), "don't support this."
+
+    if not subm:
+        if transpose:
+            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
+                                               padding, dilation, out_padding)
+        else:
+            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
+                                             padding, dilation)
+
+    else:
+        out_shape = spatial_shape
+    if grid is None:
+        if ndim == 2:
+            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_2d
+        elif ndim == 3:
+            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_3d
+        elif ndim == 4:
+            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_4d
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+    else:
+        if ndim == 2:
+            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_grid_2d
+        elif ndim == 3:
+            get_indice_pairs_func = sparse_conv_ext.get_indice_pairs_grid_3d
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, grid, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+
+
+def indice_conv(features,
+                filters,
+                indice_pairs,
+                indice_pair_num,
+                num_activate_out,
+                inverse=False,
+                subm=False):
+    if filters.dtype == torch.float32:
+        return sparse_conv_ext.indice_conv_fp32(features, filters,
+                                                indice_pairs, indice_pair_num,
+                                                num_activate_out, int(inverse),
+                                                int(subm))
+    elif filters.dtype == torch.half:
+        return sparse_conv_ext.indice_conv_half(features, filters,
+                                                indice_pairs, indice_pair_num,
+                                                num_activate_out, int(inverse),
+                                                int(subm))
+    else:
+        raise NotImplementedError
+
+
+def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
+                      num_activate_out, inverse, subm):
+    if features.dtype == torch.half:
+        func = sparse_conv_ext.fused_indice_conv_half
+    elif filters.dtype == torch.float32:
+        func = sparse_conv_ext.fused_indice_conv_fp32
+    else:
+        raise NotImplementedError
+
+    return func(features, filters, bias, indice_pairs, indice_pair_num,
+                num_activate_out, int(inverse), int(subm))
+
+
+def indice_conv_backward(features,
+                         filters,
+                         out_bp,
+                         indice_pairs,
+                         indice_pair_num,
+                         inverse=False,
+                         subm=False):
+    if filters.dtype == torch.float32:
+        return sparse_conv_ext.indice_conv_backward_fp32(
+            features, filters, out_bp, indice_pairs, indice_pair_num,
+            int(inverse), int(subm))
+    elif filters.dtype == torch.half:
+        return sparse_conv_ext.indice_conv_backward_half(
+            features, filters, out_bp, indice_pairs, indice_pair_num,
+            int(inverse), int(subm))
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
+    if features.dtype == torch.float32:
+        return sparse_conv_ext.indice_maxpool_fp32(features, indice_pairs,
+                                                   indice_pair_num,
+                                                   num_activate_out)
+    elif features.dtype == torch.half:
+        return sparse_conv_ext.indice_maxpool_half(features, indice_pairs,
+                                                   indice_pair_num,
+                                                   num_activate_out)
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
+                            indice_pair_num):
+    if features.dtype == torch.float32:
+        return sparse_conv_ext.indice_maxpool_backward_fp32(
+            features, out_features, out_bp, indice_pairs, indice_pair_num)
+    elif features.dtype == torch.half:
+        return sparse_conv_ext.indice_maxpool_backward_half(
+            features, out_features, out_bp, indice_pairs, indice_pair_num)
+    else:
+        raise NotImplementedError
diff --git a/mmdet3d/ops/spconv/pool.py b/mmdet3d/ops/spconv/pool.py
new file mode 100644
index 0000000000..2cbd7605fc
--- /dev/null
+++ b/mmdet3d/ops/spconv/pool.py
@@ -0,0 +1,85 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import functional as Fsp
+from . import ops
+from .modules import SparseModule
+from .structure import SparseConvTensor
+
+
+class SparseMaxPool(SparseModule):
+
+    def __init__(self,
+                 ndim,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 subm=False):
+        super(SparseMaxPool, self).__init__()
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+
+        self.ndim = ndim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.subm = subm
+        self.dilation = dilation
+
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            out_spatial_shape = ops.get_conv_output_size(
+                spatial_shape, self.kernel_size, self.stride, self.padding,
+                self.dilation)
+        else:
+            out_spatial_shape = spatial_shape
+        outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
+            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
+            self.padding, self.dilation, 0, self.subm)
+
+        out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
+                                          indice_pairs_num.to(device),
+                                          outids.shape[0])
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+class SparseMaxPool2d(SparseMaxPool):
+
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super(SparseMaxPool2d, self).__init__(2, kernel_size, stride, padding,
+                                              dilation)
+
+
+class SparseMaxPool3d(SparseMaxPool):
+
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super(SparseMaxPool3d, self).__init__(3, kernel_size, stride, padding,
+                                              dilation)
diff --git a/mmdet3d/ops/spconv/src/all.cc b/mmdet3d/ops/spconv/src/all.cc
new file mode 100644
index 0000000000..a5729d7edb
--- /dev/null
+++ b/mmdet3d/ops/spconv/src/all.cc
@@ -0,0 +1,51 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_runtime_api.h>
+#include <spconv/fused_spconv_ops.h>
+#include <spconv/pool_ops.h>
+#include <spconv/spconv_ops.h>
+#include <torch/extension.h>
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("get_indice_pairs_2d", &spconv::getIndicePair<2>,
+        "get_indice_pairs_2d");
+  m.def("get_indice_pairs_3d", &spconv::getIndicePair<3>,
+        "get_indice_pairs_3d");
+  m.def("get_indice_pairs_4d", &spconv::getIndicePair<4>,
+        "get_indice_pairs_4d");
+  m.def("get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>,
+        "get_indice_pairs_grid_2d");
+  m.def("get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>,
+        "get_indice_pairs_grid_3d");
+  m.def("indice_conv_fp32", &spconv::indiceConv<float>, "indice_conv_fp32");
+  m.def("indice_conv_backward_fp32", &spconv::indiceConvBackward<float>,
+        "indice_conv_backward_fp32");
+  m.def("indice_conv_half", &spconv::indiceConv<at::Half>, "indice_conv_half");
+  m.def("indice_conv_backward_half", &spconv::indiceConvBackward<at::Half>,
+        "indice_conv_backward_half");
+  m.def("fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>,
+        "fused_indice_conv_fp32");
+  m.def("fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>,
+        "fused_indice_conv_half");
+  m.def("indice_maxpool_fp32", &spconv::indiceMaxPool<float>,
+        "indice_maxpool_fp32");
+  m.def("indice_maxpool_backward_fp32", &spconv::indiceMaxPoolBackward<float>,
+        "indice_maxpool_backward_fp32");
+  m.def("indice_maxpool_half", &spconv::indiceMaxPool<at::Half>,
+        "indice_maxpool_half");
+  m.def("indice_maxpool_backward_half",
+        &spconv::indiceMaxPoolBackward<at::Half>,
+        "indice_maxpool_backward_half");
+}
diff --git a/mmdet3d/ops/spconv/src/indice.cc b/mmdet3d/ops/spconv/src/indice.cc
new file mode 100644
index 0000000000..f6fc613451
--- /dev/null
+++ b/mmdet3d/ops/spconv/src/indice.cc
@@ -0,0 +1,89 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <spconv/geometry.h>
+#include <spconv/indice.h>
+#include <spconv/spconv_ops.h>
+#include <torch/script.h>
+
+namespace spconv {
+
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                     tv::TensorView<Index> indicesOut,
+                     tv::TensorView<IndexGrid> gridsOut,
+                     tv::TensorView<Index> indicePairs,
+                     tv::TensorView<Index> indiceNum,
+                     const tv::SimpleVector<Index, NDim> kernelSize,
+                     const tv::SimpleVector<Index, NDim> stride,
+                     const tv::SimpleVector<Index, NDim> padding,
+                     const tv::SimpleVector<Index, NDim> dilation,
+                     const tv::SimpleVector<Index, NDim> outSpatialShape,
+                     bool transpose, bool resetGrid) {
+    if (transpose)
+      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut,
+          gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+    else
+      return getIndicePairsConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut,
+          gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+
+  }
+};
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                     tv::TensorView<IndexGrid> gridsOut,
+                     tv::TensorView<Index> indicePairs,
+                     tv::TensorView<Index> indiceNum,
+                     const tv::SimpleVector<Index, NDim> kernelSize,
+                     const tv::SimpleVector<Index, NDim> stride,
+                     const tv::SimpleVector<Index, NDim> padding,
+                     const tv::SimpleVector<Index, NDim> dilation,
+                     const tv::SimpleVector<Index, NDim> outSpatialShape,
+                     bool transpose, bool resetGrid) {
+    return getIndicePairsSubM<Index, IndexGrid, NDim>(
+        indicesIn,
+        gridsOut, indicePairs, indiceNum,
+        kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
+  }
+};
+} // namespace functor
+
+#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                              \
+  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, NDIM>;      \
+  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int,  \
+                                                         NDIM>;
+
+
+#define DECLARE_CPU_INDEX(Index)                                               \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1);                                      \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2);                                      \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3);                                      \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_CPU_INDEX(int);
+DECLARE_CPU_INDEX(long);
+
+#undef DECLARE_CPU_INDEX
+#undef DECLARE_CPU_SPECS_INDEX_NDIM
+
+} // namespace spconv
diff --git a/mmdet3d/ops/spconv/src/indice_cuda.cu b/mmdet3d/ops/spconv/src/indice_cuda.cu
new file mode 100644
index 0000000000..887d27106b
--- /dev/null
+++ b/mmdet3d/ops/spconv/src/indice_cuda.cu
@@ -0,0 +1,158 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <chrono>
+#include <limits>
+#include <spconv/mp_helper.h>
+#include <spconv/indice.h>
+#include <spconv/indice.cu.h>
+#include <tensorview/helper_launch.h>
+#include <tensorview/tensorview.h>
+#include <type_traits>
+#include <utility/timer.h>
+
+namespace spconv {
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose) {
+    Index batchSize = gridsOut.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0)
+      return 0;
+    // auto timer = spconv::CudaContextTimer<>();
+    if (transpose)
+      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                           indiceNum, indicePairUnique, kernelSize, stride,
+                           padding, dilation, outSpatialShape);
+    else
+      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                           indiceNum, indicePairUnique, kernelSize, stride,
+                           padding, dilation, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    // std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
+    return 1;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    Index batchSize = gridsOut.dim(0);
+    auto kernelVolume = indicePairs.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0)
+      return 0;
+    Index numAct = indicePairUnique.dim(0) - 1;
+    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
+                         indicePairUnique, outSpatialShape, batchSize);
+    TV_CHECK_CUDA_ERR();
+    assignIndicePairsKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
+                         indicePairUnique, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    if (resetGrid) {
+      resetGridKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numAct;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::GPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0)
+      return 0;
+    // auto timer = spconv::CudaContextTimer<>();
+    prepareSubMGridKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
+                         kernelSize, stride, padding, dilation, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
+    if (resetGrid) {
+      resetGridSubMKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numActIn;
+  }
+};
+} // namespace functor
+
+#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                              \
+  template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int,    \
+                                                       NDIM>;                  \
+  template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int,  \
+                                                         NDIM>;                \
+  template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int,  \
+                                                         NDIM>;                \
+  template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int,    \
+                                                       NDIM>;
+
+#define DECLARE_GPU_INDEX(Index)                                               \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1);                                      \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2);                                      \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3);                                      \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_GPU_INDEX(int);
+
+#undef DECLARE_GPU_INDEX
+#undef DECLARE_GPU_SPECS_INDEX_NDIM
+} // namespace spconv
diff --git a/mmdet3d/ops/spconv/src/maxpool.cc b/mmdet3d/ops/spconv/src/maxpool.cc
new file mode 100644
index 0000000000..e8b79bb9a5
--- /dev/null
+++ b/mmdet3d/ops/spconv/src/maxpool.cc
@@ -0,0 +1,82 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <spconv/maxpool.h>
+#include <torch/script.h>
+
+namespace spconv {
+
+namespace functor {
+template <typename T, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::CPU, T, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
+          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
+    }
+  }
+};
+
+template <typename T, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::CPU, T, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<const T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const T> dout, tv::TensorView<T> din,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto doutData = dout.data();
+    auto dinData = din.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
+          dinData[idxi + plane] += doutData[idxo + plane];
+    }
+  }
+};
+} // namespace functor
+
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                    \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>;     \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
+
+#define DECLARE_CPU_SPECS(T)                                                   \
+  DECLARE_CPU_SPECS_T_INDEX(T, int);                                           \
+  DECLARE_CPU_SPECS_T_INDEX(T, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
+
+} // namespace spconv
diff --git a/mmdet3d/ops/spconv/src/maxpool_cuda.cu b/mmdet3d/ops/spconv/src/maxpool_cuda.cu
new file mode 100644
index 0000000000..94c1307af5
--- /dev/null
+++ b/mmdet3d/ops/spconv/src/maxpool_cuda.cu
@@ -0,0 +1,471 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <chrono>
+#include <limits>
+#include <spconv/maxpool.h>
+#include <spconv/mp_helper.h>
+#include <tensorview/helper_kernel.cu.h>
+#include <tensorview/helper_launch.h>
+#include <tensorview/tensorview.h>
+#include <type_traits>
+
+namespace spconv {
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  T in, out;
+  int ILPStrideY[NumILP];
+  Index idxo, idxi;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in > out) {
+          outFeatures[idxo] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void
+maxPoolFwdGenericBlockKernel(T *outFeatures, const T *inFeatures,
+                             const Index *indicesIn, const Index *indicesOut,
+                             int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  T in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in > out) {
+          outFeatures[RO[ilp] + iy] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
+__global__ void maxPoolFwdVecBlockKernel(T *outFeatures, const T *inFeatures,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
+  T bufi[vecloadFactor];
+  T bufo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] > bufo[i]) {
+          bufo[i] = bufi[i];
+        }
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idxo] =
+          reinterpret_cast<VecType *>(bufo)[0];
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  T in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in > out) {
+            outFeatures[RO[ilp] + iy] = in;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void
+maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures, const T *dout,
+                      T *din, const Index *indicesIn, const Index *indicesOut,
+                      int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  T in, out;
+  Index idxo, idxi;
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  dout += blockIdx.y * NumTLP;
+  din += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in == out) {
+          din[idxi] += dout[idxo];
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
+                                             const T *inFeatures, const T *dout,
+                                             T *din, const Index *indicesIn,
+                                             const Index *indicesOut,
+                                             int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  T in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in == out) {
+          din[RI[ilp] + iy] += dout[RO[ilp] + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
+__global__ void
+maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
+                         const T *dout, T *din, const Index *indicesIn,
+                         const Index *indicesOut, int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
+  T bufi[vecloadFactor];
+  T bufo[vecloadFactor];
+  T bufdi[vecloadFactor];
+  T bufdo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<const VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+      reinterpret_cast<VecType *>(bufdo)[0] =
+          reinterpret_cast<const VecType *>(dout)[idxo];
+      reinterpret_cast<VecType *>(bufdi)[0] = reinterpret_cast<VecType *>(din)[idxi];
+
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] == bufo[i]) {
+          bufdi[i] += bufdo[i];
+        }
+      }
+      reinterpret_cast<VecType *>(din)[idxi] = reinterpret_cast<VecType *>(bufdi)[0];
+    }
+  }
+}
+
+template <typename T, typename Index, int NumTLP, int NumILP>
+__global__ void
+maxPoolBwdGenericKernel(const T *outFeatures, const T *inFeatures,
+                        const T *dout, T *din, const Index *indicesIn,
+                        const Index *indicesOut, int numHot, int numPlanes) {
+  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  T in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in == out) {
+            din[RI[ilp] + iy] += dout[RO[ilp] + iy];
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace functor {
+template <typename T, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0)
+      return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                 indices.subview(0).data(),
+                                 indices.subview(1).data(), numHotBlock,
+                                 numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    indices.subview(0).data() + numHotBlock,
+                                    indices.subview(1).data() + numHotBlock,
+                                    size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+
+      if (size > numHotBlock) {
+        maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+
+template <typename T, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::GPU &d, tv::TensorView<const T> outFeatures,
+                  tv::TensorView<const T> inFeatures,
+                  tv::TensorView<const T> dout, tv::TensorView<T> din,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0)
+      return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout, &din,
+                                 &indices, &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                 dout.data(), din.data(),
+                                 indices.subview(0).data(),
+                                 indices.subview(1).data(), numHotBlock,
+                                 numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    dout.data(), din.data(),
+                                    indices.subview(0).data() + numHotBlock,
+                                    indices.subview(1).data() + numHotBlock,
+                                    size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+
+      if (size > numHotBlock) {
+        maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), dout.data(), din.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+
+} // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>;     \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, Index>;
+
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
+} // namespace spconv
diff --git a/mmdet3d/ops/spconv/src/reordering.cc b/mmdet3d/ops/spconv/src/reordering.cc
new file mode 100644
index 0000000000..0f7f5760d3
--- /dev/null
+++ b/mmdet3d/ops/spconv/src/reordering.cc
@@ -0,0 +1,69 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <spconv/reordering.h>
+#include <torch/script.h>
+
+namespace spconv {
+namespace functor {
+template <typename T, typename Index>
+struct SparseGatherFunctor<tv::CPU, T, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features,
+                  tv::TensorView<const Index> indices, int size) {
+    int numPlanes = features.dim(1);
+    for (int i = 0; i < size; ++i) {
+      std::memcpy(buffer.data() + i * numPlanes,
+                  features.data() + indices[i] * numPlanes,
+                  sizeof(T) * numPlanes);
+    }
+  }
+};
+
+template <typename T, typename Index>
+struct SparseScatterAddFunctor<tv::CPU, T, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> buffer, tv::TensorView<const Index> indices,
+                  int size, bool stable) {
+    int numPlanes = outFeatures.dim(1);
+    const T* buf = buffer.data();
+    T* out = outFeatures.data();
+    for (int i = 0; i < size; ++i) {
+      buf = buffer.data() + i * numPlanes;
+      out = outFeatures.data() + indices[i] * numPlanes;
+      for (int j = 0; j < numPlanes; ++j){
+        out[j] += buf[j];
+      }
+    }
+  }
+};
+
+} // namespace functor
+
+
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)               \
+  template struct functor::SparseGatherFunctor<tv::CPU, T, Index>;  \
+  template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
+
+#define DECLARE_CPU_SPECS(T)                                                   \
+  DECLARE_CPU_SPECS_T_INDEX(T, int);                                           \
+  DECLARE_CPU_SPECS_T_INDEX(T, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
+
+} // namespace spconv
diff --git a/mmdet3d/ops/spconv/src/reordering_cuda.cu b/mmdet3d/ops/spconv/src/reordering_cuda.cu
new file mode 100644
index 0000000000..c11ccd95e2
--- /dev/null
+++ b/mmdet3d/ops/spconv/src/reordering_cuda.cu
@@ -0,0 +1,155 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <chrono>
+#include <limits>
+#include <spconv/mp_helper.h>
+#include <spconv/reordering.h>
+#include <spconv/reordering.cu.h>
+#include <tensorview/helper_kernel.cu.h>
+#include <tensorview/helper_launch.h>
+#include <tensorview/tensorview.h>
+#include <type_traits>
+#include <utility/timer.h>
+
+namespace spconv {
+namespace functor {
+template <typename T, typename Index>
+struct SparseGatherFunctor<tv::GPU, T, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
+                  tv::TensorView<const T> features,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0)
+      return;
+    int numPlanes = features.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
+    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(buffer.data(), features.data(), indices.data(),
+                                 nHotBlock, numPlanes / vecloadFactor);
+
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
+                <<<dim3(1, numPlanes / NumTLP),
+                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
+                                 features.data(), indices.data() + nHotBlock,
+                                 size - nHotBlock, numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      gatherGenericKernel<T, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              buffer.data(), features.data(), indices.data(), size, numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+template <typename T, typename Index>
+struct SparseScatterAddFunctor<tv::GPU, T, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
+                  tv::TensorView<const T> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    if (size <= 0)
+      return;
+    int numPlanes = outFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor =
+        sizeof(vecload_type_t) / sizeof(T); // important for half.
+    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
+                                 &notFound](auto NumTLP) {
+      // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), buffer.data(),
+                                 indices.data(), nHotBlock,
+                                 numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(
+                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
+                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      scatterAddGenericKernel<T, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              outFeatures.data(), buffer.data(), indices.data(), size,
+              numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+} // namespace functor
+
+
+#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
+  template struct functor::SparseGatherFunctor<tv::GPU, T, Index>;             \
+  template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
+
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
+} // namespace spconv
diff --git a/mmdet3d/ops/spconv/structure.py b/mmdet3d/ops/spconv/structure.py
new file mode 100644
index 0000000000..228e6fdbca
--- /dev/null
+++ b/mmdet3d/ops/spconv/structure.py
@@ -0,0 +1,69 @@
+import numpy as np
+import torch
+
+
+def scatter_nd(indices, updates, shape):
+    """pytorch edition of tensorflow scatter_nd.
+    this function don't contain except handle code. so use this carefully
+    when indice repeats, don't support repeat add which is supported
+    in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+
+
+class SparseConvTensor(object):
+
+    def __init__(self,
+                 features,
+                 indices,
+                 spatial_shape,
+                 batch_size,
+                 grid=None):
+        """
+        Args:
+            grid: pre-allocated grid tensor.
+                  should be used when the volume of spatial shape
+                  is very large.
+        """
+        self.features = features
+        self.indices = indices
+        if self.indices.dtype != torch.int32:
+            self.indices.int()
+        self.spatial_shape = spatial_shape
+        self.batch_size = batch_size
+        self.indice_dict = {}
+        self.grid = grid
+
+    @property
+    def spatial_size(self):
+        return np.prod(self.spatial_shape)
+
+    def find_indice_pair(self, key):
+        if key is None:
+            return None
+        if key in self.indice_dict:
+            return self.indice_dict[key]
+        return None
+
+    def dense(self, channels_first=True):
+        output_shape = [self.batch_size] + list(
+            self.spatial_shape) + [self.features.shape[1]]
+        res = scatter_nd(self.indices.long(), self.features, output_shape)
+        if not channels_first:
+            return res
+        ndim = len(self.spatial_shape)
+        trans_params = list(range(0, ndim + 1))
+        trans_params.insert(1, ndim + 1)
+        return res.permute(*trans_params).contiguous()
+
+    @property
+    def sparity(self):
+        return (self.indices.shape[0] / np.prod(self.spatial_shape) /
+                self.batch_size)
diff --git a/mmdet3d/ops/spconv/test_utils.py b/mmdet3d/ops/spconv/test_utils.py
new file mode 100644
index 0000000000..ee01f221ab
--- /dev/null
+++ b/mmdet3d/ops/spconv/test_utils.py
@@ -0,0 +1,193 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+
+class TestCase(unittest.TestCase):
+
+    def _GetNdArray(self, a):
+        if not isinstance(a, np.ndarray):
+            a = np.array(a)
+        return a
+
+    def assertAllEqual(self, a, b):
+        """Asserts that two numpy arrays have the same values.
+        Args:
+        a: the expected numpy ndarray or anything can be converted to one.
+        b: the actual numpy ndarray or anything can be converted to one.
+        """
+        a = self._GetNdArray(a)
+        b = self._GetNdArray(b)
+        self.assertEqual(
+            a.shape, b.shape,
+            'Shape mismatch: expected %s, got %s.' % (a.shape, b.shape))
+        same = (a == b)
+
+        if a.dtype == np.float32 or a.dtype == np.float64:
+            same = np.logical_or(same,
+                                 np.logical_and(np.isnan(a), np.isnan(b)))
+        if not np.all(same):
+            # Prints more details than np.testing.assert_array_equal.
+            diff = np.logical_not(same)
+            if a.ndim:
+                x = a[np.where(diff)]
+                y = b[np.where(diff)]
+                print('not equal where = ', np.where(diff))
+            else:
+                # np.where is broken for scalars
+                x, y = a, b
+            print('not equal lhs = ', x)
+            print('not equal rhs = ', y)
+            np.testing.assert_array_equal(a, b)
+
+    def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
+        """Asserts that two numpy arrays, or dicts of same, have near values.
+        This does not support nested dicts.
+        Args:
+        a: The expected numpy ndarray (or anything can be converted to one), or
+            dict of same. Must be a dict iff `b` is a dict.
+        b: The actual numpy ndarray (or anything can be converted to one), or
+            dict of same. Must be a dict iff `a` is a dict.
+        rtol: relative tolerance.
+        atol: absolute tolerance.
+        Raises:
+        ValueError: if only one of `a` and `b` is a dict.
+        """
+        is_a_dict = isinstance(a, dict)
+        if is_a_dict != isinstance(b, dict):
+            raise ValueError("Can't compare dict to non-dict, %s vs %s." %
+                             (a, b))
+        if is_a_dict:
+            self.assertCountEqual(
+                a.keys(),
+                b.keys(),
+                msg='mismatched keys, expected %s, got %s' %
+                (a.keys(), b.keys()))
+            for k in a:
+                self._assertArrayLikeAllClose(
+                    a[k],
+                    b[k],
+                    rtol=rtol,
+                    atol=atol,
+                    msg='%s: expected %s, got %s.' % (k, a, b))
+        else:
+            self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
+
+    def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
+        a = self._GetNdArray(a)
+        b = self._GetNdArray(b)
+        self.assertEqual(
+            a.shape, b.shape,
+            'Shape mismatch: expected %s, got %s.' % (a.shape, b.shape))
+        if not np.allclose(a, b, rtol=rtol, atol=atol):
+            # Prints more details than np.testing.assert_allclose.
+            #
+            # NOTE: numpy.allclose (and numpy.testing.assert_allclose)
+            # checks whether two arrays are element-wise equal within a
+            # tolerance. The relative difference (rtol * abs(b)) and the
+            # absolute difference atol are added together to compare against
+            # the absolute difference between a and b.  Here, we want to
+            # print out which elements violate such conditions.
+            cond = np.logical_or(
+                np.abs(a - b) > atol + rtol * np.abs(b),
+                np.isnan(a) != np.isnan(b))
+            if a.ndim:
+                x = a[np.where(cond)]
+                y = b[np.where(cond)]
+                print('not close where = ', np.where(cond))
+            else:
+                # np.where is broken for scalars
+                x, y = a, b
+            print('not close lhs = ', x)
+            print('not close rhs = ', y)
+            print('not close dif = ', np.abs(x - y))
+            print('not close tol = ', atol + rtol * np.abs(y))
+            print('dtype = %s, shape = %s' % (a.dtype, a.shape))
+            np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
+
+
+def params_grid(*params):
+    size = len(params)
+    length = 1
+    for p in params:
+        length *= len(p)
+    sizes = [len(p) for p in params]
+    counter = [0] * size
+    total = []
+    for i in range(length):
+        total.append([0] * size)
+    for i in range(length):
+        for j in range(size):
+            total[i][j] = params[j][counter[j]]
+        counter[size - 1] += 1
+        for c in range(size - 1, -1, -1):
+            if (counter[c] == sizes[c] and c > 0):
+                counter[c - 1] += 1
+                counter[c] = 0
+    return total
+
+
+def generate_sparse_data(shape,
+                         num_points,
+                         num_channels,
+                         integer=False,
+                         data_range=(-1, 1),
+                         with_dense=True,
+                         dtype=np.float32):
+    dense_shape = shape
+    ndim = len(dense_shape)
+    # num_points = np.random.randint(10, 100, size=[batch_size, ndim])
+    num_points = np.array(num_points)
+    # num_points = np.array([3, 2])
+    batch_size = len(num_points)
+    batch_indices = []
+    coors_total = np.stack(
+        np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
+    coors_total = coors_total.reshape(-1, ndim)
+    for i in range(batch_size):
+        np.random.shuffle(coors_total)
+        inds_total = coors_total[:num_points[i]]
+        inds_total = np.pad(
+            inds_total, ((0, 0), (0, 1)), mode='constant', constant_values=i)
+        batch_indices.append(inds_total)
+    if integer:
+        sparse_data = np.random.randint(
+            data_range[0],
+            data_range[1],
+            size=[num_points.sum(), num_channels]).astype(dtype)
+    else:
+        sparse_data = np.random.uniform(
+            data_range[0],
+            data_range[1],
+            size=[num_points.sum(), num_channels]).astype(dtype)
+
+    res = {
+        'features': sparse_data.astype(dtype),
+    }
+    if with_dense:
+        dense_data = np.zeros([batch_size, num_channels, *dense_shape],
+                              dtype=sparse_data.dtype)
+        start = 0
+        for i, inds in enumerate(batch_indices):
+            for j, ind in enumerate(inds):
+                dense_slice = (i, slice(None), *ind[:-1])
+                dense_data[dense_slice] = sparse_data[start + j]
+            start += len(inds)
+        res['features_dense'] = dense_data.astype(dtype)
+    batch_indices = np.concatenate(batch_indices, axis=0)
+    res['indices'] = batch_indices.astype(np.int32)
+    return res
diff --git a/mmdet3d/ops/sync_bn.py b/mmdet3d/ops/sync_bn.py
new file mode 100644
index 0000000000..18be912bc7
--- /dev/null
+++ b/mmdet3d/ops/sync_bn.py
@@ -0,0 +1,110 @@
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.autograd.function import Function
+
+
+class AllReduce(Function):
+
+    @staticmethod
+    def forward(ctx, input):
+        input_list = [
+            torch.zeros_like(input) for k in range(dist.get_world_size())
+        ]
+        # Use allgather instead of allreduce in-place operations is unreliable
+        dist.all_gather(input_list, input, async_op=False)
+        inputs = torch.stack(input_list, dim=0)
+        return torch.sum(inputs, dim=0)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        dist.all_reduce(grad_output, async_op=False)
+        return grad_output
+
+
+class NaiveSyncBatchNorm1d(nn.BatchNorm1d):
+    """Syncronized Batch Normalization for 3D Tensors
+
+    Note:
+        This implementation is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        `torch.nn.SyncBatchNorm` has known unknown bugs.
+        It produces significantly worse AP (and sometimes goes NaN)
+        when the batch size on each worker is quite different
+        (e.g., when scale augmentation is used).
+        In 3D detection, different workers has points of different shapes,
+        whish also cause instability.
+
+        Use this implementation before `nn.SyncBatchNorm` is fixed.
+        It is slower than `nn.SyncBatchNorm`.
+    """
+
+    def forward(self, input):
+        if dist.get_world_size() == 1 or not self.training:
+            return super().forward(input)
+
+        assert input.shape[0] > 0, 'SyncBN does not support empty inputs'
+        C = input.shape[1]
+        mean = torch.mean(input, dim=[0, 2])
+        meansqr = torch.mean(input * input, dim=[0, 2])
+
+        vec = torch.cat([mean, meansqr], dim=0)
+        vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
+
+        mean, meansqr = torch.split(vec, C)
+        var = meansqr - mean * mean
+        self.running_mean += self.momentum * (
+            mean.detach() - self.running_mean)
+        self.running_var += self.momentum * (var.detach() - self.running_var)
+
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1)
+        bias = bias.reshape(1, -1, 1)
+        return input * scale + bias
+
+
+class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
+    """Syncronized Batch Normalization for 4D Tensors
+
+    Note:
+        This implementation is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        `torch.nn.SyncBatchNorm` has known unknown bugs.
+        It produces significantly worse AP (and sometimes goes NaN)
+        when the batch size on each worker is quite different
+        (e.g., when scale augmentation is used).
+        This phenomenon also occurs when the multi-modality feature fusion
+        modules of multi-modality detectors use SyncBN.
+
+        Use this implementation before `nn.SyncBatchNorm` is fixed.
+        It is slower than `nn.SyncBatchNorm`.
+    """
+
+    def forward(self, input):
+        if dist.get_world_size() == 1 or not self.training:
+            return super().forward(input)
+
+        assert input.shape[0] > 0, 'SyncBN does not support empty inputs'
+        C = input.shape[1]
+        mean = torch.mean(input, dim=[0, 2, 3])
+        meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+        vec = torch.cat([mean, meansqr], dim=0)
+        vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
+
+        mean, meansqr = torch.split(vec, C)
+        var = meansqr - mean * mean
+        self.running_mean += self.momentum * (
+            mean.detach() - self.running_mean)
+        self.running_var += self.momentum * (var.detach() - self.running_var)
+
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+        return input * scale + bias
diff --git a/mmdet3d/ops/voxel/__init__.py b/mmdet3d/ops/voxel/__init__.py
new file mode 100644
index 0000000000..a74fb63f9f
--- /dev/null
+++ b/mmdet3d/ops/voxel/__init__.py
@@ -0,0 +1,4 @@
+from .scatter_points import DynamicScatter, dynamic_scatter
+from .voxelize import Voxelization, voxelization
+
+__all__ = ['Voxelization', 'voxelization', 'dynamic_scatter', 'DynamicScatter']
diff --git a/mmdet3d/ops/voxel/scatter_points.py b/mmdet3d/ops/voxel/scatter_points.py
new file mode 100644
index 0000000000..c2355af604
--- /dev/null
+++ b/mmdet3d/ops/voxel/scatter_points.py
@@ -0,0 +1,129 @@
+import torch
+from torch import nn
+from torch.autograd import Function
+
+from .voxel_layer import (dynamic_point_to_voxel_backward,
+                          dynamic_point_to_voxel_forward)
+
+
+class _dynamic_scatter(Function):
+
+    @staticmethod
+    def forward(ctx, points, coors, voxel_size, coors_range):
+        """convert kitti points(N, >=3) to voxels.
+
+        Args:
+            points: [N, ndim] float tensor. points[:, :3] contain xyz
+                points and points[:, 3:] contain other information
+                such as reflectivity.
+            voxel_size: [3] list/tuple or array, float. xyz, indicate
+                voxel size
+            coors_range: [6] list/tuple or array, float. indicate voxel range.
+                format: xyzxyz, minmax
+            max_points: int. indicate maximum points contained in a voxel.
+                if  max_points=-1, it means using dynamic_voxelize
+            max_voxels: int. indicate maximum voxels this function create.
+                for second, 20000 is a good choice. you should shuffle
+                points before call this function because max_voxels may
+                drop some points.
+        Returns:
+            tuple
+            voxels: [M, max_points, ndim] float tensor. only contain points
+                    and returned when max_points != -1.
+            coordinates: [M, 3] int32 tensor, always returned.
+            num_points_per_voxel: [M] int32 tensor. Only returned when
+            max_points != -1.
+        """
+        results = dynamic_point_to_voxel_forward(points, coors, voxel_size,
+                                                 coors_range)
+        (voxels, voxel_coors, num_points_per_voxel, point_to_voxelidx,
+         coor_to_voxelidx) = results
+        ctx.save_for_backward(num_points_per_voxel, point_to_voxelidx,
+                              coor_to_voxelidx)
+        return voxels, voxel_coors, num_points_per_voxel.float()
+
+    @staticmethod
+    def backward(ctx,
+                 grad_output_voxel,
+                 grad_output_voxel_coors=None,
+                 grad_output_num_points=None):
+        (num_points_per_voxel, point_to_voxelidx,
+         coor_to_voxelidx) = ctx.saved_tensors
+        # grad_output_voxel shape: NxMxC
+        num_points = point_to_voxelidx.size(0)
+        num_features = grad_output_voxel.size(-1)
+        grad_points = grad_output_voxel.new_zeros(
+            size=(num_points, num_features))
+        # TODO: whether to use index put or use cuda_backward
+        # To use index put, need point to voxel index
+        dynamic_point_to_voxel_backward(grad_points,
+                                        grad_output_voxel.contiguous(),
+                                        point_to_voxelidx, coor_to_voxelidx)
+        return grad_points, None, None, None
+
+
+dynamic_scatter = _dynamic_scatter.apply
+
+
+class DynamicScatter(nn.Module):
+
+    def __init__(self, voxel_size, point_cloud_range, average_points: bool):
+        super(DynamicScatter, self).__init__()
+        """Scatters points into voxels, used in the voxel encoder with
+           dynamic voxelization
+
+        **Note**: The CPU and GPU implementation get the same output, but
+        have numerical difference after summation and division (e.g., 5e-7).
+
+        Args:
+            average_points (bool): whether to use avg pooling to scatter
+                points into voxel voxel_size (list): list [x, y, z] size
+                of three dimension
+            point_cloud_range (list):
+                [x_min, y_min, z_min, x_max, y_max, z_max]
+        """
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.average_points = average_points
+
+    def forward_single(self, points, coors):
+        voxels, voxel_coors, num_points = dynamic_scatter(
+            points.contiguous(), coors.contiguous(), self.voxel_size,
+            self.point_cloud_range)
+        if not self.average_points:
+            voxels = torch.max(voxels, dim=1)[0]  # voxels: NxMxC -> NxC
+        else:
+            voxels = (
+                voxels.sum(dim=1, keepdim=False).div(num_points.view(-1, 1)))
+        return voxels, voxel_coors
+
+    def forward(self, points, coors):
+        """
+        Args:
+            input: NC points
+        """
+        if coors.size(-1) == 3:
+            return self.forward_single(points, coors)
+        else:
+            batch_size = coors[-1, 0] + 1
+            voxels, voxel_coors = [], []
+            for i in range(batch_size):
+                inds = torch.where(coors[:, 0] == i)
+                voxel, voxel_coor = self.forward_single(
+                    points[inds], coors[inds][:, 1:])
+                coor_pad = nn.functional.pad(
+                    voxel_coor, (1, 0), mode='constant', value=i)
+                voxel_coors.append(coor_pad)
+                voxels.append(voxel)
+            features = torch.cat(voxels, dim=0)
+            feature_coors = torch.cat(voxel_coors, dim=0)
+
+            return features, feature_coors
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + '('
+        tmpstr += 'voxel_size=' + str(self.voxel_size)
+        tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
+        tmpstr += ', average_points=' + str(self.average_points)
+        tmpstr += ')'
+        return tmpstr
diff --git a/mmdet3d/ops/voxel/src/scatter_points_cpu.cpp b/mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
new file mode 100644
index 0000000000..9635842beb
--- /dev/null
+++ b/mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
@@ -0,0 +1,131 @@
+#include <torch/extension.h>
+#include <ATen/TensorUtils.h>
+// #include "voxelization.h"
+
+
+namespace {
+
+
+template <typename T_int>
+void determin_max_points_kernel(torch::TensorAccessor<T_int,2> coor,
+                                torch::TensorAccessor<T_int,1> point_to_voxelidx,
+                                torch::TensorAccessor<T_int,1> num_points_per_voxel,
+                                torch::TensorAccessor<T_int,3> coor_to_voxelidx,
+                                int& voxel_num,
+                                int& max_points,
+                                const int num_points
+                                ) {
+
+    int voxelidx, num;
+    for (int i = 0; i < num_points; ++i) {
+        if (coor[i][0] == -1)
+            continue;
+        voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+        // record voxel
+        if (voxelidx == -1) {
+            voxelidx = voxel_num;
+            voxel_num += 1;
+            coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+        }
+
+        // put points into voxel
+        num = num_points_per_voxel[voxelidx];
+        point_to_voxelidx[i] = num;
+        num_points_per_voxel[voxelidx] += 1;
+
+        // update max points per voxel
+        max_points = std::max(max_points, num+1);
+    }
+
+    return;
+}
+
+
+template <typename T, typename T_int>
+void scatter_point_to_voxel_kernel(
+                const torch::TensorAccessor<T,2> points,
+                torch::TensorAccessor<T_int,2> coor,
+                torch::TensorAccessor<T_int,1> point_to_voxelidx,
+                torch::TensorAccessor<T_int,3> coor_to_voxelidx,
+                torch::TensorAccessor<T,3> voxels,
+                torch::TensorAccessor<T_int,2> voxel_coors,
+                const int num_features,
+                const int num_points,
+                const int NDim
+                ){
+    for (int i = 0; i < num_points; ++i) {
+        int num = point_to_voxelidx[i];
+        int voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+        for (int k = 0; k < num_features; ++k) {
+            voxels[voxelidx][num][k] = points[i][k];
+        }
+        for (int k = 0; k < NDim; ++k) {
+            voxel_coors[voxelidx][k] = coor[i][k];
+        }
+    }
+}
+
+} // namespace
+
+
+namespace voxelization {
+
+std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
+    const at::Tensor& points,
+    const at::Tensor& voxel_mapping,
+    const std::vector<float> voxel_size,
+    const std::vector<float> coors_range) {
+    // current version tooks about 0.02s_0.03s for one frame on cpu
+    // check device
+    AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+    const int NDim = voxel_mapping.size(1);
+    const int num_points = points.size(0);
+    const int num_features = points.size(1);
+
+    std::vector<int> grid_size(NDim);
+    for (int i = 0; i < NDim; ++i) {
+        grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+    }
+
+    at::Tensor num_points_per_voxel = at::zeros({num_points,}, voxel_mapping.options());
+    at::Tensor coor_to_voxelidx = -at::ones({grid_size[2], grid_size[1], grid_size[0]}, voxel_mapping.options());
+    at::Tensor point_to_voxelidx = -at::ones({num_points,}, voxel_mapping.options());
+
+    int voxel_num = 0;
+    int max_points = 0;
+    AT_DISPATCH_ALL_TYPES(voxel_mapping.type(), "determin_max_point", [&] {
+        determin_max_points_kernel<scalar_t>(
+            voxel_mapping.accessor<scalar_t,2>(),
+            point_to_voxelidx.accessor<scalar_t,1>(),
+            num_points_per_voxel.accessor<scalar_t,1>(),
+            coor_to_voxelidx.accessor<scalar_t,3>(),
+            voxel_num,
+            max_points,
+            num_points
+        );
+    });
+
+    at::Tensor voxels = at::zeros({voxel_num, max_points, num_features}, points.options());
+    at::Tensor voxel_coors = at::zeros({voxel_num, NDim}, points.options().dtype(at::kInt));
+
+    AT_DISPATCH_ALL_TYPES(points.type(), "scatter_point_to_voxel", [&] {
+        scatter_point_to_voxel_kernel<scalar_t, int>(
+            points.accessor<scalar_t,2>(),
+            voxel_mapping.accessor<int,2>(),
+            point_to_voxelidx.accessor<int,1>(),
+            coor_to_voxelidx.accessor<int,3>(),
+            voxels.accessor<scalar_t,3>(),
+            voxel_coors.accessor<int,2>(),
+            num_features,
+            num_points,
+            NDim
+        );
+    });
+
+    at::Tensor num_points_per_voxel_out = num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num);
+    return {voxels, voxel_coors, num_points_per_voxel_out};
+}
+
+}
diff --git a/mmdet3d/ops/voxel/src/scatter_points_cuda.cu b/mmdet3d/ops/voxel/src/scatter_points_cuda.cu
new file mode 100644
index 0000000000..2ef7df8a08
--- /dev/null
+++ b/mmdet3d/ops/voxel/src/scatter_points_cuda.cu
@@ -0,0 +1,284 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T, typename T_int>
+__global__ void scatter_point_to_voxel_kernel(
+    const T* points, T_int* coor, T_int* point_to_voxelidx,
+    T_int* coor_to_voxelidx, T* voxels, T_int* coors, const int num_features,
+    const int num_points, const int max_points, const int NDim) {
+  const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  if (index >= num_points) return;
+
+  int num = point_to_voxelidx[index];
+  int voxelidx = coor_to_voxelidx[index];
+  if (num > -1 && voxelidx > -1) {
+    const int feature_per_thread = num_features / 4;
+
+    int start = threadIdx.y * feature_per_thread;
+    auto voxels_offset =
+        voxels + voxelidx * max_points * num_features + num * num_features;
+    auto points_offset = points + index * num_features;
+    for (int k = start; k < start + feature_per_thread; k++) {
+      voxels_offset[k] = points_offset[k];
+    }
+    if (num == 0 && start < NDim) {
+      auto coors_offset = coors + voxelidx * NDim;
+      auto coor_offset = coor + index * NDim;
+      for (int k = start; k < NDim; k++) {
+        coors_offset[k] = coor_offset[k];
+      }
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void map_voxel_to_point_kernel(
+    T* points, T* voxels, T_int* point_to_voxelidx, T_int* coor_to_voxelidx,
+    const int num_features, const int num_points, const int max_points) {
+  const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  if (index >= num_points) return;
+  auto num = point_to_voxelidx[index];
+  if (num > -1) {
+    const int feature_per_thread = num_features / 4;
+    auto voxelidx = coor_to_voxelidx[index];
+
+    int start = threadIdx.y * feature_per_thread;
+    auto voxels_offset =
+        voxels + voxelidx * max_points * num_features + num * num_features;
+    auto points_offset = points + index * num_features;
+    for (int k = start; k < start + feature_per_thread; k++) {
+      points_offset[k] = voxels_offset[k];
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void point_to_voxelidx_kernel(const T_int* coor,
+                                         T_int* point_to_voxelidx,
+                                         T_int* point_to_pointidx,
+                                         const int num_points, const int NDim) {
+  const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  auto coor_offset = coor + index * NDim;
+  // skip invalid points
+  if ((index >= num_points) || (coor_offset[0] == -1)) return;
+
+  int num = 0;
+  int coor_x = coor_offset[0];
+  int coor_y = coor_offset[1];
+  int coor_z = coor_offset[2];
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < index; ++i) {
+    auto prev_coor = coor + i * NDim;
+    if (prev_coor[0] == -1) continue;
+
+    // record voxel
+    if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
+        (prev_coor[2] == coor_z)) {
+      num++;
+      if (num == 1) {
+        point_to_pointidx[index] = i;
+      }
+    }
+  }
+  if (num == 0) {
+    point_to_pointidx[index] = index;
+  }
+  point_to_voxelidx[index] = num;
+}
+
+template <typename T_int>
+__global__ void determin_voxel_num(
+    const T_int* coor, T_int* num_points_per_voxel, T_int* point_to_voxelidx,
+    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
+    T_int* max_points, const int num_points, const int NDim) {
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < num_points; ++i) {
+    auto coor_offset = coor + i * NDim;
+    if (coor_offset[0] == -1) continue;
+    int point_pos_in_voxel = point_to_voxelidx[i];
+    // record voxel
+    if (point_pos_in_voxel == -1) {
+      // out of max_points or invalid point
+      printf("point_pos_in_voxel == -1, point:%d", i);
+      continue;
+    } else if (point_pos_in_voxel == 0) {
+      // record new voxel
+      int voxelidx = voxel_num[0];
+      voxel_num[0] += 1;
+      coor_to_voxelidx[i] = voxelidx;
+      num_points_per_voxel[voxelidx] = 1;
+    } else {
+      int point_idx = point_to_pointidx[i];
+      int voxelidx = coor_to_voxelidx[point_idx];
+      if (voxelidx != -1) {
+        num_points_per_voxel[voxelidx] += 1;
+        coor_to_voxelidx[i] = voxelidx;
+        max_points[0] = max(max_points[0], point_pos_in_voxel + 1);
+      } else {
+        printf("voxelidx = -1, point:%d", i);
+      }
+    }
+  }
+}
+
+namespace voxelization {
+
+std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
+    const at::Tensor& points, const at::Tensor& voxel_mapping,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range) {
+  CHECK_INPUT(points);
+  at::cuda::CUDAGuard device_guard(points.device());
+
+  const int NDim = voxel_mapping.size(1);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  std::vector<int> grid_size(NDim);
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // assume the mapping is already given
+  auto point_to_pointidx = -at::ones(
+      {
+          num_points,
+      },
+      voxel_mapping.options());
+  auto point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      voxel_mapping.options());
+  auto max_points = at::zeros(
+      {
+          1,
+      },
+      voxel_mapping.options());  // must be zero from the begining
+
+  int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
+  dim3 blocks(col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t map_stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_ALL_TYPES(
+      voxel_mapping.type(), "determin_duplicate", ([&] {
+        point_to_voxelidx_kernel<int><<<blocks, threads, 0, map_stream>>>(
+            voxel_mapping.data_ptr<int>(), point_to_voxelidx.data_ptr<int>(),
+            point_to_pointidx.data_ptr<int>(), num_points, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // make the logic in the CUDA device could accelerate about 10 times
+  auto num_points_per_voxel = at::zeros(
+      {
+          num_points,
+      },
+      voxel_mapping.options());
+  auto coor_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      voxel_mapping.options());
+  auto voxel_num = at::zeros(
+      {
+          1,
+      },
+      voxel_mapping.options());  // must be zero from the begining
+  cudaStream_t logic_stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_ALL_TYPES(
+      voxel_mapping.type(), "determin_duplicate", ([&] {
+        determin_voxel_num<int><<<1, 1, 0, logic_stream>>>(
+            voxel_mapping.data_ptr<int>(), num_points_per_voxel.data_ptr<int>(),
+            point_to_voxelidx.data_ptr<int>(),
+            point_to_pointidx.data_ptr<int>(), coor_to_voxelidx.data_ptr<int>(),
+            voxel_num.data_ptr<int>(), max_points.data_ptr<int>(), num_points,
+            NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // some temporary data
+  auto max_points_cpu = max_points.to(at::kCPU);
+  int max_points_int = max_points_cpu.data_ptr<int>()[0];
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+  at::Tensor coors =
+      at::zeros({voxel_num_int, NDim}, points.options().dtype(at::kInt));
+  at::Tensor voxels = at::zeros({voxel_num_int, max_points_int, num_features},
+                                points.options());
+
+  // copy point features to voxels
+  dim3 cp_threads(threadsPerBlock, 4);
+  cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_ALL_TYPES(
+      points.type(), "scatter_point_to_voxel", ([&] {
+        scatter_point_to_voxel_kernel<float, int>
+            <<<blocks, cp_threads, 0, cp_stream>>>(
+                points.data_ptr<float>(), voxel_mapping.data_ptr<int>(),
+                point_to_voxelidx.data_ptr<int>(),
+                coor_to_voxelidx.data_ptr<int>(), voxels.data_ptr<float>(),
+                coors.data_ptr<int>(), num_features, num_points, max_points_int,
+                NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  at::Tensor num_points_per_voxel_out =
+      num_points_per_voxel.slice(/*dim=*/0, /*start=*/0, /*end=*/voxel_num_int);
+  return {voxels, coors, num_points_per_voxel_out, point_to_voxelidx,
+          coor_to_voxelidx};
+}
+
+void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
+                                         const at::Tensor& grad_output_voxels,
+                                         const at::Tensor& point_to_voxelidx,
+                                         const at::Tensor& coor_to_voxelidx) {
+  CHECK_INPUT(grad_input_points);
+  CHECK_INPUT(grad_output_voxels);
+  CHECK_INPUT(point_to_voxelidx);
+  CHECK_INPUT(coor_to_voxelidx);
+  at::cuda::CUDAGuard device_guard(grad_input_points.device());
+
+  const int num_points = grad_input_points.size(0);
+  const int num_features = grad_input_points.size(1);
+  const int max_points = grad_output_voxels.size(1);
+
+  // copy voxel grad to points
+  int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
+  dim3 blocks(col_blocks);
+  dim3 cp_threads(threadsPerBlock, 4);
+  cudaStream_t cp_stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_ALL_TYPES(grad_input_points.type(), "scatter_point_to_voxel",
+                        ([&] {
+                          map_voxel_to_point_kernel<float, int>
+                              <<<blocks, cp_threads, 0, cp_stream>>>(
+                                  grad_input_points.data_ptr<float>(),
+                                  grad_output_voxels.data_ptr<float>(),
+                                  point_to_voxelidx.data_ptr<int>(),
+                                  coor_to_voxelidx.data_ptr<int>(),
+                                  num_features, num_points, max_points);
+                        }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return;
+}
+
+}  // namespace voxelization
diff --git a/mmdet3d/ops/voxel/src/voxelization.cpp b/mmdet3d/ops/voxel/src/voxelization.cpp
new file mode 100644
index 0000000000..f83348e31a
--- /dev/null
+++ b/mmdet3d/ops/voxel/src/voxelization.cpp
@@ -0,0 +1,13 @@
+#include <torch/extension.h>
+#include "voxelization.h"
+
+namespace voxelization {
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("hard_voxelize", &hard_voxelize, "hard voxelize");
+  m.def("dynamic_voxelize", &dynamic_voxelize, "dynamic voxelization");
+  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward, "dynamic point to voxel forward");
+  m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward, "dynamic point to voxel backward");
+}
+
+} // namespace voxelization
diff --git a/mmdet3d/ops/voxel/src/voxelization.h b/mmdet3d/ops/voxel/src/voxelization.h
new file mode 100644
index 0000000000..9b8705e903
--- /dev/null
+++ b/mmdet3d/ops/voxel/src/voxelization.h
@@ -0,0 +1,113 @@
+#pragma once
+#include <torch/extension.h>
+
+namespace voxelization {
+
+int hard_voxelize_cpu(const at::Tensor& points, at::Tensor& voxels,
+                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3);
+
+void dynamic_voxelize_cpu(const at::Tensor& points, at::Tensor& coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3);
+
+std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
+    const at::Tensor& points, const at::Tensor& voxel_mapping,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range);
+
+#ifdef WITH_CUDA
+int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
+                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3);
+
+void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3);
+
+std::vector<at::Tensor> dynamic_point_to_voxel_forward_gpu(
+    const at::Tensor& points, const at::Tensor& voxel_mapping,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range);
+
+void dynamic_point_to_voxel_backward_gpu(at::Tensor& grad_input_points,
+                                         const at::Tensor& grad_output_voxels,
+                                         const at::Tensor& point_to_voxelidx,
+                                         const at::Tensor& coor_to_voxelidx);
+#endif
+
+// Interface for Python
+inline int hard_voxelize(const at::Tensor& points, at::Tensor& voxels,
+                         at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                         const std::vector<float> voxel_size,
+                         const std::vector<float> coors_range,
+                         const int max_points, const int max_voxels,
+                         const int NDim = 3) {
+  if (points.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
+                             voxel_size, coors_range, max_points, max_voxels,
+                             NDim);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return hard_voxelize_cpu(points, voxels, coors, num_points_per_voxel,
+                           voxel_size, coors_range, max_points, max_voxels,
+                           NDim);
+}
+
+inline void dynamic_voxelize(const at::Tensor& points, at::Tensor& coors,
+                             const std::vector<float> voxel_size,
+                             const std::vector<float> coors_range,
+                             const int NDim = 3) {
+  if (points.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, NDim);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return dynamic_voxelize_cpu(points, coors, voxel_size, coors_range, NDim);
+}
+
+inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
+    const at::Tensor& points, const at::Tensor& voxel_mapping,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range) {
+  if (points.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return dynamic_point_to_voxel_forward_gpu(points, voxel_mapping, voxel_size,
+                                              coors_range);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return dynamic_point_to_voxel_cpu(points, voxel_mapping, voxel_size,
+                                    coors_range);
+}
+
+inline void dynamic_point_to_voxel_backward(
+    at::Tensor& grad_input_points, const at::Tensor& grad_output_voxels,
+    const at::Tensor& point_to_voxelidx, const at::Tensor& coor_to_voxelidx) {
+  if (grad_input_points.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return dynamic_point_to_voxel_backward_gpu(
+        grad_input_points, grad_output_voxels, point_to_voxelidx,
+        coor_to_voxelidx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  // return dynamic_point_to_voxel_cpu(points,
+  //                                  voxel_mapping,
+  //                                  voxel_size,
+  //                                  coors_range);
+}
+
+}  // namespace voxelization
diff --git a/mmdet3d/ops/voxel/src/voxelization_cpu.cpp b/mmdet3d/ops/voxel/src/voxelization_cpu.cpp
new file mode 100644
index 0000000000..9339ea5fc2
--- /dev/null
+++ b/mmdet3d/ops/voxel/src/voxelization_cpu.cpp
@@ -0,0 +1,208 @@
+#include <torch/extension.h>
+#include <ATen/TensorUtils.h>
+// #include "voxelization.h"
+
+
+namespace {
+
+template <typename T, typename T_int>
+void dynamic_voxelize_kernel(const torch::TensorAccessor<T,2> points,
+                             torch::TensorAccessor<T_int, 2> coors,
+                             const std::vector<float> voxel_size,
+                             const std::vector<float> coors_range,
+                             const std::vector<int> grid_size,
+                             const int num_points,
+                             const int num_features,
+                             const int NDim
+                             ) {
+
+  const int ndim_minus_1 = NDim - 1;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+
+  for (int i = 0; i < num_points; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
+      // necessary to rm points out of range
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+
+    for (int k = 0; k < NDim; ++k) {
+      if (failed)
+        coors[i][k] = -1;
+      else
+        coors[i][k] = coor[k];
+    }
+  }
+
+  return;
+}
+
+
+template <typename T, typename T_int>
+void hard_voxelize_kernel(const torch::TensorAccessor<T,2> points,
+                          torch::TensorAccessor<T,3> voxels,
+                          torch::TensorAccessor<T_int,2> coors,
+                          torch::TensorAccessor<T_int,1> num_points_per_voxel,
+                          torch::TensorAccessor<T_int,3> coor_to_voxelidx,
+                          int& voxel_num,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const std::vector<int> grid_size,
+                          const int max_points,
+                          const int max_voxels,
+                          const int num_points,
+                          const int num_features,
+                          const int NDim
+                          ) {
+
+  // declare a temp coors
+  at::Tensor temp_coors = at::zeros({num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
+
+  // First use dynamic voxelization to get coors,
+  // then check max points/voxels constraints
+  dynamic_voxelize_kernel<T, int>(
+          points,
+          temp_coors.accessor<int,2>(),
+          voxel_size,
+          coors_range,
+          grid_size,
+          num_points,
+          num_features,
+          NDim
+  );
+
+  int voxelidx, num;
+  auto coor = temp_coors.accessor<int,2>();
+
+  for (int i = 0; i < num_points; ++i) {
+    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
+
+    if (coor[i][0] == -1)
+      continue;
+
+    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+    // record voxel
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (max_voxels != -1 && voxel_num >= max_voxels)
+        break;
+      voxel_num += 1;
+
+      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+
+      for (int k = 0; k < NDim; ++k) {
+        coors[voxelidx][k] = coor[i][k];
+      }
+    }
+
+    // put points into voxel
+    num = num_points_per_voxel[voxelidx];
+    if (max_points == -1 || num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels[voxelidx][num][k] = points[i][k];
+      }
+      num_points_per_voxel[voxelidx] += 1;
+    }
+  }
+
+  return;
+}
+
+} // namespace
+
+
+namespace voxelization {
+
+int hard_voxelize_cpu(
+    const at::Tensor& points,
+    at::Tensor& voxels,
+    at::Tensor& coors,
+    at::Tensor& num_points_per_voxel,
+    const std::vector<float> voxel_size,
+    const std::vector<float> coors_range,
+    const int max_points,
+    const int max_voxels,
+    const int NDim=3) {
+    // current version tooks about 0.02s_0.03s for one frame on cpu
+    // check device
+    AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+    std::vector<int> grid_size(NDim);
+    const int num_points = points.size(0);
+    const int num_features = points.size(1);
+
+    for (int i = 0; i < NDim; ++i) {
+        grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+    }
+
+    // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+    //printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2], grid_size[1], grid_size[0]);
+    at::Tensor coor_to_voxelidx = -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
+
+    int voxel_num = 0;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.type(), "hard_voxelize_forward", [&] {
+        hard_voxelize_kernel<scalar_t, int>(
+            points.accessor<scalar_t,2>(),
+            voxels.accessor<scalar_t,3>(),
+            coors.accessor<int,2>(),
+            num_points_per_voxel.accessor<int,1>(),
+            coor_to_voxelidx.accessor<int,3>(),
+            voxel_num,
+            voxel_size,
+            coors_range,
+            grid_size,
+            max_points,
+            max_voxels,
+            num_points,
+            num_features,
+            NDim
+        );
+    });
+
+    return voxel_num;
+}
+
+
+void dynamic_voxelize_cpu(
+    const at::Tensor& points,
+    at::Tensor& coors,
+    const std::vector<float> voxel_size,
+    const std::vector<float> coors_range,
+    const int NDim=3) {
+    // check device
+    AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+    std::vector<int> grid_size(NDim);
+    const int num_points = points.size(0);
+    const int num_features = points.size(1);
+
+    for (int i = 0; i < NDim; ++i) {
+        grid_size[i] = round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+    }
+
+    // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.type(), "hard_voxelize_forward", [&] {
+        dynamic_voxelize_kernel<scalar_t, int>(
+            points.accessor<scalar_t,2>(),
+            coors.accessor<int,2>(),
+            voxel_size,
+            coors_range,
+            grid_size,
+            num_points,
+            num_features,
+            NDim
+        );
+    });
+
+    return;
+}
+
+}
diff --git a/mmdet3d/ops/voxel/src/voxelization_cuda.cu b/mmdet3d/ops/voxel/src/voxelization_cuda.cu
new file mode 100644
index 0000000000..59bb352839
--- /dev/null
+++ b/mmdet3d/ops/voxel/src/voxelization_cuda.cu
@@ -0,0 +1,373 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+template <typename T, typename T_int>
+__global__ void dynamic_voxelize_kernel(
+    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
+    const float voxel_z, const float coors_x_min, const float coors_y_min,
+    const float coors_z_min, const float coors_x_max, const float coors_y_max,
+    const float coors_z_max, const int grid_x, const int grid_y,
+    const int grid_z, const int num_points, const int num_features,
+    const int NDim) {
+  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    // To save some computation
+    auto points_offset = points + index * num_features;
+    auto coors_offset = coors + index * NDim;
+    int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
+    if (c_x < 0 || c_x >= grid_x) {
+      coors_offset[0] = -1;
+      return;
+    }
+
+    int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
+    if (c_y < 0 || c_y >= grid_y) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      return;
+    }
+
+    int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
+    if (c_z < 0 || c_z >= grid_z) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      coors_offset[2] = -1;
+    } else {
+      coors_offset[0] = c_z;
+      coors_offset[1] = c_y;
+      coors_offset[2] = c_x;
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_point_to_voxel(const int nthreads, const T* points,
+                                      T_int* point_to_voxelidx,
+                                      T_int* coor_to_voxelidx, T* voxels,
+                                      const int max_points,
+                                      const int num_features,
+                                      const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    int index = thread_idx / num_features;
+
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num > -1 && voxelidx > -1) {
+      auto voxels_offset =
+          voxels + voxelidx * max_points * num_features + num * num_features;
+
+      int k = thread_idx % num_features;
+      voxels_offset[k] = points[thread_idx];
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
+                                   T_int* point_to_voxelidx,
+                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
+                                   const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    // if (index >= num_points) return;
+    int index = thread_idx / NDim;
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num == 0 && voxelidx > -1) {
+      auto coors_offset = voxel_coors + voxelidx * NDim;
+      int k = thread_idx % NDim;
+      coors_offset[k] = coor[thread_idx];
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void point_to_voxelidx_kernel(const T_int* coor,
+                                         T_int* point_to_voxelidx,
+                                         T_int* point_to_pointidx,
+                                         const int max_points,
+                                         const int max_voxels,
+                                         const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    auto coor_offset = coor + index * NDim;
+    // skip invalid points
+    if ((index >= num_points) || (coor_offset[0] == -1)) return;
+
+    int num = 0;
+    int coor_x = coor_offset[0];
+    int coor_y = coor_offset[1];
+    int coor_z = coor_offset[2];
+    // only calculate the coors before this coor[index]
+    for (int i = 0; i < index; ++i) {
+      auto prev_coor = coor + i * NDim;
+      if (prev_coor[0] == -1) continue;
+
+      // Find all previous points that have the same coors
+      // if find the same coor, record it
+      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
+          (prev_coor[2] == coor_z)) {
+        num++;
+        if (num == 1) {
+          // point to the same coor that first show up
+          point_to_pointidx[index] = i;
+        } else if (num >= max_points) {
+          // out of boundary
+          return;
+        }
+      }
+    }
+    if (num == 0) {
+      point_to_pointidx[index] = index;
+    }
+    if (num < max_points) {
+      point_to_voxelidx[index] = num;
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void determin_voxel_num(
+    // const T_int* coor,
+    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
+    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
+    const int max_points, const int max_voxels, const int num_points) {
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < num_points; ++i) {
+    // if (coor[i][0] == -1)
+    //    continue;
+    int point_pos_in_voxel = point_to_voxelidx[i];
+    // record voxel
+    if (point_pos_in_voxel == -1) {
+      // out of max_points or invalid point
+      continue;
+    } else if (point_pos_in_voxel == 0) {
+      // record new voxel
+      int voxelidx = voxel_num[0];
+      if (voxel_num[0] >= max_voxels) break;
+      voxel_num[0] += 1;
+      coor_to_voxelidx[i] = voxelidx;
+      num_points_per_voxel[voxelidx] = 1;
+    } else {
+      int point_idx = point_to_pointidx[i];
+      int voxelidx = coor_to_voxelidx[point_idx];
+      if (voxelidx != -1) {
+        coor_to_voxelidx[i] = voxelidx;
+        num_points_per_voxel[voxelidx] += 1;
+      }
+    }
+  }
+}
+
+namespace voxelization {
+
+int hard_voxelize_gpu(const at::Tensor& points, at::Tensor& voxels,
+                      at::Tensor& coors, at::Tensor& num_points_per_voxel,
+                      const std::vector<float> voxel_size,
+                      const std::vector<float> coors_range,
+                      const int max_points, const int max_voxels,
+                      const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+  CHECK_INPUT(points);
+
+  at::cuda::CUDAGuard device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int>
+            <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                points.contiguous().data_ptr<scalar_t>(),
+                temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y,
+                voxel_z, coors_x_min, coors_y_min, coors_z_min, coors_x_max,
+                coors_y_max, coors_z_max, grid_x, grid_y, grid_z, num_points,
+                num_features, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 2. map point to the idx of the corresponding voxel, find duplicate coor
+  // create some temporary variables
+  auto point_to_pointidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+
+  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 map_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.type(), "determin_duplicate", ([&] {
+        point_to_voxelidx_kernel<int>
+            <<<map_grid, map_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                temp_coors.contiguous().data_ptr<int>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                point_to_pointidx.contiguous().data_ptr<int>(), max_points,
+                max_voxels, num_points, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 3. determin voxel num and voxel's coor index
+  // make the logic in the CUDA device could accelerate about 10 times
+  auto coor_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto voxel_num = at::zeros(
+      {
+          1,
+      },
+      points.options().dtype(at::kInt));  // must be zero from the begining
+
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.type(), "determin_duplicate", ([&] {
+        determin_voxel_num<int><<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
+            num_points_per_voxel.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            point_to_pointidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            voxel_num.contiguous().data_ptr<int>(), max_points, max_voxels,
+            num_points);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 4. copy point features to voxels
+  // Step 4 & 5 could be parallel
+  auto pts_output_size = num_points * num_features;
+  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.type(), "assign_point_to_voxel", ([&] {
+        assign_point_to_voxel<float, int>
+            <<<cp_grid, cp_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                pts_output_size, points.contiguous().data_ptr<float>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                coor_to_voxelidx.contiguous().data_ptr<int>(),
+                voxels.contiguous().data_ptr<float>(), max_points, num_features,
+                num_points, NDim);
+      }));
+  //   cudaDeviceSynchronize();
+  //   AT_CUDA_CHECK(cudaGetLastError());
+
+  // 5. copy coors of each voxels
+  auto coors_output_size = num_points * NDim;
+  dim3 coors_cp_grid(
+      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
+  dim3 coors_cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.type(), "assign_point_to_voxel", ([&] {
+        assign_voxel_coors<float, int><<<coors_cp_grid, coors_cp_block, 0,
+                                         at::cuda::getCurrentCUDAStream()>>>(
+            coors_output_size, temp_coors.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            coors.contiguous().data_ptr<int>(), num_points, NDim);
+      }));
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+
+  return voxel_num_int;
+}
+
+void dynamic_voxelize_gpu(const at::Tensor& points, at::Tensor& coors,
+                          const std::vector<float> voxel_size,
+                          const std::vector<float> coors_range,
+                          const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+  CHECK_INPUT(points);
+
+  at::cuda::CUDAGuard device_guard(points.device());
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(num_points, threadsPerBlock);
+  dim3 blocks(col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
+    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+        points.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
+  });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return;
+}
+
+}  // namespace voxelization
diff --git a/mmdet3d/ops/voxel/voxelize.py b/mmdet3d/ops/voxel/voxelize.py
new file mode 100644
index 0000000000..e578448a3e
--- /dev/null
+++ b/mmdet3d/ops/voxel/voxelize.py
@@ -0,0 +1,122 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from .voxel_layer import dynamic_voxelize, hard_voxelize
+
+
+class _Voxelization(Function):
+
+    @staticmethod
+    def forward(ctx,
+                points,
+                voxel_size,
+                coors_range,
+                max_points=35,
+                max_voxels=20000):
+        """convert kitti points(N, >=3) to voxels.
+
+        Args:
+            points: [N, ndim] float tensor. points[:, :3] contain xyz points
+                and points[:, 3:] contain other information like reflectivity
+            voxel_size: [3] list/tuple or array, float. xyz, indicate voxel
+                size
+            coors_range: [6] list/tuple or array, float. indicate voxel
+                range. format: xyzxyz, minmax
+            max_points: int. indicate maximum points contained in a voxel. if
+                max_points=-1, it means using dynamic_voxelize
+            max_voxels: int. indicate maximum voxels this function create.
+                for second, 20000 is a good choice. Users should shuffle points
+                before call this function because max_voxels may drop points.
+
+        Returns:
+            voxels: [M, max_points, ndim] float tensor. only contain points
+                    and returned when max_points != -1.
+            coordinates: [M, 3] int32 tensor, always returned.
+            num_points_per_voxel: [M] int32 tensor. Only returned when
+                max_points != -1.
+        """
+        if max_points == -1 or max_voxels == -1:
+            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
+            dynamic_voxelize(points, coors, voxel_size, coors_range, 3)
+            return coors
+        else:
+            voxels = points.new_zeros(
+                size=(max_voxels, max_points, points.size(1)))
+            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
+            num_points_per_voxel = points.new_zeros(
+                size=(max_voxels, ), dtype=torch.int)
+            voxel_num = hard_voxelize(points, voxels, coors,
+                                      num_points_per_voxel, voxel_size,
+                                      coors_range, max_points, max_voxels, 3)
+            # select the valid voxels
+            voxels_out = voxels[:voxel_num]
+            coors_out = coors[:voxel_num]
+            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
+            return voxels_out, coors_out, num_points_per_voxel_out
+
+
+voxelization = _Voxelization.apply
+
+
+class Voxelization(nn.Module):
+
+    def __init__(self,
+                 voxel_size,
+                 point_cloud_range,
+                 max_num_points,
+                 max_voxels=20000):
+        super(Voxelization, self).__init__()
+        """
+        Args:
+            voxel_size (list): list [x, y, z] size of three dimension
+            point_cloud_range (list):
+                [x_min, y_min, z_min, x_max, y_max, z_max]
+            max_num_points (int): max number of points per voxel
+            max_voxels (tuple or int): max number of voxels in
+                (training, testing) time
+        """
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.max_num_points = max_num_points
+        if isinstance(max_voxels, tuple):
+            self.max_voxels = max_voxels
+        else:
+            self.max_voxels = _pair(max_voxels)
+
+        point_cloud_range = torch.tensor(
+            point_cloud_range, dtype=torch.float32)
+        # [0, -40, -3, 70.4, 40, 1]
+        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
+        grid_size = (point_cloud_range[3:] -
+                     point_cloud_range[:3]) / voxel_size
+        grid_size = torch.round(grid_size).long()
+        input_feat_shape = grid_size[:2]
+        self.grid_size = grid_size
+        # the origin shape is as [x-len, y-len, z-len]
+        # [w, h, d] -> [d, h, w]
+        self.pcd_shape = [*input_feat_shape, 1][::-1]
+
+    def forward(self, input):
+        """
+        Args:
+            input: NC points
+        """
+        if self.training:
+            max_voxels = self.max_voxels[0]
+        else:
+            max_voxels = self.max_voxels[1]
+
+        return voxelization(input, self.voxel_size, self.point_cloud_range,
+                            self.max_num_points, max_voxels)
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + '('
+        tmpstr += 'voxel_size=' + str(self.voxel_size)
+        tmpstr += ', point_cloud_range=' + str(self.point_cloud_range)
+        tmpstr += ', max_num_points=' + str(self.max_num_points)
+        tmpstr += ', max_voxels=' + str(self.max_voxels)
+        tmpstr += ')'
+        return tmpstr
diff --git a/mmdet3d/utils/__init__.py b/mmdet3d/utils/__init__.py
new file mode 100644
index 0000000000..ef664e1b27
--- /dev/null
+++ b/mmdet3d/utils/__init__.py
@@ -0,0 +1,8 @@
+from mmdet.utils import (Registry, build_from_cfg, get_model_complexity_info,
+                         get_root_logger, print_log)
+from .collect_env import collect_env
+
+__all__ = [
+    'Registry', 'build_from_cfg', 'get_model_complexity_info',
+    'get_root_logger', 'print_log', 'collect_env'
+]
diff --git a/mmdet3d/utils/collect_env.py b/mmdet3d/utils/collect_env.py
new file mode 100644
index 0000000000..961e9caaa5
--- /dev/null
+++ b/mmdet3d/utils/collect_env.py
@@ -0,0 +1,65 @@
+import os.path as osp
+import subprocess
+import sys
+from collections import defaultdict
+
+import cv2
+import mmcv
+import torch
+import torchvision
+
+import mmdet
+import mmdet3d
+
+
+def collect_env():
+    env_info = {}
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
+
+    cuda_available = torch.cuda.is_available()
+    env_info['CUDA available'] = cuda_available
+
+    if cuda_available:
+        from torch.utils.cpp_extension import CUDA_HOME
+        env_info['CUDA_HOME'] = CUDA_HOME
+
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            try:
+                nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                nvcc = subprocess.check_output(
+                    '"{}" -V | tail -n1'.format(nvcc), shell=True)
+                nvcc = nvcc.decode('utf-8').strip()
+            except subprocess.SubprocessError:
+                nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, devids in devices.items():
+            env_info['GPU ' + ','.join(devids)] = name
+
+    gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
+    gcc = gcc.decode('utf-8').strip()
+    env_info['GCC'] = gcc
+
+    env_info['PyTorch'] = torch.__version__
+    env_info['PyTorch compiling details'] = torch.__config__.show()
+
+    env_info['TorchVision'] = torchvision.__version__
+
+    env_info['OpenCV'] = cv2.__version__
+
+    env_info['MMCV'] = mmcv.__version__
+    env_info['MMDetection'] = mmdet.__version__
+    env_info['MMDetection3D'] = mmdet3d.__version__
+    from mmdet.ops import get_compiler_version, get_compiling_cuda_version
+    env_info['MMDetection3D Compiler'] = get_compiler_version()
+    env_info['MMDetection3D CUDA Compiler'] = get_compiling_cuda_version()
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print('{}: {}'.format(name, val))
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000..6981bd7233
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/optional.txt
+-r requirements/runtime.txt
+-r requirements/tests.txt
diff --git a/requirements/build.txt b/requirements/build.txt
new file mode 100644
index 0000000000..84c500a07f
--- /dev/null
+++ b/requirements/build.txt
@@ -0,0 +1,3 @@
+# These must be installed before building mmdetection
+numpy
+torch>=1.1
diff --git a/requirements/optional.txt b/requirements/optional.txt
new file mode 100644
index 0000000000..9e0ace4969
--- /dev/null
+++ b/requirements/optional.txt
@@ -0,0 +1,2 @@
+# To avoid install too many unnecessary packages
+nuscenes-devkit==1.0.5
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
new file mode 100644
index 0000000000..d1e9867801
--- /dev/null
+++ b/requirements/runtime.txt
@@ -0,0 +1,9 @@
+matplotlib
+mmcv>=0.2.15
+numpy
+# need older pillow until torchvision is fixed
+Pillow<=6.2.2
+six
+terminaltables
+torch>=1.1
+torchvision
diff --git a/requirements/tests.txt b/requirements/tests.txt
new file mode 100644
index 0000000000..c3e76b7311
--- /dev/null
+++ b/requirements/tests.txt
@@ -0,0 +1,12 @@
+asynctest
+codecov
+flake8
+isort
+# Note: used for kwarray.group_items, this may be ported to mmcv in the future.
+kwarray
+pytest
+pytest-cov
+pytest-runner
+ubelt
+xdoctest >= 0.10.0
+yapf
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000..a96ec05f3b
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,271 @@
+import os
+import platform
+import subprocess
+import time
+from setuptools import Extension, find_packages, setup
+
+import numpy as np
+from Cython.Build import cythonize
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+MAJOR = 0
+MINOR = 1
+PATCH = ''
+SUFFIX = 'rc0'
+SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX)
+
+version_file = 'mmdet3d/version.py'
+
+
+def get_git_hash():
+
+    def _minimal_ext_cmd(cmd):
+        # construct minimal environment
+        env = {}
+        for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+            v = os.environ.get(k)
+            if v is not None:
+                env[k] = v
+        # LANGUAGE is used on win32
+        env['LANGUAGE'] = 'C'
+        env['LANG'] = 'C'
+        env['LC_ALL'] = 'C'
+        out = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
+        return out
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        sha = out.strip().decode('ascii')
+    except OSError:
+        sha = 'unknown'
+
+    return sha
+
+
+def get_hash():
+    if os.path.exists('.git'):
+        sha = get_git_hash()[:7]
+    elif os.path.exists(version_file):
+        try:
+            from mmdet3d.version import __version__
+            sha = __version__.split('+')[-1]
+        except ImportError:
+            raise ImportError('Unable to get git version')
+    else:
+        sha = 'unknown'
+
+    return sha
+
+
+def write_version_py():
+    content = """# GENERATED VERSION FILE
+# TIME: {}
+__version__ = '{}'
+short_version = '{}'
+"""
+    sha = get_hash()
+    VERSION = SHORT_VERSION + '+' + sha
+
+    with open(version_file, 'w') as f:
+        f.write(content.format(time.asctime(), VERSION, SHORT_VERSION))
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+def make_cuda_ext(name, module, sources, extra_args=[], extra_include_path=[]):
+    return CUDAExtension(
+        name='{}.{}'.format(module, name),
+        define_macros=[('WITH_CUDA', None)],
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        include_dirs=extra_include_path,
+        extra_compile_args={
+            'cxx': [] + extra_args,
+            'nvcc':
+            extra_args + [
+                '-D__CUDA_NO_HALF_OPERATORS__',
+                '-D__CUDA_NO_HALF_CONVERSIONS__',
+                '-D__CUDA_NO_HALF2_OPERATORS__',
+            ]
+        })
+
+
+def make_cython_ext(name, module, sources):
+    extra_compile_args = None
+    if platform.system() != 'Windows':
+        extra_compile_args = {
+            'cxx': ['-Wno-unused-function', '-Wno-write-strings']
+        }
+
+    extension = Extension(
+        '{}.{}'.format(module, name),
+        [os.path.join(*module.split('.'), p) for p in sources],
+        include_dirs=[np.get_include()],
+        language='c++',
+        extra_compile_args=extra_compile_args)
+    extension, = cythonize(extension)
+    return extension
+
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """
+    Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+    Returns:
+        List[str]: list of requirements items
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import sys
+    from os.path import exists
+    import re
+    require_fpath = fname
+
+    def parse_line(line):
+        """
+        Parse information from a line in a requirements text file
+        """
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+if __name__ == '__main__':
+    write_version_py()
+    setup(
+        name='mmdet3d',
+        version=get_version(),
+        description='3D Detection Toolbox',
+        long_description=readme(),
+        keywords='computer vision, 3D object detection',
+        url='https://github.com/ZwwWayne/mmdetection3d',
+        packages=find_packages(exclude=('configs', 'tools', 'demo')),
+        package_data={'mmdet3d.ops': ['*/*.so']},
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.4',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+        ],
+        license='Apache License 2.0',
+        setup_requires=parse_requirements('requirements/build.txt'),
+        tests_require=parse_requirements('requirements/tests.txt'),
+        install_requires=parse_requirements('requirements/runtime.txt'),
+        extras_require={
+            'all': parse_requirements('requirements.txt'),
+            'tests': parse_requirements('requirements/tests.txt'),
+            'build': parse_requirements('requirements/build.txt'),
+            'optional': parse_requirements('requirements/optional.txt'),
+        },
+        ext_modules=[
+            make_cuda_ext(
+                name='sparse_conv_ext',
+                module='mmdet3d.ops.spconv',
+                extra_include_path=[
+                    os.path.join(*'mmdet3d.ops.spconv'.split('.'), 'include/')
+                ],
+                sources=[
+                    'src/all.cc',
+                    'src/reordering.cc',
+                    'src/reordering_cuda.cu',
+                    'src/indice.cc',
+                    'src/indice_cuda.cu',
+                    'src/maxpool.cc',
+                    'src/maxpool_cuda.cu',
+                ],
+                extra_args=['-w', '-std=c++14']),
+            make_cuda_ext(
+                name='iou3d_cuda',
+                module='mmdet3d.ops.iou3d',
+                sources=[
+                    'src/iou3d.cpp',
+                    'src/iou3d_kernel.cu',
+                ]),
+            make_cuda_ext(
+                name='sigmoid_focal_loss_cuda',
+                module='mmdet3d.ops.sigmoid_focal_loss',
+                sources=[
+                    'src/sigmoid_focal_loss.cpp',
+                    'src/sigmoid_focal_loss_cuda.cu'
+                ]),
+            make_cuda_ext(
+                name='voxel_layer',
+                module='mmdet3d.ops.voxel',
+                sources=[
+                    'src/voxelization.cpp',
+                    'src/scatter_points_cpu.cpp',
+                    'src/scatter_points_cuda.cu',
+                    'src/voxelization_cpu.cpp',
+                    'src/voxelization_cuda.cu',
+                ]),
+        ],
+        cmdclass={'build_ext': BuildExtension},
+        zip_safe=False)
diff --git a/tests/test_config.py b/tests/test_config.py
new file mode 100644
index 0000000000..c2ada1d067
--- /dev/null
+++ b/tests/test_config.py
@@ -0,0 +1,293 @@
+from os.path import dirname, exists, join, relpath
+
+from mmdet.core import BitmapMasks, PolygonMasks
+
+
+def _get_config_directory():
+    """ Find the predefined detector config directory """
+    try:
+        # Assume we are running in the source mmdetection repo
+        repo_dpath = dirname(dirname(__file__))
+    except NameError:
+        # For IPython development when this __file__ is not defined
+        import mmdet
+        repo_dpath = dirname(dirname(mmdet.__file__))
+    config_dpath = join(repo_dpath, 'configs')
+    if not exists(config_dpath):
+        raise Exception('Cannot find config path')
+    return config_dpath
+
+
+def test_config_build_detector():
+    """
+    Test that all detection models defined in the configs can be initialized.
+    """
+    from mmcv import Config
+    from mmdet3d.models import build_detector
+
+    config_dpath = _get_config_directory()
+    print('Found config_dpath = {!r}'.format(config_dpath))
+
+    import glob
+    config_fpaths = list(glob.glob(join(config_dpath, '**', '*.py')))
+    config_fpaths = [p for p in config_fpaths if p.find('_base_') == -1]
+    config_names = [relpath(p, config_dpath) for p in config_fpaths]
+
+    print('Using {} config files'.format(len(config_names)))
+
+    for config_fname in config_names:
+        config_fpath = join(config_dpath, config_fname)
+        config_mod = Config.fromfile(config_fpath)
+
+        config_mod.model
+        config_mod.train_cfg
+        config_mod.test_cfg
+        print('Building detector, config_fpath = {!r}'.format(config_fpath))
+
+        # Remove pretrained keys to allow for testing in an offline environment
+        if 'pretrained' in config_mod.model:
+            config_mod.model['pretrained'] = None
+
+        detector = build_detector(
+            config_mod.model,
+            train_cfg=config_mod.train_cfg,
+            test_cfg=config_mod.test_cfg)
+        assert detector is not None
+
+        if 'roi_head' in config_mod.model.keys():
+            # for two stage detector
+            # detectors must have bbox head
+            assert detector.roi_head.with_bbox and detector.with_bbox
+            assert detector.roi_head.with_mask == detector.with_mask
+
+            head_config = config_mod.model['roi_head']
+            _check_roi_head(head_config, detector.roi_head)
+        # else:
+        #     # for single stage detector
+        #     # detectors must have bbox head
+        #     # assert detector.with_bbox
+        #     head_config = config_mod.model['bbox_head']
+        #     _check_bbox_head(head_config, detector.bbox_head)
+
+
+def test_config_data_pipeline():
+    """
+    Test whether the data pipeline is valid and can process corner cases.
+    CommandLine:
+        xdoctest -m tests/test_config.py test_config_build_data_pipeline
+    """
+    from mmcv import Config
+    from mmdet.datasets.pipelines import Compose
+    import numpy as np
+
+    config_dpath = _get_config_directory()
+    print('Found config_dpath = {!r}'.format(config_dpath))
+
+    # Only tests a representative subset of configurations
+    # TODO: test pipelines using Albu, current Albu throw None given empty GT
+    config_names = [
+        'nus/faster_rcnn_r50_fpn_caffe_2x8_1x_nus.py',
+        'nus/retinanet_r50_fpn_caffe_2x8_1x_nus.py',
+        'kitti/'
+        'faster_rcnn_r50_fpn_caffe_1x_kitti-2d-3class_coco-3x-pretrain.py',
+    ]
+
+    def dummy_masks(h, w, num_obj=3, mode='bitmap'):
+        assert mode in ('polygon', 'bitmap')
+        if mode == 'bitmap':
+            masks = np.random.randint(0, 2, (num_obj, h, w), dtype=np.uint8)
+            masks = BitmapMasks(masks, h, w)
+        else:
+            masks = []
+            for i in range(num_obj):
+                masks.append([])
+                masks[-1].append(
+                    np.random.uniform(0, min(h - 1, w - 1), (8 + 4 * i, )))
+                masks[-1].append(
+                    np.random.uniform(0, min(h - 1, w - 1), (10 + 4 * i, )))
+            masks = PolygonMasks(masks, h, w)
+        return masks
+
+    print('Using {} config files'.format(len(config_names)))
+
+    for config_fname in config_names:
+        config_fpath = join(config_dpath, config_fname)
+        config_mod = Config.fromfile(config_fpath)
+
+        # remove loading pipeline
+        loading_pipeline = config_mod.train_pipeline.pop(0)
+        loading_ann_pipeline = config_mod.train_pipeline.pop(0)
+        config_mod.test_pipeline.pop(0)
+
+        train_pipeline = Compose(config_mod.train_pipeline)
+        test_pipeline = Compose(config_mod.test_pipeline)
+
+        print(
+            'Building data pipeline, config_fpath = {!r}'.format(config_fpath))
+
+        print('Test training data pipeline: \n{!r}'.format(train_pipeline))
+        img = np.random.randint(0, 255, size=(888, 666, 3), dtype=np.uint8)
+        if loading_pipeline.get('to_float32', False):
+            img = img.astype(np.float32)
+        mode = 'bitmap' if loading_ann_pipeline.get('poly2mask',
+                                                    True) else 'polygon'
+        results = dict(
+            filename='test_img.png',
+            img=img,
+            img_shape=img.shape,
+            ori_shape=img.shape,
+            gt_bboxes=np.array([[35.2, 11.7, 39.7, 15.7]], dtype=np.float32),
+            gt_labels=np.array([1], dtype=np.int64),
+            gt_masks=dummy_masks(img.shape[0], img.shape[1], mode=mode),
+        )
+        results['bbox_fields'] = ['gt_bboxes']
+        results['mask_fields'] = ['gt_masks']
+        output_results = train_pipeline(results)
+        assert output_results is not None
+
+        print('Test testing data pipeline: \n{!r}'.format(test_pipeline))
+        results = dict(
+            filename='test_img.png',
+            img=img,
+            img_shape=img.shape,
+            ori_shape=img.shape,
+            gt_bboxes=np.array([[35.2, 11.7, 39.7, 15.7]], dtype=np.float32),
+            gt_labels=np.array([1], dtype=np.int64),
+            gt_masks=dummy_masks(img.shape[0], img.shape[1], mode=mode),
+        )
+        results['bbox_fields'] = ['gt_bboxes']
+        results['mask_fields'] = ['gt_masks']
+        output_results = test_pipeline(results)
+        assert output_results is not None
+
+        # test empty GT
+        print('Test empty GT with training data pipeline: \n{!r}'.format(
+            train_pipeline))
+        results = dict(
+            filename='test_img.png',
+            img=img,
+            img_shape=img.shape,
+            ori_shape=img.shape,
+            gt_bboxes=np.zeros((0, 4), dtype=np.float32),
+            gt_labels=np.array([], dtype=np.int64),
+            gt_masks=dummy_masks(
+                img.shape[0], img.shape[1], num_obj=0, mode=mode),
+        )
+        results['bbox_fields'] = ['gt_bboxes']
+        results['mask_fields'] = ['gt_masks']
+        output_results = train_pipeline(results)
+        assert output_results is not None
+
+        print('Test empty GT with testing data pipeline: \n{!r}'.format(
+            test_pipeline))
+        results = dict(
+            filename='test_img.png',
+            img=img,
+            img_shape=img.shape,
+            ori_shape=img.shape,
+            gt_bboxes=np.zeros((0, 4), dtype=np.float32),
+            gt_labels=np.array([], dtype=np.int64),
+            gt_masks=dummy_masks(
+                img.shape[0], img.shape[1], num_obj=0, mode=mode),
+        )
+        results['bbox_fields'] = ['gt_bboxes']
+        results['mask_fields'] = ['gt_masks']
+        output_results = test_pipeline(results)
+        assert output_results is not None
+
+
+def _check_roi_head(config, head):
+    # check consistency between head_config and roi_head
+    assert config['type'] == head.__class__.__name__
+
+    # check roi_align
+    bbox_roi_cfg = config.bbox_roi_extractor
+    bbox_roi_extractor = head.bbox_roi_extractor
+    _check_roi_extractor(bbox_roi_cfg, bbox_roi_extractor)
+
+    # check bbox head infos
+    bbox_cfg = config.bbox_head
+    bbox_head = head.bbox_head
+    _check_bbox_head(bbox_cfg, bbox_head)
+
+    if head.with_mask:
+        # check roi_align
+        if config.mask_roi_extractor:
+            mask_roi_cfg = config.mask_roi_extractor
+            mask_roi_extractor = head.mask_roi_extractor
+            _check_roi_extractor(mask_roi_cfg, mask_roi_extractor,
+                                 bbox_roi_extractor)
+
+        # check mask head infos
+        mask_head = head.mask_head
+        mask_cfg = config.mask_head
+        _check_mask_head(mask_cfg, mask_head)
+
+
+def _check_roi_extractor(config, roi_extractor, prev_roi_extractor=None):
+    import torch.nn as nn
+    if isinstance(roi_extractor, nn.ModuleList):
+        if prev_roi_extractor:
+            prev_roi_extractor = prev_roi_extractor[0]
+        roi_extractor = roi_extractor[0]
+
+    assert (len(config.featmap_strides) == len(roi_extractor.roi_layers))
+    assert (config.out_channels == roi_extractor.out_channels)
+    from torch.nn.modules.utils import _pair
+    assert (_pair(
+        config.roi_layer.out_size) == roi_extractor.roi_layers[0].out_size)
+
+    if 'use_torchvision' in config.roi_layer:
+        assert (config.roi_layer.use_torchvision ==
+                roi_extractor.roi_layers[0].use_torchvision)
+    elif 'aligned' in config.roi_layer:
+        assert (
+            config.roi_layer.aligned == roi_extractor.roi_layers[0].aligned)
+
+    if prev_roi_extractor:
+        assert (roi_extractor.roi_layers[0].aligned ==
+                prev_roi_extractor.roi_layers[0].aligned)
+        assert (roi_extractor.roi_layers[0].use_torchvision ==
+                prev_roi_extractor.roi_layers[0].use_torchvision)
+
+
+def _check_mask_head(mask_cfg, mask_head):
+    import torch.nn as nn
+    if isinstance(mask_cfg, list):
+        for single_mask_cfg, single_mask_head in zip(mask_cfg, mask_head):
+            _check_mask_head(single_mask_cfg, single_mask_head)
+    elif isinstance(mask_head, nn.ModuleList):
+        for single_mask_head in mask_head:
+            _check_mask_head(mask_cfg, single_mask_head)
+    else:
+        assert mask_cfg['type'] == mask_head.__class__.__name__
+        assert mask_cfg.in_channels == mask_head.in_channels
+        assert (
+            mask_cfg.conv_out_channels == mask_head.conv_logits.in_channels)
+        class_agnostic = mask_cfg.get('class_agnostic', False)
+        out_dim = (1 if class_agnostic else mask_cfg.num_classes)
+        assert mask_head.conv_logits.out_channels == out_dim
+
+
+def _check_bbox_head(bbox_cfg, bbox_head):
+    import torch.nn as nn
+    if isinstance(bbox_cfg, list):
+        for single_bbox_cfg, single_bbox_head in zip(bbox_cfg, bbox_head):
+            _check_bbox_head(single_bbox_cfg, single_bbox_head)
+    elif isinstance(bbox_head, nn.ModuleList):
+        for single_bbox_head in bbox_head:
+            _check_bbox_head(bbox_cfg, single_bbox_head)
+    else:
+        assert bbox_cfg['type'] == bbox_head.__class__.__name__
+        assert bbox_cfg.in_channels == bbox_head.in_channels
+        with_cls = bbox_cfg.get('with_cls', True)
+        if with_cls:
+            fc_out_channels = bbox_cfg.get('fc_out_channels', 2048)
+            assert (fc_out_channels == bbox_head.fc_cls.in_features)
+            assert bbox_cfg.num_classes + 1 == bbox_head.fc_cls.out_features
+
+        with_reg = bbox_cfg.get('with_reg', True)
+        if with_reg:
+            out_dim = (4 if bbox_cfg.reg_class_agnostic else 4 *
+                       bbox_cfg.num_classes)
+            assert bbox_head.fc_reg.out_features == out_dim
diff --git a/tools/create_data.py b/tools/create_data.py
new file mode 100644
index 0000000000..3ed16ad269
--- /dev/null
+++ b/tools/create_data.py
@@ -0,0 +1,106 @@
+import argparse
+import os.path as osp
+
+import tools.data_converter.kitti_converter as kitti
+import tools.data_converter.nuscenes_converter as nuscenes_converter
+from tools.data_converter.create_gt_database import create_groundtruth_database
+
+
+def kitti_data_prep(root_path, info_prefix, version, out_dir):
+    kitti.create_kitti_info_file(root_path, info_prefix)
+    kitti.create_reduced_point_cloud(root_path, info_prefix)
+    create_groundtruth_database(
+        'KittiDataset',
+        root_path,
+        info_prefix,
+        '{}/{}_infos_train.pkl'.format(out_dir, info_prefix),
+        relative_path=False,
+        mask_anno_path='instances_train.json',
+        with_mask=(version == 'mask'))
+
+
+def nuscenes_data_prep(root_path,
+                       info_prefix,
+                       version,
+                       dataset_name,
+                       out_dir,
+                       max_sweeps=10):
+    nuscenes_converter.create_nuscenes_infos(
+        root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+    if version == 'v1.0-test':
+        return
+
+    info_train_path = osp.join(root_path,
+                               '{}_infos_train.pkl'.format(info_prefix))
+    info_val_path = osp.join(root_path, '{}_infos_val.pkl'.format(info_prefix))
+    nuscenes_converter.export_2d_annotation(
+        root_path, info_train_path, version=version)
+    nuscenes_converter.export_2d_annotation(
+        root_path, info_val_path, version=version)
+    create_groundtruth_database(
+        dataset_name, root_path, info_prefix,
+        '{}/{}_infos_train.pkl'.format(out_dir, info_prefix))
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+    '--root-path',
+    type=str,
+    default='./data/kitti',
+    help='specify the root path of dataset')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.0',
+    required=False,
+    help='specify the dataset version, no need for kitti')
+parser.add_argument(
+    '--max-sweeps',
+    type=int,
+    default=10,
+    required=False,
+    help='specify sweeps of lidar per example')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default='./data/kitti',
+    required='False',
+    help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    if args.dataset == 'kitti':
+        kitti_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir)
+    elif args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+        train_version = f'{args.version}-trainval'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+        train_version = f'{args.version}'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
diff --git a/tools/create_data.sh b/tools/create_data.sh
new file mode 100644
index 0000000000..4007de4095
--- /dev/null
+++ b/tools/create_data.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+set -x
+export PYTHONPATH=`pwd`:$PYTHONPATH
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+WORK_DIR=$4
+GPUS=${GPUS:-1}
+GPUS_PER_NODE=${GPUS_PER_NODE:-1}
+SRUN_ARGS=${SRUN_ARGS:-""}
+JOB_NAME=create_data
+
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/create_data.py kitti \
+            --root-path ./data/kitti \
+            --out-dir ./data/kitti \
+            --extra-tag kitti
diff --git a/tools/data_converter/__init__.py b/tools/data_converter/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/data_converter/create_gt_database.py b/tools/data_converter/create_gt_database.py
new file mode 100644
index 0000000000..6035a6257c
--- /dev/null
+++ b/tools/data_converter/create_gt_database.py
@@ -0,0 +1,263 @@
+import os.path as osp
+import pickle
+
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+from mmcv import track_iter_progress
+from pycocotools.coco import COCO
+
+import mmdet3d.core.bbox.box_np_ops as box_np_ops
+from mmdet3d.core.evaluation.bbox_overlaps import bbox_overlaps
+from mmdet3d.datasets import build_dataset
+from mmdet.ops import roi_align
+
+
+def _poly2mask(mask_ann, img_h, img_w):
+    if isinstance(mask_ann, list):
+        # polygon -- a single object might consist of multiple parts
+        # we merge all parts into one mask rle code
+        rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        rle = maskUtils.merge(rles)
+    elif isinstance(mask_ann['counts'], list):
+        # uncompressed RLE
+        rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+    else:
+        # rle
+        rle = mask_ann
+    mask = maskUtils.decode(rle)
+    return mask
+
+
+def _parse_coco_ann_info(ann_info):
+    gt_bboxes = []
+    gt_labels = []
+    gt_bboxes_ignore = []
+    gt_masks_ann = []
+
+    for i, ann in enumerate(ann_info):
+        if ann.get('ignore', False):
+            continue
+        x1, y1, w, h = ann['bbox']
+        if ann['area'] <= 0:
+            continue
+        bbox = [x1, y1, x1 + w, y1 + h]
+        if ann.get('iscrowd', False):
+            gt_bboxes_ignore.append(bbox)
+        else:
+            gt_bboxes.append(bbox)
+            gt_masks_ann.append(ann['segmentation'])
+
+    if gt_bboxes:
+        gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+        gt_labels = np.array(gt_labels, dtype=np.int64)
+    else:
+        gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+        gt_labels = np.array([], dtype=np.int64)
+
+    if gt_bboxes_ignore:
+        gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+    else:
+        gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+    ann = dict(
+        bboxes=gt_bboxes, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann)
+
+    return ann
+
+
+def crop_image_patch_v2(pos_proposals, pos_assigned_gt_inds, gt_masks):
+    import torch
+    from torch.nn.modules.utils import _pair
+    device = pos_proposals.device
+    num_pos = pos_proposals.size(0)
+    fake_inds = (
+        torch.arange(num_pos,
+                     device=device).to(dtype=pos_proposals.dtype)[:, None])
+    rois = torch.cat([fake_inds, pos_proposals], dim=1)  # Nx5
+    mask_size = _pair(28)
+    rois = rois.to(device=device)
+    gt_masks_th = (
+        torch.from_numpy(gt_masks).to(device).index_select(
+            0, pos_assigned_gt_inds).to(dtype=rois.dtype))
+    # Use RoIAlign could apparently accelerate the training (~0.1s/iter)
+    targets = (
+        roi_align(gt_masks_th, rois, mask_size[::-1], 1.0, 0, True).squeeze(1))
+    return targets
+
+
+def crop_image_patch(pos_proposals, gt_masks, pos_assigned_gt_inds, org_img):
+    num_pos = pos_proposals.shape[0]
+    masks = []
+    img_patches = []
+    for i in range(num_pos):
+        gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+        bbox = pos_proposals[i, :].astype(np.int32)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1 + 1, 1)
+        h = np.maximum(y2 - y1 + 1, 1)
+
+        mask_patch = gt_mask[y1:y1 + h, x1:x1 + w]
+        masked_img = gt_mask[..., None] * org_img
+        img_patch = masked_img[y1:y1 + h, x1:x1 + w]
+
+        img_patches.append(img_patch)
+        masks.append(mask_patch)
+    return img_patches, masks
+
+
+def create_groundtruth_database(dataset_class_name,
+                                data_path,
+                                info_prefix,
+                                info_path=None,
+                                mask_anno_path=None,
+                                used_classes=None,
+                                database_save_path=None,
+                                db_info_save_path=None,
+                                relative_path=True,
+                                add_rgb=False,
+                                lidar_only=False,
+                                bev_only=False,
+                                coors_range=None,
+                                with_mask=False):
+    print(f'Create GT Database of {dataset_class_name}')
+    dataset_cfg = dict(
+        type=dataset_class_name,
+        root_path=data_path,
+        ann_file=info_path,
+    )
+    if dataset_class_name == 'KittiDataset':
+        dataset_cfg.update(
+            training=True,
+            split='training',
+            modality=dict(
+                use_lidar=True,
+                use_depth=False,
+                use_lidar_intensity=True,
+                use_camera=with_mask,
+            ))
+    dataset = build_dataset(dataset_cfg)
+
+    if database_save_path is None:
+        database_save_path = osp.join(data_path,
+                                      '{}_gt_database'.format(info_prefix))
+    if db_info_save_path is None:
+        db_info_save_path = osp.join(
+            data_path, '{}_dbinfos_train.pkl'.format(info_prefix))
+    mmcv.mkdir_or_exist(database_save_path)
+    all_db_infos = dict()
+
+    if with_mask:
+        coco = COCO(osp.join(data_path, mask_anno_path))
+        imgIds = coco.getImgIds()
+        file2id = dict()
+        for i in imgIds:
+            info = coco.loadImgs([i])[0]
+            file2id.update({info['file_name']: i})
+
+    group_counter = 0
+    for j in track_iter_progress(list(range(len(dataset)))):
+        image_idx = j
+        annos = dataset.get_sensor_data(j)
+        image_idx = annos['sample_idx']
+        points = annos['points']
+        gt_boxes_3d = annos['gt_bboxes_3d']
+        names = annos['gt_names']
+        group_dict = dict()
+        group_ids = np.full([gt_boxes_3d.shape[0]], -1, dtype=np.int64)
+        if 'group_ids' in annos:
+            group_ids = annos['group_ids']
+        else:
+            group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64)
+        difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32)
+        if 'difficulty' in annos:
+            difficulty = annos['difficulty']
+
+        num_obj = gt_boxes_3d.shape[0]
+        point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d)
+
+        if with_mask:
+            # prepare masks
+            gt_boxes = annos['gt_bboxes']
+            img_path = annos['filename'].split('/')[-1]
+            if img_path not in file2id.keys():
+                print('skip image {} for empty mask'.format(img_path))
+                continue
+            img_id = file2id[img_path]
+            kins_annIds = coco.getAnnIds(imgIds=img_id)
+            kins_raw_info = coco.loadAnns(kins_annIds)
+            kins_ann_info = _parse_coco_ann_info(kins_raw_info)
+            h, w = annos['img_shape'][:2]
+            gt_masks = [
+                _poly2mask(mask, h, w) for mask in kins_ann_info['masks']
+            ]
+            # get mask inds based on iou mapping
+            bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes)
+            mask_inds = bbox_iou.argmax(axis=0)
+            valid_inds = (bbox_iou.max(axis=0) > 0.5)
+
+            # mask the image
+            # use more precise crop when it is ready
+            # object_img_patches = np.ascontiguousarray(
+            #     np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2))
+            # crop image patches using roi_align
+            # object_img_patches = crop_image_patch_v2(
+            #     torch.Tensor(gt_boxes),
+            #     torch.Tensor(mask_inds).long(), object_img_patches)
+            object_img_patches, object_masks = crop_image_patch(
+                gt_boxes, gt_masks, mask_inds, annos['img'])
+
+        for i in range(num_obj):
+            filename = f'{image_idx}_{names[i]}_{i}.bin'
+            filepath = osp.join(database_save_path, filename)
+
+            # save point clouds and image patches for each object
+            gt_points = points[point_indices[:, i]]
+            gt_points[:, :3] -= gt_boxes_3d[i, :3]
+
+            if with_mask:
+                if object_masks[i].sum() == 0 or not valid_inds[i]:
+                    # Skip object for empty or invalid mask
+                    continue
+                img_patch_path = filepath + '.png'
+                mask_patch_path = filepath + '.mask.png'
+                mmcv.imwrite(object_img_patches[i], img_patch_path)
+                mmcv.imwrite(object_masks[i], mask_patch_path)
+
+            with open(filepath, 'w') as f:
+                gt_points.tofile(f)
+
+            if (used_classes is None) or names[i] in used_classes:
+                if relative_path:
+                    db_path = osp.join(data_path, filename)
+                else:
+                    db_path = filepath
+                db_info = {
+                    'name': names[i],
+                    'path': db_path,
+                    'image_idx': image_idx,
+                    'gt_idx': i,
+                    'box3d_lidar': gt_boxes_3d[i],
+                    'num_points_in_gt': gt_points.shape[0],
+                    'difficulty': difficulty[i],
+                }
+                local_group_id = group_ids[i]
+                # if local_group_id >= 0:
+                if local_group_id not in group_dict:
+                    group_dict[local_group_id] = group_counter
+                    group_counter += 1
+                db_info['group_id'] = group_dict[local_group_id]
+                if 'score' in annos:
+                    db_info['score'] = annos['score'][i]
+                if with_mask:
+                    db_info.update({'box2d_camera': gt_boxes[i]})
+                if names[i] in all_db_infos:
+                    all_db_infos[names[i]].append(db_info)
+                else:
+                    all_db_infos[names[i]] = [db_info]
+
+    for k, v in all_db_infos.items():
+        print(f'load {len(v)} {k} database infos')
+
+    with open(db_info_save_path, 'wb') as f:
+        pickle.dump(all_db_infos, f)
diff --git a/tools/data_converter/kitti_converter.py b/tools/data_converter/kitti_converter.py
new file mode 100644
index 0000000000..3c470b75a4
--- /dev/null
+++ b/tools/data_converter/kitti_converter.py
@@ -0,0 +1,204 @@
+import pickle
+from pathlib import Path
+
+import numpy as np
+from mmcv import track_iter_progress
+
+from mmdet3d.core.bbox import box_np_ops
+from .kitti_data_utils import get_kitti_image_info
+
+
+def convert_to_kitti_info_version2(info):
+    """convert kitti info v1 to v2 if possible.
+    """
+    if 'image' not in info or 'calib' not in info or 'point_cloud' not in info:
+        info['image'] = {
+            'image_shape': info['img_shape'],
+            'image_idx': info['image_idx'],
+            'image_path': info['img_path'],
+        }
+        info['calib'] = {
+            'R0_rect': info['calib/R0_rect'],
+            'Tr_velo_to_cam': info['calib/Tr_velo_to_cam'],
+            'P2': info['calib/P2'],
+        }
+        info['point_cloud'] = {
+            'velodyne_path': info['velodyne_path'],
+        }
+
+
+def _read_imageset_file(path):
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    return [int(line) for line in lines]
+
+
+def _calculate_num_points_in_gt(data_path,
+                                infos,
+                                relative_path,
+                                remove_outside=True,
+                                num_features=4):
+    for info in track_iter_progress(infos):
+        pc_info = info['point_cloud']
+        image_info = info['image']
+        calib = info['calib']
+        if relative_path:
+            v_path = str(Path(data_path) / pc_info['velodyne_path'])
+        else:
+            v_path = pc_info['velodyne_path']
+        points_v = np.fromfile(
+            v_path, dtype=np.float32, count=-1).reshape([-1, num_features])
+        rect = calib['R0_rect']
+        Trv2c = calib['Tr_velo_to_cam']
+        P2 = calib['P2']
+        if remove_outside:
+            points_v = box_np_ops.remove_outside_points(
+                points_v, rect, Trv2c, P2, image_info['image_shape'])
+
+        # points_v = points_v[points_v[:, 0] > 0]
+        annos = info['annos']
+        num_obj = len([n for n in annos['name'] if n != 'DontCare'])
+        # annos = kitti.filter_kitti_anno(annos, ['DontCare'])
+        dims = annos['dimensions'][:num_obj]
+        loc = annos['location'][:num_obj]
+        rots = annos['rotation_y'][:num_obj]
+        gt_boxes_camera = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                         axis=1)
+        gt_boxes_lidar = box_np_ops.box_camera_to_lidar(
+            gt_boxes_camera, rect, Trv2c)
+        indices = box_np_ops.points_in_rbbox(points_v[:, :3], gt_boxes_lidar)
+        num_points_in_gt = indices.sum(0)
+        num_ignored = len(annos['dimensions']) - num_obj
+        num_points_in_gt = np.concatenate(
+            [num_points_in_gt, -np.ones([num_ignored])])
+        annos['num_points_in_gt'] = num_points_in_gt.astype(np.int32)
+
+
+def create_kitti_info_file(data_path,
+                           pkl_prefix='kitti_',
+                           save_path=None,
+                           relative_path=True):
+    imageset_folder = Path(data_path) / 'ImageSets'
+    train_img_ids = _read_imageset_file(
+        str(imageset_folder / 'train_6014.txt'))
+    val_img_ids = _read_imageset_file(str(imageset_folder / 'val_1467.txt'))
+    test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+
+    print('Generate info. this may take several minutes.')
+    if save_path is None:
+        save_path = Path(data_path)
+    else:
+        save_path = Path(save_path)
+    kitti_infos_train = get_kitti_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        image_ids=train_img_ids,
+        relative_path=relative_path)
+    _calculate_num_points_in_gt(data_path, kitti_infos_train, relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_train.pkl'
+    print(f'Kitti info train file is saved to {filename}')
+    with open(filename, 'wb') as f:
+        pickle.dump(kitti_infos_train, f)
+    kitti_infos_val = get_kitti_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        image_ids=val_img_ids,
+        relative_path=relative_path)
+    _calculate_num_points_in_gt(data_path, kitti_infos_val, relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_val.pkl'
+    print(f'Kitti info val file is saved to {filename}')
+    with open(filename, 'wb') as f:
+        pickle.dump(kitti_infos_val, f)
+    filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+    print(f'Kitti info trainval file is saved to {filename}')
+    with open(filename, 'wb') as f:
+        pickle.dump(kitti_infos_train + kitti_infos_val, f)
+
+    kitti_infos_test = get_kitti_image_info(
+        data_path,
+        training=False,
+        label_info=False,
+        velodyne=True,
+        calib=True,
+        image_ids=test_img_ids,
+        relative_path=relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+    print(f'Kitti info test file is saved to {filename}')
+    with open(filename, 'wb') as f:
+        pickle.dump(kitti_infos_test, f)
+
+
+def _create_reduced_point_cloud(data_path,
+                                info_path,
+                                save_path=None,
+                                back=False):
+    with open(info_path, 'rb') as f:
+        kitti_infos = pickle.load(f)
+
+    for info in track_iter_progress(kitti_infos):
+        pc_info = info['point_cloud']
+        image_info = info['image']
+        calib = info['calib']
+
+        v_path = pc_info['velodyne_path']
+        v_path = Path(data_path) / v_path
+        points_v = np.fromfile(
+            str(v_path), dtype=np.float32, count=-1).reshape([-1, 4])
+        rect = calib['R0_rect']
+        P2 = calib['P2']
+        Trv2c = calib['Tr_velo_to_cam']
+        # first remove z < 0 points
+        # keep = points_v[:, -1] > 0
+        # points_v = points_v[keep]
+        # then remove outside.
+        if back:
+            points_v[:, 0] = -points_v[:, 0]
+        points_v = box_np_ops.remove_outside_points(points_v, rect, Trv2c, P2,
+                                                    image_info['image_shape'])
+        if save_path is None:
+            save_dir = v_path.parent.parent / (v_path.parent.stem + '_reduced')
+            if not save_dir.exists():
+                save_dir.mkdir()
+            save_filename = save_dir / v_path.name
+            # save_filename = str(v_path) + '_reduced'
+            if back:
+                save_filename += '_back'
+        else:
+            save_filename = str(Path(save_path) / v_path.name)
+            if back:
+                save_filename += '_back'
+        with open(save_filename, 'w') as f:
+            points_v.tofile(f)
+
+
+def create_reduced_point_cloud(data_path,
+                               pkl_prefix,
+                               train_info_path=None,
+                               val_info_path=None,
+                               test_info_path=None,
+                               save_path=None,
+                               with_back=False):
+    if train_info_path is None:
+        train_info_path = Path(data_path) / f'{pkl_prefix}_infos_train.pkl'
+    if val_info_path is None:
+        val_info_path = Path(data_path) / f'{pkl_prefix}_infos_val.pkl'
+    if test_info_path is None:
+        test_info_path = Path(data_path) / f'{pkl_prefix}_infos_test.pkl'
+
+    print('create reduced point cloud for training set')
+    _create_reduced_point_cloud(data_path, train_info_path, save_path)
+    print('create reduced point cloud for validatin set')
+    _create_reduced_point_cloud(data_path, val_info_path, save_path)
+    print('create reduced point cloud for testing set')
+    _create_reduced_point_cloud(data_path, test_info_path, save_path)
+    if with_back:
+        _create_reduced_point_cloud(
+            data_path, train_info_path, save_path, back=True)
+        _create_reduced_point_cloud(
+            data_path, val_info_path, save_path, back=True)
+        _create_reduced_point_cloud(
+            data_path, test_info_path, save_path, back=True)
diff --git a/tools/data_converter/kitti_data_utils.py b/tools/data_converter/kitti_data_utils.py
new file mode 100644
index 0000000000..6e15c23009
--- /dev/null
+++ b/tools/data_converter/kitti_data_utils.py
@@ -0,0 +1,355 @@
+import concurrent.futures as futures
+from collections import OrderedDict
+from pathlib import Path
+
+import numpy as np
+from skimage import io
+
+
+def get_image_index_str(img_idx):
+    return '{:06d}'.format(img_idx)
+
+
+def get_kitti_info_path(idx,
+                        prefix,
+                        info_type='image_2',
+                        file_tail='.png',
+                        training=True,
+                        relative_path=True,
+                        exist_check=True):
+    img_idx_str = get_image_index_str(idx)
+    img_idx_str += file_tail
+    prefix = Path(prefix)
+    if training:
+        file_path = Path('training') / info_type / img_idx_str
+    else:
+        file_path = Path('testing') / info_type / img_idx_str
+    if exist_check and not (prefix / file_path).exists():
+        raise ValueError('file not exist: {}'.format(file_path))
+    if relative_path:
+        return str(file_path)
+    else:
+        return str(prefix / file_path)
+
+
+def get_image_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True):
+    return get_kitti_info_path(idx, prefix, 'image_2', '.png', training,
+                               relative_path, exist_check)
+
+
+def get_label_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True):
+    return get_kitti_info_path(idx, prefix, 'label_2', '.txt', training,
+                               relative_path, exist_check)
+
+
+def get_velodyne_path(idx,
+                      prefix,
+                      training=True,
+                      relative_path=True,
+                      exist_check=True):
+    return get_kitti_info_path(idx, prefix, 'velodyne', '.bin', training,
+                               relative_path, exist_check)
+
+
+def get_calib_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True):
+    return get_kitti_info_path(idx, prefix, 'calib', '.txt', training,
+                               relative_path, exist_check)
+
+
+def get_label_anno(label_path):
+    annotations = {}
+    annotations.update({
+        'name': [],
+        'truncated': [],
+        'occluded': [],
+        'alpha': [],
+        'bbox': [],
+        'dimensions': [],
+        'location': [],
+        'rotation_y': []
+    })
+    with open(label_path, 'r') as f:
+        lines = f.readlines()
+    # if len(lines) == 0 or len(lines[0]) < 15:
+    #     content = []
+    # else:
+    content = [line.strip().split(' ') for line in lines]
+    num_objects = len([x[0] for x in content if x[0] != 'DontCare'])
+    annotations['name'] = np.array([x[0] for x in content])
+    num_gt = len(annotations['name'])
+    annotations['truncated'] = np.array([float(x[1]) for x in content])
+    annotations['occluded'] = np.array([int(x[2]) for x in content])
+    annotations['alpha'] = np.array([float(x[3]) for x in content])
+    annotations['bbox'] = np.array([[float(info) for info in x[4:8]]
+                                    for x in content]).reshape(-1, 4)
+    # dimensions will convert hwl format to standard lhw(camera) format.
+    annotations['dimensions'] = np.array([[float(info) for info in x[8:11]]
+                                          for x in content
+                                          ]).reshape(-1, 3)[:, [2, 0, 1]]
+    annotations['location'] = np.array([[float(info) for info in x[11:14]]
+                                        for x in content]).reshape(-1, 3)
+    annotations['rotation_y'] = np.array([float(x[14])
+                                          for x in content]).reshape(-1)
+    if len(content) != 0 and len(content[0]) == 16:  # have score
+        annotations['score'] = np.array([float(x[15]) for x in content])
+    else:
+        annotations['score'] = np.zeros((annotations['bbox'].shape[0], ))
+    index = list(range(num_objects)) + [-1] * (num_gt - num_objects)
+    annotations['index'] = np.array(index, dtype=np.int32)
+    annotations['group_ids'] = np.arange(num_gt, dtype=np.int32)
+    return annotations
+
+
+def _extend_matrix(mat):
+    mat = np.concatenate([mat, np.array([[0., 0., 0., 1.]])], axis=0)
+    return mat
+
+
+def get_kitti_image_info(path,
+                         training=True,
+                         label_info=True,
+                         velodyne=False,
+                         calib=False,
+                         image_ids=7481,
+                         extend_matrix=True,
+                         num_worker=8,
+                         relative_path=True,
+                         with_imageshape=True):
+    # image_infos = []
+    """
+    KITTI annotation format version 2:
+    {
+        [optional]points: [N, 3+] point cloud
+        [optional, for kitti]image: {
+            image_idx: ...
+            image_path: ...
+            image_shape: ...
+        }
+        point_cloud: {
+            num_features: 4
+            velodyne_path: ...
+        }
+        [optional, for kitti]calib: {
+            R0_rect: ...
+            Tr_velo_to_cam: ...
+            P2: ...
+        }
+        annos: {
+            location: [num_gt, 3] array
+            dimensions: [num_gt, 3] array
+            rotation_y: [num_gt] angle array
+            name: [num_gt] ground truth name array
+            [optional]difficulty: kitti difficulty
+            [optional]group_ids: used for multi-part object
+        }
+    }
+    """
+    root_path = Path(path)
+    if not isinstance(image_ids, list):
+        image_ids = list(range(image_ids))
+
+    def map_func(idx):
+        info = {}
+        pc_info = {'num_features': 4}
+        calib_info = {}
+
+        image_info = {'image_idx': idx}
+        annotations = None
+        if velodyne:
+            pc_info['velodyne_path'] = get_velodyne_path(
+                idx, path, training, relative_path)
+        image_info['image_path'] = get_image_path(idx, path, training,
+                                                  relative_path)
+        if with_imageshape:
+            img_path = image_info['image_path']
+            if relative_path:
+                img_path = str(root_path / img_path)
+            image_info['image_shape'] = np.array(
+                io.imread(img_path).shape[:2], dtype=np.int32)
+        if label_info:
+            label_path = get_label_path(idx, path, training, relative_path)
+            if relative_path:
+                label_path = str(root_path / label_path)
+            annotations = get_label_anno(label_path)
+        info['image'] = image_info
+        info['point_cloud'] = pc_info
+        if calib:
+            calib_path = get_calib_path(
+                idx, path, training, relative_path=False)
+            with open(calib_path, 'r') as f:
+                lines = f.readlines()
+            P0 = np.array([float(info) for info in lines[0].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P1 = np.array([float(info) for info in lines[1].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P2 = np.array([float(info) for info in lines[2].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P3 = np.array([float(info) for info in lines[3].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            if extend_matrix:
+                P0 = _extend_matrix(P0)
+                P1 = _extend_matrix(P1)
+                P2 = _extend_matrix(P2)
+                P3 = _extend_matrix(P3)
+            R0_rect = np.array([
+                float(info) for info in lines[4].split(' ')[1:10]
+            ]).reshape([3, 3])
+            if extend_matrix:
+                rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
+                rect_4x4[3, 3] = 1.
+                rect_4x4[:3, :3] = R0_rect
+            else:
+                rect_4x4 = R0_rect
+
+            Tr_velo_to_cam = np.array([
+                float(info) for info in lines[5].split(' ')[1:13]
+            ]).reshape([3, 4])
+            Tr_imu_to_velo = np.array([
+                float(info) for info in lines[6].split(' ')[1:13]
+            ]).reshape([3, 4])
+            if extend_matrix:
+                Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
+                Tr_imu_to_velo = _extend_matrix(Tr_imu_to_velo)
+            calib_info['P0'] = P0
+            calib_info['P1'] = P1
+            calib_info['P2'] = P2
+            calib_info['P3'] = P3
+            calib_info['R0_rect'] = rect_4x4
+            calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam
+            calib_info['Tr_imu_to_velo'] = Tr_imu_to_velo
+            info['calib'] = calib_info
+
+        if annotations is not None:
+            info['annos'] = annotations
+            add_difficulty_to_annos(info)
+        return info
+
+    with futures.ThreadPoolExecutor(num_worker) as executor:
+        image_infos = executor.map(map_func, image_ids)
+
+    return list(image_infos)
+
+
+def kitti_anno_to_label_file(annos, folder):
+    folder = Path(folder)
+    for anno in annos:
+        image_idx = anno['metadata']['image_idx']
+        label_lines = []
+        for j in range(anno['bbox'].shape[0]):
+            label_dict = {
+                'name': anno['name'][j],
+                'alpha': anno['alpha'][j],
+                'bbox': anno['bbox'][j],
+                'location': anno['location'][j],
+                'dimensions': anno['dimensions'][j],
+                'rotation_y': anno['rotation_y'][j],
+                'score': anno['score'][j],
+            }
+            label_line = kitti_result_line(label_dict)
+            label_lines.append(label_line)
+        label_file = folder / f'{get_image_index_str(image_idx)}.txt'
+        label_str = '\n'.join(label_lines)
+        with open(label_file, 'w') as f:
+            f.write(label_str)
+
+
+def add_difficulty_to_annos(info):
+    min_height = [40, 25,
+                  25]  # minimum height for evaluated groundtruth/detections
+    max_occlusion = [
+        0, 1, 2
+    ]  # maximum occlusion level of the groundtruth used for evaluation
+    max_trunc = [
+        0.15, 0.3, 0.5
+    ]  # maximum truncation level of the groundtruth used for evaluation
+    annos = info['annos']
+    dims = annos['dimensions']  # lhw format
+    bbox = annos['bbox']
+    height = bbox[:, 3] - bbox[:, 1]
+    occlusion = annos['occluded']
+    truncation = annos['truncated']
+    diff = []
+    easy_mask = np.ones((len(dims), ), dtype=np.bool)
+    moderate_mask = np.ones((len(dims), ), dtype=np.bool)
+    hard_mask = np.ones((len(dims), ), dtype=np.bool)
+    i = 0
+    for h, o, t in zip(height, occlusion, truncation):
+        if o > max_occlusion[0] or h <= min_height[0] or t > max_trunc[0]:
+            easy_mask[i] = False
+        if o > max_occlusion[1] or h <= min_height[1] or t > max_trunc[1]:
+            moderate_mask[i] = False
+        if o > max_occlusion[2] or h <= min_height[2] or t > max_trunc[2]:
+            hard_mask[i] = False
+        i += 1
+    is_easy = easy_mask
+    is_moderate = np.logical_xor(easy_mask, moderate_mask)
+    is_hard = np.logical_xor(hard_mask, moderate_mask)
+
+    for i in range(len(dims)):
+        if is_easy[i]:
+            diff.append(0)
+        elif is_moderate[i]:
+            diff.append(1)
+        elif is_hard[i]:
+            diff.append(2)
+        else:
+            diff.append(-1)
+    annos['difficulty'] = np.array(diff, np.int32)
+    return diff
+
+
+def kitti_result_line(result_dict, precision=4):
+    prec_float = '{' + ':.{}f'.format(precision) + '}'
+    res_line = []
+    all_field_default = OrderedDict([
+        ('name', None),
+        ('truncated', -1),
+        ('occluded', -1),
+        ('alpha', -10),
+        ('bbox', None),
+        ('dimensions', [-1, -1, -1]),
+        ('location', [-1000, -1000, -1000]),
+        ('rotation_y', -10),
+        ('score', 0.0),
+    ])
+    res_dict = [(key, None) for key, val in all_field_default.items()]
+    res_dict = OrderedDict(res_dict)
+    for key, val in result_dict.items():
+        if all_field_default[key] is None and val is None:
+            raise ValueError('you must specify a value for {}'.format(key))
+        res_dict[key] = val
+
+    for key, val in res_dict.items():
+        if key == 'name':
+            res_line.append(val)
+        elif key in ['truncated', 'alpha', 'rotation_y', 'score']:
+            if val is None:
+                res_line.append(str(all_field_default[key]))
+            else:
+                res_line.append(prec_float.format(val))
+        elif key == 'occluded':
+            if val is None:
+                res_line.append(str(all_field_default[key]))
+            else:
+                res_line.append('{}'.format(val))
+        elif key in ['bbox', 'dimensions', 'location']:
+            if val is None:
+                res_line += [str(v) for v in all_field_default[key]]
+            else:
+                res_line += [prec_float.format(v) for v in val]
+        else:
+            raise ValueError('unknown key. supported key:{}'.format(
+                res_dict.keys()))
+    return ' '.join(res_line)
diff --git a/tools/data_converter/nuscenes_converter.py b/tools/data_converter/nuscenes_converter.py
new file mode 100644
index 0000000000..ee3bf593a0
--- /dev/null
+++ b/tools/data_converter/nuscenes_converter.py
@@ -0,0 +1,503 @@
+import os.path as osp
+from collections import OrderedDict
+from typing import List, Tuple, Union
+
+import mmcv
+import numpy as np
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+from pyquaternion import Quaternion
+from shapely.geometry import MultiPoint, box
+
+from mmdet3d.datasets import NuScenesDataset
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+
+def create_nuscenes_infos(root_path,
+                          info_prefix,
+                          version='v1.0-trainval',
+                          max_sweeps=10):
+    from nuscenes.nuscenes import NuScenes
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    from nuscenes.utils import splits
+    available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini']
+    assert version in available_vers
+    if version == 'v1.0-trainval':
+        train_scenes = splits.train
+        val_scenes = splits.val
+    elif version == 'v1.0-test':
+        train_scenes = splits.test
+        val_scenes = []
+    elif version == 'v1.0-mini':
+        train_scenes = splits.mini_train
+        val_scenes = splits.mini_val
+    else:
+        raise ValueError('unknown')
+
+    # filter existing scenes.
+    available_scenes = _get_available_scenes(nusc)
+    available_scene_names = [s['name'] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes))
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in train_scenes
+    ])
+    val_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in val_scenes
+    ])
+
+    test = 'test' in version
+    if test:
+        print('test scene: {}'.format(len(train_scenes)))
+    else:
+        print('train scene: {}, val scene: {}'.format(
+            len(train_scenes), len(val_scenes)))
+    train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
+        nusc, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+    metadata = dict(version=version)
+    if test:
+        print('test sample: {}'.format(len(train_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(root_path,
+                             '{}_infos_test.pkl'.format(info_prefix))
+        mmcv.dump(data, info_path)
+    else:
+        print('train sample: {}, val sample: {}'.format(
+            len(train_nusc_infos), len(val_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(root_path,
+                             '{}_infos_train.pkl'.format(info_prefix))
+        mmcv.dump(data, info_path)
+        data['infos'] = val_nusc_infos
+        info_val_path = osp.join(root_path,
+                                 '{}_infos_val.pkl'.format(info_prefix))
+        mmcv.dump(data, info_val_path)
+
+
+def _get_available_scenes(nusc):
+    available_scenes = []
+    print('total scene num: {}'.format(len(nusc.scene)))
+    for scene in nusc.scene:
+        scene_token = scene['token']
+        scene_rec = nusc.get('scene', scene_token)
+        sample_rec = nusc.get('sample', scene_rec['first_sample_token'])
+        sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+        has_more_frames = True
+        scene_not_exist = False
+        while has_more_frames:
+            lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token'])
+            if not mmcv.is_filepath(lidar_path):
+                scene_not_exist = True
+                break
+            else:
+                break
+            if not sd_rec['next'] == '':
+                sd_rec = nusc.get('sample_data', sd_rec['next'])
+            else:
+                has_more_frames = False
+        if scene_not_exist:
+            continue
+        available_scenes.append(scene)
+    print('exist scene num: {}'.format(len(available_scenes)))
+    return available_scenes
+
+
+def _fill_trainval_infos(nusc,
+                         train_scenes,
+                         val_scenes,
+                         test=False,
+                         max_sweeps=10):
+    train_nusc_infos = []
+    val_nusc_infos = []
+
+    for sample in mmcv.track_iter_progress(nusc.sample):
+        lidar_token = sample['data']['LIDAR_TOP']
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = nusc.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+        lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
+
+        mmcv.check_file_exist(lidar_path, msg_tmpl='file "{}" does not exist.')
+
+        info = {
+            'lidar_path': lidar_path,
+            'token': sample['token'],
+            'sweeps': [],
+            'cams': dict(),
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'timestamp': sample['timestamp'],
+        }
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+        for cam in camera_types:
+            cam_token = sample['data'][cam]
+            cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
+                                         e2g_t, e2g_r_mat, cam)
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info['cams'].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec['prev'] == '':
+                sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t,
+                                          l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+                sweeps.append(sweep)
+                sd_rec = nusc.get('sample_data', sd_rec['prev'])
+            else:
+                break
+        info['sweeps'] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                nusc.get('sample_annotation', token)
+                for token in sample['anns']
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array([b.orientation.yaw_pitch_roll[0]
+                             for b in boxes]).reshape(-1, 1)
+            velocity = np.array(
+                [nusc.box_velocity(token)[:2] for token in sample['anns']])
+            # convert velo from global to lidar
+            for i in range(len(boxes)):
+                velo = np.array([*velocity[i], 0.0])
+                velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(
+                    l2e_r_mat).T
+                velocity[i] = velo[:2]
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in NuScenesDataset.NameMapping:
+                    names[i] = NuScenesDataset.NameMapping[names[i]]
+            names = np.array(names)
+            # we need to convert rot to SECOND format.
+            gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+            assert len(gt_boxes) == len(
+                annotations), f'{len(gt_boxes)}, {len(annotations)}'
+            info['gt_boxes'] = gt_boxes
+            info['gt_names'] = names
+            info['gt_velocity'] = velocity.reshape(-1, 2)
+            info['num_lidar_pts'] = np.array(
+                [a['num_lidar_pts'] for a in annotations])
+            info['num_radar_pts'] = np.array(
+                [a['num_radar_pts'] for a in annotations])
+
+        if sample['scene_token'] in train_scenes:
+            train_nusc_infos.append(info)
+        else:
+            val_nusc_infos.append(info)
+
+    return train_nusc_infos, val_nusc_infos
+
+
+def obtain_sensor2top(nusc,
+                      sensor_token,
+                      l2e_t,
+                      l2e_r_mat,
+                      e2g_t,
+                      e2g_r_mat,
+                      sensor_type='lidar'):
+    """Obtain the info with RT matric from general sensor to Top LiDAR
+    """
+    sd_rec = nusc.get('sample_data', sensor_token)
+    cs_record = nusc.get('calibrated_sensor',
+                         sd_rec['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    data_path = nusc.get_sample_data_path(sd_rec['token'])
+    sweep = {
+        'data_path': data_path,
+        'type': sensor_type,
+        'sample_data_token': sd_rec['token'],
+        'sensor2ego_translation': cs_record['translation'],
+        'sensor2ego_rotation': cs_record['rotation'],
+        'ego2global_translation': pose_record['translation'],
+        'ego2global_rotation': pose_record['rotation'],
+        'timestamp': sd_rec['timestamp']
+    }
+    l2e_r_s = sweep['sensor2ego_rotation']
+    l2e_t_s = sweep['sensor2ego_translation']
+    e2g_r_s = sweep['ego2global_rotation']
+    e2g_t_s = sweep['ego2global_translation']
+
+    # obtain the RT from sensor to Top LiDAR
+    # sweep->ego->global->ego'->lidar
+    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+                  ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+    sweep['sensor2lidar_rotation'] = R.T  # points @ R.T + T
+    sweep['sensor2lidar_translation'] = T
+    return sweep
+
+
+def export_2d_annotation(root_path, info_path, version):
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    nusc_infos = mmcv.load(info_path)['infos']
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    # info_2d_list = []
+    cat2Ids = [
+        dict(id=nus_categories.index(cat_name), name=cat_name)
+        for cat_name in nus_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    for info in mmcv.track_iter_progress(nusc_infos):
+        # info_2d = dict(token=info['token'],
+        #                timestamp=info['timestamp'],
+        #                cams=dict())
+        for cam in camera_types:
+            cam_info = info['cams'][cam]
+            coco_infos = get_2d_boxes(
+                nusc,
+                cam_info['sample_data_token'],
+                visibilities=['', '1', '2', '3', '4'])
+            (height, width, _) = mmcv.imread(cam_info['data_path']).shape
+            coco_2d_dict['images'].append(
+                dict(
+                    file_name=cam_info['data_path'],
+                    id=cam_info['sample_data_token'],
+                    width=width,
+                    height=height))
+            for coco_info in coco_infos:
+                if coco_info is None:
+                    continue
+                # add an empty key for coco format
+                coco_info['segmentation'] = []
+                coco_info['id'] = coco_ann_id
+                coco_2d_dict['annotations'].append(coco_info)
+                coco_ann_id += 1
+            # gt_bbox_2d = [res['bbox_corners'] for res in anno_info]
+            # gt_names_2d = [res['category_name'] for res in anno_info]
+            # for i in range(len(gt_names_2d)):
+            #     if gt_names_2d[i] in NuScenesDataset.NameMapping:
+            #         gt_names_2d[i] = NuScenesDataset.NameMapping[
+            #               gt_names_2d[i]]
+            # assert len(gt_bbox_2d) == len(gt_names_2d)
+            # gt_bbox_2d = np.array(gt_bbox_2d, dtype=np.float32)
+            # gt_names_2d = np.array(gt_names_2d)
+            # info_2d['cams'][cam] = dict(
+            #     data_path=info['cams'][cam]['data_path'],
+            #     type=info['cams'][cam]['type'],
+            #     token=info['cams'][cam]['sample_data_token'],
+            #     gt_boxes=gt_bbox_2d,
+            #     gt_names=gt_names_2d)
+        # info_2d_list.append(info_2d)
+    # mmcv.dump(
+    #     info_2d_list,
+    #     osp.join(root_path,
+    #     '{}_2d_infos_train.pkl'.format(info_prefix)))
+    mmcv.dump(coco_2d_dict, '{}.coco.json'.format(info_path[:-4]))
+
+
+def get_2d_boxes(nusc, sample_data_token: str,
+                 visibilities: List[str]) -> List[OrderedDict]:
+    """Get the 2D annotation records for a given `sample_data_token`.
+
+    Args:
+        sample_data_token: Sample data token belonging to a camera keyframe.
+        visibilities: Visibility filter.
+
+    Return:
+        list: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+
+    # Get the sample data and the sample corresponding to that sample data.
+    sd_rec = nusc.get('sample_data', sample_data_token)
+
+    assert sd_rec[
+        'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+        ' for camera sample_data!'
+    if not sd_rec['is_key_frame']:
+        raise ValueError(
+            'The 2D re-projections are available only for keyframes.')
+
+    s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+    # Get the calibrated sensor and ego pose
+    # record to get the transformation matrices.
+    cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+    pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+    # Get all the annotation with the specified visibilties.
+    ann_recs = [
+        nusc.get('sample_annotation', token) for token in s_rec['anns']
+    ]
+    ann_recs = [
+        ann_rec for ann_rec in ann_recs
+        if (ann_rec['visibility_token'] in visibilities)
+    ]
+
+    repro_recs = []
+
+    for ann_rec in ann_recs:
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = ann_rec['token']
+        ann_rec['sample_data_token'] = sample_data_token
+
+        # Get the box in global coordinates.
+        box = nusc.get_box(ann_rec['token'])
+
+        # Move them to the ego-pose frame.
+        box.translate(-np.array(pose_rec['translation']))
+        box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+        # Move them to the calibrated sensor frame.
+        box.translate(-np.array(cs_rec['translation']))
+        box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box.corners()
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token, sd_rec['filename'])
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def post_process_coords(
+    corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
+) -> Union[Tuple[float, float, float, float], None]:
+    """
+    Get the intersection of the convex hull of the reprojected
+    bbox corners and the image canvas, return None if no
+    intersection.
+
+    corner_coords: Corner coordinates of reprojected bounding box.
+    imsize: Size of the image canvas.
+
+    Return:
+        Intersection of the convex hull of the 2D box corners and the image
+        canvas.
+    """
+    polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+    img_canvas = box(0, 0, imsize[0], imsize[1])
+
+    if polygon_from_2d_box.intersects(img_canvas):
+        img_intersection = polygon_from_2d_box.intersection(img_canvas)
+        intersection_coords = np.array(
+            [coord for coord in img_intersection.exterior.coords])
+
+        min_x = min(intersection_coords[:, 0])
+        min_y = min(intersection_coords[:, 1])
+        max_x = max(intersection_coords[:, 0])
+        max_y = max(intersection_coords[:, 1])
+
+        return min_x, min_y, max_x, max_y
+    else:
+        return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+                    sample_data_token: str, filename: str) -> OrderedDict:
+    """
+    Generate one 2D annotation record given various informations on
+    top of the 2D bounding box coordinates.
+    :param ann_rec: Original 3d annotation record.
+    :param x1: Minimum value of the x coordinate.
+    :param y1: Minimum value of the y coordinate.
+    :param x2: Maximum value of the x coordinate.
+    :param y2: Maximum value of the y coordinate.
+    :param sample_data_token: Sample data token.
+    :param filename:The corresponding image file where the annotation
+                    is present.
+    :return: A sample 2D annotation record.
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    relevant_keys = [
+        'attribute_tokens',
+        'category_name',
+        'instance_token',
+        'next',
+        'num_lidar_pts',
+        'num_radar_pts',
+        'prev',
+        'sample_annotation_token',
+        'sample_data_token',
+        'visibility_token',
+    ]
+
+    for key, value in ann_rec.items():
+        if key in relevant_keys:
+            repro_rec[key] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in NuScenesDataset.NameMapping:
+        return None
+    cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']]
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = nus_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec
diff --git a/tools/dist_train.sh b/tools/dist_train.sh
new file mode 100644
index 0000000000..a6ed4858c6
--- /dev/null
+++ b/tools/dist_train.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+PYTHON=${PYTHON:-"python"}
+
+CONFIG=$1
+GPUS=$2
+
+$PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
diff --git a/tools/publish_model.py b/tools/publish_model.py
new file mode 100644
index 0000000000..a049f17674
--- /dev/null
+++ b/tools/publish_model.py
@@ -0,0 +1,35 @@
+import argparse
+import subprocess
+
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/slurm_test.sh b/tools/slurm_test.sh
new file mode 100755
index 0000000000..865f45599a
--- /dev/null
+++ b/tools/slurm_test.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+set -x
+export PYTHONPATH=`pwd`:$PYTHONPATH
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+CHECKPOINT=$4
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+PY_ARGS=${@:5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
diff --git a/tools/slurm_train.sh b/tools/slurm_train.sh
new file mode 100755
index 0000000000..d480aa88cf
--- /dev/null
+++ b/tools/slurm_train.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+set -x
+export PYTHONPATH=`pwd`:$PYTHONPATH
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+WORK_DIR=$4
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PY_ARGS=${PY_ARGS:-"--validate"}
+
+
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py ${CONFIG} --work_dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
diff --git a/tools/test.py b/tools/test.py
new file mode 100644
index 0000000000..1d33ab6529
--- /dev/null
+++ b/tools/test.py
@@ -0,0 +1,170 @@
+import argparse
+import os
+
+import mmcv
+import torch
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import get_dist_info, init_dist, load_checkpoint
+from tools.fuse_conv_bn import fuse_module
+
+from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_detector
+from mmdet.apis import multi_gpu_test, single_gpu_test
+from mmdet.core import wrap_fp16_model
+
+
+class MultipleKVAction(argparse.Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options should
+    be passed as comma separated values, i.e KEY=V1,V2,V3
+    """
+
+    def _parse_int_float_bool(self, val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        return val
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        options = {}
+        for kv in values:
+            key, val = kv.split('=', maxsplit=1)
+            val = [self._parse_int_float_bool(v) for v in val.split(',')]
+            if len(val) == 1:
+                val = val[0]
+            options[key] = val
+        setattr(namespace, self.dest, options)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--fuse_conv_bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--format_only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--gpu_collect',
+        action='store_true',
+        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu_collect is not specified')
+    parser.add_argument(
+        '--options', nargs='+', action=MultipleKVAction, help='custom options')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results) with the argument "--out", "--eval", "--format_only" '
+         'or "--show"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = mmcv.Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    model = build_detector(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_module(model)
+    # old versions did not save class info in checkpoints, this walkaround is
+    # for backward compatibility
+    if 'CLASSES' in checkpoint['meta']:
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        model.CLASSES = dataset.CLASSES
+
+    if not distributed:
+        model = MMDataParallel(model, device_ids=[0])
+        outputs = single_gpu_test(model, data_loader, args.show)
+    else:
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False)
+        outputs = multi_gpu_test(model, data_loader, args.tmpdir,
+                                 args.gpu_collect)
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        if args.out:
+            print('\nwriting results to {}'.format(args.out))
+            mmcv.dump(outputs, args.out)
+        kwargs = {} if args.options is None else args.options
+        if args.format_only:
+            dataset.format_results(outputs, **kwargs)
+        if args.eval:
+            dataset.evaluate(outputs, args.eval, **kwargs)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/train.py b/tools/train.py
new file mode 100644
index 0000000000..22a30ac8ac
--- /dev/null
+++ b/tools/train.py
@@ -0,0 +1,149 @@
+from __future__ import division
+import argparse
+import copy
+import os
+import os.path as osp
+import time
+
+import mmcv
+import torch
+from mmcv import Config
+from mmcv.runner import init_dist
+
+from mmdet3d import __version__
+from mmdet3d.apis import train_detector
+from mmdet3d.datasets import build_dataset
+from mmdet3d.models import build_detector
+from mmdet3d.utils import collect_env
+from mmdet.apis import get_root_logger, set_random_seed
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work_dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume_from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--validate',
+        action='store_true',
+        help='whether to evaluate the checkpoint during training')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=1,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--autoscale-lr',
+        action='store_true',
+        help='automatically scale lr with the number of gpus')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    if args.resume_from is not None:
+        cfg.resume_from = args.resume_from
+    cfg.gpus = args.gpus
+
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * cfg.gpus / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, '{}.log'.format(timestamp))
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([('{}: {}'.format(k, v))
+                          for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+
+    # log some basic info
+    logger.info('Distributed training: {}'.format(distributed))
+    logger.info('Config:\n{}'.format(cfg.text))
+
+    # set random seeds
+    if args.seed is not None:
+        logger.info('Set random seed to {}, deterministic: {}'.format(
+            args.seed, args.deterministic))
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta['seed'] = args.seed
+
+    model = build_detector(
+        cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
+    logger.info('Model:\n{}'.format(model))
+    datasets = [build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        val_dataset.pipeline = cfg.data.train.pipeline
+        datasets.append(build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmdet version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmdet_version=__version__,
+            config=cfg.text,
+            CLASSES=datasets[0].CLASSES)
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+    train_detector(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        validate=args.validate,
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()