From 362a90f8bfffe62d5802925944f540ed16b2731e Mon Sep 17 00:00:00 2001 From: Jiazhen Wang <47851024+teamwong111@users.noreply.github.com> Date: Sat, 16 Apr 2022 15:45:00 +0800 Subject: [PATCH] [Feature] Add several MLU ops (#1563) * [Feature] Add roiaware pool3d ops from mmdet3d (#1382) * add ops (roiaware pool3d) in mmdet3d * refactor code * fix typo Co-authored-by: zhouzaida * [Feature] Add iou3d op from mmdet3d (#1356) * add ops (iou3d) in mmdet3d * add unit test * refactor code * refactor code * refactor code * refactor code * refactor code Co-authored-by: zhouzaida * [Fix] Update test data for test_iou3d (#1427) * Update test data for test_iou3d * delete blank lines Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> * [Feature] Add group points ops from mmdet3d (#1415) * add op (group points) and its related ops (ball query and knn) in mmdet3d * refactor code * fix typo * refactor code * fix typo * refactor code * make input contiguous Co-authored-by: zhouzaida * add mmdet3d op (#1425) Co-authored-by: zhouzaida * [Feature] Loading objects from different backends and dumping objects to different backends (#1330) * [Feature] Choose storage backend by the prefix of filepath * refactor FileClient and add unittest * support loading from different backends * polish docstring * fix unittet * rename attribute str_like_obj to is_str_like_obj * add infer_client method * add check_exist method * rename var client to file_client * polish docstring * add join_paths method * remove join_paths and add _format_path * enhance unittest * refactor unittest * singleton pattern * fix test_clientio.py * deprecate CephBackend * enhance docstring * refactor unittest for petrel * refactor unittest for disk backend * update io.md * add concat_paths method * improve docstring * improve docstring * add isdir and copyfile for file backend * delete copyfile and add get_local_path * remove isdir method of petrel * fix typo * add comment and polish docstring * polish docstring * rename _path_mapping to _map_path * polish docstring and fix typo * refactor get_local_path * add list_dir_or_file for FileClient * add list_dir_or_file for PetrelBackend * fix windows ci * Add return docstring * polish docstring * fix typo * fix typo * deprecate the conversion from Path to str * add docs for loading checkpoints with FileClient * refactor map_path * add _ensure_methods to ensure methods have been implemented * fix list_dir_or_file * rename _ensure_method_implemented to has_method * Add CI for pytorch 1.10 (#1431) * [Feature] Upload checkpoints and logs to ceph (#1375) * [Feature] Choose storage backend by the prefix of filepath * refactor FileClient and add unittest * support loading from different backends * polish docstring * fix unittet * rename attribute str_like_obj to is_str_like_obj * [Docs] Upload checkpoint to petrel oss * add infer_client method * Support uploading checkpoint to petrel oss * add check_exist method * refactor CheckpointHook * support uploading logs to ceph * rename var client to file_client * polish docstring * enhance load_from_ceph * refactor load_from_ceph * refactor TextLoggerHook * change the meaning of out_dir argument * fix test_checkpoint_hook.py * add join_paths method * remove join_paths and add _format_path * enhance unittest * refactor unittest * add a unittest for EvalHook when file backend is petrel * singleton pattern * fix test_clientio.py * deprecate CephBackend * add warning in load_from_ceph * fix type of out_suffix * enhance docstring * refactor unittest for petrel * refactor unittest for disk backend * update io.md * add concat_paths method * fix CI * mock check_exist * improve docstring * improve docstring * improve docstring * improve docstring * add isdir and copyfile for file backend * delete copyfile and add get_local_path * remove isdir method of petrel * fix typo * rename check_exists to exists * refactor code and polish docstring * fix windows ci * add comment and polish docstring * polish docstring * polish docstring * rename _path_mapping to _map_path * polish docstring and fix typo * refactor get_local_path * add list_dir_or_file for FileClient * add list_dir_or_file for PetrelBackend * fix windows ci * Add return docstring * polish docstring * fix typo * fix typo * fix typo * fix error when mocking PetrelBackend * deprecate the conversion from Path to str * add docs for loading checkpoints with FileClient * rename keep_log to keep_local * refactor map_path * add _ensure_methods to ensure methods have been implemented * fix list_dir_or_file * rename _ensure_method_implemented to has_method * refactor * polish information * format information * bump version to v1.3.16 (#1430) * [Fix]: Update test data of test_tin_shift (#1426) * Update test data of test_tin_shift * Delete tmp.engine * add pytest raises asserterror test * raise valueerror, update test log * add more comment * Apply suggestions from code review Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> * fix the wrong function reference bug in BaseTransformerLayer when batch_first is True (#1418) * [Docs] Add mmcv itself in the docs list (#1441) * Add mmcv itself in the docs list * modify link of docs * [Improve] improve checkpoint loading log (#1446) * [Feature] Support SigmoidFocalLoss with Cambricon MLU backend (#1346) * [Feature] Support SigmoidFocalLoss with Cambricon MLU backend * refactor MMCV_WITH_MLU macro define * refactor NFU_ALIGN_SIZE, PAD_DOWN and split_pipeline_num * delete extra fool proofing in cpp * [Feature] Support SigmoidFocalLossBackward with Cambricon MLU backend * fix macro definition in SigmoidFocalLoss * refactor mlu files into clang-format * refactor sigmoid focal loss test * refactor Sigmoid Focal Loss file structure. * fix python lint error * fix import torch_mlu error type * fix lint * refactor clang format style to google Co-authored-by: zhouzaida * [Feature] Support RoiAlign With Cambricon MLU Backend (#1429) * [Feature] Support NMS with cambricon MLU backend (#1467) * [Feature] Support BBoxOverlaps with cambricon MLU backend (#1507) * [Refactor] Format C++ code * [Refactor] include common_mlu_helper in pytorch_mlu_helper and refactor build condition * [Improve] Improve the performance of roialign, nms and focalloss with MLU backend (#1572) * [Improve] Improve the performance of roialign with MLU backend * replace CHECK_MLU with CHECK_MLU_INPUT * [Improve] Improve the perf of nms and focallosssigmoid with MLU backend * [Improve] Improve the performance of roialign with MLU backend (#1741) * [Feature] Support tin_shift with cambricon MLU backend (#1696) * [Feature] Support tin_shift with cambricon MLU backend * [fix] Add the assertion of batch_size in tin_shift.py * [fix] fix the param check of tin_shift in cambricon code * [fix] Fix lint failure. * [fix] Fix source file lint failure. * Update mmcv/ops/tin_shift.py [Refactor] Modify the code in mmcv/ops/tin_shift.py. Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> Co-authored-by: budefei Co-authored-by: budefei Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> * resolve conflicts and fix lint * fix mmcv.utils.__init__ * fix mmcv.utils.__init__ * Fix lints and change FLAG * fix setup and refine * remove a redundant line * remove an unnecessary 'f' * fix compilation error Co-authored-by: dingchang Co-authored-by: zhouzaida Co-authored-by: q.yao Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> Co-authored-by: pc Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com> Co-authored-by: q.yao Co-authored-by: Tong Gao Co-authored-by: Yuxin Liu Co-authored-by: zihanchang11 <92860914+zihanchang11@users.noreply.github.com> Co-authored-by: shlrao Co-authored-by: zhouchenyang Co-authored-by: Mrxiaofei <36697723+Mrxiaofei@users.noreply.github.com> Co-authored-by: budefei Co-authored-by: budefei --- mmcv/device/mlu/__init__.py | 4 +- mmcv/device/mlu/utils.py | 2 +- .../common/mlu/bbox_overlaps_mlu_kernel.mlu | 322 +++++ .../ops/csrc/common/mlu/common_mlu_helper.hpp | 36 + .../mlu/focal_loss_sigmoid_mlu_kernel.mlu | 888 +++++++++++++ mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu | 1161 +++++++++++++++++ .../csrc/common/mlu/roi_align_mlu_kernel.mlu | 493 +++++++ .../csrc/common/mlu/tin_shift_mlu_kernel.mlu | 307 +++++ mmcv/ops/csrc/common/pytorch_cpp_helper.hpp | 2 +- .../csrc/pytorch/mlu/bbox_overlaps_mlu.cpp | 100 ++ .../pytorch/mlu/focal_loss_sigmoid_mlu.cpp | 332 +++++ mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp | 130 ++ mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp | 206 +++ mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp | 203 +++ mmcv/ops/focal_loss.py | 3 +- mmcv/ops/tin_shift.py | 4 + mmcv/runner/dist_utils.py | 4 +- mmcv/utils/__init__.py | 14 +- mmcv/utils/parrots_wrapper.py | 7 + setup.py | 5 +- .../test_device/test_mlu/test_mlu_parallel.py | 9 +- tests/test_ops/test_bbox.py | 51 +- tests/test_ops/test_focal_loss.py | 42 +- tests/test_ops/test_nms.py | 19 +- tests/test_ops/test_roi_align.py | 31 +- tests/test_ops/test_tin_shift.py | 57 +- 26 files changed, 4358 insertions(+), 74 deletions(-) create mode 100644 mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu create mode 100644 mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp create mode 100644 mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu create mode 100644 mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu create mode 100644 mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu create mode 100644 mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu create mode 100644 mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp create mode 100644 mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp create mode 100644 mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp create mode 100644 mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp create mode 100644 mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp mode change 100644 => 100755 mmcv/ops/tin_shift.py mode change 100644 => 100755 tests/test_ops/test_tin_shift.py diff --git a/mmcv/device/mlu/__init__.py b/mmcv/device/mlu/__init__.py index 92681c5ef0..ffec020e94 100644 --- a/mmcv/device/mlu/__init__.py +++ b/mmcv/device/mlu/__init__.py @@ -2,9 +2,9 @@ from .data_parallel import MLUDataParallel from .distributed import MLUDistributedDataParallel from .scatter_gather import scatter, scatter_kwargs -from .utils import IS_MLU +from .utils import IS_MLU_AVAILABLE __all__ = [ 'MLUDataParallel', 'MLUDistributedDataParallel', 'scatter', - 'scatter_kwargs', 'IS_MLU' + 'scatter_kwargs', 'IS_MLU_AVAILABLE' ] diff --git a/mmcv/device/mlu/utils.py b/mmcv/device/mlu/utils.py index 4158e6d441..ff9ea60c7e 100644 --- a/mmcv/device/mlu/utils.py +++ b/mmcv/device/mlu/utils.py @@ -8,4 +8,4 @@ def is_mlu_available(): return False -IS_MLU = is_mlu_available() +IS_MLU_AVAILABLE = is_mlu_available() diff --git a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu new file mode 100644 index 0000000000..58e695a015 --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu @@ -0,0 +1,322 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include + +#include "common_mlu_helper.hpp" + +#define COORD_NUM 4 + +__nram__ char nmem_buf[MAX_NRAM_SIZE]; + +template +__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, void *nram_src1, + void *nram_addition, const int32_t deal_num) { + __bang_active_reciphp((T *)nram_dst, (T *)nram_src1, deal_num); + __bang_mul((T *)nram_dst, (T *)nram_src0, (T *)nram_dst, deal_num); +} + +template <> +__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, + void *nram_src1, void *nram_addition, + const int32_t deal_num) { + __bang_half2float((float *)nram_addition, (half *)nram_src1, deal_num); + __bang_active_reciphp((float *)nram_addition, (float *)nram_addition, + deal_num); + __bang_float2half_rd((half *)nram_src1, (float *)nram_addition, deal_num); + __bang_mul((half *)nram_dst, (half *)nram_src0, (half *)nram_src1, deal_num); +} + +template +__mlu_func__ void bboxOverlapsWorkflow( + T *vec_b1_x1, T *vec_b1_y1, T *vec_b1_x2, T *vec_b1_y2, T *vec_b2_x1, + T *vec_b2_y1, T *vec_b2_x2, T *vec_b2_y2, T *vec_left, T *vec_right, + T *vec_top, T *vec_bottom, const T *bbox1, const T *bbox2, void *ious, + const int32_t offset, const int32_t mode, const int32_t batches_stride, + const int32_t num_bbox1, const int32_t num_bbox2, const bool aligned) { + int32_t task_batch_stride = (num_bbox1 + taskDim - 1) / taskDim; + int32_t batch_start = taskId * task_batch_stride; + int32_t batch_per_task = batch_start + task_batch_stride < num_bbox1 + ? task_batch_stride + : num_bbox1 - batch_start; + batch_per_task = batch_per_task > 0 ? batch_per_task : (0); + + if (aligned) { + int32_t num_loop_cpy = batch_per_task / batches_stride; + int32_t num_rem_cpy_batches = batch_per_task % batches_stride; + num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy; + for (int32_t i = 0; i < num_loop_cpy; i++) { + int32_t index = batch_start + i * batches_stride; + int32_t handle_batches = index + batches_stride > num_bbox1 + ? num_rem_cpy_batches + : batches_stride; + int32_t b1 = index; + int32_t b2 = index; + + int32_t base1 = b1 * COORD_NUM; + __memcpy(vec_b1_x1, &bbox1[base1], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b1_y1, &bbox1[base1 + 1], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b1_x2, &bbox1[base1 + 2], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b1_y2, &bbox1[base1 + 3], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + + int32_t base2 = b2 * COORD_NUM; + __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + // get the width and height + __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride); + __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride); + __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride); + __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride); + + // right - left + offset ---> left + __bang_sub(vec_left, vec_right, vec_left, batches_stride); + __bang_add_const(vec_left, vec_left, (T)offset, batches_stride); + + // bottom - top + offset ---> right + __bang_sub(vec_right, vec_bottom, vec_top, batches_stride); + __bang_add_const(vec_right, vec_right, (T)offset, batches_stride); + + // zero vector ---> bottom + __nramset(vec_bottom, batches_stride, 0.f); + + // width --> vec_left + __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride); + T *width = vec_left; + // height --> vec_right + __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride); + T *height = vec_right; + + // get the b1_area + // (b1_x2 - b1_x1 + offset) ---> vec_top + __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride); + __bang_add_const(vec_top, vec_top, (T)offset, batches_stride); + + // (b1_y2 - b1_y1 + offset) ---> vec_bottom + __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride); + __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride); + + // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset) + // ---> vec_top; + __bang_mul(vec_top, vec_top, vec_bottom, batches_stride); + T *b1_area = vec_top; + + // get the b2_area + // (b2_x2 - b2_x1 + offset) ---> b2_x1 + __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride); + __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride); + + // (b2_y2 - b2_y1 + offset) ---> b2_y1 + __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride); + __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride); + + // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset) + // ---> b2_x1; + __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride); + T *b2_area = vec_b2_x1; + + // inter_s = width * height + __bang_mul(height, width, height, batches_stride); + T *inter_s = height; + + // offset vector ---> vec_b2_y1 + __nramset(vec_b2_y1, batches_stride, T(offset)); + T *vec_offset = vec_b2_y1; + + if (mode == 0) { + __bang_add(b1_area, b1_area, b2_area, batches_stride); + __bang_sub(b1_area, b1_area, inter_s, batches_stride); + __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride); + } else { + __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride); + } + T *base_s = b1_area; + + // ious = inter_s / base_s + computeDiv(width, inter_s, base_s, vec_b2_x2, batches_stride); + __memcpy((T *)ious + index, width, handle_batches * sizeof(T), + NRAM2GDRAM); + } + } else { + int32_t num_loop_cpy = num_bbox2 / batches_stride; + int32_t num_rem_cpy_batches = num_bbox2 % batches_stride; + num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy; + for (int32_t i = 0; i < batch_per_task; i++) { + int32_t index1 = batch_start + i; + int32_t b1 = index1; + int32_t base1 = b1 * COORD_NUM; + + // set bbox1 and bbox2 to nram + __nramset(vec_b1_x1, batches_stride, bbox1[base1]); + __nramset(vec_b1_y1, batches_stride, bbox1[base1 + 1]); + __nramset(vec_b1_x2, batches_stride, bbox1[base1 + 2]); + __nramset(vec_b1_y2, batches_stride, bbox1[base1 + 3]); + + for (int32_t j = 0; j < num_loop_cpy; j++) { + int32_t index2 = j * batches_stride; + int32_t handle_batches = index2 + batches_stride > num_bbox2 + ? num_rem_cpy_batches + : batches_stride; + int32_t b2 = index2; + int32_t base2 = b2 * COORD_NUM; + + // copy bbox2 to nram + __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + + // get the width and height + __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride); + __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride); + __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride); + __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride); + + // right - left + offset ---> left + __bang_sub(vec_left, vec_right, vec_left, batches_stride); + __bang_add_const(vec_left, vec_left, (T)offset, batches_stride); + // bottom - top + offset ---> right + __bang_sub(vec_right, vec_bottom, vec_top, batches_stride); + __bang_add_const(vec_right, vec_right, (T)offset, batches_stride); + + // zero vector ---> bottom + __nramset(vec_bottom, batches_stride, (T)0); + + // width --> vec_left + __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride); + T *width = vec_left; + // height --> vec_right + __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride); + T *height = vec_right; + + // get the b1_area + // (b1_x2 - b1_x1 + offset) ---> vec_top + __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride); + __bang_add_const(vec_top, vec_top, (T)offset, batches_stride); + // (b1_y2 - b1_y1 + offset) ---> vec_bottom + __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride); + __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride); + // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset) + // ---> vec_top; + __bang_mul(vec_top, vec_top, vec_bottom, batches_stride); + T *b1_area = vec_top; + + // get the b2_area + // (b2_x2 - b2_x1 + offset) ---> b2_x1 + __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride); + __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride); + // (b2_y2 - b2_y1 + offset) ---> b2_y1 + __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride); + __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride); + // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset) + // ---> b2_x1; + __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride); + T *b2_area = vec_b2_x1; + + // inter_s = width * height + __bang_mul(height, width, height, batches_stride); + T *inter_s = height; + + // offset vector ---> vec_b2_y1 + __nramset(vec_b2_y1, batches_stride, T(offset)); + T *vec_offset = vec_b2_y1; + + if (mode == 0) { + __bang_add(b1_area, b1_area, b2_area, batches_stride); + __bang_sub(b1_area, b1_area, inter_s, batches_stride); + __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride); + } else { + __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride); + } + T *base_s = b1_area; + + // ious = inter_s / base_s + computeDiv(width, inter_s, base_s, vec_b2_x2, batches_stride); + int32_t gdram_offset = index1 * num_bbox2 + index2; + __memcpy((T *)ious + gdram_offset, width, handle_batches * sizeof(T), + NRAM2GDRAM); + } + } + } +} + +template +__mlu_global__ void MLUUnion1KernelBBoxOverlaps( + const void *bbox1, const void *bbox2, void *ious, const int32_t num_bbox1, + const int32_t num_bbox2, const int32_t mode, const bool aligned, + const int32_t offset) { + /* + * NRAM partition + * |-------------------------------------------------------------| + * | vec_b1_x1 | vec_b1_y1 | vec_b1_x2 | vec_b1_y2 | + * |-------------------------------------------------------------| + * | vec_b2_x1 | vec_b2_y1 | vec_b2_x2 | vec_b2_y2 | + * |-------------------------------------------------------------| + * | vec_left | vec_right | vec_top | vec_bottom | + * |-------------------------------------------------------------| + * + */ + const int32_t align_bytes = PAD_DOWN(MAX_NRAM_SIZE, NFU_ALIGN_SIZE); + const int32_t split_nram_num = 12; + const int32_t nram_stride = + align_bytes / NFU_ALIGN_SIZE / split_nram_num * NFU_ALIGN_SIZE; + + void *vec_b1_x1 = nmem_buf; + void *vec_b1_y1 = nmem_buf + nram_stride; + void *vec_b1_x2 = nmem_buf + 2 * nram_stride; + void *vec_b1_y2 = nmem_buf + 3 * nram_stride; + + void *vec_b2_x1 = nmem_buf + 4 * nram_stride; + void *vec_b2_y1 = nmem_buf + 5 * nram_stride; + void *vec_b2_x2 = nmem_buf + 6 * nram_stride; + void *vec_b2_y2 = nmem_buf + 7 * nram_stride; + + void *vec_left = nmem_buf + 8 * nram_stride; + void *vec_right = nmem_buf + 9 * nram_stride; + void *vec_top = nmem_buf + 10 * nram_stride; + void *vec_bottom = nmem_buf + 11 * nram_stride; + + const int32_t vec_length = nram_stride / sizeof(T); + bboxOverlapsWorkflow((T *)vec_b1_x1, (T *)vec_b1_y1, (T *)vec_b1_x2, + (T *)vec_b1_y2, (T *)vec_b2_x1, (T *)vec_b2_y1, + (T *)vec_b2_x2, (T *)vec_b2_y2, (T *)vec_left, + (T *)vec_right, (T *)vec_top, (T *)vec_bottom, + (T *)bbox1, (T *)bbox2, (T *)ious, offset, mode, + vec_length, num_bbox1, num_bbox2, aligned); +} + +void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, const cnrtDataType_t d_type, + const void *bbox1, const void *bbox2, void *ious, + const int32_t num_bbox1, const int32_t num_bbox2, + const int32_t mode, const bool aligned, + const int32_t offset) { + if (d_type == CNRT_FLOAT16) { + MLUUnion1KernelBBoxOverlaps<<>>( + bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset); + } else { + MLUUnion1KernelBBoxOverlaps<<>>( + bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset); + } +} diff --git a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp new file mode 100644 index 0000000000..826c42349f --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp @@ -0,0 +1,36 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef UTILS_H_ +#define UTILS_H_ + +#define NFU_ALIGN_SIZE 128 // Byte +#define REM_FOR_STACK (128 * 1024) // 128KB reserved for cncc + +#ifdef __BANG_ARCH__ +#define MAX_NRAM_SIZE \ + (__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK) // 128KB reserved for cncc +#define MAX_SRAM_SIZE \ + (__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK) // 128KB reserved for cncc +#else +#define MAX_NRAM_SIZE (384 * 1024) // 384KB, initialization value +#define MAX_SRAM_SIZE (1920 * 1024) // 1920KB, initialization value +#endif + +#ifndef PAD_UP +#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y)) +#endif + +#ifndef PAD_DOWN +#define PAD_DOWN(x, y) (((x) / (y)) * (y)) +#endif + +#endif // UTILS_H_ diff --git a/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu new file mode 100644 index 0000000000..7624379b68 --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu @@ -0,0 +1,888 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include + +#include "common_mlu_helper.hpp" + +#define PING 0 +#define PONG 1 + +__nram__ char nram_buffer[MAX_NRAM_SIZE]; + +namespace forward { +template +__mlu_func__ void loadInput(char *nram_input, T *dram_input, const int32_t size, + const int32_t dst_stride = 0, + const int32_t src_stride = 0, + const int32_t count = 1) { + if (dst_stride == src_stride) { + __memcpy_async(nram_input, dram_input, size * count, GDRAM2NRAM); + } else { + __memcpy_async(nram_input, dram_input, size, GDRAM2NRAM, dst_stride, + src_stride, count - 1); + } +} + +template +__mlu_func__ void loadWeight(char *nram_input, T *dram_input, const int32_t t, + const int32_t c, const int32_t has_weight, + const int32_t partition_nc) { + if (has_weight && partition_nc && t >= 0 && t < c) { + __memcpy_async(nram_input, (T *)dram_input + t, sizeof(T), GDRAM2NRAM); + } +} + +template +__mlu_func__ void storeOutput(T *dram_output, char *nram_output, + const int32_t size, const int32_t dst_stride = 0, + const int32_t src_stride = 0, + const int32_t count = 1) { + if (dst_stride == src_stride) { + __memcpy_async(dram_output, nram_output, size * count, NRAM2GDRAM); + } else { + __memcpy_async(dram_output, nram_output, size, NRAM2GDRAM, dst_stride, + src_stride, count - 1); + } +} + +template +__mlu_func__ void compute(T *input, const int32_t *target, const T *weight, + const int32_t has_weight, const int32_t partition_nc, + const int32_t deal_num, const int32_t n_seg, + const int32_t c, const int32_t c_seg, + const int32_t c_start_index, const float alpha, + const float gamma, T *compute_a, T *compute_b, + T *output) { + // set params + const int32_t c_num = + has_weight ? PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T)) : c_seg; + const int32_t c_end_index = c_start_index + c_seg; + const int32_t half_epsilon = 0x0400; + const T epsilon_f = + sizeof(T) == sizeof(float) ? FLT_MIN : *((half *)&half_epsilon); + + // 0. alpha_t * p_t^r = alpha * (1 - p) ^ gamma if t == c_i + // = (1 - alpha) * p ^ gamma if t != c_i + __nramset((T *)output, deal_num, (T)(1 - alpha)); + __bang_active_sigmoid((T *)compute_b, (T *)input, deal_num); + for (int32_t i = 0; i < n_seg; ++i) { + const int32_t t = *((uint32_t *)target + i); + if (t >= c_start_index && t < c_end_index) { + const uint32_t index = i * c_num + t - c_start_index; + *((T *)input + index) = -1.0 * (*((T *)input + index)); + *((T *)compute_b + index) = 1.0 - (*((T *)compute_b + index)) + epsilon_f; + *((T *)output + index) = alpha; + } + } + if (sizeof(T) == sizeof(half)) { + __bang_half2float((float *)compute_a, (half *)compute_b, deal_num); + __bang_active_loghp((float *)compute_a, (float *)compute_a, deal_num); + __bang_mul_const((float *)compute_a, (float *)compute_a, (float)gamma, + deal_num); + __bang_active_exphp((float *)compute_a, (float *)compute_a, deal_num); + __bang_float2half_rd((half *)compute_a, (float *)compute_a, deal_num); + } else { + __bang_active_loghp((T *)compute_a, (T *)compute_b, deal_num); + __bang_mul_const((T *)compute_a, (T *)compute_a, (T)gamma, deal_num); + __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num); + } + __bang_mul((T *)output, (T *)compute_a, (T *)output, deal_num); + + // 1. max = max(0, -x) if t == c_i + // = max(0, x) if t != c_i + __nramset((T *)compute_b, deal_num, (T)0); + __bang_maxequal((T *)compute_b, (T *)compute_b, (T *)input, deal_num); + + // 2. -log(p_t) = ln(e^(-max)+ e^(-max-x) + max if t == c_i + // = ln(e^(-max)+ e^(-max+x) + max if t != c_i + __bang_mul_const((T *)compute_a, (T *)compute_b, (T)-1.0, deal_num); + __bang_add((T *)input, (T *)compute_a, (T *)input, deal_num); + + __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num); + __bang_active_exphp((T *)input, (T *)input, deal_num); + __bang_add((T *)compute_a, (T *)compute_a, (T *)input, deal_num); + __bang_active_loghp((T *)compute_a, (T *)compute_a, deal_num); + __bang_add((T *)input, (T *)compute_a, (T *)compute_b, deal_num); + + // 3. output = alpha_t * p_t^r * [-log(p_t)] + __bang_mul((T *)output, (T *)output, (T *)input, deal_num); + + // 4. with weight + if (has_weight) { + for (int32_t i = 0; i < n_seg; ++i) { + int32_t t = *((int32_t *)target + i); + if (t >= 0 && t < c) { + t = partition_nc ? 0 : t; + __bang_mul_const((T *)output + i * c_num, (T *)output + i * c_num, + *((T *)weight + t), c_num); + } + } + } +} + +template +__mlu_func__ void startPipeline( + const T *input, const int32_t *target, const T *weight, + char *nram_compute_a, char *nram_compute_b, char *nram_input, + char *nram_target, char *nram_weight, char *nram_output, + const int32_t has_weight, const int32_t partition_nc, + const int32_t pingpong_offset, const int32_t pingpong_weight_offset, + const int32_t c_offset_num, const int32_t n, const int32_t n_seg, + const int32_t c, const int32_t c_seg, const float alpha, const float gamma, + T *output) { + // with offset + input = (T *)((char *)input + c_offset_num * sizeof(T)); + output = (T *)((char *)output + c_offset_num * sizeof(T)); + + const int32_t c_seg_align_num = PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T)); + const int32_t c_num = has_weight ? c_seg_align_num : c_seg; + const int32_t deal_num = PAD_UP(n_seg * c_num, NFU_ALIGN_SIZE / sizeof(T)); + const int32_t load_size = c_seg * sizeof(T); + const int32_t dram_stride = c * sizeof(T); + const int32_t nram_stride = c_num * sizeof(T); + + if (has_weight && !partition_nc) { + loadInput(nram_weight, (T *)weight, load_size, nram_stride, dram_stride, + 1); + __asm__ volatile("sync;\n\t"); + } + const int32_t repeat = n / n_seg; + const int32_t remain = n % n_seg; + + /* + * Pipeline: The pipeline is processed in three stages: Load, Compute, Store. + * The allocated memory space of NRAM is divided into two parts: + * PING and Pong. In a single time slice, PING is used to process + * IO stream and PONG is used for computation. Both of them are + * processed synchronously until finished. + * + * diagram of PINGPONG: + * |------|-----------------------------------------------------------------| + * | | space | + * |------|-----------------------------------------------------------------| + * | time | Ping | Pong | Ping | Pong | Ping | Pong | + * |------|-----------------------------------------------------------------| + * | 0 | L0 | | | | | | + * | 1 | C0 | L1 | | | | | + * | 2 | S0 | C1 | L2 | | | | + * | 3 | | S1 | C2 | L3 | | | + * | 4 | | | S2 | C3 | L4 | | + * | 5 | | | | S3 | C4 | L5 | + * | 6 | | | | | S4 | C5 | + * | 7 | | | | | | S5 | + * |------|-----------------------------------------------------------------| + */ + + // diagram of PINGPONG: L0 + if (repeat > 0) { + loadInput(nram_input, (T *)input, load_size, nram_stride, dram_stride, + n_seg); + loadInput(nram_target, (int32_t *)target, n_seg * sizeof(int32_t)); + loadWeight(nram_weight, (T *)weight, *((int32_t *)target), c, has_weight, + partition_nc); + __asm__ volatile("sync;\n\t"); + } + + // diagram of PINGPONG: C0 and L1 + if (repeat > 1) { + compute((T *)nram_input, (int32_t *)nram_target, (T *)nram_weight, + has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num, + alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b, + (T *)nram_output); + loadInput((char *)nram_input + pingpong_offset, (T *)input + c * n_seg, + load_size, nram_stride, dram_stride, n_seg); + loadInput((char *)nram_target + pingpong_offset, + (int32_t *)target + n_seg, n_seg * sizeof(int32_t)); + loadWeight((char *)nram_weight + pingpong_weight_offset, (T *)weight, + *((int32_t *)target + n_seg), c, has_weight, partition_nc); + __asm__ volatile("sync;\n\t"); + } + + for (int32_t i = 0; i < repeat - 2; ++i) { + storeOutput((T *)output + i * c * n_seg, + nram_output + (i % 2) * pingpong_offset, load_size, + dram_stride, nram_stride, n_seg); + loadInput((char *)nram_input + (i % 2) * pingpong_offset, + (T *)(input) + (i + 2) * c * n_seg, load_size, nram_stride, + dram_stride, n_seg); + loadInput((char *)nram_target + (i % 2) * pingpong_offset, + (int32_t *)target + (i + 2) * n_seg, + n_seg * sizeof(int32_t)); + loadWeight((char *)nram_weight + (i % 2) * pingpong_weight_offset, + (T *)weight, *((int32_t *)target + (i + 2) * n_seg), c, + has_weight, partition_nc); + compute((T *)(nram_input + ((i + 1) % 2) * pingpong_offset), + (int32_t *)(nram_target + ((i + 1) % 2) * pingpong_offset), + (T *)(nram_weight + + partition_nc * ((i + 1) % 2) * pingpong_weight_offset), + has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num, + alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b, + (T *)(nram_output + ((i + 1) % 2) * pingpong_offset)); + __asm__ volatile("sync;\n\t"); + } + + if (repeat > 1) { + storeOutput((T *)output + (repeat - 2) * c * n_seg, + (char *)nram_output + (repeat % 2) * pingpong_offset, + load_size, dram_stride, nram_stride, n_seg); + } + + if (remain > 0) { + loadInput((char *)nram_input + (repeat % 2) * pingpong_offset, + (T *)input + repeat * c * n_seg, load_size, nram_stride, + dram_stride, remain); + loadInput((char *)nram_target + (repeat % 2) * pingpong_offset, + (int32_t *)target + repeat * n_seg, + remain * sizeof(int32_t)); + loadWeight((char *)nram_weight + (repeat % 2) * pingpong_weight_offset, + (T *)weight, *((int32_t *)target + repeat * n_seg), c, + has_weight, partition_nc); + } + + if (repeat > 0) { + compute((T *)(nram_input + ((repeat - 1) % 2) * pingpong_offset), + (int32_t *)(nram_target + ((repeat - 1) % 2) * pingpong_offset), + (T *)(nram_weight + + partition_nc * ((repeat - 1) % 2) * pingpong_weight_offset), + has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num, + alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b, + (T *)(nram_output + ((repeat - 1) % 2) * pingpong_offset)); + } + __asm__ volatile("sync;\n\t"); + + if (repeat > 0) { + storeOutput((T *)output + (repeat - 1) * c * n_seg, + (char *)nram_output + ((repeat - 1) % 2) * pingpong_offset, + load_size, dram_stride, nram_stride, n_seg); + } + + if (remain > 0) { + int32_t rem_num = PAD_UP(remain * c_num, NFU_ALIGN_SIZE / sizeof(T)); + compute((T *)(nram_input + (repeat % 2) * pingpong_offset), + (int32_t *)(nram_target + (repeat % 2) * pingpong_offset), + (T *)(nram_weight + + partition_nc * (repeat % 2) * pingpong_weight_offset), + has_weight, partition_nc, rem_num, remain, c, c_seg, c_offset_num, + alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b, + (T *)(nram_output + (repeat % 2) * pingpong_offset)); + __asm__ volatile("sync;\n\t"); + + storeOutput((T *)output + repeat * c * n_seg, + (char *)nram_output + (repeat % 2) * pingpong_offset, + load_size, dram_stride, nram_stride, remain); + } + __asm__ volatile("sync;\n\t"); +} + +template +__mlu_func__ void focalLossSigmoidForwardBlock( + const T *input, const int32_t *target, const T *weight, const int32_t n, + const int32_t c, const float alpha, const float gamma, T *output) { + /* + * NRAM partition + * |-----------------------------------------------------------------------| + * | weight | + * |------------------------------- COMPUTE -------------------------------| + * | | | + * | computeA | computeB | + * | | | + * |------------- PING ------------------------------- PONG ---------------| + * | | | + * | input | input | + * | | | + * |-----------------------------------|-----------------------------------| + * | | | + * | output | output | + * | | | + * |-----------------------------------|-----------------------------------| + * | target | target | + * |-----------------------------------|-----------------------------------| + * + * split_pipeline_num is 6: COMPUTE(computeA,computeB), PING(input,output), + * PONG(input,output). + * split_target_num is 2: PING(target), PONG(target). + * weight is not NULL: + * The nram-size of weight is equal to c_align_size when partition input-N. + * The nram-size of weight is equal to NFU_ALIGN_SIZE when partition + * input-NC. + */ + + // calculate threshold of c + const int32_t split_pipeline_num = 6; + const int32_t split_target_num = 2; + const int32_t has_weight = weight != NULL; + const int32_t threshold_c = + PAD_DOWN((MAX_NRAM_SIZE - split_target_num * sizeof(int32_t)) / + (split_pipeline_num + has_weight), + NFU_ALIGN_SIZE) / + sizeof(T); + const int32_t c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T)); + const int32_t c_align_size = c_align * sizeof(T); + + if (c <= threshold_c) { + // partition inputN + int32_t c_num = c; + int32_t reservered_align_size = + (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE; + int32_t weight_size = 0; + if (has_weight) { + c_num = c_align; + reservered_align_size = split_target_num * NFU_ALIGN_SIZE; + weight_size = c_align_size; + } + + const int32_t remain_size = + MAX_NRAM_SIZE - weight_size - reservered_align_size; + const int32_t n_seg = + remain_size / (split_pipeline_num * c_num * sizeof(T) + + split_target_num * sizeof(int32_t)); + const int32_t split_pipeline_size = + PAD_UP(c_num * n_seg * sizeof(T), NFU_ALIGN_SIZE); + const int32_t compute_size = 2 * split_pipeline_size; + const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2; + + char *nram_weight = (char *)nram_buffer; + char *nram_compute_a = nram_weight + has_weight * c_align_size; + char *nram_compute_b = nram_compute_a + split_pipeline_size; + char *nram_input = nram_compute_b + split_pipeline_size; + char *nram_output = nram_input + split_pipeline_size; + char *nram_target = nram_output + split_pipeline_size; + + startPipeline(input, target, weight, nram_compute_a, nram_compute_b, + nram_input, nram_target, nram_weight, nram_output, + has_weight, 0, pingpong_offset, 0, 0, n, n_seg, c, c, + alpha, gamma, output); + } else { + // partition inputNC + const int32_t weight_size = has_weight * NFU_ALIGN_SIZE; + const int32_t remain_size = MAX_NRAM_SIZE - weight_size; + const int32_t split_pipeline_size = PAD_DOWN( + (remain_size - split_target_num * NFU_ALIGN_SIZE) / split_pipeline_num, + NFU_ALIGN_SIZE); + const int32_t c_seg = split_pipeline_size / sizeof(T); + const int32_t n_seg = 1; + const int32_t compute_size = 2 * split_pipeline_size; + const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2; + const int32_t pingpong_weight_offset = weight_size / 2; + + char *nram_weight = (char *)nram_buffer; + char *nram_compute_a = nram_weight + weight_size; + char *nram_compute_b = nram_compute_a + split_pipeline_size; + char *nram_input = nram_compute_b + split_pipeline_size; + char *nram_output = nram_input + split_pipeline_size; + char *nram_target = nram_output + split_pipeline_size; + + const int32_t loop_num = (c + c_seg - 1) / c_seg; + const int32_t partition_nc = 1; + for (int32_t i = 0; i < loop_num; ++i) { + const int32_t c_index = i * c_seg; + const int32_t c_seg_curr = i == (loop_num - 1) ? c - c_index : c_seg; + startPipeline(input, target, weight, nram_compute_a, nram_compute_b, + nram_input, nram_target, nram_weight, nram_output, + has_weight, partition_nc, pingpong_offset, + pingpong_weight_offset, c_index, n, n_seg, c, c_seg_curr, + alpha, gamma, output); + } + } +} + +template +__mlu_global__ void MLUUnion1KernelFocalLossSigmoidForward( + const void *input, const void *target, const void *weight, const int32_t N, + const int32_t C, const float alpha, const float gamma, void *output) { + const int32_t n_seg = N / taskDim + (taskId == taskDim - 1) * (N % taskDim); + const T *input_offset = (T *)input + N / taskDim * taskId * C; + const int32_t *target_offset = (int32_t *)target + N / taskDim * taskId; + T *output_offset = (T *)output + N / taskDim * taskId * C; + + focalLossSigmoidForwardBlock((T *)input_offset, (int32_t *)target_offset, + (T *)weight, n_seg, C, alpha, gamma, + (T *)output_offset); +} +} // namespace forward + +namespace backward { +template +__mlu_func__ void loadInput(char *nram_input, char *nram_target, + const T *gdram_input, const int32_t *gdram_target, + const int32_t deal_n, const int32_t total_c, + const bool pingping_flag, const bool has_weight, + const int32_t nram_offset, + const int32_t gdram_offset) { + if (pingping_flag == PONG) { + nram_input += nram_offset; + nram_target += nram_offset; + } + + __memcpy_async(nram_target, gdram_target + gdram_offset / total_c, + deal_n * sizeof(int32_t), GDRAM2NRAM); + + char *nram_input_load = nram_input; + int32_t compute_align_size = 2 * NFU_ALIGN_SIZE; + if (has_weight) { + if (sizeof(T) == sizeof(half)) { + int32_t compute_align_num = compute_align_size / sizeof(float); + int32_t align_c = PAD_UP(total_c, compute_align_num); + int32_t compute_size = deal_n * align_c * sizeof(float); + nram_input_load += compute_size / 2; + } + int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T)); + int32_t total_c_size = total_c * sizeof(T); + int32_t align_c_size = align_c * sizeof(T); + __memcpy_async(nram_input_load, gdram_input + gdram_offset, total_c_size, + GDRAM2NRAM, align_c_size, total_c_size, deal_n - 1); + } else { + if (sizeof(T) == sizeof(half)) { + int32_t compute_size = + PAD_UP(deal_n * total_c * sizeof(float), compute_align_size); + nram_input_load += compute_size / 2; + } + int32_t load_size = deal_n * total_c * sizeof(T); + __memcpy_async(nram_input_load, gdram_input + gdram_offset, load_size, + GDRAM2NRAM); + } +} + +template +__mlu_func__ void sigmoid(T *dst_data, const T *src_data, + const int32_t elem_count) { + __bang_mul_const(dst_data, (T *)src_data, T(-1), elem_count); + __bang_active_exphp(dst_data, dst_data, elem_count); + __bang_add_const(dst_data, dst_data, T(1), elem_count); + __bang_active_reciphp(dst_data, dst_data, elem_count); +} + +template +__mlu_func__ void coreCompute(char *nram_input, const T *nram_weight, + const float *nram_flt_min, char *nram_pt, + char *nram_alpha_t, char *nram_temp, + char *nram_target, const float *nram_gamma, + char *nram_output, const float alpha, + const int32_t compute_num, const int32_t deal_n, + const int32_t total_c, const bool pingpong_flag, + const int32_t nram_offset, + const bool has_weight) { + if (pingpong_flag == PONG) { + nram_input += nram_offset; + nram_pt += nram_offset; + nram_alpha_t += nram_offset; + nram_temp += nram_offset; + nram_output += nram_offset; + nram_target += nram_offset; + } + + if (sizeof(T) == sizeof(half)) { + const int32_t compute_size = compute_num * sizeof(float); + char *nram_input_load = nram_input + compute_size / 2; + __bang_half2float((float *)nram_input, (half *)nram_input_load, + compute_num); + } + + // 0. alpha_t = alpha - 1 + __nramset((float *)nram_alpha_t, compute_num, (float)(alpha - 1.0)); + + // 1. pt = 1 - sigmoid(x) + sigmoid((float *)nram_pt, (float *)nram_input, compute_num); + __bang_mul_const((float *)nram_pt, (float *)nram_pt, (float)(-1), + compute_num); + __bang_add_const((float *)nram_pt, (float *)nram_pt, (float)1, compute_num); + + // 2. pt = target[n] == c ? sigmoid(x) : 1 - sigmoid(x) + // alpha_t = target[n] == c ? alpha : alpha - 1 + const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(float); + for (int n = 0; n < deal_n; n++) { + const int32_t target_value = ((int32_t *)nram_target)[n]; + if (target_value >= total_c || target_value < 0) continue; + int32_t c_offset = 0; + if (has_weight) { + int32_t c_align_num = nfu_align_num; + if (sizeof(T) == sizeof(half)) { + c_align_num += nfu_align_num; + } + c_offset = PAD_UP(total_c, c_align_num); + } else { + c_offset = total_c; + } + int32_t idx = n * c_offset + target_value; + *((float *)nram_pt + idx) = 1.0 - *((float *)nram_pt + idx); + *((float *)nram_alpha_t + idx) = alpha; + } + + // 3. temp = -alpha_t * e^(gamma * log(max(1 - pt, FLT_MIN)) + __bang_mul_const((float *)nram_temp, (float *)nram_pt, (float)(-1), + compute_num); + __bang_add_const((float *)nram_temp, (float *)nram_temp, (float)(1), + compute_num); + __bang_cycle_maxequal((float *)nram_temp, (float *)nram_temp, + (float *)nram_flt_min, compute_num, nfu_align_num); + __bang_active_loghp((float *)nram_temp, (float *)nram_temp, compute_num); + __bang_cycle_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_gamma, + compute_num, nfu_align_num); + __bang_active_exphp((float *)nram_temp, (float *)nram_temp, compute_num); + __bang_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_alpha_t, + compute_num); + __bang_mul_const((float *)nram_temp, (float *)nram_temp, (float)(-1), + compute_num); + + // 4. output = 1 - pt - gamma * pt * log(max(pt, FLT_MIN)) + __bang_cycle_maxequal((float *)nram_output, (float *)nram_pt, + (float *)nram_flt_min, compute_num, nfu_align_num); + __bang_active_loghp((float *)nram_output, (float *)nram_output, compute_num); + __bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_pt, + compute_num); + __bang_cycle_mul((float *)nram_output, (float *)nram_output, + (float *)nram_gamma, compute_num, nfu_align_num); + __bang_add((float *)nram_output, (float *)nram_output, (float *)nram_pt, + compute_num); + __bang_mul_const((float *)nram_output, (float *)nram_output, (float)(-1), + compute_num); + __bang_add_const((float *)nram_output, (float *)nram_output, (float)(1), + compute_num); + + // 5. output = output * temp + __bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_temp, + compute_num); + + if (sizeof(T) == sizeof(half)) { + __bang_float2half_rd((half *)nram_output, (float *)nram_output, + compute_num); + } + + if (has_weight) { + // with weight + for (int n = 0; n < deal_n; n++) { + int32_t c_align_num = nfu_align_num; + if (sizeof(T) == sizeof(half)) { + c_align_num += nfu_align_num; + } + int32_t align_c = PAD_UP(total_c, c_align_num); + int32_t target_value = ((int32_t *)nram_target)[n]; + T weight_value = nram_weight[target_value]; + __bang_mul_const((T *)nram_output + n * align_c, + (T *)nram_output + n * align_c, weight_value, align_c); + } + } +} + +template +__mlu_func__ void storeOutput(T *gdram_output, const char *nram_output, + const int32_t deal_n, const int32_t total_c, + const bool pingpong_flag, const bool has_weight, + const int32_t nram_offset, + const int32_t gdram_offset) { + if (pingpong_flag == PONG) { + nram_output += nram_offset; + } + const int32_t store_size = deal_n * total_c * sizeof(T); + if (has_weight) { + int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T)); + int32_t total_c_size = total_c * sizeof(T); + int32_t align_c_size = align_c * sizeof(T); + __memcpy_async(gdram_output + gdram_offset, nram_output, total_c_size, + NRAM2GDRAM, total_c_size, align_c_size, deal_n - 1); + } else { + __memcpy_async(gdram_output + gdram_offset, nram_output, store_size, + NRAM2GDRAM); + } +} + +template +__mlu_func__ void focalLossSigmoidBackwardBlock( + const T *input, const int32_t *target, const T *weight, const float gamma, + const float alpha, const int32_t total_n, const int32_t deal_n, + const int32_t total_c, T *output) { + // params per time slice + int32_t deal_num = deal_n * total_c; + int32_t deal_size = deal_num * sizeof(float); + int32_t compute_num = 0; + int32_t compute_size = 0; + int32_t compute_align_size = NFU_ALIGN_SIZE; + const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(T); + if (sizeof(T) == sizeof(half)) { + compute_align_size += NFU_ALIGN_SIZE; + } + const int32_t compute_align_num = compute_align_size / sizeof(float); + bool has_weight = false; + if (weight != NULL) { + has_weight = true; + int32_t align_c = PAD_UP(total_c, compute_align_num); + compute_num = deal_n * align_c; + compute_size = compute_num * sizeof(float); + } else { + compute_size = PAD_UP(deal_size, compute_align_size); + compute_num = compute_size / sizeof(float); + } + + // params per core + int32_t total_num = total_n * total_c; + int32_t num_per_core = PAD_DOWN(total_num / taskDim, deal_num); + int32_t loop_per_core = num_per_core / deal_num; + + /* NRAM partition: + * + * |-----------------ping pong--------------------| + * |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight| + * + * split_pipeline_num is 5: input, pt, alpha_t, temp, output. + * nram_reserved_line_num is 2: flt_min, gamma. + */ + const int32_t split_pipeline_num = 5; + const int32_t nram_reserved_line_num = 2; + int32_t target_deal_size = deal_n * sizeof(int32_t); + int32_t target_deal_size_align = PAD_UP(target_deal_size, NFU_ALIGN_SIZE); + // nram PING/PONG offset + int32_t ping_pong_offset = + compute_size * split_pipeline_num + target_deal_size_align; + + // gdram addr + int32_t *base_addr_target = + (int32_t *)target + taskId * loop_per_core * deal_n; + T *base_addr_input = (T *)input + taskId * num_per_core; + T *base_addr_output = output + taskId * num_per_core; + + // nram addr + char *nram_input = (char *)nram_buffer; + char *nram_pt = nram_input + compute_size; + char *nram_alpha_t = nram_pt + compute_size; + char *nram_temp = nram_alpha_t + compute_size; + char *nram_output = nram_temp + compute_size; + char *nram_target = nram_output + compute_size; + float *nram_flt_min = NULL; + float *nram_gamma = NULL; + T *nram_weight = NULL; + + if (!has_weight) { + nram_flt_min = (float *)(nram_buffer + MAX_NRAM_SIZE - + nram_reserved_line_num * NFU_ALIGN_SIZE); + nram_gamma = nram_flt_min + nfu_align_num; + } else { + int32_t weight_space = PAD_UP(total_c * sizeof(T), NFU_ALIGN_SIZE); + nram_flt_min = + (float *)(nram_buffer + MAX_NRAM_SIZE - + nram_reserved_line_num * NFU_ALIGN_SIZE - weight_space); + nram_gamma = nram_flt_min + nfu_align_num; + nram_weight = (T *)(nram_gamma + nfu_align_num); + __memcpy_async(nram_weight, weight, total_c * sizeof(T), GDRAM2NRAM); + } + + // nram set gamma and FLT_MIN + __nramset(nram_gamma, nfu_align_num, gamma); + __nramset(nram_flt_min, nfu_align_num, FLT_MIN); + + /* + * Pipeline: The pipeline is processed in three stages: Load, Compute, Store. + * The allocated memory space of NRAM is divided into two parts: + * PING and Pong. In a single time slice, PING is used to process + * IO stream and PONG is used for computation. Both of them are + * processed synchronously until finished. + * + * diagram of PINGPONG: + * |------|-----------------------------------------------------------------| + * | | space | + * |------|-----------------------------------------------------------------| + * | time | Ping | Pong | Ping | Pong | Ping | Pong | + * |------|-----------------------------------------------------------------| + * | 0 | L0 | | | | | | + * | 1 | C0 | L1 | | | | | + * | 2 | S0 | C1 | L2 | | | | + * | 3 | | S1 | C2 | L3 | | | + * | 4 | | | S2 | C3 | L4 | | + * | 5 | | | | S3 | C4 | L5 | + * | 6 | | | | | S4 | C5 | + * | 7 | | | | | | S5 | + * |------|-----------------------------------------------------------------| + */ + + // diagram of PINGPONG: L0 + if (loop_per_core > 0) { + loadInput(nram_input, nram_target, base_addr_input, base_addr_target, + deal_n, total_c, PING, has_weight, ping_pong_offset, 0); + __asm__ volatile("sync;"); + } + + // diagram of PINGPONG: C0 and L1 + if (loop_per_core > 1) { + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + compute_num, deal_n, total_c, PING, ping_pong_offset, + has_weight); + loadInput(nram_input, nram_target, base_addr_input, base_addr_target, + deal_n, total_c, PONG, has_weight, ping_pong_offset, deal_num); + __asm__ volatile("sync;"); + } + + for (int i = 0; i < loop_per_core - 2; ++i) { + if (i % 2 == PING) { + storeOutput(base_addr_output, nram_output, deal_n, total_c, PING, + has_weight, ping_pong_offset, i * deal_num); + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + compute_num, deal_n, total_c, PONG, ping_pong_offset, + has_weight); + loadInput(nram_input, nram_target, base_addr_input, base_addr_target, + deal_n, total_c, PING, has_weight, ping_pong_offset, + (i + 2) * deal_num); + } else { + storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG, + has_weight, ping_pong_offset, i * deal_num); + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + compute_num, deal_n, total_c, PING, ping_pong_offset, + has_weight); + loadInput(nram_input, nram_target, base_addr_input, base_addr_target, + deal_n, total_c, PONG, has_weight, ping_pong_offset, + (i + 2) * deal_num); + } + __asm__ volatile("sync;"); + } + + if (loop_per_core > 1) { + if ((loop_per_core - 2) % 2 == PING) { + storeOutput(base_addr_output, nram_output, deal_n, total_c, PING, + has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num); + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + compute_num, deal_n, total_c, PONG, ping_pong_offset, + has_weight); + } else { + storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG, + has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num); + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + compute_num, deal_n, total_c, PING, ping_pong_offset, + has_weight); + } + __asm__ volatile("sync;"); + } + + if (loop_per_core > 0) { + if (loop_per_core == 1) { + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + compute_num, deal_n, total_c, PING, ping_pong_offset, + has_weight); + __asm__ volatile("sync;"); + } + if ((loop_per_core - 1) % 2 == PING) { + storeOutput(base_addr_output, nram_output, deal_n, total_c, PING, + has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num); + } else { + storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG, + has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num); + } + } + + // process the remaining data which N remainder per core is less than deal_n + int32_t rem_for_all = total_num - num_per_core * taskDim; + if (rem_for_all == 0) return; + int32_t rem_n_for_all = rem_for_all / total_c; + int32_t rem_n_per_core = (rem_n_for_all + taskDim - 1) / taskDim; + int32_t rem_num_per_core = rem_n_per_core * total_c; + int32_t rem_num_per_core_align = 0; + int32_t rem_core_num = rem_for_all / rem_num_per_core; + + int32_t rem_n_for_last = rem_n_for_all % rem_n_per_core; + int32_t rem_num_for_last = rem_n_for_last * total_c; + int32_t rem_num_for_last_align = 0; + + if (has_weight) { + int32_t align_c = PAD_UP(total_c, compute_align_num); + rem_num_per_core_align = rem_n_per_core * align_c; + rem_num_for_last_align = rem_n_for_last * align_c; + } else { + rem_num_per_core_align = PAD_UP(rem_num_per_core, compute_align_num); + rem_num_for_last_align = PAD_UP(rem_num_for_last, compute_align_num); + } + + int32_t rem_addr_base = num_per_core * taskDim; + int32_t rem_target_addr_base = loop_per_core * deal_n * taskDim; + base_addr_target = (int32_t *)target + rem_target_addr_base; + base_addr_input = (T *)input + rem_addr_base; + base_addr_output = output + rem_addr_base; + + if (taskId < rem_core_num) { + loadInput(nram_input, nram_target, base_addr_input, base_addr_target, + rem_n_per_core, total_c, PING, has_weight, ping_pong_offset, + taskId * rem_num_per_core); + __asm__ volatile("sync;"); + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + rem_num_per_core_align, rem_n_per_core, total_c, PING, + ping_pong_offset, has_weight); + __asm__ volatile("sync;"); + storeOutput(base_addr_output, nram_output, rem_n_per_core, total_c, PING, + has_weight, ping_pong_offset, taskId * rem_num_per_core); + } else if (taskId == rem_core_num) { + if (rem_num_for_last == 0) return; + loadInput(nram_input, nram_target, base_addr_input, base_addr_target, + rem_n_for_last, total_c, PING, has_weight, ping_pong_offset, + taskId * rem_num_per_core); + __asm__ volatile("sync;"); + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + rem_num_for_last_align, rem_n_for_last, total_c, PING, + ping_pong_offset, has_weight); + __asm__ volatile("sync;"); + storeOutput(base_addr_output, nram_output, rem_n_for_last, total_c, PING, + has_weight, ping_pong_offset, taskId * rem_num_per_core); + } else { + return; + } +} + +template +__mlu_global__ void MLUUnion1KernelFocalLossSigmoidBackward( + const void *input, const void *target, const void *weight, + const float gamma, const float alpha, const int32_t total_n, + const int32_t deal_n, const int32_t total_c, void *output) { + focalLossSigmoidBackwardBlock((T *)input, (int32_t *)target, (T *)weight, + gamma, alpha, total_n, deal_n, total_c, + (T *)output); +} +} // namespace backward + +void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, + const cnrtDataType_t d_type, + const void *input, const void *target, + const void *weight, const int32_t N, + const int32_t C, const float alpha, + const float gamma, void *output) { + if (d_type == CNRT_FLOAT16) { + forward::MLUUnion1KernelFocalLossSigmoidForward< + half><<>>(input, target, weight, N, C, alpha, + gamma, output); + } else { + forward::MLUUnion1KernelFocalLossSigmoidForward< + float><<>>(input, target, weight, N, C, alpha, + gamma, output); + } +} + +void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, + const cnrtDataType_t d_type, + const void *input, const void *target, + const void *weight, const float gamma, + const float alpha, const int32_t dim_n, + const int32_t deal_n, const int32_t dim_c, + void *output) { + if (d_type == CNRT_FLOAT16) { + backward::MLUUnion1KernelFocalLossSigmoidBackward< + half><<>>(input, target, weight, gamma, alpha, + dim_n, deal_n, dim_c, output); + } else { + backward::MLUUnion1KernelFocalLossSigmoidBackward< + float><<>>(input, target, weight, gamma, alpha, + dim_n, deal_n, dim_c, output); + } +} diff --git a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu new file mode 100644 index 0000000000..7cb16bb100 --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu @@ -0,0 +1,1161 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "common_mlu_helper.hpp" + +#define NMS_SIZE (64) +#define COORD_DIM (4) +#define MEMORY_CORE (0x80) +#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score +#define REDUCE_NUM \ + (7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input) + +#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024) +#define SIZE_SRAM_BUF (MAX_SRAM_SIZE) + +__nram__ int8_t nram_buffer[SIZE_NRAM_BUF]; +__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF]; + +__mlu_func__ void pvLock() { +#if __BANG_ARCH__ == 270 + if (coreId != MEMORY_CORE) { + __bang_lock(0, 0); + } +#endif +} + +__mlu_func__ void pvUnlock() { +#if __BANG_ARCH__ == 270 + if (coreId != MEMORY_CORE) { + __bang_unlock(0, 0); + } +#endif +} + +enum Addr { SRAM, GDRAM }; + +template +__mlu_func__ void nms_detection( + uint32_t *output_box_num, const int output_mode, const int input_layout, + OUT_DT *output_data, const Addr dst, IN_DT *input_data_score, + const IN_DT *input_data_box, const Addr src, IN_DT *buffer, + const int buffer_size, IN_DT *sram, const int core_limit, + const int input_box_num, const int input_stride, const int output_stride, + const int keepNum, const float thresh_iou, const float thresh_score, + const float offset, const int algo) { + // global value, it is stored in sram with a offset from the begin. + const int flag_offset_size = 28; + int32_t *loop_end_flag = (int32_t *)(sram + flag_offset_size); + loop_end_flag[0] = 0; + // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2 + const int nms_buffer_count1 = 9; + // temp nram buffer to store selected target. + const int nram_save_limit_count = 256; + float div_thresh_iou = 1.0 / thresh_iou; + + // input data ptr + IN_DT *input_score_ptr; + const IN_DT *input_x1_ptr; + const IN_DT *input_y1_ptr; + const IN_DT *input_x2_ptr; + const IN_DT *input_y2_ptr; + input_score_ptr = input_data_score; + input_x1_ptr = input_data_box; + if (input_layout == 0) { + // [boxes_num, 4] + input_y1_ptr = input_x1_ptr + 1; + input_x2_ptr = input_x1_ptr + 2; + input_y2_ptr = input_x1_ptr + 3; + } else if (input_layout == 1) { + // [4, boxes_num] + input_y1_ptr = input_x1_ptr + input_stride; + input_x2_ptr = input_y1_ptr + input_stride; + input_y2_ptr = input_x2_ptr + input_stride; + } + + // nram data ptr + IN_DT *x1; + IN_DT *y1; + IN_DT *x2; + IN_DT *y2; + IN_DT *score; + IN_DT *inter_x1; + IN_DT *inter_y1; + IN_DT *inter_x2; + IN_DT *inter_y2; + IN_DT *max_box; // the max score, x1, y1, x2, y2 + IN_DT *x1_mask; + IN_DT *y1_mask; + IN_DT *x2_mask; + IN_DT *y2_mask; + OUT_DT *nram_save; + + int limit = 0; // find limit when GDRAM or SRAM + int len_core = 0; // the length deal by every core + int max_seg_pad = 0; // the max length every repeat + int repeat = 0; + int remain = 0; + int remain_pad = 0; + int input_offset = 0; // offset of input_data for current core + int nram_save_count = 0; + // mask for collect x1, y1, x2, y2. each mask has 128 elements + const int mask_size = 128; + const int total_mask_size = 512; + + if (output_mode == 0) { + limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) - + nram_save_limit_count * sizeof(OUT_DT) - + total_mask_size * sizeof(IN_DT)) / + (nms_buffer_count1 * sizeof(IN_DT)); + } else { + limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) - + nram_save_limit_count * INFO_NUM * sizeof(OUT_DT) - + total_mask_size * sizeof(IN_DT)) / + (nms_buffer_count1 * sizeof(IN_DT)); + } + + if (core_limit == 1) { + len_core = input_box_num; + input_offset = 0; + } else { + int avg_core = input_box_num / core_limit; + int rem = input_box_num % core_limit; + len_core = avg_core + (taskId < rem ? 1 : 0); + input_offset = avg_core * taskId + (taskId <= rem ? taskId : rem); + } + max_seg_pad = PAD_DOWN(limit, NMS_SIZE); + repeat = len_core / max_seg_pad; + remain = len_core % max_seg_pad; + remain_pad = PAD_UP(remain, NMS_SIZE); + + // if datatype is half, we should convert it to float when compute the IoU + int max_seg_iou_compute = + PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE); + int repeat_iou_compute = len_core / max_seg_iou_compute; + int remain_iou_compute = len_core % max_seg_iou_compute; + int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE); + // initial the address point + score = buffer; + x1 = score + max_seg_pad; + y1 = x1 + max_seg_pad; + x2 = y1 + max_seg_pad; + y2 = x2 + max_seg_pad; + inter_x1 = y2 + max_seg_pad; + inter_y1 = inter_x1 + max_seg_pad; + inter_x2 = inter_y1 + max_seg_pad; + inter_y2 = inter_x2 + max_seg_pad; + x1_mask = inter_y2 + max_seg_pad; + y1_mask = x1_mask + mask_size; + x2_mask = y1_mask + mask_size; + y2_mask = x2_mask + mask_size; + max_box = y2_mask + mask_size; // the max score, x1, y1, x2, y2 + // offset two line from max_box + nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE); + + // set mask for __bang_collect instruction + if (input_layout == 0) { + __nramset((IN_DT *)x1_mask, total_mask_size, (IN_DT)0); + for (int idx = 0; idx < mask_size; idx++) { + int index = (idx % COORD_DIM) * mask_size + idx; + x1_mask[index] = (IN_DT)1.0; + } + } + + for (int keep = 0; keep < keepNum; keep++) { // loop until the max_score <= 0 + if (core_limit != 1) { + __sync_cluster(); // sync before current loop + } + + /******find max start******/ + int max_index = 0; // the max score index + int global_max_index = 0; // for U1 + float max_area = 0; // the max score area + max_box[0] = 0; // init 0 + + for (int i = 0; i <= repeat; i++) { + if (i == repeat && remain == 0) { + break; + } + int seg_len = 0; // the length every nms compute + int cpy_len = 0; // the length every nms memcpy + i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad; + // check seg_len exceeds the limit of fp16 or not. 65536 is the largest + // num that half data type could express. + if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) { + // seg length exceeds the max num for fp16 datatype! + return; + } + i == repeat ? cpy_len = remain : cpy_len = max_seg_pad; + /******nms load start******/ + mluMemcpyDirection_t load_dir = SRAM2NRAM; + if (src == SRAM) { + load_dir = SRAM2NRAM; + } else { + load_dir = GDRAM2NRAM; + } + __nramset(score, seg_len, (IN_DT)0); + __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + + /******nms load end******/ + + __bang_max(inter_x1, score, seg_len); + if (inter_x1[0] > max_box[0]) { + max_box[0] = inter_x1[0]; + + if (sizeof(IN_DT) == sizeof(half)) { + max_index = ((uint16_t *)inter_x1)[1] + input_offset + + i * max_seg_pad; // offset start from head of input_data + } else if (sizeof(IN_DT) == sizeof(float)) { + max_index = ((uint32_t *)inter_x1)[1] + input_offset + + i * max_seg_pad; // offset start from head of input_data + } + } + } // for repeat + + int stride = 1; + if (input_layout == 0) { + stride = input_stride; + } else if (input_layout == 1) { + stride = 1; + } + + if (core_limit == 1) { + max_box[1] = input_x1_ptr[max_index * stride]; + max_box[2] = input_y1_ptr[max_index * stride]; + max_box[3] = input_x2_ptr[max_index * stride]; + max_box[4] = input_y2_ptr[max_index * stride]; + if (algo == 0 || offset == 0.0) { + max_area = ((float)max_box[3] - (float)max_box[1]) * + ((float)max_box[4] - (float)max_box[2]); + } else { + max_area = ((float)max_box[3] - (float)max_box[1] + offset) * + ((float)max_box[4] - (float)max_box[2] + offset); + } + input_score_ptr[max_index] = 0; + global_max_index = max_index; + ((uint32_t *)(max_box + INFO_NUM))[0] = max_index; + } else if (core_limit == 4) { + // find the max with sram + // the max box's x1, y1, x2, y2 on every core + if (coreId != MEMORY_CORE) { + max_box[1] = input_x1_ptr[max_index * stride]; + max_box[2] = input_y1_ptr[max_index * stride]; + max_box[3] = input_x2_ptr[max_index * stride]; + max_box[4] = input_y2_ptr[max_index * stride]; + } + ((uint32_t *)(max_box + INFO_NUM))[0] = max_index; + // copy every core's box info to sram, form: score---x1---y1---x2---y2--- + for (int i = 0; i < INFO_NUM; i++) { + __memcpy(sram + i * core_limit + taskId, max_box + i, 1 * sizeof(IN_DT), + NRAM2SRAM); + } + // copy every core's max_index to sram, use 2 half to store max_index + __memcpy(sram + INFO_NUM * core_limit + taskId * 2, max_box + INFO_NUM, + sizeof(uint32_t), + NRAM2SRAM); // int32_t datatype + __sync_cluster(); + + // copy score from sram to nram and find the max + __nramset(inter_x1, NMS_SIZE, (IN_DT)0); + __memcpy(inter_x1, sram, core_limit * sizeof(IN_DT), SRAM2NRAM); + __bang_max(max_box, inter_x1, NMS_SIZE); + int max_core = 0; + if (sizeof(IN_DT) == sizeof(half)) { + max_core = ((uint16_t *)max_box)[1]; + } else if (sizeof(IN_DT) == sizeof(float)) { + max_core = ((uint32_t *)max_box)[1]; + } + + // copy the max box from SRAM to NRAM + __memcpy(max_box + 1, sram + 1 * core_limit + max_core, 1 * sizeof(IN_DT), + SRAM2NRAM); // x1 + __memcpy(max_box + 2, sram + 2 * core_limit + max_core, 1 * sizeof(IN_DT), + SRAM2NRAM); // y1 + __memcpy(max_box + 3, sram + 3 * core_limit + max_core, 1 * sizeof(IN_DT), + SRAM2NRAM); // x2 + __memcpy(max_box + 4, sram + 4 * core_limit + max_core, 1 * sizeof(IN_DT), + SRAM2NRAM); // y2 + __memcpy(max_box + 5, sram + 5 * core_limit + 2 * max_core, + sizeof(uint32_t), SRAM2NRAM); + if (algo == 0 || offset == 0.0) { + max_area = ((float)max_box[3] - (float)max_box[1]) * + ((float)max_box[4] - (float)max_box[2]); + } else { + max_area = ((float)max_box[3] - (float)max_box[1] + offset) * + ((float)max_box[4] - (float)max_box[2] + offset); + } + global_max_index = ((uint32_t *)(max_box + INFO_NUM))[0]; + input_score_ptr[global_max_index] = 0; + } + // by now, we get: max_score|max_index|max_box|max_area + /******find max end******/ + + /******nms store start******/ + // store to nram + if (float(max_box[0]) > thresh_score) { + OUT_DT *save_ptr; + int save_offset = 0; + int save_str_num = 0; + save_ptr = nram_save; + save_offset = nram_save_count; + save_str_num = nram_save_limit_count; + if (coreId == 0) { + if (output_mode == 0) { // index1, index2, ... + __memcpy(save_ptr + save_offset, (uint32_t *)(max_box + INFO_NUM), + 1 * sizeof(uint32_t), NRAM2NRAM, 1 * sizeof(uint32_t), + 1 * sizeof(uint32_t), 0); + } else if (output_mode == 1) { // score, x1, y1, x2, y2 + __memcpy(save_ptr + save_offset * INFO_NUM, max_box, + INFO_NUM * sizeof(IN_DT), NRAM2NRAM, + INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0); + } else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2--- + __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), + NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), + 4); + } + } + nram_save_count++; + (*output_box_num)++; + } + + // store to sram/gdram + if (*output_box_num != 0) { + mluMemcpyDirection_t store_dir = NRAM2GDRAM; + if (dst == SRAM) { + store_dir = NRAM2SRAM; + } else { // dst == GDRAM + store_dir = NRAM2GDRAM; + } + if ((nram_save_count == nram_save_limit_count) || + (float(max_box[0]) <= thresh_score) || keep == keepNum - 1) { + if (nram_save_count != 0) { + if (coreId == 0) { + if (output_mode == 0) { // index1, index2, ... + pvLock(); + __memcpy(output_data, nram_save, + nram_save_count * sizeof(uint32_t), store_dir); + pvUnlock(); + output_data += nram_save_count; + } else if (output_mode == 1) { // score, x1, y1, x2, y2 + pvLock(); + __memcpy(output_data, nram_save, + nram_save_count * INFO_NUM * sizeof(IN_DT), store_dir); + pvUnlock(); + output_data += nram_save_count * INFO_NUM; + } else if (output_mode == + 2) { // score---, x1---, y1---, x2---, y2--- + pvLock(); + __memcpy(output_data, nram_save, nram_save_count * sizeof(IN_DT), + store_dir, output_stride * sizeof(IN_DT), + nram_save_limit_count * sizeof(IN_DT), 4); + pvUnlock(); + output_data += nram_save_count; + } + nram_save_count = 0; + } + } + } // if move data nram->sram/gdram + } // if dst + + // if the max score <= 0, end + if (core_limit == 1) { + if (float(max_box[0]) <= thresh_score) { + break; + } + } else { + if (float(max_box[0]) <= thresh_score) { + if (coreId == 0) { + loop_end_flag[0] = 1; + } + } + __sync_cluster(); + if (loop_end_flag[0] == 1) { + break; + } + } + /******nms store end******/ + + // To solve half data accuracy, we convert half to float to calculate IoU. + for (int i = 0; i <= repeat_iou_compute; i++) { + if (i == repeat_iou_compute && remain_iou_compute == 0) { + break; + } + int seg_len = 0; // the length every nms compute + int cpy_len = 0; // the length every nms memcpy + i == repeat_iou_compute ? seg_len = remain_pad_iou_compute + : seg_len = max_seg_iou_compute; + i == repeat_iou_compute ? cpy_len = remain_iou_compute + : cpy_len = max_seg_iou_compute; + + /******nms load start******/ + mluMemcpyDirection_t load_dir = SRAM2NRAM; + if (src == SRAM) { + load_dir = SRAM2NRAM; + } else { + load_dir = GDRAM2NRAM; + } + + __nramset((float *)score, seg_len, 0.0f); + int dt_offset = 0; + if (sizeof(IN_DT) == sizeof(float)) { + __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + dt_offset = 0; + } else if (sizeof(IN_DT) == sizeof(half)) { + __nramset(x1, seg_len, half(0)); + __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + __bang_half2float((float *)score, (half *)x1, seg_len); + dt_offset = max_seg_iou_compute; + } + + if (input_layout == 0) { + // the following number 4 means x1, y1, x2, y2 + __memcpy( + inter_x1, + input_x1_ptr + (input_offset + i * max_seg_iou_compute) * COORD_DIM, + cpy_len * COORD_DIM * sizeof(IN_DT), load_dir, + cpy_len * COORD_DIM * sizeof(IN_DT), + cpy_len * COORD_DIM * sizeof(IN_DT), 0); + // here use collect instruction to transpose the [n, 4] shape into [4, + // n] shape to avoid + // discrete memory accessing. + for (int c_i = 0; c_i < COORD_DIM * seg_len / mask_size; c_i++) { + // the following number 32 means 32 elements will be selected out by + // once operation + __bang_collect(x1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size, + x1_mask, mask_size); + __bang_collect(y1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size, + y1_mask, mask_size); + __bang_collect(x2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size, + x2_mask, mask_size); + __bang_collect(y2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size, + y2_mask, mask_size); + } + } else if (input_layout == 1) { + __memcpy(x1 + dt_offset, + input_x1_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + __memcpy(y1 + dt_offset, + input_y1_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + __memcpy(x2 + dt_offset, + input_x2_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + __memcpy(y2 + dt_offset, + input_y2_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + } + /******nms load end******/ + + /******nms compute start******/ + if (sizeof(IN_DT) == sizeof(half)) { + __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, + seg_len); + __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, + seg_len); + __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, + seg_len); + __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, + seg_len); + } + // 1、 compute IOU + // get the area_I + __nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1 + __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1, + seg_len); // inter_x1 + __nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2 + __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2, + seg_len); // inter_x2 + __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, + seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len); + } + __bang_active_relu((float *)inter_x1, (float *)inter_x1, + seg_len); // inter_w + __nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1 + __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2, + seg_len); // inter_y1 + __nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2 + __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2, + seg_len); // inter_y2 + __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1, + seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len); + } + __bang_active_relu((float *)inter_y1, (float *)inter_y1, + seg_len); // inter_h + __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1, + seg_len); // area_I + // get the area of input_box: area = (x2 - x1) * (y2 - y1); + __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len); + __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len); + __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len); + } + __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2, + seg_len); // area + // get the area_U: area + max_area - area_I + __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area), + seg_len); + __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1, + seg_len); // area_U + // 2、 select the box + // if IOU greater than thres, set the score to zero, abort it: area_U > + // area_I * (1 / thresh)? + if (thresh_iou > 0.0) { + __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou, + seg_len); + } else { + __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou, + seg_len); + } + __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, + seg_len); + __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len); + /******nms compute end******/ + + // update the score + mluMemcpyDirection_t update_dir = NRAM2SRAM; + if (dst == SRAM) { + update_dir = NRAM2SRAM; + } else { + update_dir = NRAM2GDRAM; + } + if (sizeof(IN_DT) == sizeof(half)) { + __bang_float2half_rd((half *)score, (float *)score, seg_len); + } + pvLock(); + __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score, + cpy_len * sizeof(IN_DT), update_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + pvUnlock(); + } // for repeat + } // for keepNum +} + +__mlu_global__ void MLUUnion1KernelNMS( + const void *input_boxes, const void *input_confidence, + const int input_num_boxes, const int input_stride, + const int max_output_size, const float iou_threshold, + const float confidence_threshold, const int mode, const int input_layout, + void *workspace, void *result_num, void *output, + const cnrtDataType_t data_type_input, const float offset, const int algo) { + if (data_type_input == CNRT_FLOAT16) { + __memcpy(workspace, input_confidence, input_num_boxes * sizeof(half), + GDRAM2GDRAM); + } else if (data_type_input == CNRT_FLOAT32) { + __memcpy(workspace, input_confidence, input_num_boxes * sizeof(float), + GDRAM2GDRAM); + } else { + } + + int output_stride = max_output_size; + uint32_t result_box_num = 0; + if (mode == 0) { + uint32_t *out_data = (uint32_t *)output; + switch (data_type_input) { + default: { return; } + case CNRT_FLOAT16: { + half *boxes_data = (half *)input_boxes; + half *confi_data = (half *)workspace; + half *buffer = (half *)nram_buffer; + half *sram = (half *)sram_buffer; + + nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, + confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, + sram, taskDim, input_num_boxes, input_stride, + output_stride, max_output_size, iou_threshold, + confidence_threshold, offset, algo); + ((uint32_t *)result_num)[0] = result_box_num; + }; break; + case CNRT_FLOAT32: { + float *boxes_data = (float *)input_boxes; + float *confi_data = (float *)workspace; + float *buffer = (float *)nram_buffer; + float *sram = (float *)sram_buffer; + + nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, + confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, + sram, taskDim, input_num_boxes, input_stride, + output_stride, max_output_size, iou_threshold, + confidence_threshold, offset, algo); + ((uint32_t *)result_num)[0] = result_box_num; + }; break; + } + } else { + switch (data_type_input) { + default: { return; } + case CNRT_FLOAT16: { + half *boxes_data = (half *)input_boxes; + half *confi_data = (half *)workspace; + half *out_data = (half *)output; + half *buffer = (half *)nram_buffer; + half *sram = (half *)sram_buffer; + + nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, + confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, + sram, taskDim, input_num_boxes, input_stride, + output_stride, max_output_size, iou_threshold, + confidence_threshold, offset, algo); + ((uint32_t *)result_num)[0] = result_box_num; + }; break; + case CNRT_FLOAT32: { + float *boxes_data = (float *)input_boxes; + float *confi_data = (float *)workspace; + float *out_data = (float *)output; + float *buffer = (float *)nram_buffer; + float *sram = (float *)sram_buffer; + + nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, + confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, + sram, taskDim, input_num_boxes, input_stride, + output_stride, max_output_size, iou_threshold, + confidence_threshold, offset, algo); + ((uint32_t *)result_num)[0] = result_box_num; + }; break; + } + } +} + +template +__mlu_func__ void nms_detection_ux( + int32_t *loop_end_flag, uint32_t &output_box_num, OUT_DT *output_dram, + IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram, + const int input_layout, const int input_num_boxes, const int input_stride, + const int max_output_size, const float thresh_iou, const float thresh_score, + const float offset, const int output_mode, const int algo) { + loop_end_flag[0] = 0; + IN_DT *sram = (IN_DT *)sram_buffer; + + // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2 + int nms_buffer_count1 = 9; + // temp nram buffer to store selected target. + int nram_save_limit_count = 256; + float div_thresh_iou = 1.0 / thresh_iou; + + // input data ptr + IN_DT *input_score_ptr; + const IN_DT *input_x1_ptr; + const IN_DT *input_y1_ptr; + const IN_DT *input_x2_ptr; + const IN_DT *input_y2_ptr; + input_score_ptr = score_data; + input_x1_ptr = boxes_data; + input_y1_ptr = input_x1_ptr + input_stride; + input_x2_ptr = input_y1_ptr + input_stride; + input_y2_ptr = input_x2_ptr + input_stride; + + int limit = 0; // find limit when GDRAM or SRAM + int max_seg_pad = 0; // the max length every repeat + int repeat = 0; + int remain = 0; + int remain_pad = 0; + int nram_save_count = 0; + + if (output_mode == 0) { + limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) - + nram_save_limit_count * sizeof(OUT_DT)) / + (nms_buffer_count1 * sizeof(IN_DT)); + } else { + limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) - + nram_save_limit_count * INFO_NUM * sizeof(OUT_DT)) / + (nms_buffer_count1 * sizeof(IN_DT)); + } + + // data split + int avg_cluster = input_num_boxes / clusterDim; + int rem_cluster = input_num_boxes % clusterDim; + int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0); + int cluster_offset = avg_cluster * clusterId + + (clusterId <= rem_cluster ? clusterId : rem_cluster); + + int avg_core = len_cluster / coreDim; + int rem_core = len_cluster % coreDim; + int len_core = avg_core + (coreId < rem_core ? 1 : 0); + int core_offset = + avg_core * coreId + (coreId <= rem_core ? coreId : rem_core); + int input_offset = cluster_offset + core_offset; + + max_seg_pad = PAD_DOWN(limit, NMS_SIZE); + + // core 0 of each cluster calculate the max score index + int max_index_avg_core = input_num_boxes / clusterDim; + int max_index_rem_core = input_num_boxes % clusterDim; + int max_index_len_core = + max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0); + int max_index_input_offset = + max_index_avg_core * clusterId + + (clusterId <= max_index_rem_core ? clusterId : max_index_rem_core); + repeat = max_index_len_core / max_seg_pad; + remain = max_index_len_core % max_seg_pad; + remain_pad = PAD_UP(remain, NMS_SIZE); + + // if datatype is fp16, we should cvt to fp32 when compute iou + int max_seg_iou_compute = + PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE); + int repeat_iou_compute = len_core / max_seg_iou_compute; + int remain_iou_compute = len_core % max_seg_iou_compute; + int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE); + + // init the nram ptr + IN_DT *score = (IN_DT *)nram_buffer; + IN_DT *x1 = score + max_seg_pad; + IN_DT *y1 = x1 + max_seg_pad; + IN_DT *x2 = y1 + max_seg_pad; + IN_DT *y2 = x2 + max_seg_pad; + IN_DT *inter_x1 = y2 + max_seg_pad; + IN_DT *inter_y1 = inter_x1 + max_seg_pad; + IN_DT *inter_x2 = inter_y1 + max_seg_pad; + IN_DT *inter_y2 = inter_x2 + max_seg_pad; + IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2 + OUT_DT *nram_save = + (OUT_DT *)((char *)max_box + + NFU_ALIGN_SIZE); // offset two line from max_box + + mluMemcpyDirection_t input_load_dir = SRAM2NRAM; + mluMemcpyDirection_t input_store_dir = NRAM2SRAM; + input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM; + input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM; + + for (int keep = 0; keep < max_output_size; + keep++) { // loop until the max_score <= 0 + __sync_all(); + + /******FIND MAX START******/ + int max_index = 0; + int global_max_index = 0; // for Ux + float max_area = 0; // the max socre area + max_box[0] = 0; // init 0 + + if (coreId == 0) { + for (int i = 0; i <= repeat; i++) { + if (i == repeat && remain == 0) { + break; + } + + int seg_len = (i == repeat) + ? remain_pad + : max_seg_pad; // the length every nms compute + // check seg_len exceeds the limit of fp16 or not. 65536 is the largest + // num + // that fp16 could express. + if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) { + return; + } + int cpy_len = (i == repeat) + ? remain + : max_seg_pad; // the length every nms memcpy + + /******NMS LOAD START******/ + __bang_write_zero(score, seg_len); + __memcpy(score, + input_score_ptr + max_index_input_offset + i * max_seg_pad, + cpy_len * sizeof(IN_DT), input_load_dir, + cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); + + /******NMS LOAD END******/ + + __bang_max(inter_x1, score, seg_len); + if (inter_x1[0] > max_box[0]) { + max_box[0] = inter_x1[0]; + if (sizeof(IN_DT) == sizeof(half)) { + max_index = + ((uint16_t *)inter_x1)[1] + max_index_input_offset + + i * max_seg_pad; // offset start from head of input_data + } else if (sizeof(IN_DT) == sizeof(float)) { + max_index = + ((uint32_t *)inter_x1)[1] + max_index_input_offset + + i * max_seg_pad; // offset start from head of input_data + } + } + } // for repeat + + // the max box's x1, y1, x2, y2 on every cluster + max_box[1] = input_x1_ptr[max_index]; + max_box[2] = input_y1_ptr[max_index]; + max_box[3] = input_x2_ptr[max_index]; + max_box[4] = input_y2_ptr[max_index]; + ((uint32_t *)(max_box + 5))[0] = max_index; + // copy max box info to sram + __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM); + } + __sync_all(); + // copy all partial max to the sram of cluster 0 + if (clusterId != 0) { + __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT), + SRAM2SRAM, 0); + } + __sync_all(); + + // reduce between clusters to get the global max box + if (clusterId == 0) { + if (coreId == 0) { + __bang_write_zero(inter_x1, NMS_SIZE); + __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT), + REDUCE_NUM * sizeof(IN_DT), clusterDim - 1); + __bang_max(max_box, inter_x1, NMS_SIZE); + int max_cluster = (sizeof(IN_DT) == sizeof(half)) + ? ((uint16_t *)max_box)[1] + : ((uint32_t *)max_box)[1]; + __memcpy(max_box, sram + max_cluster * REDUCE_NUM, + REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM); + __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM); + } + __sync_cluster(); + if (coreId == 0x80 && clusterDim > 1) { + // broadcast global max box to each cluster's sram + for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) { + __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM, + cluster_idx); + } + } + __sync_cluster(); + } + __sync_all(); + + // copy the global max box to max_box + __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM); + if (algo == 0 || offset == 0.0) { + max_area = ((float)max_box[3] - (float)max_box[1]) * + ((float)max_box[4] - (float)max_box[2]); + } else { + max_area = ((float)max_box[3] - (float)max_box[1] + offset) * + ((float)max_box[4] - (float)max_box[2] + offset); + } + global_max_index = ((uint32_t *)(max_box + 5))[0]; + if (coreId != 0x80) { + input_score_ptr[global_max_index] = 0; + } + // by now, we get: max_score|max_index|max_box|max_area + /******FIND MAX END******/ + + /******NMS STORE START******/ + // store to nram + if (float(max_box[0]) > thresh_score) { + OUT_DT *save_ptr; + int save_offset = 0; + int save_str_num = 0; + save_ptr = nram_save; + save_offset = nram_save_count; + save_str_num = nram_save_limit_count; + if (clusterId == 0 && coreId == 0) { + if (output_mode == 0) { // index1, index2, ... + save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0]; + } else if (output_mode == 1) { // score, x1, y1, x2, y2 + __memcpy(save_ptr + save_offset * INFO_NUM, max_box, + INFO_NUM * sizeof(IN_DT), NRAM2NRAM, + INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0); + } else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2--- + __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), + NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), + 4); + } + } + nram_save_count++; + output_box_num++; + } + + // store to sram/gdram + if (output_box_num != 0) { + if ((nram_save_count == nram_save_limit_count) || + (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) { + if (nram_save_count != 0) { + if (clusterId == 0 && coreId == 0) { + if (output_mode == 0) { // index1, index2, ... + pvLock(); + __memcpy(output_dram, nram_save, + nram_save_count * sizeof(uint32_t), NRAM2GDRAM); + pvUnlock(); + output_dram += nram_save_count; + } else if (output_mode == 1) { // score, x1, y1, x2, y2 + pvLock(); + __memcpy(output_dram, nram_save, + nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM); + pvUnlock(); + output_dram += nram_save_count * INFO_NUM; + } else if (output_mode == + 2) { // score---, x1---, y1---, x2---, y2--- + pvLock(); + __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT), + NRAM2GDRAM, max_output_size * sizeof(IN_DT), + nram_save_limit_count * sizeof(IN_DT), 4); + pvUnlock(); + output_dram += nram_save_count; + } + nram_save_count = 0; + } + } + } // if move data nram->sram/gdram + } // if dst + + if (float(max_box[0]) <= thresh_score) { + if (clusterId == 0 && coreId == 0) { + loop_end_flag[0] = 1; // dram + } + } + __sync_all(); + if (loop_end_flag[0] == 1) { + break; + } + /******NMS STORE END******/ + + // To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU. + for (int i = 0; i <= repeat_iou_compute; i++) { + if (i == repeat_iou_compute && remain_iou_compute == 0) { + break; + } + int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute + : max_seg_iou_compute; + int cpy_len = + (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute; + + /******NMS LOAD START******/ + __nramset((float *)score, seg_len, 0.0f); + int dt_offset = 0; + if (sizeof(IN_DT) == sizeof(float)) { + __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad, + cpy_len * sizeof(IN_DT), input_load_dir, + cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); + dt_offset = 0; + } else if (sizeof(IN_DT) == sizeof(half)) { + __nramset(x1, seg_len, half(0)); + __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), input_load_dir, + cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); + __bang_half2float((float *)score, (half *)x1, seg_len); + dt_offset = max_seg_iou_compute; + } + + __memcpy(x1 + dt_offset, + input_x1_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), input_load_dir, + max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3); + /******NMS LOAD END******/ + + /******NMS COMPUTE START******/ + if (sizeof(IN_DT) == sizeof(half)) { + __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, + seg_len); + __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, + seg_len); + __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, + seg_len); + __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, + seg_len); + } + // 1、 compute IOU + // get the area_I + __nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1 + __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1, + seg_len); // inter_x1 + __nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2 + __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2, + seg_len); // inter_x2 + __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, + seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len); + } + __bang_active_relu((float *)inter_x1, (float *)inter_x1, + seg_len); // inter_w + __nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1 + __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2, + seg_len); // inter_y1 + __nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2 + __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2, + seg_len); // inter_y2 + __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1, + seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len); + } + __bang_active_relu((float *)inter_y1, (float *)inter_y1, + seg_len); // inter_h + __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1, + seg_len); // area_I + // get the area of input_box: area = (x2 - x1) * (y2 - y1); + __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len); + __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len); + __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len); + } + __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2, + seg_len); // area + // get the area_U: area + max_area - area_I + __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area), + seg_len); + __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1, + seg_len); // area_U + // 2、 select the box + // if IOU greater than thres, set the score to zero, abort it: area_U > + // area_I * (1 / thresh)? + if (thresh_iou > 0.0) { + __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou, + seg_len); + } else { + __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou, + seg_len); + } + __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, + seg_len); + __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len); + /******NMS COMPUTE END******/ + + if (sizeof(IN_DT) == 2) { + __bang_float2half_rd((half *)score, (float *)score, seg_len); + } + pvLock(); + __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score, + cpy_len * sizeof(IN_DT), input_store_dir, + cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); + pvUnlock(); + } // for repeat + } // for max_output_size +} + +__mlu_global__ void MLUUionXKernelNMS( + const void *input_boxes, const void *input_confidence, + const int input_num_boxes, const int input_layout, const int input_stride, + const int max_output_size, const float iou_threshold, + const float confidence_threshold, const float offset, + const cnrtDataType_t data_type_input, const int output_mode, const int algo, + void *workspace, void *result_num, void *output) { + int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2; + int32_t *loop_end_flag = + (int32_t *)((char *)workspace + + INFO_NUM * input_num_boxes * input_dwidth); + int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth; + int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size; + + int cluster_score_size = input_num_boxes * input_dwidth; + int cluster_boxes_size = input_num_boxes * 4 * input_dwidth; + char *sram_score = (char *)sram_buffer + reduce_sram_size; + char *sram_boxes = + (char *)sram_buffer + reduce_sram_size + cluster_score_size; + Addr input_ram = GDRAM; + if ((cluster_score_size + cluster_boxes_size) < availbale_sram_size) { + input_ram = SRAM; + __memcpy(sram_score, input_confidence, cluster_score_size, GDRAM2SRAM); + __memcpy(sram_boxes, input_boxes, cluster_boxes_size, GDRAM2SRAM); + } else { + __memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM); + } + __sync_cluster(); + uint32_t output_box_num = 0; + if (output_mode == 0) { + uint32_t *output_dram = (uint32_t *)output; + switch (data_type_input) { + default: { return; } + case CNRT_FLOAT16: { + half *score_data; + half *boxes_data; + score_data = + (input_ram == SRAM) ? (half *)sram_score : (half *)workspace; + boxes_data = + (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes; + nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data, + boxes_data, input_ram, input_layout, input_num_boxes, + input_stride, max_output_size, iou_threshold, + confidence_threshold, offset, output_mode, algo); + ((uint32_t *)result_num)[0] = output_box_num; + }; break; + case CNRT_FLOAT32: { + float *score_data; + float *boxes_data; + score_data = + (input_ram == SRAM) ? (float *)sram_score : (float *)workspace; + boxes_data = + (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes; + nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data, + boxes_data, input_ram, input_layout, input_num_boxes, + input_stride, max_output_size, iou_threshold, + confidence_threshold, offset, output_mode, algo); + ((uint32_t *)result_num)[0] = output_box_num; + }; break; + } + } else { + switch (data_type_input) { + default: { return; } + case CNRT_FLOAT16: { + half *output_dram = (half *)output; + half *score_data; + half *boxes_data; + score_data = + (input_ram == SRAM) ? (half *)sram_score : (half *)workspace; + boxes_data = + (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes; + nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data, + boxes_data, input_ram, input_layout, input_num_boxes, + input_stride, max_output_size, iou_threshold, + confidence_threshold, offset, output_mode, algo); + ((uint32_t *)result_num)[0] = output_box_num; + }; break; + case CNRT_FLOAT32: { + float *output_dram = (float *)output; + float *score_data; + float *boxes_data; + score_data = + (input_ram == SRAM) ? (float *)sram_score : (float *)workspace; + boxes_data = + (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes; + nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data, + boxes_data, input_ram, input_layout, input_num_boxes, + input_stride, max_output_size, iou_threshold, + confidence_threshold, offset, output_mode, algo); + ((uint32_t *)result_num)[0] = output_box_num; + }; break; + } + } +} + +void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const cnrtDataType_t data_type_input, const void *boxes_ptr, + const void *scores_ptr, const int input_num_boxes, + const int input_stride, const int max_output_boxes, + const float iou_threshold, const float offset, + void *workspace_ptr, void *output_size_ptr, void *output_ptr) { + switch (k_type) { + default: { return; } + case CNRT_FUNC_TYPE_BLOCK: + case CNRT_FUNC_TYPE_UNION1: { + MLUUnion1KernelNMS<<>>( + boxes_ptr, scores_ptr, input_num_boxes, input_stride, + max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, + /*output_mode=*/0, + /*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr, + data_type_input, offset, /*algo=*/1); + }; break; + case CNRT_FUNC_TYPE_UNION2: + case CNRT_FUNC_TYPE_UNION4: + case CNRT_FUNC_TYPE_UNION8: + case CNRT_FUNC_TYPE_UNION16: { + MLUUionXKernelNMS<<>>( + boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1, + input_stride, max_output_boxes, iou_threshold, + /*confidence_threshold=*/0.0, offset, data_type_input, + /*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr, + output_ptr); + }; break; + } +} diff --git a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu new file mode 100644 index 0000000000..f62554d0ef --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu @@ -0,0 +1,493 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "common_mlu_helper.hpp" + +#define ROI_OFFSET 5 + +__nram__ char buffer[MAX_NRAM_SIZE]; + +namespace forward { +template +__mlu_func__ void bilinearInterpolate(const int input_height, + const int input_width, T y, T x, T *w1, + T *w2, T *w3, T *w4, int *x_low, + int *x_high, int *y_low, int *y_high, + bool *empty) { + // deal with cases that inverse elements are of feature map boundary + if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) { + *empty = true; + return; + } + + if (y <= 0) y = 0; + if (x <= 0) x = 0; + + int y_low_ = int(y); + int x_low_ = int(x); + + if (y_low_ >= input_height - 1) { + *y_high = y_low_ = input_height - 1; + y = (T)y_low_; + } else { + *y_high = y_low_ + 1; + } + + if (x_low_ >= input_width - 1) { + *x_high = x_low_ = input_width - 1; + x = T(x_low_); + } else { + *x_high = x_low_ + 1; + } + + *y_low = y_low_; + *x_low = x_low_; + + T ly = y - y_low_; + T lx = x - x_low_; + T hy = 1.0 - ly; + T hx = 1.0 - lx; + *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx; + return; +} + +template +__mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core, + T *nram_out, const int roi_bin_grid_h, + const int roi_bin_grid_w, const T roi_start_h, + const T roi_start_w, const int ph, + const int pw, const T bin_size_h, + const T bin_size_w, const float count, + const int input_height, const int input_width, + const int channels, const int cyc_num, + const int max_elements) { + int cyc_channel = max_elements; + + for (int i = 0; i < cyc_num; i++) { + int real_channel = + (i == cyc_num - 1) ? channels - i * cyc_channel : cyc_channel; + int align_channel = PAD_UP(real_channel, NFU_ALIGN_SIZE / sizeof(T)); + __bang_write_zero(nram_out, align_channel); + uint32_t real_size = real_channel * sizeof(T); + + int iy, ix; + for (iy = 0; iy < roi_bin_grid_h; iy++) { + // 1. compute the coordinates of the y axis in the current roi_bin_grid_h + T y = roi_start_h + ph * bin_size_h + + (T)(iy + 0.5) * bin_size_h / (T)(roi_bin_grid_h); + for (ix = 0; ix < roi_bin_grid_w; ix++) { + // 2. compute the coordinates of the x axis in the current + // roi_bin_grid_w + T x = roi_start_w + pw * bin_size_w + + (T)(ix + 0.5) * bin_size_w / (T)(roi_bin_grid_w); + + // 3. compute the four weights (w1, w2, w3 and w4), the height (y_low + // and y_high) and weight (x_low and x_high) of input feature map in + // the current roi bin grid, and the flag (empty) which shows if x, y + // are out of input feature map ranges + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + bool empty = false; + + bilinearInterpolate(input_height, input_width, y, x, &w1, &w2, &w3, &w4, + &x_low, &x_high, &y_low, &y_high, &empty); + + // 4. compute interpolation of the current roi bin grid + // tmp_cyc1, temp_cyc2, tmp_cyc3 and tmp_cyc4 store the input values + // to compute the interpolation, and then reused to compute + // the argmax_x and argmax_y. + T *tmp_cyc1 = nram_in + cyc_channel; + T *tmp_cyc2 = nram_in + cyc_channel * 2; + T *tmp_cyc3 = nram_in + cyc_channel * 3; + T *tmp_cyc4 = nram_in + cyc_channel * 4; + + if (empty) { // exits abnormal values + __bang_write_zero(nram_in, align_channel); + } else { + __bang_write_zero(nram_in, align_channel); + uint32_t offset1 = (y_low * input_width + x_low) * channels; + uint32_t offset2 = (y_low * input_width + x_high) * channels; + uint32_t offset3 = (y_high * input_width + x_low) * channels; + uint32_t offset4 = (y_high * input_width + x_high) * channels; + T *input1 = (T *)input_core + offset1 + i * cyc_channel; + T *input2 = (T *)input_core + offset2 + i * cyc_channel; + T *input3 = (T *)input_core + offset3 + i * cyc_channel; + T *input4 = (T *)input_core + offset4 + i * cyc_channel; + + // load the four pixels (p1, p2, p3 and p4) of input feature map to + // compute interpolation + __memcpy(tmp_cyc1, input1, real_size, GDRAM2NRAM); + __memcpy(tmp_cyc2, input2, real_size, GDRAM2NRAM); + __memcpy(tmp_cyc3, input3, real_size, GDRAM2NRAM); + __memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM); + + // interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4 + __bang_mul_const(tmp_cyc1, tmp_cyc1, w1, align_channel); + __bang_mul_const(tmp_cyc2, tmp_cyc2, w2, align_channel); + __bang_mul_const(tmp_cyc3, tmp_cyc3, w3, align_channel); + __bang_mul_const(tmp_cyc4, tmp_cyc4, w4, align_channel); + + __bang_add(nram_in, tmp_cyc1, nram_in, align_channel); + __bang_add(nram_in, tmp_cyc2, nram_in, align_channel); + __bang_add(nram_in, tmp_cyc3, nram_in, align_channel); + __bang_add(nram_in, tmp_cyc4, nram_in, align_channel); + } + // 5. compute sum value and corresponding coordinates of x axis and y + // axis. Update the sum value. + __bang_add(nram_out, nram_in, nram_out, align_channel); + } // loop_roi_grid_w + } // loop_roi_grid_h + T count_value = (T)(1.0 / count); + __bang_mul_const(nram_out, nram_out, count_value, align_channel); + __memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM); + } // loop_cyc_num +} + +template +__mlu_func__ void roialignForwardAvg( + T *input, T *rois, T *output, const bool aligned, const int channels, + const int pooled_height, const int pooled_width, const int input_height, + const int input_width, const int sampling_ratio, const T spatial_scale, + const int num_rois) { + // find limit for channel, the nram space is divided to 6 parts that are + // input, 4 weights to compute the interpolation (w1, w2, w3, w4), output + + // max_elements : 300 : float datatype : 27296, half datatype : 54592 + // max_elements : 200 : float datatype : 16384, half datatype : 32768 + int max_elements = (PAD_DOWN(MAX_NRAM_SIZE / 6, NFU_ALIGN_SIZE)) / sizeof(T); + int cyc_num = channels / max_elements + (int)(channels % max_elements != 0); + T offset = aligned ? (T)0.5 : (T)0.0; + int task_num = num_rois * pooled_height * pooled_width; + T *nram_out = (T *)buffer; + T *nram_in = nram_out + max_elements; + if (task_num < taskDim) { + if (taskId >= task_num) { + return; + } + } + + for (int bin_idx = taskId; bin_idx < task_num; bin_idx = bin_idx + taskDim) { + if (bin_idx >= task_num) { + return; + } + + // (n,ph.pw) is a c in the pooled output + int pw = bin_idx % pooled_width; + int ph = (bin_idx / pooled_width) % pooled_height; + int n = bin_idx / pooled_width / pooled_height; + + T *roi_id_tmp = rois + n * ROI_OFFSET; + // 1. compute width and height of roi region. + int batch_idx = (int)roi_id_tmp[0]; + T roi_x1 = roi_id_tmp[1]; + T roi_y1 = roi_id_tmp[2]; + T roi_x2 = roi_id_tmp[3]; + T roi_y2 = roi_id_tmp[4]; + T roi_start_w = roi_x1 * spatial_scale - offset; + T roi_start_h = roi_y1 * spatial_scale - offset; + T roi_end_w = roi_x2 * spatial_scale - offset; + T roi_end_h = roi_y2 * spatial_scale - offset; + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + + if (!aligned) { + roi_width = roi_width > (T)(1.0) ? roi_width : (T)(1.0); + roi_height = roi_height > (T)(1.0) ? roi_height : (T)(1.0); + } + + // 2. compute float-type width and height of roi bin region. + T bin_size_w = (T)roi_width / (T)pooled_width; + T bin_size_h = (T)roi_height / (T)pooled_height; + + // 3. compute int-type width and height of roi bin region. + int roi_bin_grid_h, roi_bin_grid_w; + roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : int(ceilf(roi_height / pooled_height)); + roi_bin_grid_w = (sampling_ratio > 0) + ? sampling_ratio + : int(ceilf(roi_width / pooled_width)); + float count = (float)((roi_bin_grid_h * roi_bin_grid_w) > 1 + ? roi_bin_grid_h * roi_bin_grid_w + : 1.0); + T *input_core = input + batch_idx * channels * input_width * input_height; + T *output_core = output + bin_idx * channels; + // 4. compute avg value and corresponding coordinates of x axis and y axis. + computeChannel(input_core, nram_in, output_core, nram_out, roi_bin_grid_h, + roi_bin_grid_w, roi_start_h, roi_start_w, ph, pw, bin_size_h, + bin_size_w, count, input_height, input_width, channels, + cyc_num, max_elements); + } +} + +__mlu_global__ void MLUUnion1KernelRoiAlignAvg( + const void *input, const void *rois, const int channels, const bool aligned, + const int pooled_height, const int pooled_width, const int input_height, + const int input_width, const int sampling_ratio, const float spatial_scale, + const int num_rois, const cnrtDataType_t data_type, void *output) { + // make sure that memcore is not used + if (coreId == 0x80) { + return; + } + + switch (data_type) { + case CNRT_FLOAT16: { + roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned, + channels, pooled_height, pooled_width, input_height, + input_width, sampling_ratio, + (half)spatial_scale, num_rois); + }; break; + case CNRT_FLOAT32: { + roialignForwardAvg((float *)input, (float *)rois, (float *)output, + aligned, channels, pooled_height, pooled_width, + input_height, input_width, sampling_ratio, + (float)spatial_scale, num_rois); + }; break; + default: + break; + } + + return; +} +} // namespace forward + +namespace backward { +__mlu_func__ void bilinearInterpolateGradient(int height, int width, float y, + float x, float *w1, float *w2, + float *w3, float *w4, int *x_low, + int *x_high, int *y_low, + int *y_high) { + if (y < -1.0 || y > height || x < -1.0 || x > width) { + *w1 = 0.0, *w2 = 0.0, *w3 = 0.0, *w4 = 0.0; + *x_low = -1, *x_high = -1, *y_low = -1, *y_high = -1; + return; + } + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + *y_low = (int)y; + *x_low = (int)x; + if (*y_low >= height - 1) { + *y_high = height - 1, *y_low = height - 1; + y = (float)(*y_low); + } else { + *y_high = *y_low + 1; + } + if (*x_low >= width - 1) { + *x_high = width - 1, *x_low = width - 1; + x = (float)(*x_low); + } else { + *x_high = *x_low + 1; + } + float ly = y - *y_low, lx = x - *x_low; + float hy = 1.0 - ly, hx = 1.0 - lx; + *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx; + return; +} + +template +__mlu_func__ void unionRoiAlignBp( + T *grads, T *boxes, T *grads_image, const int boxes_num, const int hi, + const int wi, const int c, const int no, const int ho, const int wo, + const float spatial_scale, const int sampling_ratio, const bool aligned) { + int c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T)); + int deal_all = boxes_num * hi * wi; + int deal_this_core = deal_all / taskDim + (int)(taskId < deal_all % taskDim); + for (int i = 0; i < deal_this_core; ++i) { + int bhw_id = i * taskDim + taskId; + int box_id = bhw_id / (hi * wi); + int ih = (bhw_id / wi) % hi; + int iw = bhw_id % wi; + T *box = boxes + box_id * 5; + int image_id = (int)box[0]; + T *image_offset = grads_image + image_id * ho * wo * c; + T *grads_ = grads + box_id * hi * wi * c + ih * wi * c + iw * c; + + float offset = aligned ? 0.5 : 0.0; + float x1 = box[1] * spatial_scale - offset; + float y1 = box[2] * spatial_scale - offset; + float x2 = box[3] * spatial_scale - offset; + float y2 = box[4] * spatial_scale - offset; + float roi_width = x2 - x1; + float roi_height = y2 - y1; + if (!aligned) { + roi_width = (roi_width > 1.0) ? roi_width : 1.0; + roi_height = (roi_height > 1.0) ? roi_height : 1.0; + } + float bin_size_h = roi_height / hi; + float bin_size_w = roi_width / wi; + + int roi_grid_h = + (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_height / hi); + int roi_grid_w = + (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / wi); + const T count = roi_grid_h * roi_grid_w; + if (c_align * sizeof(T) * 2 <= MAX_NRAM_SIZE) { + for (int iy = 0; iy < roi_grid_h; ++iy) { + const float y = + y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h; + for (int ix = 0; ix < roi_grid_w; ++ix) { + const float x = + x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w; + float w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low, + &x_high, &y_low, &y_high); + if (x_low >= 0 && y_low >= 0) { + __memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM); + __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w1, + c_align); + __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, + 1 / count, c_align); + __bang_atomic_add((T *)buffer + c_align, + image_offset + y_low * wo * c + x_low * c, + (T *)buffer + c_align, c); + __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w2, + c_align); + __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, + 1 / count, c_align); + __bang_atomic_add((T *)buffer + c_align, + image_offset + y_low * wo * c + x_high * c, + (T *)buffer + c_align, c); + __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w3, + c_align); + __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, + 1 / count, c_align); + __bang_atomic_add((T *)buffer + c_align, + image_offset + y_high * wo * c + x_low * c, + (T *)buffer + c_align, c); + __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w4, + c_align); + __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, + 1 / count, c_align); + __bang_atomic_add((T *)buffer + c_align, + image_offset + y_high * wo * c + x_high * c, + (T *)buffer + c_align, c); + } // x_low && y_low + } // ix + } // iy + } else { + for (int iy = 0; iy < roi_grid_h; ++iy) { + const float y = + y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h; + for (int ix = 0; ix < roi_grid_w; ++ix) { + const float x = + x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w; + float w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low, + &x_high, &y_low, &y_high); + if (x_low >= 0 && y_low >= 0) { + int deal_once = + PAD_DOWN(MAX_NRAM_SIZE / 2, NFU_ALIGN_SIZE) / sizeof(T); + int c_repeat = c / deal_once + (int)(c % deal_once != 0); + for (int i = 0; i < c_repeat; ++i) { + int deal_c = deal_once; + int align_c = deal_once; + if (i == c_repeat - 1) { + deal_c = c - i * deal_once; + align_c = c_align - i * deal_once; + } + __memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T), + GDRAM2NRAM); + __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w1, + align_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, + 1 / count, align_c); + __bang_atomic_add( + (T *)buffer + align_c, + image_offset + y_low * wo * c + x_low * c + i * deal_once, + (T *)buffer + align_c, deal_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w2, + align_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, + 1 / count, align_c); + __bang_atomic_add( + (T *)buffer + align_c, + image_offset + y_low * wo * c + x_high * c + i * deal_once, + (T *)buffer + align_c, deal_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w3, + align_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, + 1 / count, align_c); + __bang_atomic_add( + (T *)buffer + align_c, + image_offset + y_high * wo * c + x_low * c + i * deal_once, + (T *)buffer + align_c, deal_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w4, + align_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, + 1 / count, align_c); + __bang_atomic_add( + (T *)buffer + align_c, + image_offset + y_high * wo * c + x_high * c + i * deal_once, + (T *)buffer + align_c, deal_c); + } // for c_repeat + } // x_low >= 0 && y_low >= 0 + } // ix + } // iy + } // if c + } // i +} + +__mlu_global__ void MLUUnion1KernelRoiAlignBackward( + const void *grads, const void *boxes, void *grads_image, + const cnrtDataType_t dtype, const int boxes_num, const int hi, const int wi, + const int c, const int no, const int ho, const int wo, + const float spatial_scale, const int sampling_ratio, const bool aligned) { + // make sure that memcore is not used + if (coreId == 0x80) { + return; + } + switch (dtype) { + case CNRT_FLOAT16: { + unionRoiAlignBp((half *)grads, (half *)boxes, (half *)grads_image, + boxes_num, hi, wi, c, no, ho, wo, spatial_scale, + sampling_ratio, aligned); + }; break; + case CNRT_FLOAT32: { + unionRoiAlignBp((float *)grads, (float *)boxes, (float *)grads_image, + boxes_num, hi, wi, c, no, ho, wo, spatial_scale, + sampling_ratio, aligned); + }; break; + default: { return; } + } +} +} // namespace backward + +void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, const cnrtDataType_t d_type, + const void *input, const void *rois, const int channels, + const bool aligned, const int pooled_height, + const int pooled_width, const int input_height, + const int input_width, const int sampling_ratio, + const float spatial_scale, const int num_rois, + void *output) { + forward::MLUUnion1KernelRoiAlignAvg<<>>( + input, rois, channels, aligned, pooled_height, pooled_width, input_height, + input_width, sampling_ratio, spatial_scale, num_rois, d_type, output); +} + +void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, const cnrtDataType_t dtype, + const void *grads, const void *boxes, + void *grads_image, const int boxes_num, + const int hi, const int wi, const int c, + const int no, const int ho, const int wo, + const float spatial_scale, const int sampling_ratio, + const bool aligned) { + backward::MLUUnion1KernelRoiAlignBackward<<>>( + grads, boxes, grads_image, dtype, boxes_num, hi, wi, c, no, ho, wo, + spatial_scale, sampling_ratio, aligned); +} diff --git a/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu new file mode 100644 index 0000000000..7cb6df0e5d --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu @@ -0,0 +1,307 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "common_mlu_helper.hpp" + +__nram__ char data_nram[MAX_NRAM_SIZE]; + +template +__mlu_func__ void mluMultiKernelTinShift( + const T *input, const int *shifts, T *output, const int batch_size, + const int time_size, const int channel_size, const int hw_size, + const int group_size, const int group_channel) { + for (int cur_channel_index = taskId; + cur_channel_index < batch_size * channel_size; + cur_channel_index += taskDim) { + int n_index = cur_channel_index / channel_size; + int group_id = cur_channel_index % channel_size / group_channel; + int t_shift = shifts[n_index * group_size + group_id]; + int index = cur_channel_index % channel_size * hw_size + + n_index * time_size * channel_size * hw_size; + __nramset(data_nram, MAX_NRAM_SIZE, (char)0); + __asm__ volatile("sync;"); + if (abs(t_shift) >= time_size) { + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + time_size - 1); + } else { + if (t_shift > 0) { + __memcpy(data_nram + t_shift * hw_size * sizeof(T), input + index, + hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T), + channel_size * hw_size * sizeof(T), time_size - 1 - t_shift); + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + time_size - 1); + } else { + __memcpy(data_nram, input + (index - t_shift * channel_size * hw_size), + hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T), + channel_size * hw_size * sizeof(T), time_size - 1 + t_shift); + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + time_size - 1); + } + } + __asm__ volatile("sync;"); + } +} + +template +__mlu_func__ void mluHwSplit(const T *input, const int t_shift, + const int time_size, const int hw_size, + const int channel_size, const int index, + const int cur_sequence_index, + const int max_length_per_core, T *output) { + for (int cur_index = index; cur_index < index + hw_size; + cur_index += max_length_per_core) { + int memcpy_size = max_length_per_core; + if (cur_index + max_length_per_core > index + hw_size) { + memcpy_size = index + hw_size - cur_index; + } + if (cur_sequence_index - t_shift < 0 || + cur_sequence_index - t_shift >= time_size) { + __memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T), + NRAM2GDRAM); + } else { + __memcpy(data_nram, input + cur_index - t_shift * channel_size * hw_size, + memcpy_size * sizeof(T), GDRAM2NRAM); + __memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T), + NRAM2GDRAM); + } + __asm__ volatile("sync;"); + } +} + +template +__mlu_func__ void mluMultiKernelTinShiftSplitSequence( + const T *input, const int *shifts, T *output, const int batch_size, + const int time_size, const int channel_size, const int hw_size, + const int group_size, const int group_channel, + const int max_number_hw_per_core, const int max_length_per_core) { + const int tmp_max_number_hw_per_core = + max_number_hw_per_core > 0 ? max_number_hw_per_core : 1; + const int loop_time = time_size / tmp_max_number_hw_per_core + + ((time_size % tmp_max_number_hw_per_core) > 0 ? 1 : 0); + int segmentime_size = tmp_max_number_hw_per_core; + int res_segment = time_size % tmp_max_number_hw_per_core; + + for (int cur_segment_index = taskId; + cur_segment_index < loop_time * batch_size * channel_size; + cur_segment_index += taskDim) { + int n_index = cur_segment_index / loop_time / channel_size; + int group_id = cur_segment_index / loop_time % channel_size / group_channel; + int t_shift = shifts[n_index * group_size + group_id]; + int index = n_index * time_size * channel_size * hw_size + + (cur_segment_index / loop_time % channel_size) * hw_size + + cur_segment_index % loop_time * segmentime_size * hw_size * + channel_size; + char *dst_gdram2nram = data_nram; + const T *src_gdram2nram = input + index; + int count_gdram2nram = -1; + int count_nram2gdram = -1; + int next_sequence_index = + index / hw_size / channel_size % time_size + segmentime_size; + int cur_sequence_index = index / hw_size / channel_size % time_size; + __nramset(data_nram, MAX_NRAM_SIZE, (char)0); + __asm__ volatile("sync;"); + if (max_number_hw_per_core == 0) { + mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index, + cur_sequence_index, max_length_per_core, output); + continue; + } + if (abs(t_shift) >= time_size) { + if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) { + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + res_segment - 1); + } else { + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + segmentime_size - 1); + } + continue; + } + if (t_shift == 0) { + if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) { + dst_gdram2nram = data_nram; + src_gdram2nram = input + index; + count_gdram2nram = res_segment - 1; + count_nram2gdram = res_segment - 1; + } else { + dst_gdram2nram = data_nram; + src_gdram2nram = input + index; + count_gdram2nram = segmentime_size - 1; + count_nram2gdram = segmentime_size - 1; + } + } else if (t_shift > 0) { + int first_index_cur_channel = + n_index * time_size * channel_size * hw_size + + (cur_segment_index / loop_time % channel_size) * hw_size; + if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) { + dst_gdram2nram = data_nram; + src_gdram2nram = + input + + (index - t_shift * channel_size * hw_size < first_index_cur_channel + ? first_index_cur_channel + : index - t_shift * channel_size * hw_size); + count_gdram2nram = res_segment - 1; + count_nram2gdram = res_segment - 1; + if (cur_sequence_index < t_shift && t_shift < next_sequence_index) { + dst_gdram2nram = + data_nram + t_shift % segmentime_size * hw_size * sizeof(T); + count_gdram2nram = res_segment - (t_shift - cur_sequence_index) - 1; + } + } else { + if (t_shift >= next_sequence_index) { + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + segmentime_size - 1); + continue; + } else if (cur_sequence_index < t_shift && + t_shift < next_sequence_index) { + dst_gdram2nram = + data_nram + t_shift % segmentime_size * hw_size * sizeof(T); + src_gdram2nram = input + first_index_cur_channel; + count_gdram2nram = segmentime_size - (t_shift % segmentime_size) - 1; + count_nram2gdram = segmentime_size - 1; + } else { + dst_gdram2nram = data_nram; + src_gdram2nram = input + index - t_shift * channel_size * hw_size; + count_gdram2nram = segmentime_size - 1; + count_nram2gdram = segmentime_size - 1; + } + } + } else { + int offset_index = time_size + t_shift; + if (cur_sequence_index >= offset_index) { + if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) { + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + res_segment - 1); + continue; + } else { + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + segmentime_size - 1); + continue; + } + } else { + dst_gdram2nram = data_nram; + src_gdram2nram = input + index - t_shift * channel_size * hw_size; + if (cur_sequence_index - t_shift + segmentime_size < time_size) { + count_gdram2nram = segmentime_size - 1; + count_nram2gdram = segmentime_size - 1; + } else { + count_gdram2nram = time_size - (cur_sequence_index - t_shift) - 1; + count_nram2gdram = + (segmentime_size - 1) < (time_size - cur_sequence_index - 1) + ? (segmentime_size - 1) + : (time_size - cur_sequence_index - 1); + } + } + } + __memcpy(dst_gdram2nram, src_gdram2nram, hw_size * sizeof(T), GDRAM2NRAM, + hw_size * sizeof(T), channel_size * hw_size * sizeof(T), + count_gdram2nram); + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + count_nram2gdram); + __asm__ volatile("sync;"); + } +} + +__mlu_entry__ void MLUUnion1KernelTinShift( + const void *input, const void *shifts, void *output, const int batch_size, + const int time_size, const int channel_size, const int hw_size, + const int group_size, const int group_channel, + const cnrtDataType_t data_dtype) { + // make sure that memcore is not used + if (coreId == 0x80) { + return; + } + switch (data_dtype) { + case CNRT_FLOAT16: { + mluMultiKernelTinShift((half *)input, (const int *)shifts, (half *)output, + batch_size, time_size, channel_size, hw_size, + group_size, group_channel); + }; break; + case CNRT_FLOAT32: { + mluMultiKernelTinShift((float *)input, (const int *)shifts, + (float *)output, batch_size, time_size, + channel_size, hw_size, group_size, group_channel); + }; break; + default: { return; } + } +} + +__mlu_entry__ void MLUUnion1KernelTinShiftSplitSequence( + const void *input, const void *shifts, void *output, const int batch_size, + const int time_size, const int channel_size, const int hw_size, + const int group_size, const int group_channel, + const int max_number_hw_per_core, const int max_length_per_core, + const cnrtDataType_t data_dtype) { + // make sure that memcore is not used + if (coreId == 0x80) { + return; + } + switch (data_dtype) { + case CNRT_FLOAT16: { + mluMultiKernelTinShiftSplitSequence( + (half *)input, (const int *)shifts, (half *)output, batch_size, + time_size, channel_size, hw_size, group_size, group_channel, + max_number_hw_per_core, max_length_per_core); + }; break; + case CNRT_FLOAT32: { + mluMultiKernelTinShiftSplitSequence( + (float *)input, (const int *)shifts, (float *)output, batch_size, + time_size, channel_size, hw_size, group_size, group_channel, + max_number_hw_per_core, max_length_per_core); + }; break; + default: { return; } + } +} + +void KernelTinShiftForward( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const void *input, const void *shifts, void *output, const int batch_size, + const int time_size, const int channel_size, const int hw_size, + const int group_size, const int group_channel, + const cnrtDataType_t data_dtype, const int channel_per_core, + const int max_number_hw_per_core, const int max_length_per_core) { + if (channel_per_core >= 1) { + MLUUnion1KernelTinShift<<>>( + input, shifts, output, batch_size, time_size, channel_size, hw_size, + group_size, group_channel, data_dtype); + } else { + MLUUnion1KernelTinShiftSplitSequence<<>>( + input, shifts, output, batch_size, time_size, channel_size, hw_size, + group_size, group_channel, max_number_hw_per_core, max_length_per_core, + data_dtype); + } +} + +void KernelTinShiftBackward( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const void *grad_output, const void *shifts, void *grad_input, + const int batch_size, const int time_size, const int channel_size, + const int hw_size, const int group_size, const int group_channel, + const cnrtDataType_t data_dtype, const int channel_per_core, + const int max_number_hw_per_core, const int max_length_per_core) { + if (channel_per_core >= 1) { + MLUUnion1KernelTinShift<<>>( + grad_output, shifts, grad_input, batch_size, time_size, channel_size, + hw_size, group_size, group_channel, data_dtype); + } else { + MLUUnion1KernelTinShiftSplitSequence<<>>( + grad_output, shifts, grad_input, batch_size, time_size, channel_size, + hw_size, group_size, group_channel, max_number_hw_per_core, + max_length_per_core, data_dtype); + } +} diff --git a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp index f054c104bd..f68e874056 100644 --- a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp +++ b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp @@ -11,7 +11,7 @@ using namespace at; #define CHECK_MLU(x) \ TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor") #define CHECK_CPU(x) \ - TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor") + TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor") #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_CUDA_INPUT(x) \ diff --git a/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp new file mode 100644 index 0000000000..82d55559c5 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp @@ -0,0 +1,100 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + +#include "pytorch_device_registry.hpp" +#include "pytorch_mlu_helper.hpp" + +void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, const cnrtDataType_t d_type, + const void *bbox1, const void *bbox2, void *ious, + const int32_t num_bbox1, const int32_t num_bbox2, + const int32_t mode, const bool aligned, + const int32_t offset); + +static void policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type, + const int32_t batch_num_all) { + auto union_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount); + auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + auto core_num = union_num * core_dim; + + // Union1 policyFunc + *k_type = CNRT_FUNC_TYPE_UNION1; + k_dim->x = core_dim; + auto need_core_num = PAD_UP(batch_num_all, core_dim); + k_dim->y = + (need_core_num < core_num) ? (need_core_num / core_dim) : union_num; + k_dim->z = 1; + + return; +} + +void BBoxOverlapsMLUKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, + Tensor ious, const int32_t mode, + const bool aligned, const int32_t offset) { + // check dtype + TORCH_CHECK( + bboxes1.scalar_type() == at::kFloat || bboxes1.scalar_type() == at::kHalf, + "Data type of input should be Float or Half. But now input type is ", + bboxes1.scalar_type(), "."); + TORCH_CHECK(bboxes1.scalar_type() == bboxes2.scalar_type(), + "bboxes1's dtype should be the same with bboxes2's dtype."); + + // params check + TORCH_CHECK(bboxes1.dim() == 2, "bboxes1 should be a 2d tensor, got ", + bboxes1.dim(), "D"); + TORCH_CHECK(bboxes2.dim() == 2, "bboxes2 should be a 2d tensor, got ", + bboxes2.dim(), "D"); + + auto rows = bboxes1.size(0); + auto cols = bboxes2.size(0); + auto batch_num_all = rows; + + if (rows * cols == 0) { + // return if zero element + return; + } + + // calculate task dimension + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + policyFunc(&k_dim, &k_type, batch_num_all); + + // get compute queue + cnrtQueue_t queue = torch_mlu::getCurQueue(); + + // get dtype of input + cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bboxes1.dtype()); + + // get ptr of tensors + auto bboxes1_impl = torch_mlu::getMluTensorImpl(bboxes1); + auto bboxes1_ptr = bboxes1_impl->cnnlMalloc(); + auto bboxes2_impl = torch_mlu::getMluTensorImpl(bboxes2); + auto bboxes2_ptr = bboxes2_impl->cnnlMalloc(); + auto ious_impl = torch_mlu::getMluTensorImpl(ious); + auto ious_ptr = ious_impl->cnnlMalloc(); + + // launch kernel + CNLOG(INFO) << "Launch Kernel MLUUnion1BboxOverlapsKernel"; + CNLOG(INFO) << "kDim :[ " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z + << " ]"; + KernelBBoxOverlaps(k_dim, k_type, queue, d_type, bboxes1_ptr, bboxes2_ptr, + ious_ptr, rows, cols, mode, aligned, offset); +} + +void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, + const int mode, const bool aligned, const int offset) { + BBoxOverlapsMLUKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset); +} + +void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, + const int mode, const bool aligned, const int offset); +REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MLU, bbox_overlaps_mlu); diff --git a/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp new file mode 100644 index 0000000000..9242644c89 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp @@ -0,0 +1,332 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include +#include + +#include "pytorch_device_registry.hpp" +#include "pytorch_mlu_helper.hpp" + +void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, + const cnrtDataType_t d_type, + const void *input, const void *target, + const void *weight, const int32_t N, + const int32_t C, const float alpha, + const float gamma, void *output); + +void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, + const cnrtDataType_t d_type, + const void *input, const void *target, + const void *weight, const float gamma, + const float alpha, const int32_t dim_n, + const int32_t deal_n, const int32_t dim_c, + void *output); +// Policy Function for Forward +static void policyFuncForward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type, + const Tensor &input, const Tensor &target, + const Tensor &weight) { + auto N = input.size(0); + auto C = input.size(1); + + const size_t nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore); + const size_t c_align_size = PAD_UP((C * input.itemsize()), NFU_ALIGN_SIZE); + const int split_target_num = 2; + const int split_pipeline_num = 6; + const int has_weight = weight.data_ptr() != nullptr; + const int target_data_width = target.scalar_type() == at::kLong + ? target.itemsize() / 2 + : target.itemsize(); + const int threshold_c = + PAD_DOWN((nram_size - split_target_num * sizeof(int)) / + (split_pipeline_num + has_weight), + NFU_ALIGN_SIZE) / + input.itemsize(); + + int n_seg = 1; + if (C <= threshold_c) { + int c_size = C * input.itemsize(); + int reservered_align_size = + (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE; + int wegiht_size = 0; + if (has_weight) { + c_size = c_align_size; + reservered_align_size = split_target_num * NFU_ALIGN_SIZE; + wegiht_size = c_align_size; + } + // n_seg * c_size * split_pipeline_num + n_seg * target.itemsize() * + // split_target_num + // + weight_size + reservered_align_size <= nram_size + n_seg = (nram_size - wegiht_size - reservered_align_size) / + (split_pipeline_num * c_size + split_target_num * sizeof(int32_t)); + } + auto seg_num = n_seg == 0 ? N : (N + n_seg - 1) / n_seg; + auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount); + auto core_num = core_dim * cluster_num; + + k_dim->x = *k_type; + k_dim->y = + seg_num > core_num ? cluster_num : (seg_num + core_dim - 1) / core_dim; + k_dim->z = 1; +} + +// Policy Function for Backward +static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { + // set Union1 Job + *k_type = CNRT_FUNC_TYPE_UNION1; + k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount); + k_dim->z = 1; +} + +void SigmoidFocalLossForwardMLUKernelLauncher(Tensor input, Tensor target, + Tensor weight, Tensor output, + const float gamma, + const float alpha) { + // params check + TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ", + "But now gamma is ", gamma, "."); + + // check dtype + TORCH_CHECK( + input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf, + "Data type of input should be Float or Half. But now input type is ", + input.scalar_type(), "."); + + TORCH_CHECK( + (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong), + "target type should be Int or Long. ", "But now target type is ", + target.scalar_type(), "."); + + if (weight.data_ptr() != nullptr) { + TORCH_CHECK(weight.scalar_type() == input.scalar_type(), + "Data types of input and weight should be the same. But now " + "input type is ", + input.scalar_type(), ", weight type is ", weight.scalar_type(), + "."); + } else { + CNLOG(INFO) << "weight is a empty tensor."; + } + + // return if zero-element + if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) { + return; + } + + // calculate task dimension + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + policyFuncForward(&k_dim, &k_type, input, target, weight); + auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + + // get compute queue + auto queue = torch_mlu::getCurQueue(); + + // get ptr of tensors + auto input_impl = torch_mlu::getMluTensorImpl(input); + auto input_ptr = input_impl->cnnlMalloc(); + auto target_impl = torch_mlu::getMluTensorImpl(target); + auto target_ptr = target_impl->cnnlMalloc(); + auto weight_impl = torch_mlu::getMluTensorImpl(weight); + auto weight_ptr = weight_impl->cnnlMalloc(); + auto output_impl = torch_mlu::getMluTensorImpl(output); + auto output_ptr = output_impl->cnnlMalloc(); + + // get dtype of input + cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype()); + + CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidForward<<>>"; + // launch kernel + KernelFocalLossSigmoidForward(k_dim, k_type, queue, d_type, input_ptr, + target_ptr, weight_ptr, input.size(0), + input.size(1), alpha, gamma, output_ptr); +} + +void getDealNAndThresholdC(const int compute_data_bytes, + const int target_data_bytes, const int total_c, + int *deal_n_ptr, int *threshold_c_ptr, + const bool has_weight, const bool is_half) { + /* NRAM partition: + * + * |-----------------ping pong--------------------| + * |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight| + * + * split_pipeline_num is 5: including input, pt, alpha_t, temp, output. + */ + const int nram_split_num = 5; + const int nram_split_pingpong = 2; + const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore); + int32_t compute_align_size = NFU_ALIGN_SIZE; + if (is_half) { + compute_align_size += NFU_ALIGN_SIZE; + } + const int32_t compute_align_num = compute_align_size / compute_data_bytes; + // reservered_align_size: including input(ping pong), pt(ping pong), + // alpha_t(ping pong), temp(ping pong), + // output(ping pong), target(ping pong), + // flt_min and gamma. + const int reservered_align_size = + ((nram_split_num + 1) * nram_split_pingpong + 2) * compute_align_size; + int nram_pingpong_size = max_nram_size - reservered_align_size; + + int compute_c = total_c; + int threshold_c = 0; + if (has_weight) { + // reserved space for weight to align + nram_pingpong_size -= NFU_ALIGN_SIZE; + + // threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num + + // nram_split_pingpong * target_data_bytes + + // threshold_c * compute_data_bytes <= nram_pingpong_size + threshold_c = + (nram_pingpong_size - nram_split_pingpong * target_data_bytes) / + (compute_data_bytes * (nram_split_num * nram_split_pingpong + 1)); + threshold_c = PAD_DOWN(threshold_c, compute_align_num); + int weight_space = PAD_UP(total_c * compute_data_bytes, NFU_ALIGN_SIZE); + + // reserved space for weight + nram_pingpong_size -= weight_space; + compute_c = PAD_UP(total_c, compute_align_num); + } else { + // threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num + + // nram_split_pingpong * target_data_bytes <= nram_pingpong_size + threshold_c = + (nram_pingpong_size / nram_split_pingpong - target_data_bytes) / + (nram_split_num * compute_data_bytes); + } + // deal_n * compute_c * nram_split_pingpong * compute_data_bytes * + // nram_split_num + deal_n * nram_split_pingpong * target_data_bytes <= + // nram_pingpong_size + *deal_n_ptr = + nram_pingpong_size / + ((nram_split_num * compute_c * compute_data_bytes + target_data_bytes) * + nram_split_pingpong); + *threshold_c_ptr = threshold_c; +} + +void SigmoidFocalLossBackwardMLUKernelLauncher(Tensor input, Tensor target, + Tensor weight, Tensor output, + const float gamma, + const float alpha) { + // params check + TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ", + "But now gamma is ", gamma, "."); + // check dtype + TORCH_CHECK( + input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf, + "Data type of input should be Float or Half. But now input type is ", + input.scalar_type(), "."); + + TORCH_CHECK( + (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong), + "target type should be Int or Long. ", "But now target type is ", + target.scalar_type(), "."); + + bool has_weight = false; + if (weight.data_ptr() != nullptr) { + TORCH_CHECK(weight.scalar_type() == input.scalar_type(), + "Data types of input and weight should be the same. But now " + "input type is ", + input.scalar_type(), ", weight type is ", weight.scalar_type(), + "."); + has_weight = true; + } else { + CNLOG(INFO) << "weight is a empty tensor."; + } + + auto dim_c = input.size(1); + const int compute_data_bytes = sizeof(float); + // target supports only INT on MLU device while it keeps LONG on host side, + // so target.itemsize() / 2 + const int target_data_bytes = target.scalar_type() == at::kLong + ? (target.itemsize() / 2) + : target.itemsize(); + int deal_n = 0; + int threshold_c = 0; + bool is_half = false; + if (input.scalar_type() == at::kHalf) { + is_half = true; + } + // calculate deal_n and threshold_c + getDealNAndThresholdC(compute_data_bytes, target_data_bytes, dim_c, &deal_n, + &threshold_c, has_weight, is_half); + + // check C + TORCH_CHECK(threshold_c >= dim_c, + "input.size(1) should be in the range of [0, ", threshold_c, + "]. ", "But now input.size(1) is ", dim_c, "."); + + if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) { + // return if zero-element + return; + } + + // set task dimension + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + policyFuncBackward(&k_dim, &k_type); + + // get compute queue + auto queue = torch_mlu::getCurQueue(); + + // get ptr of tensors + auto input_impl = torch_mlu::getMluTensorImpl(input); + auto input_ptr = input_impl->cnnlMalloc(); + auto target_impl = torch_mlu::getMluTensorImpl(target); + auto target_ptr = target_impl->cnnlMalloc(); + auto weight_impl = torch_mlu::getMluTensorImpl(weight); + auto weight_ptr = weight_impl->cnnlMalloc(); + auto output_impl = torch_mlu::getMluTensorImpl(output); + auto output_ptr = output_impl->cnnlMalloc(); + + // get dtype of input + cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype()); + auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + auto dim_n = input.size(0); + + CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidBackward<<>>"; + + // launch kernel + KernelFocalLossSigmoidBackward(k_dim, k_type, queue, d_type, input_ptr, + target_ptr, weight_ptr, gamma, alpha, dim_n, + deal_n, dim_c, output_ptr); +} + +void sigmoid_focal_loss_forward_mlu(Tensor input, Tensor target, Tensor weight, + Tensor output, float gamma, float alpha) { + SigmoidFocalLossForwardMLUKernelLauncher(input, target, weight, output, gamma, + alpha); +} + +void sigmoid_focal_loss_backward_mlu(Tensor input, Tensor target, Tensor weight, + Tensor grad_input, float gamma, + float alpha) { + SigmoidFocalLossBackwardMLUKernelLauncher(input, target, weight, grad_input, + gamma, alpha); +} + +void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight, + Tensor output, float gamma, float alpha); + +void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target, + Tensor weight, Tensor grad_input, + float gamma, float alpha); + +REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, MLU, + sigmoid_focal_loss_forward_mlu); +REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, MLU, + sigmoid_focal_loss_backward_mlu); diff --git a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp new file mode 100644 index 0000000000..33c4f7de50 --- /dev/null +++ b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp @@ -0,0 +1,130 @@ +/************************************************************************* + * Copyright (C) 2021 by Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + +#include "pytorch_device_registry.hpp" +#include "pytorch_mlu_helper.hpp" + +void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const cnrtDataType_t data_type_input, const void *boxes_ptr, + const void *scores_ptr, const int input_num_boxes, + const int input_stride, const int max_output_boxes, + const float iou_threshold, const float offset, + void *workspace_ptr, void *output_size_ptr, void *output_ptr); + +int selectUnionType(uint32_t use_job, int box_num_per_core) { + // the box_num_per_core should be at least 256, otherwise the real IO + // bandwidth would be very low + while (box_num_per_core < 256 && use_job >= 4) { + box_num_per_core *= 2; + use_job /= 2; + } + return use_job; +} + +Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold, + int offset) { + // dimension parameters check + TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ", + boxes.dim(), "D"); + TORCH_CHECK(boxes.size(1) == 4, + "boxes should have 4 elements in dimension 1, got ", + boxes.size(1)); + TORCH_CHECK(scores.dim() == 1, "scores should be a 1d tensor, got ", + scores.dim(), "D"); + + // data type check + TORCH_CHECK(boxes.scalar_type() == scores.scalar_type(), + "boxes should have the same type as scores"); + TORCH_CHECK( + boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf, + "data type of boxes should be Float or Half, got ", boxes.scalar_type()); + + if (boxes.numel() == 0) { + return at::empty({0}, boxes.options().dtype(at::kLong)); + } + + int input_num_boxes = boxes.size(0); + int input_stride = boxes.size(0); + int max_output_boxes = boxes.size(0); + + cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype()); + cnrtDim3_t k_dim; + cnrtJobType_t k_type; + uint32_t union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount); + uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + uint32_t job_limit = union_number * core_dim; + uint32_t core_number = union_number * core_dim; + int box_num_per_core = (input_num_boxes + core_number - 1) / core_number; + // initiate k_type as Union1 + k_dim.x = core_dim; + k_dim.y = 1; + k_dim.z = 1; + k_type = CNRT_FUNC_TYPE_UNION1; + int use_job = selectUnionType(job_limit, box_num_per_core); + if (use_job < 4) { + k_dim.x = 1; + k_type = CNRT_FUNC_TYPE_BLOCK; + } else if (use_job == 4) { + k_dim.x = core_dim; + k_type = CNRT_FUNC_TYPE_UNION1; + } else { + k_dim.x = use_job; + k_type = (cnrtFunctionType_t)use_job; + } + + // transpose boxes (n, 4) to (4, n) for better performance + auto boxes_t = boxes.transpose(0, 1); + auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t); + auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores); + auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kLong)); + auto output_size = at::empty({1}, scores.options().dtype(at::kInt)); + + // workspace + const int info_num = 5; // x1, x2, y1, y2 and score + size_t space_size = 0; + if (boxes.scalar_type() == at::kHalf) { + space_size = input_num_boxes * sizeof(int16_t) * info_num + sizeof(float); + } else { + space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float); + } + auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte)); + + // get compute queue + auto queue = torch_mlu::getCurQueue(); + + auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_); + auto boxes_ptr = boxes_impl->cnnlMalloc(); + auto scores_impl = torch_mlu::getMluTensorImpl(scores_); + auto scores_ptr = scores_impl->cnnlMalloc(); + auto workspace_impl = torch_mlu::getMluTensorImpl(workspace); + auto workspace_ptr = workspace_impl->cnnlMalloc(); + auto output_impl = torch_mlu::getMluTensorImpl(output); + auto output_ptr = output_impl->cnnlMalloc(); + auto output_size_impl = torch_mlu::getMluTensorImpl(output_size); + auto output_size_ptr = output_size_impl->cnnlMalloc(); + + CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<>>"; + KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr, + input_num_boxes, input_stride, max_output_boxes, iou_threshold, + offset, workspace_ptr, output_size_ptr, output_ptr); + + int output_num = *static_cast(output_size.cpu().data_ptr()); + return output.slice(0, 0, output_num); +} + +Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) { + return NMSMLUKernelLauncher(boxes, scores, iou_threshold, offset); +} + +Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset); +REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu); diff --git a/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp new file mode 100644 index 0000000000..077dbfc51e --- /dev/null +++ b/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp @@ -0,0 +1,206 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "pytorch_device_registry.hpp" +#include "pytorch_mlu_helper.hpp" + +void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, const cnrtDataType_t d_type, + const void *input, const void *rois, const int channels, + const bool aligned, const int pooled_height, + const int pooled_width, const int input_height, + const int input_width, const int sampling_ratio, + const float spatial_scale, const int num_rois, + void *output); + +void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, const cnrtDataType_t dtype, + const void *grads, const void *boxes, + void *grads_image, const int boxes_num, + const int hi, const int wi, const int c, + const int no, const int ho, const int wo, + const float spatial_scale, const int sampling_ratio, + const bool aligned); + +void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output, + Tensor argmax_y, Tensor argmax_x, + int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, + int pool_mode, bool aligned) { + // params check + TORCH_CHECK( + input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf, + "input type should be Float or Half, got ", input.scalar_type()); + TORCH_CHECK(rois.scalar_type() == input.scalar_type(), + "rois should have the same type as input"); + TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ", + input.dim(), "D"); + TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(), + "D"); + TORCH_CHECK(pool_mode == 1, "pool_mode only suppurts 'avg' currently"); + + auto memory_format = + torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim()); + auto input_tensor = + torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + int height = input.size(2); + int width = input.size(3); + + if (output.numel() == 0) { + output = at::zeros({num_rois, channels, aligned_height, aligned_width}, + input.options()); + return; + } + + at::Tensor output_tmp = + at::empty({num_rois, channels, aligned_height, aligned_width}, + input.options(), memory_format); + + // get tensor impl + auto self_impl = torch_mlu::getMluTensorImpl(input_tensor); + auto rois_impl = torch_mlu::getMluTensorImpl(rois); + auto output_impl = torch_mlu::getMluTensorImpl(output_tmp); + + // get compute queue + auto queue = torch_mlu::getCurQueue(); + + // get the mlu ptr + auto self_ptr = self_impl->cnnlMalloc(); + auto rois_ptr = rois_impl->cnnlMalloc(); + auto output_ptr = output_impl->cnnlMalloc(); + + cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtDim3_t k_dim; + k_dim.x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + k_dim.y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount); + k_dim.z = 1; + cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input.dtype()); + + KernelRoiAlign(k_dim, k_type, queue, data_type, self_ptr, rois_ptr, channels, + aligned, aligned_height, aligned_width, height, width, + sampling_ratio, spatial_scale, num_rois, output_ptr); + + output.copy_(output_tmp); +} + +static int nearestPower2(int x) { + x--; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + x++; + return x; +} + +void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois, + Tensor argmax_y, Tensor argmax_x, + Tensor grad_input, int aligned_height, + int aligned_width, float spatial_scale, + int sampling_ratio, int pool_mode, + bool aligned) { + // params check + TORCH_CHECK( + grad.scalar_type() == at::kFloat || grad.scalar_type() == at::kHalf, + "grad type should be Float or Half, got ", grad.scalar_type()); + TORCH_CHECK(rois.scalar_type() == grad.scalar_type(), + "rois should have the same type as grad"); + TORCH_CHECK(grad.dim() == 4, "grad should be a 4d tensor, got ", grad.dim(), + "D"); + TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(), + "D"); + TORCH_CHECK(pool_mode == 1, "pool_mode only suppurts 'avg' currently"); + + int batch_size = grad_input.size(0); + int channels = grad_input.size(1); + int height = grad_input.size(2); + int width = grad_input.size(3); + auto memory_format = + torch_mlu::cnnl::ops::get_channels_last_memory_format(grad.dim()); + auto grad_ = torch_mlu::cnnl::ops::cnnl_contiguous(grad, memory_format); + auto grad_input_ = at::empty({batch_size, channels, height, width}, + grad.options(), memory_format) + .zero_(); + + int boxes_num = rois.size(0); + int hi = grad.size(2); + int wi = grad.size(3); + int c = grad.size(1); + + int no = grad_input.size(0); + int ho = grad_input.size(2); + int wo = grad_input.size(3); + + // get tensor impl + auto grad_impl = torch_mlu::getMluTensorImpl(grad_); + auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_); + auto rois_impl = torch_mlu::getMluTensorImpl(rois); + + // get compute queue + auto queue = torch_mlu::getCurQueue(); + + // get the mlu ptr + auto grad_ptr = grad_impl->cnnlMalloc(); + auto rois_ptr = rois_impl->cnnlMalloc(); + auto grad_input_ptr = grad_input_impl->cnnlMalloc(); + + cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1; + int need_core = nearestPower2(boxes_num); + int union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount); + uint32_t dim_x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + uint32_t dim_y = (need_core - 1) / dim_x + 1; + dim_y = (dim_y > union_number) ? union_number : dim_y; + cnrtDim3_t k_dim = {dim_x, dim_y, 1}; + cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad.dtype()); + + KernelRoiAlignBackward(k_dim, k_type, queue, k_dtype, grad_ptr, rois_ptr, + grad_input_ptr, boxes_num, hi, wi, c, no, ho, wo, + spatial_scale, sampling_ratio, aligned); + grad_input.copy_(grad_input_); +} + +void roi_align_forward_mlu(Tensor input, Tensor rois, Tensor output, + Tensor argmax_y, Tensor argmax_x, int aligned_height, + int aligned_width, float spatial_scale, + int sampling_ratio, int pool_mode, bool aligned) { + ROIAlignForwardMLUKernelLauncher(input, rois, output, argmax_y, argmax_x, + aligned_height, aligned_width, spatial_scale, + sampling_ratio, pool_mode, aligned); +} + +void roi_align_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax_y, + Tensor argmax_x, Tensor grad_input, + int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, + int pool_mode, bool aligned) { + ROIAlignBackwardMLUKernelLauncher( + grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height, + aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); +} + +void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output, + Tensor argmax_y, Tensor argmax_x, + int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, + int pool_mode, bool aligned); + +void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y, + Tensor argmax_x, Tensor grad_input, + int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, + int pool_mode, bool aligned); + +REGISTER_DEVICE_IMPL(roi_align_forward_impl, MLU, roi_align_forward_mlu); +REGISTER_DEVICE_IMPL(roi_align_backward_impl, MLU, roi_align_backward_mlu); diff --git a/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp new file mode 100644 index 0000000000..728330795d --- /dev/null +++ b/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp @@ -0,0 +1,203 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "pytorch_device_registry.hpp" +#include "pytorch_mlu_helper.hpp" + +void KernelTinShiftForward( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const void *input, const void *shifts, void *output, const int batch_size, + const int time_size, const int channel_size, const int hw_size, + const int group_size, const int group_channel, + const cnrtDataType_t data_dtype, const int channel_per_core, + const int max_number_hw_per_core, const int max_length_per_core); + +void KernelTinShiftBackward( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const void *grad_output, const void *shifts, void *grad_input, + const int batch_size, const int time_size, const int channel_size, + const int hw_size, const int group_size, const int group_channel, + const cnrtDataType_t data_dtype, const int channel_per_core, + const int max_number_hw_per_core, const int max_length_per_core); + +// policy function +static void policyFunc(const Tensor &input, cnrtDim3_t *k_dim, + cnrtFunctionType_t *k_type, int *channel_per_core, + int *max_number_hw_per_core, int *max_length_per_core) { + const int32_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount); + const int32_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + auto nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore); + const int core_num = core_limit * cluster_limit; + const int batch_size = input.size(0); + const int time_size = input.size(1); + const int channel_size = input.size(2); + const int hw_size = input.size(3); + + const size_t size_per_channel = time_size * hw_size * input.itemsize(); + *channel_per_core = nram_size / size_per_channel; + int task_dim = 0; + if (*channel_per_core == 0) { + const size_t size_per_hw = hw_size * input.itemsize(); + *max_number_hw_per_core = nram_size / size_per_hw; + if (*max_number_hw_per_core <= 0) { + *max_length_per_core = nram_size / input.itemsize(); + } + int tmp_max_number_hw_per_core = + *max_number_hw_per_core > 0 ? *max_number_hw_per_core : 1; + const int loop_time = + (time_size / (tmp_max_number_hw_per_core)) + + ((time_size % (tmp_max_number_hw_per_core)) > 0 ? 1 : 0); + task_dim = batch_size * channel_size * loop_time < core_num + ? batch_size * channel_size * loop_time + : core_num; + } else { + task_dim = batch_size * channel_size < core_num ? batch_size * channel_size + : core_num; + } + + k_dim->x = core_limit; + k_dim->y = (task_dim / core_limit) > 0 ? (task_dim / core_limit) : 1; + k_dim->z = 1; + *k_type = CNRT_FUNC_TYPE_UNION1; +} + +void TINShiftForwardMLUKernelLauncher(Tensor input, Tensor shift, + Tensor output) { + // params check + TORCH_CHECK( + input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf, + "input type should be Float or Half, got ", input.scalar_type(), "."); + TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ", + input.dim(), "d."); + TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ", + shift.dim(), "d."); + TORCH_CHECK( + input.size(0) == shift.size(0), + "input batch size should be the same as shift's, input batch size is ", + input.size(0), " and shift batch size is ", shift.size(0), "."); + TORCH_CHECK(input.size(0) != 0, "Input batch size should not be zero."); + TORCH_CHECK(input.size(3) != 0, + "The last dim size of input should not be zero."); + if (input.size(1) == 0) { + return; + } + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + int channel_per_core = 0; + int max_number_hw_per_core = 0; + int max_length_per_core = 0; + policyFunc(input, &k_dim, &k_type, &channel_per_core, &max_number_hw_per_core, + &max_length_per_core); + + const int batch_size = input.size(0); + const int time_size = input.size(1); + const int channel_size = input.size(2); + const int hw_size = input.size(3); + const int group_size = shift.size(1); + int group_channel = channel_size / group_size; + + // get tensor impl + auto input_impl = torch_mlu::getMluTensorImpl(input); + auto shift_impl = torch_mlu::getMluTensorImpl(shift); + auto output_impl = torch_mlu::getMluTensorImpl(output); + + // get compute queue + auto queue = torch_mlu::getCurQueue(); + + // get the mlu ptr + auto input_ptr = input_impl->cnnlMalloc(); + auto shift_ptr = shift_impl->cnnlMalloc(); + auto output_ptr = output_impl->cnnlMalloc(); + + cnrtDataType_t data_dtype = torch_mlu::toCnrtDtype(input.dtype()); + + KernelTinShiftForward(k_dim, k_type, queue, input_ptr, shift_ptr, output_ptr, + batch_size, time_size, channel_size, hw_size, + group_size, group_channel, data_dtype, channel_per_core, + max_number_hw_per_core, max_length_per_core); +} + +void TINShiftBackwardMLUKernelLauncher(Tensor grad_output, Tensor shift, + Tensor grad_input) { + // params check + TORCH_CHECK(grad_output.scalar_type() == at::kFloat || + grad_output.scalar_type() == at::kHalf, + "grad_output type should be Float or Half, got ", + grad_output.scalar_type(), "."); + TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ", + grad_output.dim(), "d."); + TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ", + shift.dim(), "d."); + TORCH_CHECK(grad_output.size(0) == shift.size(0), + "grad_output batch size should be the same as shift's, " + "grad_output batch size is ", + grad_output.size(0), ", shift batch size is ", shift.size(0), + "."); + TORCH_CHECK(grad_output.size(0) != 0, + "grad_output batch size should not be zero."); + TORCH_CHECK(grad_output.size(3) != 0, + "The last dim size of grad_output should not be zero."); + if (grad_output.size(1) == 0) { + return; + } + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + int channel_per_core = 0; + int max_number_hw_per_core = 0; + int max_length_per_core = 0; + policyFunc(grad_output, &k_dim, &k_type, &channel_per_core, + &max_number_hw_per_core, &max_length_per_core); + + const int batch_size = grad_output.size(0); + const int time_size = grad_output.size(1); + const int channel_size = grad_output.size(2); + const int hw_size = grad_output.size(3); + const int group_size = shift.size(1); + int group_channel = channel_size / group_size; + + // get tensor impl + auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output); + auto shift_impl = torch_mlu::getMluTensorImpl(shift); + auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input); + + // get compute queue + auto queue = torch_mlu::getCurQueue(); + + // get the mlu ptr + auto grad_output_ptr = grad_output_impl->cnnlMalloc(); + auto shift_ptr = shift_impl->cnnlMalloc(); + auto grad_input_ptr = grad_input_impl->cnnlMalloc(); + + cnrtDataType_t data_dtype = torch_mlu::toCnrtDtype(grad_output.dtype()); + + KernelTinShiftBackward(k_dim, k_type, queue, grad_output_ptr, shift_ptr, + grad_input_ptr, batch_size, time_size, channel_size, + hw_size, group_size, group_channel, data_dtype, + channel_per_core, max_number_hw_per_core, + max_length_per_core); +} + +void tin_shift_forward_mlu(Tensor input, Tensor shift, Tensor output) { + TINShiftForwardMLUKernelLauncher(input, shift, output); +} + +void tin_shift_backward_mlu(Tensor grad_output, Tensor shift, + Tensor grad_input) { + TINShiftBackwardMLUKernelLauncher(grad_output, shift, grad_input); +} + +void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output); + +void tin_shift_backward_impl(Tensor grad_output, Tensor shift, + Tensor grad_input); + +REGISTER_DEVICE_IMPL(tin_shift_forward_impl, MLU, tin_shift_forward_mlu); +REGISTER_DEVICE_IMPL(tin_shift_backward_impl, MLU, tin_shift_backward_mlu); diff --git a/mmcv/ops/focal_loss.py b/mmcv/ops/focal_loss.py index 763bc93bd2..8058605161 100644 --- a/mmcv/ops/focal_loss.py +++ b/mmcv/ops/focal_loss.py @@ -34,7 +34,8 @@ def forward(ctx, weight=None, reduction='mean'): - assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor)) + assert isinstance( + target, (torch.Tensor, torch.LongTensor, torch.cuda.LongTensor)) assert input.dim() == 2 assert target.dim() == 1 assert input.size(0) == target.size(0) diff --git a/mmcv/ops/tin_shift.py b/mmcv/ops/tin_shift.py old mode 100644 new mode 100755 index 6965c5b34c..473231cc0d --- a/mmcv/ops/tin_shift.py +++ b/mmcv/ops/tin_shift.py @@ -18,6 +18,10 @@ class TINShiftFunction(Function): @staticmethod def forward(ctx, input, shift): + if input.size(0) != shift.size(0): + raise ValueError( + 'The first dim (batch) of `input` and `shift` should be ' + f'same, but got {input.size(0)} and {shift.size(0)}.') C = input.size(2) num_segments = shift.size(1) if C // num_segments <= 0 or C % num_segments != 0: diff --git a/mmcv/runner/dist_utils.py b/mmcv/runner/dist_utils.py index 8a60dcc418..0914dd670d 100644 --- a/mmcv/runner/dist_utils.py +++ b/mmcv/runner/dist_utils.py @@ -12,7 +12,7 @@ from torch._utils import (_flatten_dense_tensors, _take_tensors, _unflatten_dense_tensors) -from mmcv.device.mlu import IS_MLU +from mmcv.device.mlu import IS_MLU_AVAILABLE def _find_free_port(): @@ -49,7 +49,7 @@ def init_dist(launcher, backend='nccl', **kwargs): def _init_dist_pytorch(backend, **kwargs): # TODO: use local_rank instead of rank % num_gpus rank = int(os.environ['RANK']) - if IS_MLU: + if IS_MLU_AVAILABLE: import torch_mlu # noqa: F401 torch.mlu.set_device(rank) dist.init_process_group( diff --git a/mmcv/utils/__init__.py b/mmcv/utils/__init__.py index 8159c6a1a7..774259b4e2 100644 --- a/mmcv/utils/__init__.py +++ b/mmcv/utils/__init__.py @@ -41,11 +41,12 @@ from .logging import get_logger, print_log from .parrots_jit import jit, skip_no_elena # yapf: disable - from .parrots_wrapper import (TORCH_VERSION, BuildExtension, CppExtension, - CUDAExtension, DataLoader, PoolDataLoader, - SyncBatchNorm, _AdaptiveAvgPoolNd, - _AdaptiveMaxPoolNd, _AvgPoolNd, _BatchNorm, - _ConvNd, _ConvTransposeMixin, _get_cuda_home, + from .parrots_wrapper import (IS_CUDA_AVAILABLE, TORCH_VERSION, + BuildExtension, CppExtension, CUDAExtension, + DataLoader, PoolDataLoader, SyncBatchNorm, + _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, + _AvgPoolNd, _BatchNorm, _ConvNd, + _ConvTransposeMixin, _get_cuda_home, _InstanceNorm, _MaxPoolNd, get_build_config, is_rocm_pytorch) # yapf: enable @@ -71,5 +72,6 @@ 'assert_dict_has_keys', 'assert_keys_equal', 'assert_is_norm_layer', 'assert_params_all_zeros', 'check_python_script', 'is_method_overridden', 'is_jit_tracing', 'is_rocm_pytorch', - '_get_cuda_home', 'load_url', 'has_method', 'worker_init_fn' + '_get_cuda_home', 'load_url', 'has_method', 'IS_CUDA_AVAILABLE', + 'worker_init_fn' ] diff --git a/mmcv/utils/parrots_wrapper.py b/mmcv/utils/parrots_wrapper.py index cf6bf9ef86..7e657b5616 100644 --- a/mmcv/utils/parrots_wrapper.py +++ b/mmcv/utils/parrots_wrapper.py @@ -6,6 +6,13 @@ TORCH_VERSION = torch.__version__ +def is_cuda_available() -> bool: + return torch.cuda.is_available() + + +IS_CUDA_AVAILABLE = is_cuda_available() + + def is_rocm_pytorch() -> bool: is_rocm = False if TORCH_VERSION != 'parrots': diff --git a/setup.py b/setup.py index 154be6f318..d34706a267 100644 --- a/setup.py +++ b/setup.py @@ -301,13 +301,12 @@ def get_extensions(): extra_compile_args['cncc'] = [mlu_args] if mlu_args else [] op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \ glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \ - glob.glob('./mmcv/ops/csrc/pytorch/mlu/*.cpp') + \ - glob.glob('./mmcv/ops/csrc/pytorch/mlu/*.mlu') + glob.glob('./mmcv/ops/csrc/pytorch/mlu/*.cpp') extension = MLUExtension include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common')) include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mlu')) else: - print(f'Compiling {ext_name} without CUDA') + print(f'Compiling {ext_name} only with CPU') op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \ glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') extension = CppExtension diff --git a/tests/test_device/test_mlu/test_mlu_parallel.py b/tests/test_device/test_mlu/test_mlu_parallel.py index 7db9b495ca..3d51a68f82 100644 --- a/tests/test_device/test_mlu/test_mlu_parallel.py +++ b/tests/test_device/test_mlu/test_mlu_parallel.py @@ -5,7 +5,8 @@ import torch import torch.nn as nn -from mmcv.device.mlu import IS_MLU, MLUDataParallel, MLUDistributedDataParallel +from mmcv.device.mlu import (IS_MLU_AVAILABLE, MLUDataParallel, + MLUDistributedDataParallel) from mmcv.device.mlu._functions import Scatter, scatter from mmcv.parallel import is_module_wrapper @@ -31,7 +32,7 @@ def forward(self, x): model = Model() assert not is_module_wrapper(model) - if IS_MLU: + if IS_MLU_AVAILABLE: mludp = MLUDataParallel(model) assert is_module_wrapper(mludp) @@ -51,7 +52,7 @@ def test_scatter(): assert torch.allclose(input, output) # if the device is MLU, copy the input from CPU to MLU - if IS_MLU: + if IS_MLU_AVAILABLE: input = torch.zeros([1, 3, 3, 3]) output = scatter(input=input, devices=[0]) assert torch.allclose(input.to('mlu'), output) @@ -82,7 +83,7 @@ def test_Scatter(): assert torch.allclose(input, output) # if the device is MLU, copy the input from CPU to MLU - if IS_MLU: + if IS_MLU_AVAILABLE: target_mlus = [0] input = torch.zeros([1, 3, 3, 3]) outputs = Scatter.forward(target_mlus, input) diff --git a/tests/test_ops/test_bbox.py b/tests/test_ops/test_bbox.py index 2bdb3ce7a9..f75100ccf8 100644 --- a/tests/test_ops/test_bbox.py +++ b/tests/test_ops/test_bbox.py @@ -3,41 +3,60 @@ import pytest import torch +from mmcv.device.mlu import IS_MLU_AVAILABLE +from mmcv.utils import IS_CUDA_AVAILABLE -@pytest.mark.skipif( - not torch.cuda.is_available(), reason='requires CUDA support') -class TestBBox(object): - def _test_bbox_overlaps(self, dtype=torch.float): +class TestBBox(object): + def _test_bbox_overlaps(self, device, dtype=torch.float): from mmcv.ops import bbox_overlaps b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0, 4.0], - [7.0, 7.0, 8.0, 8.0]]).cuda().type(dtype) + [7.0, 7.0, 8.0, 8.0]]).to(device).type(dtype) b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0, - 3.0]]).cuda().type(dtype) + 3.0]]).to(device).type(dtype) should_output = np.array([[0.33333334, 0.5], [0.2, 0.5], [0.0, 0.0]]) out = bbox_overlaps(b1, b2, offset=1) assert np.allclose(out.cpu().numpy(), should_output, 1e-2) b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0, - 4.0]]).cuda().type(dtype) + 4.0]]).to(device).type(dtype) b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0, - 3.0]]).cuda().type(dtype) + 3.0]]).to(device).type(dtype) should_output = np.array([0.33333334, 0.5]) out = bbox_overlaps(b1, b2, aligned=True, offset=1) assert np.allclose(out.cpu().numpy(), should_output, 1e-2) - b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).cuda().type(dtype) - b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).cuda().type(dtype) + b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).to(device).type(dtype) b2 = torch.tensor([[4.0, 0.0, 5.0, 3.0], [3.0, 0.0, 4.0, 3.0], [2.0, 0.0, 3.0, 3.0], [1.0, 0.0, 2.0, - 3.0]]).cuda().type(dtype) + 3.0]]).to(device).type(dtype) should_output = np.array([0, 0.2, 0.5, 0.5]) out = bbox_overlaps(b1, b2, offset=1) assert np.allclose(out.cpu().numpy(), should_output, 1e-2) - def test_bbox_overlaps_float(self): - self._test_bbox_overlaps(torch.float) - - def test_bbox_overlaps_half(self): - self._test_bbox_overlaps(torch.half) + @pytest.mark.parametrize('device', [ + pytest.param( + 'cuda', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'mlu', + marks=pytest.mark.skipif( + not IS_MLU_AVAILABLE, reason='requires MLU support')) + ]) + def test_bbox_overlaps_float(self, device): + self._test_bbox_overlaps(device, dtype=torch.float) + + @pytest.mark.parametrize('device', [ + pytest.param( + 'cuda', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'mlu', + marks=pytest.mark.skipif( + not IS_MLU_AVAILABLE, reason='requires MLU support')) + ]) + def test_bbox_overlaps_half(self, device): + self._test_bbox_overlaps(device, dtype=torch.half) diff --git a/tests/test_ops/test_focal_loss.py b/tests/test_ops/test_focal_loss.py index a9a10cc2e3..c28e640874 100644 --- a/tests/test_ops/test_focal_loss.py +++ b/tests/test_ops/test_focal_loss.py @@ -1,7 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np +import pytest import torch +from mmcv.device.mlu import IS_MLU_AVAILABLE +from mmcv.utils import IS_CUDA_AVAILABLE + _USING_PARROTS = True try: from parrots.autograd import gradcheck @@ -57,9 +61,7 @@ def _test_softmax(self, dtype=torch.float): assert np.allclose(loss.data.cpu().numpy(), output[0], 1e-2) assert np.allclose(x.grad.data.cpu(), np_x_grad, 1e-2) - def _test_sigmoid(self, dtype=torch.float): - if not torch.cuda.is_available(): - return + def _test_sigmoid(self, device, dtype=torch.float): from mmcv.ops import sigmoid_focal_loss alpha = 0.25 gamma = 2.0 @@ -68,9 +70,9 @@ def _test_sigmoid(self, dtype=torch.float): np_y = np.array(case[1]) np_x_grad = np.array(output[1]) - x = torch.from_numpy(np_x).cuda().type(dtype) + x = torch.from_numpy(np_x).to(device).type(dtype) x.requires_grad_() - y = torch.from_numpy(np_y).cuda().long() + y = torch.from_numpy(np_y).to(device).long() loss = sigmoid_focal_loss(x, y, gamma, alpha, None, 'mean') loss.backward() @@ -128,11 +130,31 @@ def test_softmax_float(self): def test_softmax_half(self): self._test_softmax(dtype=torch.half) - def test_sigmoid_float(self): - self._test_sigmoid(dtype=torch.float) - - def test_sigmoid_half(self): - self._test_sigmoid(dtype=torch.half) + @pytest.mark.parametrize('device', [ + pytest.param( + 'cuda', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'mlu', + marks=pytest.mark.skipif( + not IS_MLU_AVAILABLE, reason='requires MLU support')) + ]) + def test_sigmoid_float(self, device): + self._test_sigmoid(device=device, dtype=torch.float) + + @pytest.mark.parametrize('device', [ + pytest.param( + 'cuda', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'mlu', + marks=pytest.mark.skipif( + not IS_MLU_AVAILABLE, reason='requires MLU support')) + ]) + def test_sigmoid_half(self, device): + self._test_sigmoid(device, dtype=torch.half) def test_grad_softmax_float(self): self._test_grad_softmax(dtype=torch.float) diff --git a/tests/test_ops/test_nms.py b/tests/test_ops/test_nms.py index 44b7dbd74b..d90f0afb17 100644 --- a/tests/test_ops/test_nms.py +++ b/tests/test_ops/test_nms.py @@ -3,12 +3,23 @@ import pytest import torch +from mmcv.device.mlu import IS_MLU_AVAILABLE +from mmcv.utils import IS_CUDA_AVAILABLE + class Testnms(object): - def test_nms_allclose(self): - if not torch.cuda.is_available(): - return + @pytest.mark.parametrize('device', [ + pytest.param( + 'cuda', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'mlu', + marks=pytest.mark.skipif( + not IS_MLU_AVAILABLE, reason='requires MLU support')) + ]) + def test_nms_allclose(self, device): from mmcv.ops import nms np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0], [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]], @@ -24,7 +35,7 @@ def test_nms_allclose(self): assert np.allclose(dets, np_dets) # test cpu assert np.allclose(inds, np_inds) # test cpu dets, inds = nms( - boxes.cuda(), scores.cuda(), iou_threshold=0.3, offset=0) + boxes.to(device), scores.to(device), iou_threshold=0.3, offset=0) assert np.allclose(dets.cpu().numpy(), np_dets) # test gpu assert np.allclose(inds.cpu().numpy(), np_inds) # test gpu diff --git a/tests/test_ops/test_roi_align.py b/tests/test_ops/test_roi_align.py index 70134bd0ac..cc6f92b8cb 100644 --- a/tests/test_ops/test_roi_align.py +++ b/tests/test_ops/test_roi_align.py @@ -3,6 +3,9 @@ import pytest import torch +from mmcv.device.mlu import IS_MLU_AVAILABLE +from mmcv.utils import IS_CUDA_AVAILABLE + _USING_PARROTS = True try: from parrots.autograd import gradcheck @@ -11,6 +14,7 @@ _USING_PARROTS = False # yapf:disable + inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]), ([[[[1., 2.], [3., 4.]], @@ -39,8 +43,6 @@ def _test_roialign_gradcheck(device, dtype): - if not torch.cuda.is_available() and device == 'cuda': - pytest.skip('test requires GPU') try: from mmcv.ops import RoIAlign except ModuleNotFoundError: @@ -65,8 +67,6 @@ def _test_roialign_gradcheck(device, dtype): def _test_roialign_allclose(device, dtype): - if not torch.cuda.is_available() and device == 'cuda': - pytest.skip('test requires GPU') try: from mmcv.ops import roi_align except ModuleNotFoundError: @@ -75,7 +75,6 @@ def _test_roialign_allclose(device, dtype): pool_w = 2 spatial_scale = 1.0 sampling_ratio = 2 - for case, output in zip(inputs, outputs): np_input = np.array(case[0]) np_rois = np.array(case[1]) @@ -95,8 +94,26 @@ def _test_roialign_allclose(device, dtype): x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3) -@pytest.mark.parametrize('device', ['cuda', 'cpu']) -@pytest.mark.parametrize('dtype', [torch.float, torch.double, torch.half]) +@pytest.mark.parametrize('device', [ + 'cpu', + pytest.param( + 'cuda', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'mlu', + marks=pytest.mark.skipif( + not IS_MLU_AVAILABLE, reason='requires MLU support')) +]) +@pytest.mark.parametrize('dtype', [ + torch.float, + pytest.param( + torch.double, + marks=pytest.mark.skipif( + IS_MLU_AVAILABLE, + reason='MLU does not support for 64-bit floating point')), + torch.half +]) def test_roialign(device, dtype): # check double only if dtype is torch.double: diff --git a/tests/test_ops/test_tin_shift.py b/tests/test_ops/test_tin_shift.py old mode 100644 new mode 100755 index dac1490ca6..7a32aed71e --- a/tests/test_ops/test_tin_shift.py +++ b/tests/test_ops/test_tin_shift.py @@ -5,6 +5,9 @@ import pytest import torch +from mmcv.device.mlu import IS_MLU_AVAILABLE +from mmcv.utils import IS_CUDA_AVAILABLE + _USING_PARROTS = True try: from parrots.autograd import gradcheck @@ -131,7 +134,7 @@ ] -def _test_tinshift_gradcheck(dtype): +def _test_tinshift_gradcheck(device, dtype): try: from mmcv.ops import tin_shift except ModuleNotFoundError: @@ -145,15 +148,15 @@ def _test_tinshift_gradcheck(dtype): np_shift = np.array(shift) x = torch.tensor( - np_input, dtype=dtype, device='cuda', requires_grad=True) - shift = torch.tensor(np_shift, device='cuda').int() + np_input, dtype=dtype, device=device, requires_grad=True) + shift = torch.tensor(np_shift, device=device).int() if torch.__version__ == 'parrots': gradcheck(tin_shift, (x, shift)) else: gradcheck(tin_shift, (x, shift), atol=1, rtol=0.1) -def _test_tinshift_allclose(dtype): +def _test_tinshift_allclose(device, dtype): try: from mmcv.ops import tin_shift except ModuleNotFoundError: @@ -166,8 +169,8 @@ def _test_tinshift_allclose(dtype): np_grad = np.array(grad) x = torch.tensor( - np_input, dtype=dtype, device='cuda', requires_grad=True) - shift = torch.tensor(np_shift, device='cuda').int() + np_input, dtype=dtype, device=device, requires_grad=True) + shift = torch.tensor(np_shift, device=device).int() output = tin_shift(x, shift) output.backward(torch.ones_like(output)) @@ -177,28 +180,48 @@ def _test_tinshift_allclose(dtype): x.grad.data.type(torch.float).cpu().numpy(), np_grad, 1e-3) -def _test_tinshift_assert(dtype): +def _test_tinshift_assert(device, dtype): try: from mmcv.ops import tin_shift except ModuleNotFoundError: pytest.skip('TINShift op is not successfully compiled') - inputs = [torch.rand(2, 3, 4, 2), torch.rand(2, 3, 4, 2)] + inputs = [ + torch.rand(2, 3, 4, 2), + torch.rand(2, 3, 4, 2), + torch.rand(1, 3, 4, 2) + ] shifts = [torch.rand(2, 3), torch.rand(2, 5)] for x, shift in zip(inputs, shifts): - x = x.cuda() - shift = shift.cuda() + x = x.to(device).type(dtype) + shift = shift.to(device).type(dtype) # A ValueError should be raised if ops get inputs with wrong shapes. with pytest.raises(ValueError): tin_shift(x, shift) -@pytest.mark.skipif( - not torch.cuda.is_available(), reason='requires CUDA support') -@pytest.mark.parametrize('dtype', [torch.float, torch.double, torch.half]) -def test_tinshift(dtype): - _test_tinshift_allclose(dtype=dtype) - _test_tinshift_gradcheck(dtype=dtype) - _test_tinshift_assert(dtype=dtype) +@pytest.mark.parametrize('device', [ + pytest.param( + 'cuda', + marks=pytest.mark.skipif( + not IS_CUDA_AVAILABLE, reason='requires CUDA support')), + pytest.param( + 'mlu', + marks=pytest.mark.skipif( + not IS_MLU_AVAILABLE, reason='requires MLU support')) +]) +@pytest.mark.parametrize('dtype', [ + torch.float, + pytest.param( + torch.double, + marks=pytest.mark.skipif( + IS_MLU_AVAILABLE, + reason='MLU does not support for 64-bit floating point')), + torch.half +]) +def test_tinshift(device, dtype): + _test_tinshift_allclose(device=device, dtype=dtype) + _test_tinshift_gradcheck(device=device, dtype=dtype) + _test_tinshift_assert(device=device, dtype=dtype)