diff --git a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
new file mode 100644
index 0000000000..1095da870c
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
@@ -0,0 +1,649 @@
+/*************************************************************************
+ * Copyright (C) 2021 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+
+#define NMS_SIZE (64)
+#define COORD_DIM (4)
+#define MEMORY_CORE (0x80)
+#define INFO_NUM (5)  // 5 means x1, x2, y1, y2 and score
+
+#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
+#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
+
+__nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
+__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
+
+__mlu_func__ void pvLock() {
+#if __BANG_ARCH__ == 270
+  if (coreId != MEMORY_CORE) {
+    __bang_lock(0, 0);
+  }
+#endif
+}
+
+__mlu_func__ void pvUnlock() {
+#if __BANG_ARCH__ == 270
+  if (coreId != MEMORY_CORE) {
+    __bang_unlock(0, 0);
+  }
+#endif
+}
+
+enum Addr { SRAM, GDRAM };
+
+template <typename IN_DT, typename OUT_DT>
+__mlu_func__ void nms_detection(
+    uint32_t *output_box_num, const int output_mode, const int input_layout,
+    OUT_DT *output_data, const Addr dst, IN_DT *input_data_score,
+    const IN_DT *input_data_box, const Addr src, IN_DT *buffer,
+    const int buffer_size, IN_DT *sram, const int core_limit,
+    const int input_box_num, const int input_stride, const int output_stride,
+    const int keepNum, const float thresh_iou, const float thresh_score,
+    const float offset, const int algo) {
+  // global value, it is stored in sram with a offset from the begin.
+  const int flag_offset_size = 28;
+  int32_t *loop_end_flag = (int32_t *)(sram + flag_offset_size);
+  loop_end_flag[0] = 0;
+  // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
+  const int nms_buffer_count1 = 9;
+  // temp nram buffer to store selected target.
+  const int nram_save_limit_count = 256;
+  float div_thresh_iou = 1.0 / thresh_iou;
+
+  // input data ptr
+  IN_DT *input_score_ptr;
+  const IN_DT *input_x1_ptr;
+  const IN_DT *input_y1_ptr;
+  const IN_DT *input_x2_ptr;
+  const IN_DT *input_y2_ptr;
+  input_score_ptr = input_data_score;
+  input_x1_ptr = input_data_box;
+  if (input_layout == 0) {
+    // [boxes_num, 4]
+    input_y1_ptr = input_x1_ptr + 1;
+    input_x2_ptr = input_x1_ptr + 2;
+    input_y2_ptr = input_x1_ptr + 3;
+  } else if (input_layout == 1) {
+    // [4, boxes_num]
+    input_y1_ptr = input_x1_ptr + input_stride;
+    input_x2_ptr = input_y1_ptr + input_stride;
+    input_y2_ptr = input_x2_ptr + input_stride;
+  }
+
+  // nram data ptr
+  IN_DT *x1;
+  IN_DT *y1;
+  IN_DT *x2;
+  IN_DT *y2;
+  IN_DT *score;
+  IN_DT *inter_x1;
+  IN_DT *inter_y1;
+  IN_DT *inter_x2;
+  IN_DT *inter_y2;
+  IN_DT *max_box;  // the max score, x1, y1, x2, y2
+  IN_DT *x1_mask;
+  IN_DT *y1_mask;
+  IN_DT *x2_mask;
+  IN_DT *y2_mask;
+  OUT_DT *nram_save;
+
+  int limit = 0;        // find limit when GDRAM or SRAM
+  int len_core = 0;     // the length deal by every core
+  int max_seg_pad = 0;  // the max length every repeat
+  int repeat = 0;
+  int remain = 0;
+  int remain_pad = 0;
+  int input_offset = 0;  // offset of input_data for current core
+  int nram_save_count = 0;
+  // mask for collect x1, y1, x2, y2. each mask has 128 elements
+  const int mask_size = 128;
+  const int total_mask_size = 512;
+
+  if (output_mode == 0) {
+    limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * sizeof(OUT_DT) -
+             total_mask_size * sizeof(IN_DT)) /
+            (nms_buffer_count1 * sizeof(IN_DT));
+  } else {
+    limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * INFO_NUM * sizeof(OUT_DT) -
+             total_mask_size * sizeof(IN_DT)) /
+            (nms_buffer_count1 * sizeof(IN_DT));
+  }
+
+  if (core_limit == 1) {
+    len_core = input_box_num;
+    input_offset = 0;
+  } else {
+    int avg_core = input_box_num / core_limit;
+    int rem = input_box_num % core_limit;
+    len_core = avg_core + (taskId < rem ? 1 : 0);
+    input_offset = avg_core * taskId + (taskId <= rem ? taskId : rem);
+  }
+  max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
+  repeat = len_core / max_seg_pad;
+  remain = len_core % max_seg_pad;
+  remain_pad = PAD_UP(remain, NMS_SIZE);
+
+  // if datatype is half, we should convert it to float when compute the IoU
+  int max_seg_iou_compute =
+      PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
+  int repeat_iou_compute = len_core / max_seg_iou_compute;
+  int remain_iou_compute = len_core % max_seg_iou_compute;
+  int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
+  // initial the address point
+  score = buffer;
+  x1 = score + max_seg_pad;
+  y1 = x1 + max_seg_pad;
+  x2 = y1 + max_seg_pad;
+  y2 = x2 + max_seg_pad;
+  inter_x1 = y2 + max_seg_pad;
+  inter_y1 = inter_x1 + max_seg_pad;
+  inter_x2 = inter_y1 + max_seg_pad;
+  inter_y2 = inter_x2 + max_seg_pad;
+  x1_mask = inter_y2 + max_seg_pad;
+  y1_mask = x1_mask + mask_size;
+  x2_mask = y1_mask + mask_size;
+  y2_mask = x2_mask + mask_size;
+  max_box = y2_mask + mask_size;  // the max score, x1, y1, x2, y2
+  // offset two line from max_box
+  nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE);
+
+  // set mask for __bang_collect instruction
+  if (input_layout == 0) {
+    __nramset((IN_DT *)x1_mask, total_mask_size, (IN_DT)0);
+    for (int idx = 0; idx < mask_size; idx++) {
+      int index = (idx % COORD_DIM) * mask_size + idx;
+      x1_mask[index] = (IN_DT)1.0;
+    }
+  }
+
+  for (int keep = 0; keep < keepNum; keep++) {  // loop until the max_score <= 0
+    if (core_limit != 1) {
+      __sync_cluster();  // sync before current loop
+    }
+
+    /******find max start******/
+    int max_index = 0;         // the max score index
+    int global_max_index = 0;  // for U1
+    float max_area = 0;        // the max score area
+    max_box[0] = 0;            // init 0
+
+    for (int i = 0; i <= repeat; i++) {
+      if (i == repeat && remain == 0) {
+        break;
+      }
+      int seg_len = 0;  // the length every nms compute
+      int cpy_len = 0;  // the length every nms memcpy
+      i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
+      // check seg_len exceeds the limit of fp16 or not. 65536 is the largest
+      // num that half data type could express.
+      if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
+        // seg length exceeds the max num for fp16 datatype!
+        return;
+      }
+      i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
+      /******nms load start******/
+      mluMemcpyDirection_t load_dir = SRAM2NRAM;
+      if (src == SRAM) {
+        load_dir = SRAM2NRAM;
+      } else {
+        load_dir = GDRAM2NRAM;
+      }
+      __nramset(score, seg_len, (IN_DT)0);
+      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+               cpy_len * sizeof(IN_DT), 0);
+
+      /******nms load end******/
+
+      __bang_max(inter_x1, score, seg_len);
+      if (inter_x1[0] > max_box[0]) {
+        max_box[0] = inter_x1[0];
+
+        if (sizeof(IN_DT) == sizeof(half)) {
+          max_index = ((uint16_t *)inter_x1)[1] + input_offset +
+                      i * max_seg_pad;  // offset start from head of input_data
+        } else if (sizeof(IN_DT) == sizeof(float)) {
+          max_index = ((uint32_t *)inter_x1)[1] + input_offset +
+                      i * max_seg_pad;  // offset start from head of input_data
+        }
+      }
+    }  // for repeat
+
+    int stride = 1;
+    if (input_layout == 0) {
+      stride = input_stride;
+    } else if (input_layout == 1) {
+      stride = 1;
+    }
+
+    if (core_limit == 1) {
+      max_box[1] = input_x1_ptr[max_index * stride];
+      max_box[2] = input_y1_ptr[max_index * stride];
+      max_box[3] = input_x2_ptr[max_index * stride];
+      max_box[4] = input_y2_ptr[max_index * stride];
+      if (algo == 0 || offset == 0.0) {
+        max_area = ((float)max_box[3] - (float)max_box[1]) *
+                   ((float)max_box[4] - (float)max_box[2]);
+      } else {
+        max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
+                   ((float)max_box[4] - (float)max_box[2] + offset);
+      }
+      input_score_ptr[max_index] = 0;
+      global_max_index = max_index;
+      ((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
+    } else if (core_limit == 4) {
+      // find the max with sram
+      // the max box's x1, y1, x2, y2 on every core
+      if (coreId != MEMORY_CORE) {
+        max_box[1] = input_x1_ptr[max_index * stride];
+        max_box[2] = input_y1_ptr[max_index * stride];
+        max_box[3] = input_x2_ptr[max_index * stride];
+        max_box[4] = input_y2_ptr[max_index * stride];
+      }
+      ((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
+      // copy every core's box info to sram, form: score---x1---y1---x2---y2---
+      for (int i = 0; i < INFO_NUM; i++) {
+        __memcpy(sram + i * core_limit + taskId, max_box + i, 1 * sizeof(IN_DT),
+                 NRAM2SRAM);
+      }
+      // copy every core's max_index to sram, use 2 half to store max_index
+      __memcpy(sram + INFO_NUM * core_limit + taskId * 2, max_box + INFO_NUM,
+               sizeof(uint32_t),
+               NRAM2SRAM);  // int32_t datatype
+      __sync_cluster();
+
+      // copy score from sram to nram and find the max
+      __nramset(inter_x1, NMS_SIZE, (IN_DT)0);
+      __memcpy(inter_x1, sram, core_limit * sizeof(IN_DT), SRAM2NRAM);
+      __bang_max(max_box, inter_x1, NMS_SIZE);
+      int max_core = 0;
+      if (sizeof(IN_DT) == sizeof(half)) {
+        max_core = ((uint16_t *)max_box)[1];
+      } else if (sizeof(IN_DT) == sizeof(float)) {
+        max_core = ((uint32_t *)max_box)[1];
+      }
+
+      // copy the max box from SRAM to NRAM
+      __memcpy(max_box + 1, sram + 1 * core_limit + max_core, 1 * sizeof(IN_DT),
+               SRAM2NRAM);  // x1
+      __memcpy(max_box + 2, sram + 2 * core_limit + max_core, 1 * sizeof(IN_DT),
+               SRAM2NRAM);  // y1
+      __memcpy(max_box + 3, sram + 3 * core_limit + max_core, 1 * sizeof(IN_DT),
+               SRAM2NRAM);  // x2
+      __memcpy(max_box + 4, sram + 4 * core_limit + max_core, 1 * sizeof(IN_DT),
+               SRAM2NRAM);  // y2
+      __memcpy(max_box + 5, sram + 5 * core_limit + 2 * max_core,
+               sizeof(uint32_t), SRAM2NRAM);
+      if (algo == 0 || offset == 0.0) {
+        max_area = ((float)max_box[3] - (float)max_box[1]) *
+                   ((float)max_box[4] - (float)max_box[2]);
+      } else {
+        max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
+                   ((float)max_box[4] - (float)max_box[2] + offset);
+      }
+      global_max_index = ((uint32_t *)(max_box + INFO_NUM))[0];
+      input_score_ptr[global_max_index] = 0;
+    }
+    // by now, we get: max_score|max_index|max_box|max_area
+    /******find max end******/
+
+    /******nms store start******/
+    // store to nram
+    if (float(max_box[0]) > thresh_score) {
+      OUT_DT *save_ptr;
+      int save_offset = 0;
+      int save_str_num = 0;
+      save_ptr = nram_save;
+      save_offset = nram_save_count;
+      save_str_num = nram_save_limit_count;
+      if (coreId == 0) {
+        if (output_mode == 0) {  // index1, index2, ...
+          __memcpy(save_ptr + save_offset, (uint32_t *)(max_box + INFO_NUM),
+                   1 * sizeof(uint32_t), NRAM2NRAM, 1 * sizeof(uint32_t),
+                   1 * sizeof(uint32_t), 0);
+        } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+          __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
+                   INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
+                   INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
+        } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
+          __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
+                   NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
+                   4);
+        }
+      }
+      nram_save_count++;
+      (*output_box_num)++;
+    }
+
+    // store to sram/gdram
+    if (*output_box_num != 0) {
+      mluMemcpyDirection_t store_dir = NRAM2GDRAM;
+      if (dst == SRAM) {
+        store_dir = NRAM2SRAM;
+      } else {  // dst == GDRAM
+        store_dir = NRAM2GDRAM;
+      }
+      if ((nram_save_count == nram_save_limit_count) ||
+          (float(max_box[0]) <= thresh_score) || keep == keepNum - 1) {
+        if (nram_save_count != 0) {
+          if (coreId == 0) {
+            if (output_mode == 0) {  // index1, index2, ...
+              pvLock();
+              __memcpy(output_data, nram_save,
+                       nram_save_count * sizeof(uint32_t), store_dir);
+              pvUnlock();
+              output_data += nram_save_count;
+            } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+              pvLock();
+              __memcpy(output_data, nram_save,
+                       nram_save_count * INFO_NUM * sizeof(IN_DT), store_dir);
+              pvUnlock();
+              output_data += nram_save_count * INFO_NUM;
+            } else if (output_mode ==
+                       2) {  // score---, x1---, y1---, x2---, y2---
+              pvLock();
+              __memcpy(output_data, nram_save, nram_save_count * sizeof(IN_DT),
+                       store_dir, output_stride * sizeof(IN_DT),
+                       nram_save_limit_count * sizeof(IN_DT), 4);
+              pvUnlock();
+              output_data += nram_save_count;
+            }
+            nram_save_count = 0;
+          }
+        }
+      }  // if move data nram->sram/gdram
+    }    // if dst
+
+    // if the max score <= 0, end
+    if (core_limit == 1) {
+      if (float(max_box[0]) <= thresh_score) {
+        break;
+      }
+    } else {
+      if (float(max_box[0]) <= thresh_score) {
+        if (coreId == 0) {
+          loop_end_flag[0] = 1;
+        }
+      }
+      __sync_cluster();
+      if (loop_end_flag[0] == 1) {
+        break;
+      }
+    }
+    /******nms store end******/
+
+    // To solve half data accuracy, we convert half to float to calculate IoU.
+    for (int i = 0; i <= repeat_iou_compute; i++) {
+      if (i == repeat_iou_compute && remain_iou_compute == 0) {
+        break;
+      }
+      int seg_len = 0;  // the length every nms compute
+      int cpy_len = 0;  // the length every nms memcpy
+      i == repeat_iou_compute ? seg_len = remain_pad_iou_compute
+                              : seg_len = max_seg_iou_compute;
+      i == repeat_iou_compute ? cpy_len = remain_iou_compute
+                              : cpy_len = max_seg_iou_compute;
+
+      /******nms load start******/
+      mluMemcpyDirection_t load_dir = SRAM2NRAM;
+      if (src == SRAM) {
+        load_dir = SRAM2NRAM;
+      } else {
+        load_dir = GDRAM2NRAM;
+      }
+
+      __nramset((float *)score, seg_len, 0.0f);
+      int dt_offset = 0;
+      if (sizeof(IN_DT) == sizeof(float)) {
+        __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        dt_offset = 0;
+      } else if (sizeof(IN_DT) == sizeof(half)) {
+        __nramset(x1, seg_len, half(0));
+        __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        __bang_half2float((float *)score, (half *)x1, seg_len);
+        dt_offset = max_seg_iou_compute;
+      }
+
+      if (input_layout == 0) {
+        // the following number 4 means x1, y1, x2, y2
+        __memcpy(
+            inter_x1,
+            input_x1_ptr + (input_offset + i * max_seg_iou_compute) * COORD_DIM,
+            cpy_len * COORD_DIM * sizeof(IN_DT), load_dir,
+            cpy_len * COORD_DIM * sizeof(IN_DT),
+            cpy_len * COORD_DIM * sizeof(IN_DT), 0);
+        // here use collect instruction to transpose the [n, 4] shape into [4,
+        // n] shape to avoid
+        // discrete memory accessing.
+        for (int c_i = 0; c_i < COORD_DIM * seg_len / mask_size; c_i++) {
+          // the following number 32 means 32 elements will be selected out by
+          // once operation
+          __bang_collect(x1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
+                         x1_mask, mask_size);
+          __bang_collect(y1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
+                         y1_mask, mask_size);
+          __bang_collect(x2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
+                         x2_mask, mask_size);
+          __bang_collect(y2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
+                         y2_mask, mask_size);
+        }
+      } else if (input_layout == 1) {
+        __memcpy(x1 + dt_offset,
+                 input_x1_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        __memcpy(y1 + dt_offset,
+                 input_y1_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        __memcpy(x2 + dt_offset,
+                 input_x2_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        __memcpy(y2 + dt_offset,
+                 input_y2_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+      }
+      /******nms load end******/
+
+      /******nms compute start******/
+      if (sizeof(IN_DT) == sizeof(half)) {
+        __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
+                          seg_len);
+      }
+      // 1、 compute IOU
+      // get the area_I
+      __nramset((float *)inter_y1, seg_len, float(max_box[1]));  // max_x1
+      __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
+                      seg_len);                                  // inter_x1
+      __nramset((float *)inter_y2, seg_len, float(max_box[3]));  // max_x2
+      __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
+                      seg_len);  // inter_x2
+      __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+                 seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
+      }
+      __bang_active_relu((float *)inter_x1, (float *)inter_x1,
+                         seg_len);                               // inter_w
+      __nramset((float *)inter_x2, seg_len, float(max_box[2]));  // max_y1
+      __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
+                      seg_len);                                  // inter_y1
+      __nramset((float *)inter_x2, seg_len, float(max_box[4]));  // max_y2
+      __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
+                      seg_len);  // inter_y2
+      __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
+                 seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+      }
+      __bang_active_relu((float *)inter_y1, (float *)inter_y1,
+                         seg_len);  // inter_h
+      __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
+                 seg_len);  // area_I
+      // get the area of input_box: area = (x2 - x1) * (y2 - y1);
+      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
+      __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+        __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
+      }
+      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
+                 seg_len);  // area
+      // get the area_U: area + max_area - area_I
+      __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
+                       seg_len);
+      __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
+                 seg_len);  // area_U
+      // 2、 select the box
+      // if IOU greater than thres, set the score to zero, abort it: area_U >
+      // area_I * (1 / thresh)?
+      if (thresh_iou > 0.0) {
+        __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
+                         seg_len);
+      } else {
+        __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
+                         seg_len);
+      }
+      __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+                seg_len);
+      __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
+      /******nms compute end******/
+
+      // update the score
+      mluMemcpyDirection_t update_dir = NRAM2SRAM;
+      if (dst == SRAM) {
+        update_dir = NRAM2SRAM;
+      } else {
+        update_dir = NRAM2GDRAM;
+      }
+      if (sizeof(IN_DT) == sizeof(half)) {
+        __bang_float2half_rd((half *)score, (float *)score, seg_len);
+      }
+      pvLock();
+      __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
+               cpy_len * sizeof(IN_DT), update_dir, cpy_len * sizeof(IN_DT),
+               cpy_len * sizeof(IN_DT), 0);
+      pvUnlock();
+    }  // for repeat
+  }    // for keepNum
+}
+
+__mlu_global__ void MLUKernelNMS(
+    const void *input_boxes, const void *input_confidence,
+    const int input_num_boxes, const int input_stride,
+    const int max_output_size, const float iou_threshold,
+    const float confidence_threshold, const int mode, const int input_layout,
+    void *workspace, void *result_num, void *output,
+    const cnrtDataType_t data_type_input, const float offset, const int algo) {
+  if (data_type_input == CNRT_FLOAT16) {
+    __memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
+             GDRAM2GDRAM);
+  } else if (data_type_input == CNRT_FLOAT32) {
+    __memcpy(workspace, input_confidence, input_num_boxes * sizeof(float),
+             GDRAM2GDRAM);
+  } else {
+  }
+
+  int output_stride = max_output_size;
+  uint32_t result_box_num = 0;
+  if (mode == 0) {
+    uint32_t *out_data = (uint32_t *)output;
+    switch (data_type_input) {
+      default: { return; }
+      case CNRT_FLOAT16: {
+        half *boxes_data = (half *)input_boxes;
+        half *confi_data = (half *)workspace;
+        half *buffer = (half *)nram_buffer;
+        half *sram = (half *)sram_buffer;
+
+        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
+                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
+                      sram, taskDim, input_num_boxes, input_stride,
+                      output_stride, max_output_size, iou_threshold,
+                      confidence_threshold, offset, algo);
+        ((uint32_t *)result_num)[0] = result_box_num;
+      }; break;
+      case CNRT_FLOAT32: {
+        float *boxes_data = (float *)input_boxes;
+        float *confi_data = (float *)workspace;
+        float *buffer = (float *)nram_buffer;
+        float *sram = (float *)sram_buffer;
+
+        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
+                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
+                      sram, taskDim, input_num_boxes, input_stride,
+                      output_stride, max_output_size, iou_threshold,
+                      confidence_threshold, offset, algo);
+        ((uint32_t *)result_num)[0] = result_box_num;
+      }; break;
+    }
+  } else {
+    switch (data_type_input) {
+      default: { return; }
+      case CNRT_FLOAT16: {
+        half *boxes_data = (half *)input_boxes;
+        half *confi_data = (half *)workspace;
+        half *out_data = (half *)output;
+        half *buffer = (half *)nram_buffer;
+        half *sram = (half *)sram_buffer;
+
+        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
+                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
+                      sram, taskDim, input_num_boxes, input_stride,
+                      output_stride, max_output_size, iou_threshold,
+                      confidence_threshold, offset, algo);
+        ((uint32_t *)result_num)[0] = result_box_num;
+      }; break;
+      case CNRT_FLOAT32: {
+        float *boxes_data = (float *)input_boxes;
+        float *confi_data = (float *)workspace;
+        float *out_data = (float *)output;
+        float *buffer = (float *)nram_buffer;
+        float *sram = (float *)sram_buffer;
+
+        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
+                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
+                      sram, taskDim, input_num_boxes, input_stride,
+                      output_stride, max_output_size, iou_threshold,
+                      confidence_threshold, offset, algo);
+        ((uint32_t *)result_num)[0] = result_box_num;
+      }; break;
+    }
+  }
+}
+
+void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+               const cnrtDataType_t data_type_input, const void *boxes_ptr,
+               const void *scores_ptr, const int input_num_boxes,
+               const int input_stride, const int max_output_boxes,
+               const float iou_threshold, const float offset,
+               void *workspace_ptr, void *output_size_ptr, void *output_ptr) {
+  MLUKernelNMS<<<k_dim, k_type, queue>>>(
+      boxes_ptr, scores_ptr, input_num_boxes, input_stride, max_output_boxes,
+      iou_threshold, /*confidence_threshold=*/0.0, /*output_mode=*/0,
+      /*input_layout=*/0, workspace_ptr, output_size_ptr, output_ptr,
+      data_type_input, offset, /*algo=*/1);
+}
diff --git a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
index 4f198ac37b..15c5333712 100644
--- a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
@@ -19,6 +19,9 @@ using namespace at;
 #define CHECK_CUDA_INPUT(x) \
   CHECK_CUDA(x);            \
   CHECK_CONTIGUOUS(x)
+#define CHECK_MLU_INPUT(x) \
+  CHECK_MLU(x);            \
+  CHECK_CONTIGUOUS(x)
 #define CHECK_CPU_INPUT(x) \
   CHECK_CPU(x);            \
   CHECK_CONTIGUOUS(x)
diff --git a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
new file mode 100644
index 0000000000..af193fce33
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
@@ -0,0 +1,96 @@
+/*************************************************************************
+ * Copyright (C) 2021 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "pytorch_mlu_helper.hpp"
+
+void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+               const cnrtDataType_t data_type_input, const void *boxes_ptr,
+               const void *scores_ptr, const int input_num_boxes,
+               const int input_stride, const int max_output_boxes,
+               const float iou_threshold, const float offset,
+               void *workspace_ptr, void *output_size_ptr, void *output_ptr);
+
+Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                            int offset) {
+  // dimension parameters check
+  TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
+              boxes.dim(), "D");
+  TORCH_CHECK(boxes.size(1) == 4,
+              "boxes should have 4 elements in dimension 1, got ",
+              boxes.size(1));
+  TORCH_CHECK(scores.dim() == 1, "scores should be a 1d tensor, got ",
+              scores.dim(), "D");
+
+  // data type check
+  TORCH_CHECK(boxes.scalar_type() == scores.scalar_type(),
+              "boxes should have the same type as scores");
+  TORCH_CHECK(
+      boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
+      "data type of boxes should be Float or Half, got ", boxes.scalar_type());
+
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  int input_num_boxes = boxes.size(0);
+  int input_stride = boxes.size(1);
+  int max_output_boxes = boxes.size(0);
+  cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  int core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  uint32_t dim_x = core_dim;
+  cnrtDim3_t k_dim = {dim_x, 1, 1};
+  cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
+
+  auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kLong));
+  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
+
+  // workspace
+  size_t space_size = 0;
+  if (boxes.scalar_type() == at::kHalf) {
+    space_size = input_num_boxes * sizeof(int16_t);
+  } else {
+    space_size = input_num_boxes * sizeof(float);
+  }
+  auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes);
+  auto boxes_ptr = boxes_impl->cnnlMalloc();
+  auto scores_impl = torch_mlu::getMluTensorImpl(scores);
+  auto scores_ptr = scores_impl->cnnlMalloc();
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
+  auto output_size_ptr = output_size_impl->cnnlMalloc();
+
+  switch (k_type) {
+    default: {
+      TORCH_CHECK(false, "[nms_mlu]:Failed to choose kernel to launch");
+    }
+    case CNRT_FUNC_TYPE_BLOCK:
+    case CNRT_FUNC_TYPE_UNION1: {
+      CNLOG(INFO) << "Launch Kernel MLUUnion1 or Block NMS<<<Union"
+                  << k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y
+                  << ", " << k_dim.z << ">>>";
+      KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
+                input_num_boxes, input_stride, max_output_boxes, iou_threshold,
+                offset, workspace_ptr, output_size_ptr, output_ptr);
+    }; break;
+  }
+
+  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
+  return output.slice(0, 0, output_num);
+}
diff --git a/mmcv/ops/csrc/pytorch/nms.cpp b/mmcv/ops/csrc/pytorch/nms.cpp
index e88208dc9f..8d6844e9ff 100644
--- a/mmcv/ops/csrc/pytorch/nms.cpp
+++ b/mmcv/ops/csrc/pytorch/nms.cpp
@@ -10,6 +10,15 @@ Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
 }
 #endif
 
+#ifdef MMCV_WITH_MLU
+Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                            int offset);
+
+Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSMLUKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+#endif
+
 Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
   if (boxes.numel() == 0) {
     return at::empty({0}, boxes.options().dtype(at::kLong));
@@ -69,6 +78,12 @@ Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
     return nms_cuda(boxes, scores, iou_threshold, offset);
 #else
     AT_ERROR("nms is not compiled with GPU support");
+#endif
+#ifdef MMCV_WITH_MLU
+  } else if (boxes.device().type() == at::kMLU) {
+    CHECK_MLU_INPUT(boxes);
+    CHECK_MLU_INPUT(scores);
+    return nms_mlu(boxes, scores, iou_threshold, offset);
 #endif
   } else {
     CHECK_CPU_INPUT(boxes);
diff --git a/tests/test_ops/test_nms.py b/tests/test_ops/test_nms.py
index 3c59204b1b..4831f6f644 100644
--- a/tests/test_ops/test_nms.py
+++ b/tests/test_ops/test_nms.py
@@ -2,12 +2,22 @@
 import pytest
 import torch
 
+from mmcv.utils import is_cuda, is_mlu
+
 
 class Testnms(object):
 
-    def test_nms_allclose(self):
-        if not torch.cuda.is_available():
-            return
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not is_cuda(), reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not is_mlu(), reason='requires MLU support'))
+    ])
+    def test_nms_allclose(self, device):
         from mmcv.ops import nms
         np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
                              [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
@@ -23,7 +33,7 @@ def test_nms_allclose(self):
         assert np.allclose(dets, np_dets)  # test cpu
         assert np.allclose(inds, np_inds)  # test cpu
         dets, inds = nms(
-            boxes.cuda(), scores.cuda(), iou_threshold=0.3, offset=0)
+            boxes.to(device), scores.to(device), iou_threshold=0.3, offset=0)
         assert np.allclose(dets.cpu().numpy(), np_dets)  # test gpu
         assert np.allclose(inds.cpu().numpy(), np_inds)  # test gpu