diff --git a/modules/cudafilters/src/cuda/median_filter.cu b/modules/cudafilters/src/cuda/median_filter.cu
index ed46eb4bf94..6776428ae1a 100644
--- a/modules/cudafilters/src/cuda/median_filter.cu
+++ b/modules/cudafilters/src/cuda/median_filter.cu
@@ -53,6 +53,17 @@
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
 
+
+// The CUB library is used for the Median Filter with Wavelet Matrix,
+// which has become a standard library since CUDA 11.
+#include "wavelet_matrix_feature_support_checks.h"
+#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+#include "wavelet_matrix_multi.cuh"
+#include "wavelet_matrix_2d.cuh"
+#include "wavelet_matrix_float_supporter.cuh"
+#endif
+
+
 namespace cv { namespace cuda { namespace device
 {
         __device__ void histogramAddAndSub8(int* H, const int * hist_colAdd,const int * hist_colSub){
@@ -334,4 +345,72 @@ namespace cv { namespace cuda { namespace device
 
 }}}
 
+
+#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+namespace cv { namespace cuda { namespace device
+    {
+        using namespace wavelet_matrix_median;
+
+        template<int CH_NUM, typename T>
+        void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<T> src, PtrStepSz<T> dst, int radius,cudaStream_t stream){
+
+            constexpr bool is_float = std::is_same<T, float>::value;
+            constexpr static int WORD_SIZE = 32;
+            constexpr static int ThW = (std::is_same<T, uint8_t>::value ?  8 : 4);
+            constexpr static int ThH = (std::is_same<T, uint8_t>::value ? 64 : 256);
+            using XYIdxT = uint32_t;
+            using XIdxT = uint16_t;
+            using WM_T = typename std::conditional<is_float, uint32_t, T>::type;
+            using MedianResT = typename std::conditional<is_float, T, std::nullptr_t>::type;
+            using WM2D_IMPL = WaveletMatrix2dCu5C<WM_T, CH_NUM, WaveletMatrixMultiCu4G<XIdxT, 512>, 512, WORD_SIZE>;
+
+            CV_Assert(src.cols == dst.cols);
+            CV_Assert(dst.step % sizeof(T) == 0);
+
+            WM2D_IMPL WM_cuda(src.rows, src.cols, is_float, false);
+            WM_cuda.res_cu =  reinterpret_cast<WM_T*>(dst.ptr());
+
+            const size_t line_num = src.cols * CH_NUM;
+            if (is_float) {
+                WMMedianFloatSupporter::WMMedianFloatSupporter<float, CH_NUM, XYIdxT> float_supporter(src.rows, src.cols);
+                float_supporter.alloc();
+                for (int y = 0; y < src.rows; ++y) {
+                    cudaMemcpy(float_supporter.val_in_cu + y * line_num, src.ptr(y), line_num * sizeof(T), cudaMemcpyDeviceToDevice);
+                }
+                const auto p = WM_cuda.get_nowcu_and_buf_byte_div32();
+                float_supporter.sort_and_set((XYIdxT*)p.first, p.second);
+                WM_cuda.construct(nullptr, stream, true);
+                WM_cuda.template median2d<ThW, ThH, MedianResT, false>(radius, dst.step / sizeof(T), (MedianResT*)float_supporter.get_res_table(), stream);
+            } else {
+                for (int y = 0; y < src.rows; ++y) {
+                    cudaMemcpy(WM_cuda.src_cu + y * line_num, src.ptr(y), line_num * sizeof(T), cudaMemcpyDeviceToDevice);
+                }
+                WM_cuda.construct(nullptr, stream);
+                WM_cuda.template median2d<ThW, ThH, MedianResT, false>(radius, dst.step / sizeof(T), nullptr, stream);
+            }
+            WM_cuda.res_cu = nullptr;
+            if (!stream) {
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        }
+
+        template<typename T>
+        void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<T> src, PtrStepSz<T> dst, int radius, const int num_channels, cudaStream_t stream){
+            if (num_channels == 1) {
+                medianFiltering_wavelet_matrix_gpu<1>(src, dst, radius, stream);
+            } else if (num_channels == 3) {
+                medianFiltering_wavelet_matrix_gpu<3>(src, dst, radius, stream);
+            } else if (num_channels == 4) {
+                medianFiltering_wavelet_matrix_gpu<4>(src, dst, radius, stream);
+            } else {
+                CV_Assert(num_channels == 1 || num_channels == 3 || num_channels == 4);
+            }
+        }
+
+        template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<uint8_t>  src, PtrStepSz<uint8_t>  dst, int radius, const int num_channels, cudaStream_t stream);
+        template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<uint16_t> src, PtrStepSz<uint16_t> dst, int radius, const int num_channels, cudaStream_t stream);
+        template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<float>    src, PtrStepSz<float>    dst, int radius, const int num_channels, cudaStream_t stream);
+}}}
+#endif // __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+
 #endif
diff --git a/modules/cudafilters/src/cuda/wavelet_matrix_2d.cuh b/modules/cudafilters/src/cuda/wavelet_matrix_2d.cuh
new file mode 100644
index 00000000000..9c10c223d87
--- /dev/null
+++ b/modules/cudafilters/src/cuda/wavelet_matrix_2d.cuh
@@ -0,0 +1,1053 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_WAVELET_MATRIX_2D_CUH__
+#define __OPENCV_WAVELET_MATRIX_2D_CUH__
+
+
+// The CUB library is used for the Median Filter with Wavelet Matrix,
+// which has become a standard library since CUDA 11.
+#include "wavelet_matrix_feature_support_checks.h"
+#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+
+
+#include <cub/cub.cuh>
+#include <vector>
+#include "opencv2/core/cuda/warp_shuffle.hpp"
+
+#include "wavelet_matrix_multi.cuh"
+
+namespace cv { namespace cuda { namespace device
+{
+
+namespace wavelet_matrix_median {
+    using std::vector;
+    using namespace std;
+
+template<int CH_NUM, int ThreadsDimY, int SRC_CACHE_DIV, int SRCB_S, typename SrcT, typename DstT, typename BlockT, typename XYIdxT, typename XIdxT>
+__global__ void WaveletMatrix2dCu5C_UpSweep_gpu(const SrcT mask, const uint16_t block_pair_num, const XYIdxT size_div_w, const SrcT* __restrict__ src, DstT* __restrict__ dst, BlockT* __restrict__ nbit_bp, const XYIdxT* __restrict__ nsum_buf_test, XYIdxT* __restrict__ nsum_buf_test2, const uint32_t bv_block_byte_div32, const uint32_t buf_byte_div32, const XIdxT* __restrict__ idx_p, const XIdxT inf, XIdxT* __restrict__ wm, XIdxT* __restrict__ nxt_idx, XYIdxT* __restrict__ wm_nsum_scan_buf, const XYIdxT cwm_buf_byte_div32, BlockT* __restrict__ nbit_bp_pre) {
+    using WordT = decltype(BlockT::nbit);
+    using WarpWT = uint32_t;
+    constexpr int WARP_SIZE = 8 * sizeof(WarpWT);
+    static_assert(WARP_SIZE == 32, "");
+    static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE;
+    constexpr int WORD_SIZE = 8 * sizeof(WordT);
+    static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, "");
+    constexpr uint32_t WORD_DIV_WARP = WORD_SIZE / WARP_SIZE;
+
+    static_assert(ThreadsDimY % SRC_CACHE_DIV == 0, "");
+    static_assert(ThreadsDimY != SRC_CACHE_DIV, "Warning: It's not efficient.");
+
+    const size_t buf_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * (buf_byte_div32*32ull);
+    const size_t bv_block_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * (bv_block_byte_div32*32ull);
+    const size_t cwm_buf_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * cwm_buf_byte_div32 * 32u;
+    src = (SrcT*)((uint8_t*)src + buf_byte_y_offset);
+    dst = (DstT*)((uint8_t*)dst + buf_byte_y_offset);
+    nsum_buf_test = (XYIdxT*)((uint8_t*)nsum_buf_test + buf_byte_y_offset);
+    nsum_buf_test2 = (XYIdxT*)((uint8_t*)nsum_buf_test2 + buf_byte_y_offset);
+    nbit_bp = (BlockT*)((uint8_t*)nbit_bp + bv_block_byte_y_offset);
+    nbit_bp_pre = (BlockT*)((uint8_t*)nbit_bp_pre + bv_block_byte_y_offset);
+
+    idx_p   = (XIdxT*)((uint8_t*)idx_p + buf_byte_y_offset);
+    nxt_idx = (XIdxT*)((uint8_t*)nxt_idx + buf_byte_y_offset);
+    if (wm != nullptr) wm = (XIdxT*)((uint8_t*)wm + cwm_buf_byte_y_offset);
+    wm_nsum_scan_buf = (XYIdxT*)((uint8_t*)wm_nsum_scan_buf + cwm_buf_byte_y_offset);
+
+
+    using WarpScanX = cub::WarpScan<XYIdxT, WARP_SIZE / SRC_CACHE_DIV>;
+    using WarpScanY = cub::WarpScan<XYIdxT, ThreadsDimY>;
+    using WarpReduce = cub::WarpReduce<uint32_t>;
+    using WarpReduceY = cub::WarpReduce<uint32_t, ThreadsDimY>;
+
+    static_assert(SRCB_S < 64 * 1024, "");
+
+    __shared__ SrcT   src_val_cache[ThreadsDimY][(WARP_SIZE/SRC_CACHE_DIV)-1][WARP_SIZE];
+    __shared__ XIdxT vidx_val_cache[ThreadsDimY][(WARP_SIZE/SRC_CACHE_DIV)-1][WARP_SIZE];
+
+    __shared__ uint4 nsum_count_sh[ThreadsDimY];
+    __shared__ XYIdxT wm_zero_count_sh[ThreadsDimY];
+    __shared__ XYIdxT pre_sum_share[2];
+    __shared__ XYIdxT warp_scan_sums[ThreadsDimY];
+    __shared__ typename WarpScanX::TempStorage s_scanStorage;
+    __shared__ typename WarpScanY::TempStorage s_scanStorage2;
+    __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY];
+    __shared__ typename WarpReduceY::TempStorage WarpReduceY_temp_storage;
+    // shmem ------ end ------
+
+    XYIdxT wm_zero_count = 0;
+
+    const XYIdxT size_div_warp = size_div_w * WORD_DIV_WARP;
+    const XYIdxT nsum = nbit_bp[size_div_w].nsum;
+    const XYIdxT nsum_offset = nsum_buf_test[blockIdx.x];
+    const XYIdxT nsum_pre = nbit_bp_pre[size_div_w].nsum;
+
+
+    XYIdxT nsum_idx0_org = nsum_offset;
+    XYIdxT nsum_idx1_org = (XYIdxT)blockIdx.x * block_pair_num * THREAD_PER_GRID + nsum - nsum_idx0_org;
+    nsum_idx0_org /= (XYIdxT)block_pair_num * ThreadsDimY * WARP_SIZE;
+    nsum_idx1_org /= (XYIdxT)block_pair_num * ThreadsDimY * WARP_SIZE;
+    const XYIdxT nsum_idx0_bound = (nsum_idx0_org + 1) * block_pair_num * ThreadsDimY * WARP_SIZE;
+    const XYIdxT nsum_idx1_bound = (nsum_idx1_org + 1) * block_pair_num * ThreadsDimY * WARP_SIZE;
+    uint4 nsum_count = make_uint4(0, 0, 0, 0);
+
+    const unsigned short th_idx = threadIdx.y * WARP_SIZE + threadIdx.x;
+    if (th_idx == 0) {
+        pre_sum_share[0] = nsum_offset;
+    }
+
+    for (XYIdxT ka = 0; ka < block_pair_num; ka += WARP_SIZE / SRC_CACHE_DIV) {
+        const XYIdxT ibb = ((XYIdxT)blockIdx.x * block_pair_num + ka) * ThreadsDimY;
+        if (ibb >= size_div_warp) break;
+
+        WarpWT my_bits = 0;
+        SrcT  first_val;
+        XIdxT first_idxval;
+
+        for (XYIdxT kb = 0, i = ibb + WARP_SIZE / SRC_CACHE_DIV * threadIdx.y; kb < WARP_SIZE / SRC_CACHE_DIV; ++kb, ++i) {
+            if (i >= size_div_warp) break;
+            WarpWT bits;
+            const XYIdxT ij = i * WARP_SIZE + threadIdx.x;
+            const SrcT v = src[ij];
+            const XIdxT wm_idxv = idx_p[ij];
+            if (kb == 0) {
+                first_val = v;
+                first_idxval = wm_idxv;
+            } else {
+                src_val_cache[threadIdx.y][kb - 1][threadIdx.x] = v;
+                vidx_val_cache[threadIdx.y][kb - 1][threadIdx.x] = wm_idxv;
+            }
+            if (v <= mask) {
+                bits = __activemask();
+            } else {
+                bits = ~__activemask();
+            }
+            if (threadIdx.x == kb) {
+                my_bits = bits;
+            }
+            if (wm != nullptr) {
+                if (ij < nsum_pre) {
+                    wm[ij] = wm_idxv;
+                    if (wm_idxv * 2 <= inf) {
+                        ++wm_zero_count;
+                    }
+                } else {
+                    wm[ij] = inf;
+                }
+            }
+        }
+
+        XYIdxT c, t = 0;
+        if (threadIdx.y < ThreadsDimY) {
+            c = __popc(my_bits);
+
+            WarpScanX(s_scanStorage).ExclusiveSum(c, t);
+            if (threadIdx.x == WARP_SIZE / SRC_CACHE_DIV - 1) {
+                warp_scan_sums[threadIdx.y] = c + t;
+            }
+        }
+
+        __syncthreads();
+
+        XYIdxT pre_sum = pre_sum_share[(ka & (WARP_SIZE / SRC_CACHE_DIV)) > 0 ? 1 : 0];
+        XYIdxT s = threadIdx.x < ThreadsDimY ? warp_scan_sums[threadIdx.x] : 0;
+        WarpScanY(s_scanStorage2).ExclusiveSum(s, s);
+
+        s = cv::cuda::device::shfl(s, threadIdx.y, WARP_SIZE);
+        s += t + pre_sum;
+
+        if (SRC_CACHE_DIV == 1 || threadIdx.x < WARP_SIZE / SRC_CACHE_DIV) {
+            if (th_idx == THREAD_PER_GRID - WARP_SIZE + WARP_SIZE / SRC_CACHE_DIV - 1) {
+                pre_sum_share[(ka & (WARP_SIZE / SRC_CACHE_DIV)) == 0 ? 1 : 0] = s + c;
+            }
+            const XYIdxT bi = ibb + threadIdx.y * WARP_SIZE / SRC_CACHE_DIV + threadIdx.x;
+            if (bi < size_div_warp) {
+                static_assert(WORD_SIZE == 32, "");
+                nbit_bp[bi] = BlockT{s, my_bits};
+            }
+        }
+
+        if (mask == 0) {
+            SrcT  vo = first_val;
+            XIdxT idx_v = first_idxval;
+            for (XYIdxT j = 0, i = ibb + WARP_SIZE / SRC_CACHE_DIV * threadIdx.y; j < WARP_SIZE / SRC_CACHE_DIV; ++j, ++i) {
+                if (i >= size_div_warp) break;
+                const WarpWT e_nbit = cv::cuda::device::shfl(my_bits, j, WARP_SIZE);
+                const XYIdxT e_nsum = cv::cuda::device::shfl(s, j, WARP_SIZE);
+                XYIdxT rank = __popc(e_nbit << (WARP_SIZE - threadIdx.x));
+                const XYIdxT idx0 = e_nsum + rank;
+                XYIdxT idx = idx0;
+                if (vo > mask) { // 1
+                    const XYIdxT ij = i * WARP_SIZE + threadIdx.x;
+                    idx = ij + nsum - idx;
+                }
+                if (idx < size_div_warp * WARP_SIZE) {
+                    nxt_idx[idx] = idx_v;
+                }
+                if (j == WARP_SIZE / SRC_CACHE_DIV - 1) break;
+                vo    = src_val_cache[threadIdx.y][j][threadIdx.x];
+                idx_v = vidx_val_cache[threadIdx.y][j][threadIdx.x];
+            }
+            continue;
+        }
+        const SrcT mask_2 = mask >> 1;
+        SrcT  vo = first_val;
+        XIdxT idx_v = first_idxval;
+        for (XYIdxT j = 0, i = ibb + WARP_SIZE / SRC_CACHE_DIV * threadIdx.y; j < WARP_SIZE / SRC_CACHE_DIV; ++j, ++i) {
+            if (i >= size_div_warp) break;
+            const WarpWT e_nbit = cv::cuda::device::shfl(my_bits, j, WARP_SIZE);
+            const XYIdxT e_nsum = cv::cuda::device::shfl(s, j, WARP_SIZE);
+            XYIdxT rank = __popc(e_nbit << (WARP_SIZE - threadIdx.x));
+            const XYIdxT idx0 = e_nsum + rank;
+
+            DstT v = (DstT)vo;
+            XYIdxT idx = idx0;
+            if (vo > mask) { // 1
+                const XYIdxT ij = i * WARP_SIZE + threadIdx.x;
+                idx = ij + nsum - idx;
+                v &= mask;
+            }
+            if (idx < size_div_warp * WARP_SIZE) {
+                if (mask != 0) {
+                    dst[idx] = v;
+                }
+                nxt_idx[idx] = idx_v;
+            }
+
+            if (v <= mask_2) {
+                if (vo <= mask) {
+                    if (idx < nsum_idx0_bound) {
+                        nsum_count.x++;
+                    } else {
+                        nsum_count.y++;
+                    }
+                } else {
+                    if (idx < nsum_idx1_bound) {
+                        nsum_count.z++;
+                    } else {
+                        nsum_count.w++;
+                    }
+                }
+            }
+            if (j == WARP_SIZE / SRC_CACHE_DIV - 1) break;
+            vo    = src_val_cache[threadIdx.y][j][threadIdx.x];
+            idx_v = vidx_val_cache[threadIdx.y][j][threadIdx.x];
+        }
+    }
+    if (blockIdx.x == gridDim.x - 1 && th_idx == 0) {
+        nbit_bp[size_div_warp / WORD_DIV_WARP].nsum = nsum;
+    }
+
+    nsum_count.x = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.x);
+    nsum_count.y = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.y);
+    nsum_count.z = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.z);
+    nsum_count.w = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.w);
+    wm_zero_count = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(wm_zero_count);
+    if (threadIdx.x == 0) {
+        nsum_count_sh[threadIdx.y]    = nsum_count;
+        wm_zero_count_sh[threadIdx.y] = wm_zero_count;
+    }
+    __syncthreads();
+    if (threadIdx.x < ThreadsDimY) {
+        nsum_count = nsum_count_sh[threadIdx.x];
+        nsum_count.x = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.x);
+        nsum_count.y = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.y);
+        nsum_count.z = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.z);
+        nsum_count.w = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.w);
+        wm_zero_count = WarpReduceY(WarpReduceY_temp_storage).Sum(wm_zero_count_sh[threadIdx.x]);
+        if (th_idx == 0) {
+            const XYIdxT nsum_idx0_org = nsum_idx0_bound / ((XYIdxT)block_pair_num * ThreadsDimY * WARP_SIZE);
+            const XYIdxT nsum_idx1_org = nsum_idx1_bound / ((XYIdxT)block_pair_num * ThreadsDimY * WARP_SIZE);
+            if (mask != 0) {
+                if (nsum_count.x > 0) atomicAdd(nsum_buf_test2 + nsum_idx0_org - 1, nsum_count.x);
+                if (nsum_count.y > 0) atomicAdd(nsum_buf_test2 + nsum_idx0_org - 0, nsum_count.y);
+                if (nsum_count.z > 0) atomicAdd(nsum_buf_test2 + nsum_idx1_org - 1, nsum_count.z);
+                if (nsum_count.w > 0) atomicAdd(nsum_buf_test2 + nsum_idx1_org - 0, nsum_count.w);
+            }
+            if (wm != nullptr) {
+                wm_nsum_scan_buf[blockIdx.x] = wm_zero_count;
+            }
+        }
+    }
+}
+
+template<int CH_NUM, int ThreadsDimY, typename BlockT, typename XYIdxT, typename XIdxT>
+__global__ void WaveletMatrix2dCu5C_last_gpu(const uint16_t block_pair_num, const XYIdxT size_div_w, const uint32_t buf_byte_div32, const XIdxT* __restrict__ idx_p, const XIdxT inf, XIdxT* __restrict__ wm, XYIdxT* __restrict__ wm_nsum_scan_buf, const XYIdxT cwm_buf_byte_div32, BlockT* __restrict__ nbit_bp_pre, const uint32_t bv_block_byte_div32) {
+    using WordT = decltype(BlockT::nbit);
+    using WarpWT = uint32_t;
+    constexpr int WARP_SIZE = 8 * sizeof(WarpWT);
+    static_assert(WARP_SIZE == 32, "");
+    static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE;
+    constexpr int WORD_SIZE = 8 * sizeof(WordT);
+    static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, "");
+    constexpr uint32_t WORD_DIV_WARP = WORD_SIZE / WARP_SIZE;
+
+    const size_t buf_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * (buf_byte_div32*32ull);
+    const size_t bv_block_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * (bv_block_byte_div32*32ull);
+    const size_t cwm_buf_byte_y_offset = (size_t)(CH_NUM==1?0:blockIdx.y) * cwm_buf_byte_div32 * 32u;
+
+    idx_p   = (XIdxT*)((uint8_t*)idx_p + buf_byte_y_offset);
+    wm = (XIdxT*)((uint8_t*)wm + cwm_buf_byte_y_offset);
+    wm_nsum_scan_buf = (XYIdxT*)((uint8_t*)wm_nsum_scan_buf + cwm_buf_byte_y_offset);
+    nbit_bp_pre = (BlockT*)((uint8_t*)nbit_bp_pre + bv_block_byte_y_offset);
+    const XYIdxT nsum_pre = nbit_bp_pre[size_div_w].nsum;
+
+    using WarpReduce = cub::WarpReduce<uint32_t>;
+    using WarpReduceY = cub::WarpReduce<uint32_t, ThreadsDimY>;
+
+    __shared__ XYIdxT wm_zero_count_sh[ThreadsDimY];
+    __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY];
+    __shared__ typename WarpReduceY::TempStorage WarpReduceY_temp_storage;
+    // shmem ------ end ------
+
+    XYIdxT wm_zero_count = 0;
+    const XYIdxT size_div_warp = size_div_w * WORD_DIV_WARP;
+    const unsigned short th_idx = threadIdx.y * WARP_SIZE + threadIdx.x;
+
+    const int block_num = block_pair_num/WARP_SIZE;
+    for (XYIdxT ka = 0; ka < block_num; ++ka) {
+        const XYIdxT ibb = ((XYIdxT)blockIdx.x * block_num + ka) * THREAD_PER_GRID + WARP_SIZE * threadIdx.y;
+        if (ibb >= size_div_warp) break;
+
+        for (XYIdxT kb = 0; kb < WARP_SIZE; ++kb) {
+            XYIdxT i = ibb + kb;
+            if (i >= size_div_warp) break;
+
+            const XYIdxT ij = i * WARP_SIZE + threadIdx.x;
+
+            if (ij < nsum_pre) {
+                const XIdxT wm_idxv = idx_p[ij];
+                wm[ij] = wm_idxv;
+                if (wm_idxv * 2 <= inf) {
+                    ++wm_zero_count;
+                }
+            } else {
+                wm[ij] = inf;
+            }
+        }
+    }
+    wm_zero_count = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(wm_zero_count);
+    if (threadIdx.x == 0) {
+        wm_zero_count_sh[threadIdx.y] = wm_zero_count;
+    }
+    __syncthreads();
+    if (threadIdx.x < ThreadsDimY) {
+        wm_zero_count = WarpReduceY(WarpReduceY_temp_storage).Sum(wm_zero_count_sh[threadIdx.x]);
+        if (th_idx == 0) {
+            wm_nsum_scan_buf[blockIdx.x] = wm_zero_count;
+        }
+    }
+}
+
+template<int CH_NUM, int MAX_BLOCK_X, typename XYIdxT, typename BlockT>
+__global__ void WaveletMatrix2dCu5C_ExclusiveSum(XYIdxT* __restrict__ nsum_scan_buf, XYIdxT* __restrict__ nsum_buf_test2, BlockT* __restrict__ nsum_p, const uint32_t buf_byte_div32, const uint32_t bv_block_byte_div32) {
+
+    typedef cub::BlockScan<XYIdxT, MAX_BLOCK_X> BlockScan;
+    __shared__ typename BlockScan::TempStorage temp_storage;
+
+    XYIdxT thread_data1;
+    XYIdxT thread_data2;
+
+    nsum_scan_buf = (XYIdxT*)((uint8_t*)nsum_scan_buf + (size_t)blockIdx.x * (buf_byte_div32*32ull));
+    nsum_buf_test2 = (XYIdxT*)((uint8_t*)nsum_buf_test2 + (size_t)blockIdx.x * (buf_byte_div32*32ull));
+
+    thread_data1 = nsum_scan_buf[threadIdx.x];
+    BlockScan(temp_storage).ExclusiveSum(thread_data1, thread_data2);
+
+    nsum_scan_buf[threadIdx.x] = thread_data2;
+    nsum_buf_test2[threadIdx.x] = 0;
+
+    if (threadIdx.x == blockDim.x - 1) {
+        thread_data2 += thread_data1;
+        nsum_p = (BlockT*)((uint8_t*)nsum_p + (size_t)blockIdx.x * (bv_block_byte_div32*32ull));
+        nsum_p->nsum = thread_data2;
+    }
+}
+
+
+template<int CH_NUM, int ThreadsDimY, bool MOV_SRC,typename SrcT, typename XYIdxT, typename XIdxT>
+__global__ void WaveletMatrix2dCu5C_first_gpu_multi(const SrcT mask, uint16_t block_pair_num, const XYIdxT size_div_warp, SrcT* __restrict__ src, SrcT* __restrict__ dst, XYIdxT* __restrict__ nsum_scan_buf, const uint32_t buf_byte_div32, XIdxT* __restrict__ buf_idx, const int W, const XYIdxT WH, const SrcT* __restrict__ src_const) {
+    using WarpWT = uint32_t;
+    constexpr int WARP_SIZE = 8 * sizeof(WarpWT);
+    static_assert(WARP_SIZE == 32, "");
+    static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE;
+
+    SrcT* __restrict__ dsts[CH_NUM];
+    XIdxT* __restrict__ buf_idxs[CH_NUM];
+    XYIdxT cs[CH_NUM];
+
+    __shared__ SrcT src_vbuf_org[ThreadsDimY][CH_NUM * WARP_SIZE];
+    SrcT* __restrict__ src_vbuf = src_vbuf_org[threadIdx.y];
+
+    for (int c = 0; c < CH_NUM; ++c) {
+        if (CH_NUM > 1) { // constexpr
+            dsts[c]     = (SrcT*)((uint8_t*)dst + (size_t)c * (buf_byte_div32*32ull));
+        }
+        buf_idxs[c] = (XIdxT*)((uint8_t*)buf_idx + (size_t)c * (buf_byte_div32*32ull));
+        cs[c] = 0;
+    }
+
+    XYIdxT ibb = (XYIdxT)blockIdx.x * block_pair_num * ThreadsDimY;
+    for (XYIdxT ka = 0; ka < block_pair_num; ka += WARP_SIZE, ibb += THREAD_PER_GRID) {
+        for (XYIdxT kb = 0, i = ibb + WARP_SIZE * threadIdx.y; kb < WARP_SIZE; ++kb, ++i) {
+            if (i >= size_div_warp) break;
+            const XYIdxT iw = i * WARP_SIZE;
+            const XYIdxT idx = iw + threadIdx.x;
+            const XIdxT idx_v = (idx >= WH ? 0 : idx % W);
+
+            for (int c = 0; c < CH_NUM; ++c) {
+                if (MOV_SRC) { // constexpr
+                    const XYIdxT s_idx = iw * CH_NUM + threadIdx.x + c * WARP_SIZE;
+                    src_vbuf[threadIdx.x + c * WARP_SIZE] = src[s_idx] = src_const[s_idx];
+                } else {
+                    src_vbuf[threadIdx.x + c * WARP_SIZE] = src[iw * CH_NUM + threadIdx.x + c * WARP_SIZE];
+                }
+            }
+            __syncwarp();
+            for (int c = 0; c < CH_NUM; ++c) {
+                const SrcT v = src_vbuf[threadIdx.x * CH_NUM + c];
+                if (v <= mask) {
+                    ++cs[c];
+                }
+                buf_idxs[c][idx] = idx_v;
+                if constexpr(CH_NUM > 1) {
+                    dsts[c][idx] = v;
+                }
+            }
+        }
+    }
+
+    using WarpReduce = cub::WarpReduce<uint32_t>;
+    __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY];
+    __shared__ XYIdxT cs_sum_sh[CH_NUM][ThreadsDimY];
+    for (int c = 0; c < CH_NUM; ++c) {
+        cs[c] = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(cs[c]);
+    }
+    if (threadIdx.x == 0) {
+        for (int c = 0; c < CH_NUM; ++c) {
+            cs_sum_sh[c][threadIdx.y] = cs[c];
+        }
+    }
+    __syncthreads();
+    if (threadIdx.y != 0) return;
+    for (int c = 0; c < CH_NUM; ++c) {
+        XYIdxT cs_bsum = (threadIdx.x < ThreadsDimY ? cs_sum_sh[c][threadIdx.x] : 0);
+        cs_bsum = WarpReduce(WarpReduce_temp_storage[0]).Sum(cs_bsum);
+        if (threadIdx.x == 0) {
+            nsum_scan_buf[blockIdx.x] = cs_bsum;
+            nsum_scan_buf = (XYIdxT*)((uint8_t*)nsum_scan_buf + (buf_byte_div32*32ull));
+        }
+    }
+
+}
+
+
+template<int ThreadsDimY, typename SrcT, typename XYIdxT, typename XIdxT>
+__global__ void WaveletMatrix2dCu5C_first_gpu_multi_srcunpacked(const SrcT mask, uint16_t block_pair_num, const XYIdxT size_div_warp, const SrcT* __restrict__ src, XYIdxT* __restrict__ nsum_scan_buf, const uint32_t buf_byte_div32, XIdxT* __restrict__ buf_idx, const int W, const XYIdxT WH) {
+    using WarpWT = uint32_t;
+    constexpr int WARP_SIZE = 8 * sizeof(WarpWT);
+    static_assert(WARP_SIZE == 32, "");
+    static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE;
+
+    XYIdxT cs = 0;
+
+    const int c = blockIdx.y;
+    buf_idx = buf_idx + c * (buf_byte_div32*32ull / sizeof(XIdxT));
+    src = src + c * (buf_byte_div32*32ull / sizeof(SrcT));
+
+
+    XYIdxT i = (XYIdxT)blockIdx.x * block_pair_num * ThreadsDimY + threadIdx.y;
+
+    XIdxT x_idx = (i * WARP_SIZE + threadIdx.x) % W;
+    const XIdxT x_diff = THREAD_PER_GRID % W;
+
+    for (XYIdxT k = 0; k < block_pair_num; ++k, i += ThreadsDimY) {
+        if (i >= size_div_warp) break;
+        const XYIdxT idx = i * WARP_SIZE + threadIdx.x;
+
+        if (idx >= WH) x_idx = 0;
+
+        const SrcT v = src[idx];
+        if (v <= mask) {
+            ++cs;
+        }
+        buf_idx[idx] = x_idx;
+
+        x_idx += x_diff;
+        if (x_idx >= W) x_idx -= W;
+    }
+
+    using WarpReduce = cub::WarpReduce<uint32_t>;
+    __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY];
+    __shared__ XYIdxT cs_sum_sh[ThreadsDimY];
+    cs = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(cs);
+    if (threadIdx.x == 0) {
+        cs_sum_sh[threadIdx.y] = cs;
+    }
+    __syncthreads();
+    if (threadIdx.y != 0) return;
+    XYIdxT cs_bsum = (threadIdx.x < ThreadsDimY ? cs_sum_sh[threadIdx.x] : 0);
+    cs_bsum = WarpReduce(WarpReduce_temp_storage[0]).Sum(cs_bsum);
+    if (threadIdx.x == 0) {
+        nsum_scan_buf += (buf_byte_div32*32ull / sizeof(XYIdxT)) * (blockIdx.y);
+        nsum_scan_buf[blockIdx.x] = cs_bsum;
+    }
+}
+
+template<typename IdxType, typename BlockT>
+__device__
+inline IdxType WaveletMatrix2dCu5C_median2d_rank0(const IdxType i, const BlockT* __restrict__ nbit_bp) {
+    using WordT = decltype(BlockT::nbit);
+    constexpr int WORD_SIZE = 8 * sizeof(WordT);
+    static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, "");
+
+    const IdxType bi = i / WORD_SIZE;
+
+    const int ai = i % WORD_SIZE;
+    const BlockT block = nbit_bp[bi];
+    if constexpr(WORD_SIZE == 32) {
+        return block.nsum + __popc(block.nbit & ((1u << ai) - 1));
+    }
+    if constexpr(WORD_SIZE == 64) {
+        return block.nsum + __popcll(block.nbit & ((1ull << ai) - 1ull));
+    }
+}
+
+
+template<int CH_NUM, int THREADS_NUM_W, int THREADS_NUM_H, int VAL_BIT_LEN, bool CUT_BORDER, typename XYIdxT, typename XIdxT, typename ValT, typename BlockT, typename ResT, typename ResTableT>
+__global__ void WaveletMatrix2dCu5C_median2d_cu(
+    const int H, const int W, const int res_step_num, const int r, ResT* __restrict__ res_cu, const BlockT* __restrict__ wm_nbit_bp, const uint32_t nsum_pos,
+    const uint32_t bv_block_h_byte_div32, const uint32_t bv_block_len,
+    const BlockT* __restrict__ bv_nbit_bp, const uint8_t w_bit_len, const uint8_t val_bit_len,
+    const ResTableT* __restrict__ res_table
+) {
+
+    const int y = blockIdx.y * THREADS_NUM_H + threadIdx.y;
+    if (y >= H) return;
+    const int x = blockIdx.x * THREADS_NUM_W + threadIdx.x;
+    if (x >= W) return;
+
+    if (CH_NUM >= 2) { // constexpr
+        bv_nbit_bp = (BlockT*)((uint8_t*)bv_nbit_bp + bv_block_h_byte_div32 * 32ull * blockIdx.z * (VAL_BIT_LEN >= 0 ? VAL_BIT_LEN : val_bit_len)); // TODO
+        wm_nbit_bp = (BlockT*)((uint8_t*)wm_nbit_bp + bv_block_h_byte_div32 * 32ull * blockIdx.z * w_bit_len);
+    }
+
+    XYIdxT ya, yb, k;
+    XIdxT  xa, xb;
+    if (CUT_BORDER) { // constexpr
+        ya = y;
+        xa = x;
+        yb = y + r * 2 + 1;
+        xb = x + r * 2 + 1;
+        k = (r * 2 + 1) * (r * 2 + 1) / 2;
+    } else {
+        ya = (y < r ? 0 : y - r);
+        xa = (x < r ? 0 : x - r);
+        yb = y + r + 1; if (yb > H) yb = H;
+        xb = x + r + 1; if (xb > W) xb = W;
+        k = XYIdxT(yb - ya) * (xb - xa) / 2;
+    }
+    ValT res = 0;
+    ya *= (CUT_BORDER ? W + 2 * r : W);
+    yb *= (CUT_BORDER ? W + 2 * r : W);
+
+    for (int8_t h = (VAL_BIT_LEN >= 0 ? VAL_BIT_LEN : val_bit_len); h--; ) {
+        const XYIdxT top0 = WaveletMatrix2dCu5C_median2d_rank0(ya, bv_nbit_bp);
+        const XYIdxT bot0 = WaveletMatrix2dCu5C_median2d_rank0(yb, bv_nbit_bp);
+        XYIdxT l_ya_xa = top0;
+        XYIdxT l_yb_xa = bot0;
+        XYIdxT l_ya_xb = top0;
+        XYIdxT l_yb_xb = bot0;
+        XYIdxT d = 0;
+        for (int8_t j = w_bit_len; j--; ) {
+            const XYIdxT zeros = wm_nbit_bp[nsum_pos].nsum;
+            const XYIdxT l_ya_xa_rank0 = WaveletMatrix2dCu5C_median2d_rank0(l_ya_xa, wm_nbit_bp);
+            const XYIdxT l_ya_xb_rank0 = WaveletMatrix2dCu5C_median2d_rank0(l_ya_xb, wm_nbit_bp);
+            const XYIdxT l_yb_xb_rank0 = WaveletMatrix2dCu5C_median2d_rank0(l_yb_xb, wm_nbit_bp);
+            const XYIdxT l_yb_xa_rank0 = WaveletMatrix2dCu5C_median2d_rank0(l_yb_xa, wm_nbit_bp);
+
+            if (((xa >> j) & 1) == 0) {
+                l_ya_xa = l_ya_xa_rank0;
+                l_yb_xa = l_yb_xa_rank0;
+            } else {
+                d += l_ya_xa_rank0; l_ya_xa += zeros - l_ya_xa_rank0;
+                d -= l_yb_xa_rank0; l_yb_xa += zeros - l_yb_xa_rank0;
+            }
+            if (((xb >> j) & 1) == 0) {
+                l_ya_xb = l_ya_xb_rank0;
+                l_yb_xb = l_yb_xb_rank0;
+            } else {
+                d -= l_ya_xb_rank0; l_ya_xb += zeros - l_ya_xb_rank0;
+                d += l_yb_xb_rank0; l_yb_xb += zeros - l_yb_xb_rank0;
+            }
+            wm_nbit_bp = (BlockT*)((uint8_t*)wm_nbit_bp - bv_block_h_byte_div32 * 32ull);
+        }
+        if (CH_NUM >= 2) {
+            wm_nbit_bp = (BlockT*)((uint8_t*)wm_nbit_bp - bv_block_h_byte_div32 * 32ull * w_bit_len * (CH_NUM - 1));
+        }
+        const XYIdxT bv_h_zeros = bv_nbit_bp[nsum_pos].nsum;
+        if (k < d) {
+            ya = top0;
+            yb = bot0;
+        } else {
+            k -= d;
+            res |= (ValT)1 << h;
+            ya += bv_h_zeros - top0;
+            yb += bv_h_zeros - bot0;
+        }
+        bv_nbit_bp = (BlockT*)((uint8_t*)bv_nbit_bp - bv_block_h_byte_div32 * 32ull);
+    }
+
+
+
+
+    if constexpr(is_same<ResTableT, std::nullptr_t>::value) {
+        res_cu[(XYIdxT)y * res_step_num + x * CH_NUM + blockIdx.z] = res;
+    } else if (CH_NUM == 1){
+        res_cu[(XYIdxT)y * res_step_num + x * CH_NUM] = res_table[res];
+    } else {
+        const size_t offset = size_t(CUT_BORDER ? W + 2 * r : W) * (CUT_BORDER ? H + 2 * r : H) * blockIdx.z;
+        res_cu[(XYIdxT)y * res_step_num + x * CH_NUM + blockIdx.z] = res_table[res + offset];
+    }
+}
+
+
+
+
+template <typename ValT, int CH_NUM, class MultiWaveletMatrixImpl, int TH_NUM = 512, int WORD_SIZE = 32>
+struct WaveletMatrix2dCu5C {
+    static_assert(is_same<ValT, uint32_t>() || is_same<ValT, uint16_t>() || is_same<ValT, uint8_t>(), "Supports 32, 16, or 8 bits only");
+    static constexpr int MAX_BIT_LEN = 8 * sizeof(ValT);
+
+    static constexpr uint32_t WSIZE = WORD_SIZE;
+    static constexpr int WARP_SIZE = 32;
+    using T_Type = ValT;
+    static constexpr int THREAD_PER_GRID = TH_NUM;
+    static constexpr int SRC_CACHE_DIV = 2;
+    static constexpr int MAX_BLOCK_X = MultiWaveletMatrixImpl::MAX_BLOCK_X;
+    static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, "WORD_SIZE must be 32 or 64");
+    using WordT = typename std::conditional<WORD_SIZE == 32, uint32_t, uint64_t>::type;
+
+    static_assert(MAX_BLOCK_X <= 1024, "");
+    static_assert(TH_NUM == 1024 || TH_NUM == 512 || TH_NUM == 256 || TH_NUM == 128 || TH_NUM == 64 || TH_NUM == 32, "");
+    static_assert(THREAD_PER_GRID == MultiWaveletMatrixImpl::THREAD_PER_GRID, "");
+
+    using BlockT = typename MultiWaveletMatrixImpl::BlockT;
+    using WarpWT = uint32_t;
+    using XIdxT = uint16_t;
+    using YIdxT = uint16_t;
+    using XYIdxT = uint32_t;
+    static constexpr int BLOCK_TYPE = 2;
+    using MultiWaveletMatrixImplClass = MultiWaveletMatrixImpl;
+    static_assert(is_same<XIdxT, typename MultiWaveletMatrixImpl::T_Type>::value, "");
+    static_assert(8 * sizeof(WarpWT) == WARP_SIZE, "");
+
+    int H, W;
+    XYIdxT size = 0;
+    MultiWaveletMatrixImpl WM;
+    XYIdxT bv_zeros[MAX_BIT_LEN];
+
+    int w_bit_len = 0;
+    int val_bit_len = 0;
+    static constexpr int wm_num = CH_NUM;
+
+private:
+    uint8_t* bv_block_nbit_and_nsum_base_cu = nullptr; // GPU mem
+    uint32_t bv_block_byte_div32;
+    uint32_t buf_byte_div32;
+    uint32_t nsum_scan_buf_len;
+    size_t input_buf_byte;
+public:
+    ValT* src_cu = nullptr; // GPU mem
+    ValT* res_cu = nullptr;
+    size_t bv_block_len = 0;
+    size_t bv_chunk_len = 0;
+
+#if _MSC_VER >= 1920 || __INTEL_COMPILER
+    inline static int bitCount64(uint64_t bits) {
+        return (int)_mm_popcnt_u64(bits);
+    }
+#else
+    inline static int bitCount64(uint64_t bits) {
+        return __builtin_popcountll(bits);
+    }
+#endif
+    static constexpr int get_bit_len(uint64_t val) {
+        return (
+            (val |= val >> 1),
+            (val |= val >> 2),
+            (val |= val >> 4),
+            (val |= val >> 8),
+            (val |= val >> 16),
+            (val |= val >> 32),
+            bitCount64(val));
+        // val |= val >> 1;
+        // val |= val >> 2;
+        // val |= val >> 4;
+        // val |= val >> 8;
+        // val |= val >> 16;
+        // val |= val >> 32;
+        // return bitCount64(val);
+    }
+
+    WaveletMatrix2dCu5C() {
+        reset(0, 0);
+    }
+    WaveletMatrix2dCu5C(const int rows, const int cols, const bool use_hw_bit_len = false, const bool alloc_res = true) {
+        reset(rows, cols, use_hw_bit_len, alloc_res);
+    }
+
+    void reset(const int rows, const int cols, const bool use_hw_bit_len = false, const bool alloc_res = true) {
+        H = rows;
+        W = cols;
+        if (rows == 0 || cols == 0) return;
+        val_bit_len = (use_hw_bit_len ? get_bit_len((uint64_t)H * W - 1) : MAX_BIT_LEN);
+        assert(size == 0 && src_cu == nullptr);
+
+        size = div_ceil<size_t>((uint64_t)H * W, WORD_SIZE) * WORD_SIZE;
+        assert(W < 65535); // That is, less than 65534.
+        w_bit_len = get_bit_len(W); // w=7 [0,6] bl=3; w=8 [0,7] bl=4
+        WM.reset(size, w_bit_len, val_bit_len * wm_num);
+        if (val_bit_len == 0) return;
+
+
+        bv_block_len = div_ceil<size_t>(size, THREAD_PER_GRID) * THREAD_PER_GRID / WORD_SIZE + 1;
+        bv_block_len = div_ceil<size_t>(bv_block_len, 8*2) * 8*2;
+        const size_t bv_block_byte = (sizeof(BlockT)) * val_bit_len * bv_block_len;
+        bv_block_byte_div32 = div_ceil<size_t>(bv_block_byte, 32);
+
+        cudaMalloc(&bv_block_nbit_and_nsum_base_cu, (size_t)(bv_block_byte_div32*32ull) * CH_NUM);
+        if (bv_block_nbit_and_nsum_base_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d\n", __FILE__, __LINE__); release(); return; }
+
+        const uint16_t block_pair_num = get_block_pair_num();
+        nsum_scan_buf_len = div_ceil<size_t>(size, (size_t)THREAD_PER_GRID * block_pair_num);
+        nsum_scan_buf_len = div_ceil<size_t>(nsum_scan_buf_len, 4) * 4;
+
+        const size_t buf_byte =
+            sizeof(XYIdxT) * 2 * nsum_scan_buf_len
+            + sizeof(XIdxT) * 2 * size
+            + sizeof(ValT) * (CH_NUM == 1 ? 1 : 2) * size;
+        buf_byte_div32 = div_ceil<size_t>(buf_byte, 32);
+
+
+        input_buf_byte = sizeof(ValT) * size * CH_NUM;
+        cudaMalloc(&src_cu, (size_t)(buf_byte_div32*32ull) * CH_NUM + input_buf_byte);
+        if (src_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d\n", __FILE__, __LINE__); release(); return; }
+
+        if (alloc_res) {
+            cudaMalloc(&res_cu, sizeof(ValT) * size * CH_NUM);
+            if (res_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d\n", __FILE__, __LINE__); release(); return; }
+        }
+    }
+    void release() {
+        size = 0;
+        if (src_cu != nullptr) cudaFree(src_cu);
+        if (bv_block_nbit_and_nsum_base_cu != nullptr) cudaFree(bv_block_nbit_and_nsum_base_cu);
+        if (res_cu != nullptr) cudaFree(res_cu);
+        src_cu = nullptr;
+        bv_block_nbit_and_nsum_base_cu = nullptr;
+        res_cu = nullptr;
+    }
+    ~WaveletMatrix2dCu5C() { release(); }
+
+    BlockT*  get_bv_block_cu(int h) const { return (BlockT*)(bv_block_nbit_and_nsum_base_cu + (bv_block_len * (sizeof(BlockT))) * h); }
+
+    BlockT*  get_bv_block_cu(int h, int c) const { return (BlockT*)((uint8_t*)get_bv_block_cu(h) + (size_t)c * (bv_block_byte_div32*32ull)); }
+
+
+    uint16_t get_block_pair_num() const {
+        return WM.get_block_pair_num() * MultiWaveletMatrixImpl::THREAD_PER_GRID / THREAD_PER_GRID;
+    }
+    std::pair<ValT*, XYIdxT> get_nowcu_and_buf_byte_div32() {
+        ValT *now_cu = src_cu + (CH_NUM == 1 ? 0ull : size * (size_t)CH_NUM);
+        return make_pair(now_cu, buf_byte_div32);
+    }
+
+    // Set data in src_cu before calling (data will be destroyed). Or set src_cu_const.
+    void construct(const ValT *src_cu_const = nullptr, const cudaStream_t main_stream = 0, const bool src_unpacked = false) {
+        assert(size > 0 && src_cu != nullptr);
+        if (val_bit_len == 0) return;
+        if (src_cu == nullptr) { printf("Build Error: memory not alloced."); return;}
+
+        const XIdxT inf = ((XIdxT)1u << w_bit_len) - 1;
+        assert(W <= inf);
+        assert(size % WORD_SIZE == 0);
+
+        ValT mask = ((ValT)1u << val_bit_len) - 1;
+
+        const uint16_t block_pair_num = get_block_pair_num();
+        const int grid_x = div_ceil<int>(size, THREAD_PER_GRID * block_pair_num);
+        if (grid_x > MAX_BLOCK_X) { printf("over grid_x %d\n", grid_x); exit(1); }
+
+        const dim3 grid(grid_x, wm_num);
+        const dim3 thread(WARP_SIZE, THREAD_PER_GRID / WARP_SIZE);
+        const XYIdxT size_div_w = size / WORD_SIZE;
+        const XYIdxT size_div_warp = size / WARP_SIZE;
+        assert(size % WARP_SIZE == 0);
+        constexpr int ThreadsDimY = THREAD_PER_GRID / WARP_SIZE;
+
+
+#define CALC_SRCB_SIZE(SrcT)   (0)
+        constexpr int SRCB_S_8  = CALC_SRCB_SIZE(uint8_t);
+        constexpr int SRCB_S_16 = CALC_SRCB_SIZE(uint16_t);
+        constexpr int SRCB_S_32 = CALC_SRCB_SIZE(uint32_t);
+#undef CALC_SRCB_SIZE
+        {   using SrcT = uint8_t; using DstT = uint8_t;
+            cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_8, SrcT, DstT, BlockT, XYIdxT, XIdxT>, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_8);
+        } if (!is_same<ValT, uint8_t>::value) { using SrcT = uint16_t; using DstT = uint8_t;
+            cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_16, SrcT, DstT, BlockT, XYIdxT, XIdxT>, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_16);
+        } if (!is_same<ValT, uint8_t>::value) { using SrcT = uint16_t; using DstT = uint16_t;
+            cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_16, SrcT, DstT, BlockT, XYIdxT, XIdxT>, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_16);
+        } if (is_same<ValT, uint32_t>::value) { using SrcT = uint32_t; using DstT = uint16_t;
+            cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_32, SrcT, DstT, BlockT, XYIdxT, XIdxT>, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_32);
+        } if (is_same<ValT, uint32_t>::value) { using SrcT = uint32_t; using DstT = uint32_t;
+            cudaFuncSetAttribute(&WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_32, SrcT, DstT, BlockT, XYIdxT, XIdxT>, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_32);
+        }
+
+        {   using SrcT = uint8_t; using DstT = uint8_t;
+            cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_8, SrcT, DstT, BlockT, XYIdxT, XIdxT>, cudaFuncCachePreferShared);
+        } if (!is_same<ValT, uint8_t>::value) { using SrcT = uint16_t; using DstT = uint8_t;
+            cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_16, SrcT, DstT, BlockT, XYIdxT, XIdxT>, cudaFuncCachePreferShared);
+        } if (!is_same<ValT, uint8_t>::value) { using SrcT = uint16_t; using DstT = uint16_t;
+            cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_16, SrcT, DstT, BlockT, XYIdxT, XIdxT>, cudaFuncCachePreferShared);
+        } if (is_same<ValT, uint32_t>::value) { using SrcT = uint32_t; using DstT = uint16_t;
+            cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_32, SrcT, DstT, BlockT, XYIdxT, XIdxT>, cudaFuncCachePreferShared);
+        } if (is_same<ValT, uint32_t>::value) { using SrcT = uint32_t; using DstT = uint32_t;
+            cudaFuncSetCacheConfig(&WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_32, SrcT, DstT, BlockT, XYIdxT, XIdxT>, cudaFuncCachePreferShared);
+        }
+
+        const uint32_t nsum_pos = get_nsum_pos();
+
+        ValT *now_cu = src_cu + (CH_NUM == 1 ? 0ull : size * (size_t)CH_NUM);
+        ValT *nxt_cu = now_cu + size;
+        XYIdxT *nsum_buf_test = (XYIdxT*)(nxt_cu + size);
+        XYIdxT *nsum_buf_test2 = nsum_buf_test + nsum_scan_buf_len;
+        XIdxT *buf_idx = (XIdxT*)(nsum_buf_test2 + nsum_scan_buf_len);
+        XIdxT *nxt_idx = (XIdxT*)(buf_idx + size);
+
+
+        const int val_bit_len_m1 = val_bit_len - 1;
+        int h = val_bit_len_m1;
+        if (src_unpacked == true) {
+            if (src_cu_const != nullptr) {
+                printf("[Error!] not support. %s:%d\n", __FILE__, __LINE__);
+                exit(-1);
+            }
+            WaveletMatrix2dCu5C_first_gpu_multi_srcunpacked<ThreadsDimY> <<<grid, thread, 0, main_stream >>> (ValT(mask / 2),block_pair_num, size_div_warp, now_cu, nsum_buf_test, buf_byte_div32, buf_idx, W, (XYIdxT)W * H);
+        } else if (src_cu_const == nullptr) {
+            WaveletMatrix2dCu5C_first_gpu_multi<CH_NUM, ThreadsDimY, 0> <<<grid_x, thread, 0, main_stream >>> (ValT(mask / 2),block_pair_num, size_div_warp, src_cu, now_cu, nsum_buf_test, buf_byte_div32, buf_idx, W, (XYIdxT)W * H, src_cu_const);
+        } else {
+            WaveletMatrix2dCu5C_first_gpu_multi<CH_NUM, ThreadsDimY, 1> <<<grid_x, thread, 0, main_stream >>> (ValT(mask / 2),block_pair_num, size_div_warp, src_cu, now_cu, nsum_buf_test, buf_byte_div32, buf_idx, W, (XYIdxT)W * H, src_cu_const);
+        }
+        BlockT * nsum_p = get_bv_block_cu(h) + nsum_pos;
+        WaveletMatrix2dCu5C_ExclusiveSum<CH_NUM, MAX_BLOCK_X, XYIdxT> <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test, nsum_buf_test2, nsum_p, buf_byte_div32, bv_block_byte_div32);
+
+        const XYIdxT cwm_buf_byte_div32 = WM.get_buf_byte_div32();
+
+        if constexpr (sizeof(ValT) >= 4) for (; h > 16; --h) {
+            using SrcT = uint32_t;
+            using DstT = uint32_t;
+            mask >>= 1;
+            BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h);
+            const int hp1 = std::min(val_bit_len - 1, h+1);
+            XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM);
+            XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM));
+            BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1);
+            WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_32> <<<grid, thread, SRCB_S_32, main_stream >>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h);
+
+            if (h == 0) break;
+            BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos;
+            WaveletMatrix2dCu5C_ExclusiveSum<CH_NUM, MAX_BLOCK_X, XYIdxT> <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32);
+            swap(nsum_buf_test, nsum_buf_test2);
+            swap(now_cu, nxt_cu);
+            swap(buf_idx, nxt_idx);
+        }
+        if constexpr (sizeof(ValT) >= 4) if (h == 16 || (is_same<ValT, uint32_t>::value && h >= 0)) do {
+            using SrcT = uint32_t;
+            using DstT = uint16_t;
+            mask >>= 1;
+            BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h);
+            const int hp1 = std::min(val_bit_len - 1, h+1);
+            XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM);
+            XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM));
+            BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1);
+            WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_32> <<<grid, thread, SRCB_S_32, main_stream >>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h);
+
+            if (h == 0) break;
+            BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos;
+            WaveletMatrix2dCu5C_ExclusiveSum<CH_NUM, MAX_BLOCK_X, XYIdxT> <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32);
+            swap(nsum_buf_test, nsum_buf_test2);
+            swap(now_cu, nxt_cu);
+            swap(buf_idx, nxt_idx);
+            --h;
+        } while(0);
+        if constexpr (sizeof(ValT) >= 2) for (; h > 8; --h) {
+            using SrcT = uint16_t;
+            using DstT = uint16_t;
+            mask >>= 1;
+            BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h);
+            const int hp1 = std::min(val_bit_len - 1, h+1);
+            XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM);
+            XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM));
+            BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1);
+            WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_16> <<<grid, thread, SRCB_S_16, main_stream >>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h);
+
+            if (h == 0) break;
+            BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos;
+            WaveletMatrix2dCu5C_ExclusiveSum<CH_NUM, MAX_BLOCK_X, XYIdxT> <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32);
+            swap(nsum_buf_test, nsum_buf_test2);
+            swap(now_cu, nxt_cu);
+            swap(buf_idx, nxt_idx);
+        }
+        if constexpr (sizeof(ValT) >= 2) if (h == 8 || (is_same<ValT, uint32_t>::value && h >= 0)) do {
+            using SrcT = uint16_t;
+            using DstT = uint8_t;
+            mask >>= 1;
+            BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h);
+            const int hp1 = std::min(val_bit_len - 1, h+1);
+            XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM);
+            XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM));
+            BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1);
+            WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_16> <<<grid, thread, SRCB_S_16, main_stream >>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h);
+
+            if (h == 0) break;
+            BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos;
+            WaveletMatrix2dCu5C_ExclusiveSum<CH_NUM, MAX_BLOCK_X, XYIdxT> <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32);
+            swap(nsum_buf_test, nsum_buf_test2);
+            swap(now_cu, nxt_cu);
+            swap(buf_idx, nxt_idx);
+            --h;
+        } while(0);
+        for (; h >= 0; --h) {
+            using SrcT = uint8_t;
+            using DstT = uint8_t;
+            mask >>= 1;
+            BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h);
+            const int hp1 = std::min(val_bit_len - 1, h+1);
+            XYIdxT* WM_nsum_buf = WM.get_nsum_buf(hp1 * CH_NUM);
+            XIdxT* cwm = (h + 1 == val_bit_len ? nullptr : WM.get_src_p(hp1 * CH_NUM));
+            BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(hp1);
+            WaveletMatrix2dCu5C_UpSweep_gpu<CH_NUM, ThreadsDimY, SRC_CACHE_DIV, SRCB_S_8> <<<grid, thread, SRCB_S_8, main_stream >>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_buf_test, nsum_buf_test2, bv_block_byte_div32, buf_byte_div32, buf_idx, inf, cwm, nxt_idx, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h);
+
+            if (h == 0) break;
+            BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos;
+            WaveletMatrix2dCu5C_ExclusiveSum<CH_NUM, MAX_BLOCK_X, XYIdxT> <<< wm_num, grid_x, 0, main_stream >>> (nsum_buf_test2, nsum_buf_test, nsum_p, buf_byte_div32, bv_block_byte_div32);
+            swap(nsum_buf_test, nsum_buf_test2);
+            swap(now_cu, nxt_cu);
+            swap(buf_idx, nxt_idx);
+        }
+        {
+            const int h = 0;
+            XYIdxT* WM_nsum_buf = WM.get_nsum_buf(h * CH_NUM);
+            XIdxT* cwm = WM.get_src_p(h * CH_NUM);
+            BlockT* bv_block_nbit_pre_cu_h = get_bv_block_cu(h);
+
+            WaveletMatrix2dCu5C_last_gpu<CH_NUM, ThreadsDimY, BlockT> <<<grid, thread, SRCB_S_8, main_stream >>> (block_pair_num, size_div_w, buf_byte_div32, nxt_idx, inf, cwm, WM_nsum_buf, cwm_buf_byte_div32, bv_block_nbit_pre_cu_h, bv_block_byte_div32);
+        }
+        WM.construct(main_stream, false);
+    }
+
+    XYIdxT get_nsum_pos() const {
+        const XYIdxT size_div_w = size / WORD_SIZE;
+        return size_div_w;
+    }
+
+    template<int TH_W = 8, int TH_H = 32, typename ResTableT = std::nullptr_t, bool CUT_BORDER = false>
+    void median2d(const int r, const ResTableT* res_table = nullptr) {
+        median2d<TH_W, TH_H, ResTableT, CUT_BORDER>(r, -1, res_table);
+    }
+
+    template<int TH_W = 8, int TH_H = 32, typename ResTableT = std::nullptr_t, bool CUT_BORDER = false>
+    void median2d(const int r, int res_step_num = -1, const ResTableT* res_table = nullptr, const cudaStream_t main_stream = 0) {
+        if (bv_block_nbit_and_nsum_base_cu == nullptr) { printf("Median2d Error: memory not alloced."); return;}
+        if (is_same<ResTableT, std::nullptr_t>::value == false && res_table == nullptr) {printf("Median2d Error: res_table is null."); return;}
+        static_assert(is_same<ResTableT, std::nullptr_t>::value || (sizeof(ResTableT) <= sizeof(ValT)), "");
+
+        static_assert(TH_W * TH_H <= 1024, "max number of threads in block");
+
+        if (res_step_num < 0) res_step_num = W * CH_NUM;
+
+        constexpr int THREADS_NUM_W = TH_W;
+        const dim3 thread(THREADS_NUM_W, TH_H);
+        const dim3 grid(div_ceil<size_t>((CUT_BORDER ? W - 2 * r: W), THREADS_NUM_W), div_ceil<size_t>((CUT_BORDER ? H - 2 * r : H), TH_H), CH_NUM);
+
+
+        const uint32_t bv_nsum_pos = get_nsum_pos();
+        const BlockT*  bv_bv_block_nbit_cu_first = get_bv_block_cu(val_bit_len - 1);
+
+        const BlockT* wm_bv_block_nbit_cu_first = WM.get_bv_block_cu(w_bit_len - 1, (val_bit_len - 1) * CH_NUM); //
+        const uint32_t nsum_pos = WM.get_nsum_pos();
+        const uint64_t wm_bv_block_byte = WM.get_bv_block_byte();
+
+        if (bv_nsum_pos != nsum_pos) { printf("err! line %d", __LINE__); exit(-1); }
+        if (WM.get_bv_block_byte()  != WM.get_bv_block_h_byte_div32() * 32ull * w_bit_len) { printf("err! line %d", __LINE__); exit(-1); }
+
+        if (bv_block_len != WM.bv_block_len) {printf("bv_block_len error!\n"); exit(1);}
+
+        using ResT = typename std::conditional<is_same<ResTableT, std::nullptr_t>::value, ValT, ResTableT>::type;
+
+        const int Wc = (CUT_BORDER ? W - 2 * r : W);
+        const int Hc = (CUT_BORDER ? H - 2 * r : H);
+
+        constexpr int VAL_BIT_LEN = (sizeof(ValT) < 4) ? MAX_BIT_LEN : -1;
+        WaveletMatrix2dCu5C_median2d_cu<CH_NUM, THREADS_NUM_W, TH_H, VAL_BIT_LEN, CUT_BORDER, XYIdxT, XIdxT, ValT> <<<grid, thread, 0, main_stream>>>
+            (Hc, Wc, res_step_num, r, (ResT*)res_cu, wm_bv_block_nbit_cu_first, nsum_pos, WM.get_bv_block_h_byte_div32(), bv_block_len,
+                bv_bv_block_nbit_cu_first, w_bit_len, val_bit_len, res_table);
+    }
+
+    template<typename ResTableT = ValT>
+    vector<vector<ResTableT>> get_res() {
+        static_assert(sizeof(ResTableT) <= sizeof(ValT), "");
+        auto res = vector<vector<ResTableT>>(H, vector<ResTableT>(W));
+        if (res_cu == nullptr) { printf("get_res Error: memory not alloced."); return res;}
+
+        for (int i = 0; i < H; ++i) {
+            cudaMemcpy(res[i].data(), res_cu + (XYIdxT)W * i, W * sizeof(ResTableT), cudaMemcpyDeviceToHost);
+        }
+        return res;
+    }
+};
+
+} // end namespace wavelet_median
+}}} //end namespace cv::cuda::device
+
+#endif
+#endif // __OPENCV_WAVELET_MATRIX_2D_CUH__
diff --git a/modules/cudafilters/src/cuda/wavelet_matrix_feature_support_checks.h b/modules/cudafilters/src/cuda/wavelet_matrix_feature_support_checks.h
new file mode 100644
index 00000000000..6c47cc5e1a4
--- /dev/null
+++ b/modules/cudafilters/src/cuda/wavelet_matrix_feature_support_checks.h
@@ -0,0 +1,82 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_WAVELET_MATRIX_FEATURE_SUPPORT_CHECKS_H__
+#define __OPENCV_WAVELET_MATRIX_FEATURE_SUPPORT_CHECKS_H__
+
+#ifdef HAVE_CUDA
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#endif // HAVE_CUDA
+
+
+// The CUB library is used for the Median Filter with Wavelet Matrix,
+// which has become a standard library since CUDA 11.
+#if CUDA_VERSION >= 11000 || CUDART_VERSION >= 11000
+
+
+// Check `if constexpr` is available.
+
+// GCC has been supported since 7.1
+#if defined(__GNUC__) && (__GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ >= 1))
+#define __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+#endif
+
+// clang has been supported since 5.0
+#if defined(__clang__) && (__clang_major__ >= 5)
+#define __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+#endif
+
+
+// Visual Studio has been supported since Visual Studio 2019 (16.1.2)
+#if defined(_MSC_VER) && _MSC_VER >= 1921
+#define __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+#endif
+
+
+// I confirmed that it works with Intel C++ Compiler 2021.1.2. It did not work with icc 19.0.1.
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER > 202101 || (__INTEL_COMPILER == 202101 && __INTEL_COMPILER_UPDATE >= 2))
+#define __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+#endif
+
+#endif // CUDA_VERSION
+#endif // __OPENCV_WAVELET_MATRIX_FEATURE_SUPPORT_CHECKS_H__
diff --git a/modules/cudafilters/src/cuda/wavelet_matrix_float_supporter.cuh b/modules/cudafilters/src/cuda/wavelet_matrix_float_supporter.cuh
new file mode 100644
index 00000000000..3a21a3c7082
--- /dev/null
+++ b/modules/cudafilters/src/cuda/wavelet_matrix_float_supporter.cuh
@@ -0,0 +1,227 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_WAVELET_MATRIX_FLOAT_SUPPORTER_CUH__
+#define __OPENCV_WAVELET_MATRIX_FLOAT_SUPPORTER_CUH__
+
+// The CUB library is used for the Median Filter with Wavelet Matrix,
+// which has become a standard library since CUDA 11.
+#include "wavelet_matrix_feature_support_checks.h"
+#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+
+
+namespace cv { namespace cuda { namespace device
+{
+
+namespace wavelet_matrix_median {
+namespace WMMedianFloatSupporter {
+
+template<int blockDim, typename IdxT>
+__global__ void iota_idx1(IdxT *idx_in_cu, const IdxT hw) {
+    const IdxT i = blockIdx.x * blockDim + threadIdx.x;
+    if (i >= hw) return;
+    idx_in_cu[i] = i;
+}
+
+template<int blockDim, int CH_NUM, typename IdxT, typename ValT>
+__global__ void split_and_iota_idx(IdxT *idx_in_cu, const ValT* val_in_cu, ValT* val_out_cu, const IdxT hw) {
+    const size_t i = blockIdx.x * blockDim + threadIdx.x;
+    if (i >= hw) return;
+
+    static_assert(is_same<ValT, float>::value, "");
+    // static_assert(2 <= CH_NUM && CH_NUM <= 4);
+    using SrcTU = std::conditional_t<CH_NUM == 2, float2, conditional_t<CH_NUM == 3, float3, float4>>;
+
+    const SrcTU *src_u = (SrcTU*)val_in_cu;
+    const SrcTU src_uv = src_u[i];
+
+    if (CH_NUM >= 1) { // constexpr
+        val_out_cu[i] = src_uv.x;
+        idx_in_cu[i] = i;
+    }
+    if constexpr (CH_NUM >= 2) {
+        val_out_cu += hw; idx_in_cu += hw;
+        val_out_cu[i] = src_uv.y;
+        idx_in_cu[i] = i;
+    }
+    if constexpr (CH_NUM >= 3) {
+        val_out_cu += hw; idx_in_cu += hw;
+        val_out_cu[i] = src_uv.z;
+        idx_in_cu[i] = i;
+    }
+    if constexpr (CH_NUM >= 4) {
+        val_out_cu += hw; idx_in_cu += hw;
+        val_out_cu[i] = src_uv.w;
+        idx_in_cu[i] = i;
+    }
+}
+
+template<int blockDim, typename IdxT>
+__global__ void set_wm_val_1(IdxT *wm_src_p, const IdxT *idx_out_cu, const IdxT hw) {
+    const IdxT i = blockIdx.x * blockDim + threadIdx.x;
+    if (i >= hw) return;
+    const IdxT j = idx_out_cu[i];
+    wm_src_p[j] = i;
+}
+
+template<int blockDim, int CH_NUM, typename IdxT>
+__global__ void set_wm_val(IdxT *wm_src_p, const IdxT *idx_out_cu, const IdxT hw, const IdxT buf_byte_div32) {
+    const IdxT i = blockIdx.x * blockDim + threadIdx.x;
+    if (i >= hw) return;
+    const size_t hwc = size_t(hw) * blockIdx.y;
+    const IdxT j = idx_out_cu[i + hwc];
+    const size_t src_offset = buf_byte_div32 * 32ull / sizeof(IdxT) * blockIdx.y;
+    wm_src_p[src_offset + j] = i;
+}
+
+template<int blockDim, typename IdxT, typename ValT>
+__global__ void conv_res_cu(ValT *dst, const ValT *val_out_cu, const IdxT *res_cu, const IdxT hw) {
+    const IdxT i = blockIdx.x * blockDim + threadIdx.x;
+    if (i >= hw) return;
+
+    const IdxT r = res_cu[i];
+    dst[i] = val_out_cu[r];
+}
+
+template<typename ValT, int CH_NUM, typename IdxT>
+struct WMMedianFloatSupporter {
+    constexpr static int blockDim = 512;
+    int h = 0, w = 0;
+    int hw_bit_len = -1;
+    WMMedianFloatSupporter(){};
+    WMMedianFloatSupporter(int h, int w) { reset(h, w); }
+    ~WMMedianFloatSupporter(){
+        free();
+    }
+    ValT *val_in_cu = nullptr;
+    IdxT *idx_in_cu = nullptr;
+    ValT *val_out_cu = nullptr;
+    IdxT *idx_out_cu = nullptr;
+    void *cub_temp_storage = nullptr;
+    size_t cub_temp_storage_bytes;
+    // set: val_in
+    // get: val_out
+    //  1ch
+    //  [val_in][idx_in][val_out][idx_out][cub_temp]
+    //  C ch
+
+    //  [val_in][......][......][......]
+    //  AaBbCcDd
+    //  [^^^^^^][......][valin2][idxin2]
+    //                  ABCDabcd01230123
+
+    //  [val0in][val1in][idx0in][idx1in][val0out][val1out][idx0out][idx1out][cub_temp][d_offsets]
+
+    void reset(const int H, const int W) {
+        h = H; w = W;
+        free();
+    }
+    void alloc(){
+        const size_t hwc = size_t(CH_NUM) * h * w;
+        if (CH_NUM == 1) { // constexpr
+            cub::DeviceRadixSort::SortPairs(
+                nullptr, cub_temp_storage_bytes, val_in_cu, val_out_cu, idx_in_cu, idx_out_cu, hwc);
+            cudaMalloc(&val_in_cu, 2ull * hwc * (sizeof(ValT) + sizeof(IdxT)) + cub_temp_storage_bytes);
+        } else {
+            cub::DeviceSegmentedRadixSort::SortPairs(
+                nullptr, cub_temp_storage_bytes, val_in_cu, val_out_cu, idx_in_cu, idx_out_cu, hwc, CH_NUM, (int*)nullptr, (int*)nullptr);
+            const size_t offsets_arr_size = (CH_NUM + 1) * sizeof(int);
+            cudaMalloc(&val_in_cu, 2ull * hwc * (sizeof(ValT) + sizeof(IdxT)) + cub_temp_storage_bytes + offsets_arr_size);
+        }
+        idx_in_cu = (IdxT*)(val_in_cu + hwc);
+        val_out_cu = (ValT*)(idx_in_cu + hwc);
+        idx_out_cu = (IdxT*)(val_out_cu + hwc);
+        int *d_offsets = (int*)(idx_out_cu + hwc);
+        cub_temp_storage = d_offsets + (CH_NUM + 1);
+    }
+    void free() {
+        if (val_in_cu != nullptr) {
+            cudaFree(val_in_cu);
+        }
+    }
+    void sort_and_set(IdxT *wm_src_p, const IdxT buf_byte_div32 = 0){
+        const IdxT hw = h * w;
+        const size_t hwc = size_t(CH_NUM) * hw;
+        const dim3 gridDim((hw + blockDim - 1) / blockDim, CH_NUM);
+
+        if (CH_NUM == 1) { // constexpr
+            iota_idx1<blockDim><<<gridDim, blockDim>>>(idx_in_cu, hw);
+            cub::DeviceRadixSort::SortPairs(
+                cub_temp_storage, cub_temp_storage_bytes, val_in_cu, val_out_cu, idx_in_cu, idx_out_cu, hw);
+            set_wm_val_1<blockDim><<<gridDim, blockDim>>>(wm_src_p, idx_out_cu, hw);
+        } else {
+            auto idx2 = idx_out_cu;
+            auto val2 = val_out_cu;
+            auto idx3 = idx_in_cu;
+            auto val3 = val_in_cu;
+            split_and_iota_idx<blockDim, CH_NUM><<<gridDim.x, blockDim>>>(idx2, val_in_cu, val2, hw);
+
+            int h_offsets[CH_NUM + 1];
+            for (size_t i = 0; i <= CH_NUM; ++i) h_offsets[i] = i * hw;
+            int *d_offsets = (int*)(idx_out_cu + hwc);
+            cudaMemcpy(d_offsets, h_offsets, (CH_NUM + 1) * sizeof(int), cudaMemcpyHostToDevice);
+
+
+            cub::DeviceSegmentedRadixSort::SortPairs(
+                cub_temp_storage, cub_temp_storage_bytes, val2, val3, idx2, idx3, hwc, CH_NUM, d_offsets, d_offsets + 1);
+            set_wm_val<blockDim, CH_NUM><<<gridDim, blockDim>>>(wm_src_p, idx3, hw, buf_byte_div32);
+        }
+        for(hw_bit_len = 1; ; ++hw_bit_len) {
+            if ((1ull << hw_bit_len) >= hw) {
+                break;
+            }
+        }
+    }
+    const ValT* get_res_table() const {
+        if (CH_NUM == 1) { // constexpr
+            return val_out_cu;
+        } else {
+            return val_in_cu;
+        }
+    }
+};
+} // end namespace WMMedianFloatSupporter
+} // end namespace wavelet_matrix_median
+
+}}} //end namespace cv::cuda::device
+#endif
+#endif // __OPENCV_WAVELET_MATRIX_FLOAT_SUPPORTER_CUH__
diff --git a/modules/cudafilters/src/cuda/wavelet_matrix_multi.cuh b/modules/cudafilters/src/cuda/wavelet_matrix_multi.cuh
new file mode 100644
index 00000000000..de59d75993a
--- /dev/null
+++ b/modules/cudafilters/src/cuda/wavelet_matrix_multi.cuh
@@ -0,0 +1,636 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_WAVELET_MATRIX_MULTI_CUH__
+#define __OPENCV_WAVELET_MATRIX_MULTI_CUH__
+
+// The CUB library is used for the Median Filter with Wavelet Matrix,
+// which has become a standard library since CUDA 11.
+#include "wavelet_matrix_feature_support_checks.h"
+#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <cub/cub.cuh>
+#include <cub/device/device_scan.cuh>
+#include <cub/block/block_scan.cuh>
+#include "opencv2/core/cuda/warp_shuffle.hpp"
+
+
+#include <vector>
+
+namespace cv { namespace cuda { namespace device
+{
+
+namespace wavelet_matrix_median {
+    using std::vector;
+    using namespace std;
+
+    template <typename T>
+    constexpr T div_ceil(T a, T b) {
+        return (a + b - 1) / b;
+    }
+
+
+template<int ThreadsDimY, int SRCB_S, int DSTB_S, typename SrcT, typename DstT, class BlockT, typename IdxType>
+__global__ void WaveletMatrixMultiCu4G_UpSweep_gpu(const SrcT mask, const uint16_t block_pair_num, const IdxType size_div_w, const SrcT* __restrict__ src, DstT* __restrict__ dst, BlockT* __restrict__ nbit_bp, const IdxType* __restrict__ nsum_zeros_buf, IdxType* __restrict__ nsum_zeros_buf2, const uint32_t bv_block_byte_div32, const uint32_t buf_byte_div32) {
+    using WordT = decltype(BlockT::nbit);
+    using WarpWT = uint32_t;
+    constexpr int WARP_SIZE = 8 * sizeof(WarpWT);
+    static_assert(WARP_SIZE == 32, "");
+    static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE;
+    constexpr int WORD_SIZE = 8 * sizeof(WordT);
+    static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, "");
+    constexpr uint32_t WORD_DIV_WARP = WORD_SIZE / WARP_SIZE;
+
+    src            = (SrcT*)((uint8_t*)src + (size_t)blockIdx.y * (buf_byte_div32*32ull));
+    dst            = (DstT*)((uint8_t*)dst + (size_t)blockIdx.y * (buf_byte_div32*32ull));
+    nsum_zeros_buf  = (IdxType*)((uint8_t*)nsum_zeros_buf + (size_t)blockIdx.y * (buf_byte_div32*32ull));
+    nsum_zeros_buf2 = (IdxType*)((uint8_t*)nsum_zeros_buf2 + (size_t)blockIdx.y * (buf_byte_div32*32ull));
+    nbit_bp        = (BlockT*)((uint8_t*)nbit_bp + (size_t)blockIdx.y * (bv_block_byte_div32*32ull)); // TODO: rename
+
+    using WarpScan = cub::WarpScan<IdxType>;
+    using WarpScanY = cub::WarpScan<IdxType, ThreadsDimY>;
+    using WarpReduce = cub::WarpReduce<uint32_t>;
+    using WarpReduceY = cub::WarpReduce<uint32_t, ThreadsDimY>;
+
+    constexpr size_t shmem_size = sizeof(SrcT) * (ThreadsDimY * (WARP_SIZE - 1) * WARP_SIZE);
+    static_assert(SRCB_S == shmem_size, "");
+    static_assert(SRCB_S + DSTB_S < 64 * 1024, "");
+
+    constexpr int DST_BUF_SIZE = DSTB_S;
+    constexpr int DST_BUF_NUM_PER_WARP = DST_BUF_SIZE / (ThreadsDimY * sizeof(DstT)); // [32k/32/2=512] [48k/8/1=6114]
+    constexpr int DST_BUF_NUM_PER_THREAD = DST_BUF_NUM_PER_WARP / WARP_SIZE;
+    static_assert(DST_BUF_NUM_PER_THREAD <= WARP_SIZE, "");
+
+
+    extern __shared__ uint8_t shmem_base[];
+    SrcT* __restrict__ src_val_cache = (SrcT*)shmem_base;
+    DstT* __restrict__ dst_buf = (DstT*)&src_val_cache[SRCB_S] + threadIdx.y * DST_BUF_NUM_PER_WARP;  //[ThreadsDimY][DST_BUF_NUM_PER_WARP];
+
+    __shared__ uint4 nsum_count_sh[ThreadsDimY];
+    __shared__ IdxType pre_sum_share[2];
+    __shared__ IdxType warp_scan_sums[WARP_SIZE];
+    __shared__ typename WarpScan::TempStorage s_scanStorage;
+    __shared__ typename WarpScanY::TempStorage s_scanStorage2;
+    __shared__ typename WarpReduce::TempStorage WarpReduce_temp_storage[ThreadsDimY];
+    __shared__ typename WarpReduceY::TempStorage WarpReduceY_temp_storage;
+    // shmem ------ end ------
+
+    const IdxType size_div_warp = size_div_w * WORD_DIV_WARP;
+    const IdxType nsum = nbit_bp[size_div_w].nsum;
+    const IdxType nsum_offset = nsum_zeros_buf[blockIdx.x];
+
+
+    IdxType nsum_idx0_org = nsum_offset;
+    IdxType nsum_idx1_org = (IdxType)blockIdx.x * block_pair_num * THREAD_PER_GRID + nsum - nsum_idx0_org;
+    nsum_idx0_org /= (IdxType)block_pair_num * ThreadsDimY * WARP_SIZE;
+    nsum_idx1_org /= (IdxType)block_pair_num * ThreadsDimY * WARP_SIZE;
+    const IdxType nsum_idx0_bound = (nsum_idx0_org + 1) * block_pair_num * ThreadsDimY * WARP_SIZE;
+    const IdxType nsum_idx1_bound = (nsum_idx1_org + 1) * block_pair_num * ThreadsDimY * WARP_SIZE;
+    uint4 nsum_count = make_uint4(0, 0, 0, 0);
+
+    const unsigned short th_idx = threadIdx.y * WARP_SIZE + threadIdx.x;
+    if (th_idx == 0) {
+        pre_sum_share[0] = nsum_offset;
+    }
+
+
+    for (IdxType ka = 0; ka < block_pair_num; ka += WARP_SIZE) {
+        const IdxType ibb = ((IdxType)blockIdx.x * block_pair_num + ka) * ThreadsDimY;
+        if (ibb >= size_div_warp) break;
+        WarpWT my_bits = 0;
+        SrcT first_val;
+
+        const IdxType src_val_cache_offset = IdxType(threadIdx.y * (WARP_SIZE - 1) - 1) * WARP_SIZE + threadIdx.x;
+        for (IdxType kb = 0, i = ibb + WARP_SIZE * threadIdx.y; kb < WARP_SIZE; ++kb, ++i) {
+            if (i >= size_div_warp) break;
+            WarpWT bits;
+            const SrcT v = src[i * WARP_SIZE + threadIdx.x];
+            if (kb == 0) {
+                first_val = v;
+            } else {
+                src_val_cache[src_val_cache_offset + kb * WARP_SIZE] = v;
+            }
+            if (v <= mask) {
+                bits = __activemask();
+            } else {
+                bits = ~__activemask();
+            }
+            if (threadIdx.x == kb) {
+                my_bits = bits;
+            }
+        }
+        IdxType t, c = __popc(my_bits);
+        WarpScan(s_scanStorage).ExclusiveSum(c, t);
+
+        if (threadIdx.x == WARP_SIZE - 1) {
+            warp_scan_sums[threadIdx.y] = c + t;
+        }
+        __syncthreads();
+        IdxType pre_sum = pre_sum_share[(ka & WARP_SIZE) > 0];
+        IdxType s = threadIdx.x < ThreadsDimY ? warp_scan_sums[threadIdx.x] : 0;
+        WarpScanY(s_scanStorage2).ExclusiveSum(s, s);
+
+        s = cv::cuda::device::shfl(s, threadIdx.y, WARP_SIZE);
+
+        s += t + pre_sum;
+        if (th_idx == THREAD_PER_GRID - 1) {
+            pre_sum_share[(ka & WARP_SIZE) == 0] = s + c;
+        }
+        const IdxType bi = ibb + th_idx;
+
+        if (bi < size_div_warp) {
+            static_assert(WORD_SIZE == 32, "");
+            nbit_bp[bi] = BlockT{s, my_bits};
+        }
+        if (mask == 0) continue;
+
+        const SrcT mask_2 = mask >> 1;
+        SrcT vo = first_val;
+        for (IdxType j = 0, i = ibb + WARP_SIZE * threadIdx.y; j < WARP_SIZE;) {
+            if (i >= size_div_warp) break;
+
+
+            IdxType idx0_begin, idx0_num, idx1_offset, idx01_num, idx1_offset0;
+            if (DST_BUF_SIZE > 0) { // constexpr
+                IdxType idx1_begin, idx0_end;
+                const IdxType ib = ::min(size_div_warp, i + DST_BUF_NUM_PER_THREAD);
+                const IdxType jb = j + ib - i - 1;
+                idx0_begin = cv::cuda::device::shfl(s, j, WARP_SIZE);
+                idx1_begin = i * WARP_SIZE + nsum - idx0_begin;
+                idx0_end = cv::cuda::device::shfl(s + c, jb, WARP_SIZE);
+
+                idx0_num = idx0_end - idx0_begin;
+                idx1_offset = idx1_begin - idx0_num;
+                idx01_num = (ib - i) * WARP_SIZE;
+                idx1_offset0 = nsum - idx1_begin + idx0_num;
+            }
+            constexpr int DST_LOOP_NUM = (DST_BUF_SIZE == 0 ? 1 : DST_BUF_NUM_PER_THREAD);
+            for (IdxType kb = 0; kb < DST_LOOP_NUM; ++kb, ++j, ++i) {
+                if (i >= size_div_warp) break;
+
+                const WarpWT  e_nbit = cv::cuda::device::shfl(my_bits, j, WARP_SIZE);
+                const IdxType e_nsum = cv::cuda::device::shfl(s, j, WARP_SIZE);
+                IdxType rank = __popc(e_nbit << (WARP_SIZE - threadIdx.x));
+                const IdxType idx0 = e_nsum + rank;
+
+                DstT v = (DstT)vo;
+                IdxType idx;
+                IdxType buf_idx;
+                if (vo > mask) { // 1
+                    const IdxType ij = i * WARP_SIZE + threadIdx.x;
+                    idx = ij + nsum - idx0;
+                    v &= mask;
+                    buf_idx = ij - idx0 + idx1_offset0;
+                } else {
+                    idx = idx0;
+                    buf_idx = idx0 - idx0_begin;
+                }
+                if (DST_BUF_SIZE == 0) {
+                    dst[idx] = v;
+                } else {
+                    dst_buf[buf_idx] = (DstT)v;
+                }
+
+                if (v <= mask_2) {
+                    if (vo <= mask) {
+                        if (idx < nsum_idx0_bound) {
+                            nsum_count.x++;
+                        } else {
+                            assert(idx < nsum_idx0_bound + block_pair_num * ThreadsDimY * WARP_SIZE);
+                            nsum_count.y++;
+                        }
+                    } else {
+                        if (idx < nsum_idx1_bound) {
+                            nsum_count.z++;
+                        } else {
+                            assert(idx < nsum_idx1_bound + block_pair_num * ThreadsDimY * WARP_SIZE);
+                            nsum_count.w++;
+                        }
+                    }
+                }
+                if (j == WARP_SIZE - 1) { j = WARP_SIZE; break; }
+                vo = src_val_cache[(threadIdx.y * (WARP_SIZE - 1) + j) * WARP_SIZE + threadIdx.x];
+            }
+            if (DST_BUF_SIZE > 0) { // constexpr
+                for (IdxType j = threadIdx.x; (int)j < DST_BUF_NUM_PER_WARP; j += WARP_SIZE) {
+                    if (j >= idx01_num) break;
+                    IdxType idx;
+                    if (j < idx0_num) { // 0
+                        idx = j + idx0_begin;
+                    } else {    // 1
+                        idx = j + idx1_offset;
+                    }
+                    dst[idx] = dst_buf[j];
+                }
+            }
+        }
+    }
+
+    if (blockIdx.x == gridDim.x - 1 && th_idx == 0) {
+        nbit_bp[size_div_warp / WORD_DIV_WARP].nsum = nsum;
+    }
+    if (mask == 0) return;
+
+    nsum_count.x = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.x);
+    nsum_count.y = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.y);
+    nsum_count.z = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.z);
+    nsum_count.w = WarpReduce(WarpReduce_temp_storage[threadIdx.y]).Sum(nsum_count.w);
+    if (threadIdx.x == 0) {
+        nsum_count_sh[threadIdx.y] = nsum_count;
+    }
+    __syncthreads();
+
+
+    if (threadIdx.x < ThreadsDimY) {
+        nsum_count = nsum_count_sh[threadIdx.x];
+        nsum_count.x = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.x);
+        nsum_count.y = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.y);
+        nsum_count.z = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.z);
+        nsum_count.w = WarpReduceY(WarpReduceY_temp_storage).Sum(nsum_count.w);
+        if (th_idx == 0) {
+            const IdxType nsum_idx0_org = nsum_idx0_bound / ((IdxType)block_pair_num * ThreadsDimY * WARP_SIZE);
+            const IdxType nsum_idx1_org = nsum_idx1_bound / ((IdxType)block_pair_num * ThreadsDimY * WARP_SIZE);
+            if (nsum_count.x > 0) atomicAdd(nsum_zeros_buf2 + nsum_idx0_org - 1, nsum_count.x);
+            if (nsum_count.y > 0) atomicAdd(nsum_zeros_buf2 + nsum_idx0_org - 0, nsum_count.y);
+            if (nsum_count.z > 0) atomicAdd(nsum_zeros_buf2 + nsum_idx1_org - 1, nsum_count.z);
+            if (nsum_count.w > 0) atomicAdd(nsum_zeros_buf2 + nsum_idx1_org - 0, nsum_count.w);
+        }
+    }
+}
+
+
+template<int MAX_BLOCK_X, typename IdxType, typename BlockT>
+__global__ void WaveletMatrixMultiCu4G_ExclusiveSum(IdxType* __restrict__ nsum_scan_buf, IdxType* __restrict__ nsum_zeros_buf2, BlockT* __restrict__ nsum_p, const uint32_t buf_byte_div32, const uint32_t bv_block_byte_div32) {
+
+    typedef cub::BlockScan<IdxType, MAX_BLOCK_X> BlockScan;
+    __shared__ typename BlockScan::TempStorage temp_storage;
+
+    IdxType thread_data1;
+    IdxType thread_data2;
+    nsum_scan_buf  = (IdxType*)((uint8_t*)nsum_scan_buf + (size_t)blockIdx.x * (buf_byte_div32*32ull));
+    nsum_zeros_buf2 = (IdxType*)((uint8_t*)nsum_zeros_buf2 + (size_t)blockIdx.x * (buf_byte_div32*32ull));
+
+    thread_data1 = nsum_scan_buf[threadIdx.x];
+    BlockScan(temp_storage).ExclusiveSum(thread_data1, thread_data2);
+
+    nsum_scan_buf[threadIdx.x] = thread_data2;
+    nsum_zeros_buf2[threadIdx.x] = 0;
+
+    if (threadIdx.x == blockDim.x - 1) {
+        thread_data2 += thread_data1;
+        nsum_p = (BlockT*)((uint8_t*)nsum_p + (size_t)blockIdx.x * (bv_block_byte_div32*32ull));
+        nsum_p->nsum = thread_data2;
+    }
+}
+
+
+template<int ThreadsDimY, typename SrcT, typename IdxType>
+__global__ void WaveletMatrixMultiCu4G_first_gpu(const SrcT mask, uint16_t block_pair_num, const IdxType size_div_warp, const SrcT* __restrict__ src, IdxType* __restrict__ nsum_scan_buf, const uint32_t buf_byte_div32) {
+    using WarpWT = uint32_t;
+    constexpr int WARP_SIZE = 8 * sizeof(WarpWT);
+    static_assert(WARP_SIZE == 32, "");
+    static constexpr int THREAD_PER_GRID = ThreadsDimY * WARP_SIZE;
+
+    src = (SrcT*)((uint8_t*)src + (size_t)blockIdx.y * (buf_byte_div32*32ull));
+    IdxType cs = 0;
+    IdxType ibb = (IdxType)blockIdx.x * block_pair_num * ThreadsDimY;
+    for (IdxType ka = 0; ka < block_pair_num; ka += WARP_SIZE, ibb += THREAD_PER_GRID) {
+        for (IdxType kb = 0, i = ibb + WARP_SIZE * threadIdx.y; kb < WARP_SIZE; ++kb, ++i) {
+            if (i >= size_div_warp) break;
+            const SrcT v = src[i * WARP_SIZE + threadIdx.x];
+            if (v <= mask) {
+                ++cs;
+            }
+        }
+    }
+    using BlockReduce = cub::BlockReduce<IdxType, WARP_SIZE, cub::BLOCK_REDUCE_WARP_REDUCTIONS, ThreadsDimY>;
+    __shared__ typename BlockReduce::TempStorage s_reduceStorage;
+    IdxType reducedValue = BlockReduce(s_reduceStorage).Sum(cs);
+
+    if (threadIdx.y == 0 && threadIdx.x == 0) {
+        nsum_scan_buf = (IdxType*)((uint8_t*)nsum_scan_buf + (size_t)blockIdx.y * (buf_byte_div32*32ull));
+        nsum_scan_buf[blockIdx.x] = reducedValue;
+    }
+}
+
+
+
+template<int MIN_DSTBUF_KB>
+constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_Z(uint32_t r) {
+    return (r >= MIN_DSTBUF_KB) ? r * 1024 : 0;
+}
+
+template<int MIN_DSTBUF_KB>
+constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_B(uint32_t r) {
+    return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_Z<MIN_DSTBUF_KB>((r + 1) / 2);
+}
+template<int MIN_DSTBUF_KB>
+constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A16(uint32_t r) {
+    return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_B<MIN_DSTBUF_KB>(r | (r >> 16));
+}
+template<int MIN_DSTBUF_KB>
+constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A8(uint32_t r) {
+    return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A16<MIN_DSTBUF_KB>(r | (r >> 8));
+}
+template<int MIN_DSTBUF_KB>
+constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A4(uint32_t r) {
+    return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A8<MIN_DSTBUF_KB>(r | (r >> 4));
+}
+template<int MIN_DSTBUF_KB>
+constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A2(uint32_t r) {
+    return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A4<MIN_DSTBUF_KB>(r | (r >> 2));
+}
+template<int MIN_DSTBUF_KB>
+constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A1(uint32_t r) {
+    return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A2<MIN_DSTBUF_KB>(r | (r >> 1));
+}
+template<int SHMEM_USE_KB, int MIN_DSTBUF_KB>
+constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb(int SRCB_S) {
+    return WaveletMatrixMultiCu4G_get_dstbuf_kb_internal_A1<MIN_DSTBUF_KB>(SHMEM_USE_KB - (SRCB_S + 1023) / 1024);
+}
+
+// template<int SHMEM_USE_KB, int MIN_DSTBUF_KB>
+// constexpr int WaveletMatrixMultiCu4G_get_dstbuf_kb(int SRCB_S) {
+//     uint32_t r = (SHMEM_USE_KB - (SRCB_S + 1023) / 1024);
+//     r |= r >> 1;
+//     r |= r >> 2;
+//     r |= r >> 4;
+//     r |= r >> 8;
+//     r |= r >> 16;
+//     r = (r + 1) / 2;
+//     return (r >= MIN_DSTBUF_KB) ? r * 1024 : 0;
+// }
+
+
+template <typename T = uint64_t, int TH_NUM = 512, int WORD_SIZE = 32, typename IdxType = uint32_t>
+struct WaveletMatrixMultiCu4G {
+    static constexpr int MAX_BIT_LEN = 8 * sizeof(T);
+
+    static constexpr uint32_t WSIZE = WORD_SIZE;
+    using T_Type = T;
+    static constexpr int WARP_SIZE       = 32;
+    static constexpr int THREAD_PER_GRID = TH_NUM;
+    static constexpr int MAX_BLOCK_X     = 1024;
+    static_assert(WORD_SIZE == 32 || WORD_SIZE == 64, "WORD SIZE must be 32 or 64");
+    using WordT = typename std::conditional<WORD_SIZE == 32, uint32_t, uint64_t>::type;
+
+    static constexpr int SHMEM_USE_KB  = 64;
+
+    struct __align__(8) BLOCK32_T { uint32_t nsum; union { uint32_t nbit; uint32_t nbit_a[1];}; };
+    struct __align__(4) BLOCK64_T { uint32_t nsum; union { uint64_t nbit; uint32_t nbit_a[2];}; };
+    using BlockT = typename std::conditional<WORD_SIZE == 32, BLOCK32_T, BLOCK64_T>::type;
+
+    static constexpr int MIN_DSTBUF_KB =  4;
+    static constexpr int BLOCK_TYPE = 2;
+    using WarpWT = uint32_t;
+    static_assert(8 * sizeof(WarpWT) == WARP_SIZE, "bits of WarpWT must be WARP_SIZE");
+
+    IdxType size = 0;
+    int wm_num = 0;
+
+private:
+    T* src_cu = nullptr;
+    uint8_t* bv_block_nbit_and_nsum_base_cu = nullptr;
+    uint32_t bv_block_byte_div32;
+    uint32_t buf_byte_div32;
+public:
+    size_t bv_block_len = 0;
+    IdxType bv_zeros[MAX_BIT_LEN];
+    int bit_len = 0;
+
+    WaveletMatrixMultiCu4G(IdxType _n = 0, int _bit_len = 0, int num = 0) {
+        reset(_n, _bit_len, num);
+    }
+    void reset(IdxType _n, int _bit_len, int _num) {
+        cudaError_t err;
+        assert(size == 0 && src_cu == nullptr && 0 <= _bit_len && _bit_len <= MAX_BIT_LEN);
+        bit_len = _bit_len;
+        wm_num = _num;
+        if (_n == 0 || wm_num == 0) return;
+        size = div_ceil<size_t>(_n, WORD_SIZE) * WORD_SIZE;
+        bv_block_len = div_ceil<size_t>(size, THREAD_PER_GRID) * THREAD_PER_GRID / WORD_SIZE + 1;
+        bv_block_len = div_ceil<size_t>(bv_block_len, 8*2) * 8*2;
+
+        const size_t bv_block_byte = (sizeof(BlockT)) * bit_len * bv_block_len;
+        if (bv_block_byte % 32 != 0) { printf("bv_block_byte not 32n!"); exit(-1); }
+        bv_block_byte_div32 = div_ceil<size_t>(bv_block_byte, 32);
+
+        err = cudaMalloc(&bv_block_nbit_and_nsum_base_cu, (size_t)(bv_block_byte_div32*32ull) * _num);
+        if (bv_block_nbit_and_nsum_base_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); release(); return; }
+
+        const uint16_t block_pair_num = get_block_pair_num();
+        const IdxType nsum_scan_buf_len = div_ceil<size_t>(size, (size_t)THREAD_PER_GRID * block_pair_num);
+
+        const size_t buf_byte = sizeof(IdxType) * 2 * nsum_scan_buf_len + sizeof(T) * size * 2;
+        buf_byte_div32 = div_ceil<size_t>(buf_byte, 32);
+        err = cudaMalloc(&src_cu, (size_t)(buf_byte_div32*32ull) * _num);
+        if (src_cu == nullptr) { printf("GPU Memory Alloc Error! %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); release(); return; }
+
+    }
+    void release() {
+        size = 0;
+        if (src_cu != nullptr) cudaFree(src_cu);
+        if (bv_block_nbit_and_nsum_base_cu != nullptr) cudaFree(bv_block_nbit_and_nsum_base_cu);
+        src_cu = nullptr;
+        bv_block_nbit_and_nsum_base_cu = nullptr;
+    }
+    ~WaveletMatrixMultiCu4G() { release(); }
+
+    BlockT*  get_bv_block_cu(int h) const { return (BlockT*)(bv_block_nbit_and_nsum_base_cu + (sizeof(BlockT)) * bv_block_len * h); }
+    BlockT*  get_bv_block_cu(int h, int c) const { return (BlockT*)((uint8_t*)get_bv_block_cu(h) + (size_t)c * (bv_block_byte_div32*32ull)); }
+    uint64_t get_bv_block_byte() const { return (bv_block_byte_div32*32ull); }
+
+
+    T* get_src_p(int c) const { return src_cu + (size_t)(buf_byte_div32*32ull) / (sizeof(T)) * c; }
+
+    uint16_t get_block_pair_num() const {
+        constexpr int x_chunk = 65536 / THREAD_PER_GRID / WARP_SIZE; // To make the pixels assigned per grid a multiple of 65536.
+        static_assert(x_chunk > 0, "");
+        const uint64_t total_gridx = div_ceil<uint64_t>(size, THREAD_PER_GRID * WARP_SIZE);
+        uint64_t block_pair_num_org = div_ceil<uint64_t>(total_gridx, MAX_BLOCK_X);
+        if (block_pair_num_org <= x_chunk) {
+            block_pair_num_org--;
+            block_pair_num_org |= block_pair_num_org >> 1;
+            block_pair_num_org |= block_pair_num_org >> 2;
+            block_pair_num_org |= block_pair_num_org >> 4;
+            block_pair_num_org |= block_pair_num_org >> 8;
+            block_pair_num_org++;
+        } else {
+            block_pair_num_org = div_ceil<uint64_t>(block_pair_num_org, x_chunk) * x_chunk;
+        }
+        block_pair_num_org *= WARP_SIZE;
+
+        if (block_pair_num_org >= (1LL << (8 * sizeof(uint16_t)))) { printf("over block_pair_num %ld\n", block_pair_num_org); exit(1); }
+        return (uint16_t)block_pair_num_org;
+    }
+    std::pair<IdxType*, size_t> get_nsum_buf_and_buf_byte() const {
+        IdxType* nsum_buf = (IdxType*)(src_cu + 2ull * size);
+        return { nsum_buf, (buf_byte_div32*32ull) };
+    }
+    IdxType* get_nsum_buf(int c) const {
+        IdxType* nsum_buf = (IdxType*)(src_cu + 2ull * size);
+        return (IdxType*)((uint8_t*)nsum_buf + (size_t)(buf_byte_div32*32ull) * c);
+    }
+    uint64_t get_buf_byte() const { return (buf_byte_div32*32ull); }
+    uint32_t get_buf_byte_div32() const { return buf_byte_div32; }
+
+    IdxType get_nsum_scan_buf_len(uint16_t block_pair_num) const {
+        return div_ceil<IdxType>(size, THREAD_PER_GRID * block_pair_num);
+    }
+    IdxType get_nsum_scan_buf_len() const {
+        const uint16_t block_pair_num = get_block_pair_num();
+        return get_nsum_scan_buf_len(block_pair_num);
+    }
+
+    // Set data in src_cu before calling (data will be destroyed).
+    void construct(const cudaStream_t main_stream = 0, const bool run_first = true) {
+        assert(size > 0 && src_cu != nullptr);
+        if (size == 0 || wm_num == 0) return;
+        if (src_cu == nullptr) { printf("Build Error: memory not alloced."); return;}
+
+        T mask = ((T)1 << bit_len) - 1;
+
+        const uint16_t block_pair_num = get_block_pair_num();
+        const int grid_x = div_ceil<int>(size, THREAD_PER_GRID * block_pair_num);
+        if (grid_x > MAX_BLOCK_X) { printf("over grid_x %d\n", grid_x); exit(1); }
+
+
+        const dim3 grid(grid_x, wm_num);
+        const dim3 thread(WARP_SIZE, THREAD_PER_GRID / WARP_SIZE);
+        const IdxType size_div_w = size / WORD_SIZE;
+        const IdxType size_div_warp = size / WARP_SIZE;
+        assert(size % WARP_SIZE == 0);
+        constexpr int ThreadsDimY = THREAD_PER_GRID / WARP_SIZE;
+
+
+        const int nsum_scan_buf_len = get_nsum_scan_buf_len(block_pair_num); // same grid_x
+
+
+#define CALC_SRCB_SIZE(SrcT)   (sizeof(SrcT) * (ThreadsDimY * (WARP_SIZE - 1) * WARP_SIZE))
+        constexpr int SRCB_S_T = CALC_SRCB_SIZE(T);
+        constexpr int SRCB_S_8 = CALC_SRCB_SIZE(uint8_t);
+#undef CALC_SRCB_SIZE
+        constexpr int BLOCK_SHMEM_KB = SHMEM_USE_KB * THREAD_PER_GRID / 1024;
+        constexpr int DSTB_S_T = WaveletMatrixMultiCu4G_get_dstbuf_kb<BLOCK_SHMEM_KB, MIN_DSTBUF_KB>(SRCB_S_T);
+        constexpr int DSTB_S_8 = WaveletMatrixMultiCu4G_get_dstbuf_kb<BLOCK_SHMEM_KB, MIN_DSTBUF_KB>(SRCB_S_8);
+        static_assert(SHMEM_USE_KB >= 64 || THREAD_PER_GRID == 1024, "if SHMEM_USE_KB < 64, THREAD_PER_GRID must 1024");
+        static_assert(SRCB_S_T + DSTB_S_T <= BLOCK_SHMEM_KB * 1024 && ((DSTB_S_T == 0 && SRCB_S_T + MIN_DSTBUF_KB * 1024> BLOCK_SHMEM_KB * 1024) || (DSTB_S_T >= MIN_DSTBUF_KB * 1024 && SRCB_S_T + DSTB_S_T * 2 > BLOCK_SHMEM_KB * 1024)), "");
+        static_assert(SRCB_S_8 + DSTB_S_8 <= BLOCK_SHMEM_KB * 1024 && ((DSTB_S_8 == 0 && SRCB_S_8 + MIN_DSTBUF_KB * 1024> BLOCK_SHMEM_KB * 1024) || (DSTB_S_8 >= MIN_DSTBUF_KB * 1024 && SRCB_S_8 + DSTB_S_8 * 2 > BLOCK_SHMEM_KB * 1024)), "");
+
+        {   using SrcT = T; using DstT = T;
+            cudaFuncSetAttribute(&WaveletMatrixMultiCu4G_UpSweep_gpu<ThreadsDimY, SRCB_S_T, DSTB_S_T, SrcT, DstT, BlockT, IdxType>, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_T + DSTB_S_T);
+        } { using SrcT = T; using DstT = uint8_t;
+            cudaFuncSetAttribute(&WaveletMatrixMultiCu4G_UpSweep_gpu<ThreadsDimY, SRCB_S_T, DSTB_S_T, SrcT, DstT, BlockT, IdxType>, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_T + DSTB_S_T);
+        } { using SrcT = uint8_t; using DstT = uint8_t;
+            cudaFuncSetAttribute(&WaveletMatrixMultiCu4G_UpSweep_gpu<ThreadsDimY, SRCB_S_8, DSTB_S_8, SrcT, DstT, BlockT, IdxType>, cudaFuncAttributeMaxDynamicSharedMemorySize, SRCB_S_8 + DSTB_S_8);
+        }
+
+        T* now_cu = src_cu;
+        T* nxt_cu = src_cu + size;
+        IdxType* nsum_zeros_buf = (IdxType*)(nxt_cu + size);
+        IdxType* nsum_zeros_buf2 = nsum_zeros_buf + nsum_scan_buf_len;
+
+        const uint32_t nsum_pos = get_nsum_pos();
+
+        int h = bit_len - 1;
+        if (run_first) {
+            WaveletMatrixMultiCu4G_first_gpu<ThreadsDimY> <<<grid, thread, 0, main_stream >>> (T(mask / 2), block_pair_num, size_div_warp, src_cu, nsum_zeros_buf, buf_byte_div32);
+        }
+        WaveletMatrixMultiCu4G_ExclusiveSum<MAX_BLOCK_X, IdxType> <<< wm_num, grid_x, 0, main_stream >>> (nsum_zeros_buf, nsum_zeros_buf2, get_bv_block_cu(h) + nsum_pos, buf_byte_div32, bv_block_byte_div32);
+
+        for (; h > 8; --h) {
+            using SrcT = T;
+            using DstT = T;
+            mask >>= 1;
+            BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h);
+            WaveletMatrixMultiCu4G_UpSweep_gpu<ThreadsDimY, SRCB_S_T, DSTB_S_T> <<<grid, thread, SRCB_S_T + DSTB_S_T, main_stream >>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_zeros_buf, nsum_zeros_buf2, bv_block_byte_div32, buf_byte_div32);
+
+            BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos;
+            WaveletMatrixMultiCu4G_ExclusiveSum<MAX_BLOCK_X, IdxType> <<< wm_num, grid_x, 0, main_stream >>> (nsum_zeros_buf2, nsum_zeros_buf, nsum_p, buf_byte_div32, bv_block_byte_div32);
+            swap(nsum_zeros_buf, nsum_zeros_buf2);
+            swap(now_cu, nxt_cu);
+        }
+        if (h == 8 || (is_same<T, uint16_t>::value && bit_len <= 8 && h == bit_len - 1)) {
+            using SrcT = T;
+            using DstT = uint8_t;
+            mask >>= 1;
+            BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h);
+            WaveletMatrixMultiCu4G_UpSweep_gpu<ThreadsDimY, SRCB_S_T, DSTB_S_T> <<<grid, thread, SRCB_S_T + DSTB_S_T, main_stream >>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_zeros_buf, nsum_zeros_buf2, bv_block_byte_div32, buf_byte_div32);
+            if (h == 0) return;
+            BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos;
+            WaveletMatrixMultiCu4G_ExclusiveSum<MAX_BLOCK_X, IdxType> <<< wm_num, grid_x, 0, main_stream >>> (nsum_zeros_buf2, nsum_zeros_buf, nsum_p, buf_byte_div32, bv_block_byte_div32);
+            swap(nsum_zeros_buf, nsum_zeros_buf2);
+            swap(now_cu, nxt_cu);
+            --h;
+        }
+
+        for (; h >= 0; --h) {
+            using SrcT = uint8_t;
+            using DstT = uint8_t;
+            mask >>= 1;
+            BlockT* bv_block_nbit_cu_h = get_bv_block_cu(h);
+            WaveletMatrixMultiCu4G_UpSweep_gpu<ThreadsDimY, SRCB_S_8, DSTB_S_8> <<<grid, thread, SRCB_S_8 + DSTB_S_8, main_stream >>> ((SrcT)mask, block_pair_num, size_div_w, (SrcT*)now_cu, (DstT*)nxt_cu, bv_block_nbit_cu_h, nsum_zeros_buf, nsum_zeros_buf2, bv_block_byte_div32, buf_byte_div32);
+            if (h == 0) break;
+            BlockT* nsum_p = get_bv_block_cu(h - 1) + nsum_pos;
+            WaveletMatrixMultiCu4G_ExclusiveSum<MAX_BLOCK_X, IdxType> <<< wm_num, grid_x, 0, main_stream >>> (nsum_zeros_buf2, nsum_zeros_buf, nsum_p, buf_byte_div32, bv_block_byte_div32);
+            swap(nsum_zeros_buf, nsum_zeros_buf2);
+            swap(now_cu, nxt_cu);
+        }
+    }
+
+    IdxType get_nsum_pos() const {
+        const IdxType size_div_w = size / WORD_SIZE;
+        return size_div_w;
+    }
+    IdxType get_bv_block_h_byte_div32() const {
+        return (bv_block_len * (sizeof(WordT) + sizeof(IdxType))) / 32u;
+    }
+};
+
+} // end namespace wavelet_median
+}}} //end namespace cv::cuda::device
+
+#endif
+#endif // __OPENCV_WAVELET_MATRIX_MULTI_CUH__
diff --git a/modules/cudafilters/src/filtering.cpp b/modules/cudafilters/src/filtering.cpp
index daab3acde10..2ae789c856d 100644
--- a/modules/cudafilters/src/filtering.cpp
+++ b/modules/cudafilters/src/filtering.cpp
@@ -72,6 +72,8 @@ Ptr<Filter> cv::cuda::createColumnSumFilter(int, int, int, int, int, Scalar) { t
 Ptr<Filter> cv::cuda::createMedianFilter(int srcType, int _windowSize, int _partitions){ throw_no_cuda(); return Ptr<Filter>();}
 
 #else
+#include <cuda.h>
+#include <cuda_runtime_api.h>
 
 namespace
 {
@@ -1047,12 +1049,20 @@ Ptr<Filter> cv::cuda::createColumnSumFilter(int srcType, int dstType, int ksize,
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Median Filter
 
+// The CUB library is used for the Median Filter with Wavelet Matrix,
+// which has become a standard library since CUDA 11.
+#include "cuda/wavelet_matrix_feature_support_checks.h"
 
 
 namespace cv { namespace cuda { namespace device
 {
     void medianFiltering_gpu(const PtrStepSzb src, PtrStepSzb dst, PtrStepSzi devHist,
         PtrStepSzi devCoarseHist,int kernel, int partitions, cudaStream_t stream);
+
+#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+    template<typename T>
+    void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<T> src, PtrStepSz<T> dst, int radius, const int num_channels, cudaStream_t stream);
+#endif
 }}}
 
 namespace
@@ -1074,7 +1084,15 @@ namespace
     MedianFilter::MedianFilter(int srcType, int _windowSize, int _partitions) :
         windowSize(_windowSize),partitions(_partitions)
     {
-        CV_Assert( srcType == CV_8UC1 );
+#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+        CV_Assert(srcType == CV_8UC1  || srcType == CV_8UC3  || srcType == CV_8UC4
+               || srcType == CV_16UC1 || srcType == CV_16UC3 || srcType == CV_16UC4
+               || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4);
+#else
+        if (srcType != CV_8UC1) {
+            CV_Error(Error::StsNotImplemented, "If CUDA version is below 10, only implementations that support CV_8UC1 are available");
+        }
+#endif
         CV_Assert(windowSize>=3);
         CV_Assert(_partitions>=1);
 
@@ -1094,6 +1112,18 @@ namespace
         // Kernel needs to be half window size
         int kernel=windowSize/2;
 
+#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+        const int depth = src.depth();
+        if (depth == CV_8U) {
+            medianFiltering_wavelet_matrix_gpu<uint8_t>(src, dst, kernel, src.channels(), StreamAccessor::getStream(_stream));
+        } else if (depth == CV_16U) {
+            medianFiltering_wavelet_matrix_gpu<uint16_t>(src, dst, kernel, src.channels(), StreamAccessor::getStream(_stream));
+        } else if (depth == CV_32F) {
+            medianFiltering_wavelet_matrix_gpu<float>(src, dst, kernel, src.channels(), StreamAccessor::getStream(_stream));
+        } else {
+            CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32F);
+        }
+#else
         CV_Assert(kernel < src.rows);
         CV_Assert(kernel < src.cols);
 
@@ -1107,6 +1137,7 @@ namespace
         devCoarseHist.setTo(0, _stream);
 
         medianFiltering_gpu(src,dst,devHist, devCoarseHist,kernel,partitions,StreamAccessor::getStream(_stream));
+# endif
     }
 }
 
diff --git a/modules/cudafilters/test/test_filters.cpp b/modules/cudafilters/test/test_filters.cpp
index bb235dad093..432b5d2a5ac 100644
--- a/modules/cudafilters/test/test_filters.cpp
+++ b/modules/cudafilters/test/test_filters.cpp
@@ -43,6 +43,7 @@
 #include "test_precomp.hpp"
 
 #ifdef HAVE_CUDA
+#include "../src/cuda/wavelet_matrix_feature_support_checks.h"
 
 namespace opencv_test { namespace {
 
@@ -647,7 +648,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Filters, MorphEx, testing::Combine(
 // Median
 
 
-PARAM_TEST_CASE(Median, cv::cuda::DeviceInfo, cv::Size, MatDepth,  KernelSize, UseRoi)
+PARAM_TEST_CASE(Median, cv::cuda::DeviceInfo, cv::Size, MatType, KernelSize, UseRoi)
 {
     cv::cuda::DeviceInfo devInfo;
     cv::Size size;
@@ -681,7 +682,7 @@ CUDA_TEST_P(Median, Accuracy)
     cv::Mat dst_gold;
     cv::medianBlur(src,dst_gold,kernel);
 
-    cv::Rect rect(kernel+1,0,src.cols-(2*kernel+1),src.rows);
+    cv::Rect rect(kernel/2, kernel/2, src.cols-(kernel-1), src.rows-(kernel-1));
     cv::Mat dst_gold_no_border = dst_gold(rect);
     cv::cuda::GpuMat dst_no_border = cv::cuda::GpuMat(dst, rect);
 
@@ -703,6 +704,17 @@ INSTANTIATE_TEST_CASE_P(CUDA_Filters, Median, testing::Combine(
     WHOLE_SUBMAT)
     );
 
-}} // namespace
+INSTANTIATE_TEST_CASE_P(CUDA_Filters_Median_HDR, Median, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(
+        MatType(CV_8UC3), MatType(CV_8UC4),
+        MatType(CV_16U), MatType(CV_16UC3), MatType(CV_16UC4),
+        MatType(CV_32F), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(KernelSize(3), KernelSize(5)),
+    WHOLE_SUBMAT)
+    );
+
 
+}} // namespace
 #endif // HAVE_CUDA