Merge pull request #3627 from TumoiYorozu:wavelet_matrix_median_filte…

…r_cuda Implemented fast median filter for CUDA using Wavelet Matrix, a constant-time, HDR-compatible method #3627 I replaced the existing CUDA implementation of the histogram-based median filter with an implementation of a new wavelet matrix-based median filter algorithm, which I presented at SIGGRAPH Asia 2022. This paper won the Best Paper Award in the journal track of technical papers (ACM Transactions on Graphics). This new algorithm, like the histogram method, has the property that the window radius does not affect the computation time, and is several times faster than the histogram method. Furthermore, while the histogram method does not support HDR and only supports 8U images, this new algorithm supports HDR and also supports 16U and 32F images. I (the author) have published the implementation on my personal GitHub and made some modifications for OpenCV to make it accessible from OpenCV. I used the CUB library, which is part of the standard toolkit since CUDA 11.0. Therefore, depending on the CUDA_VERSION, the code is written to use the new algorithm for versions 11.0 and above, and the existing histogram method for versions 10 and below. Regarding the old histogram-based code, the CPU version of the median filter supports 16U and 32F for window sizes up to 5, but it seems that the histogram CUDA version of the median filter does not. Also, the number of channels supported is different: the CPU version supports 1, 3, and 4 channels, while the CUDA version supports only 1 channel. In addition, for the CUDA version of the histogram method, pixels at the edges of the image, i.e. where the window is insufficient, were set to zero. For example, if the window size is 7, the width of the 3 pixels at the top, bottom, left, and right were not calculated correctly. When checking the tests, it was found that they compared with the CPU version by cropping the edges with rect, and also the cropping area was too wide, with 8 pixels cropped from the top, bottom, left, and right when the window size was 7. In this PR, I first corrected the rect range for the tests so that both the old histogram method and the new wavelet matrix method can pass. Also, the CUDA version now supports 16U, 32F, and multi-channel formats such as 3 and 4 channels. In addition, while the CPU version only supports window sizes up to 5 for HDR, the new CUDA Wavelet Matrix method supports sizes of 7 and above. Additionally, I have added new tests for 16U, 32F, and multi-channel formats, specifically 3 and 4 channels. Paper’s project page: [Constant Time Median Filter using 2D Wavelet Matrix | Interactive Graphics & Engineering Lab](https://cgenglab.github.io/en/publication/sigga22_wmatrix_median/) My implementation (as author): [GitHub - TumoiYorozu/WMatrixMedian](https://github.com/TumoiYorozu/WMatrixMedian) ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch ~~- [ ] There is a reference to the original bug report and related work~~ - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
opencv · May 22, 2024 · e46ba34 · e46ba34
1 parent 345371e
commit e46ba34
Show file tree

Hide file tree

Showing 7 changed files with 2,124 additions and 4 deletions.
diff --git a/modules/cudafilters/src/cuda/median_filter.cu b/modules/cudafilters/src/cuda/median_filter.cu
@@ -53,6 +53,17 @@
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
 
+
+// The CUB library is used for the Median Filter with Wavelet Matrix,
+// which has become a standard library since CUDA 11.
+#include "wavelet_matrix_feature_support_checks.h"
+#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+#include "wavelet_matrix_multi.cuh"
+#include "wavelet_matrix_2d.cuh"
+#include "wavelet_matrix_float_supporter.cuh"
+#endif
+
+
 namespace cv { namespace cuda { namespace device
 {
         __device__ void histogramAddAndSub8(int* H, const int * hist_colAdd,const int * hist_colSub){
@@ -334,4 +345,72 @@ namespace cv { namespace cuda { namespace device
 
 }}}
 
+
+#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+namespace cv { namespace cuda { namespace device
+    {
+        using namespace wavelet_matrix_median;
+
+        template<int CH_NUM, typename T>
+        void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<T> src, PtrStepSz<T> dst, int radius,cudaStream_t stream){
+
+            constexpr bool is_float = std::is_same<T, float>::value;
+            constexpr static int WORD_SIZE = 32;
+            constexpr static int ThW = (std::is_same<T, uint8_t>::value ?  8 : 4);
+            constexpr static int ThH = (std::is_same<T, uint8_t>::value ? 64 : 256);
+            using XYIdxT = uint32_t;
+            using XIdxT = uint16_t;
+            using WM_T = typename std::conditional<is_float, uint32_t, T>::type;
+            using MedianResT = typename std::conditional<is_float, T, std::nullptr_t>::type;
+            using WM2D_IMPL = WaveletMatrix2dCu5C<WM_T, CH_NUM, WaveletMatrixMultiCu4G<XIdxT, 512>, 512, WORD_SIZE>;
+
+            CV_Assert(src.cols == dst.cols);
+            CV_Assert(dst.step % sizeof(T) == 0);
+
+            WM2D_IMPL WM_cuda(src.rows, src.cols, is_float, false);
+            WM_cuda.res_cu =  reinterpret_cast<WM_T*>(dst.ptr());
+
+            const size_t line_num = src.cols * CH_NUM;
+            if (is_float) {
+                WMMedianFloatSupporter::WMMedianFloatSupporter<float, CH_NUM, XYIdxT> float_supporter(src.rows, src.cols);
+                float_supporter.alloc();
+                for (int y = 0; y < src.rows; ++y) {
+                    cudaMemcpy(float_supporter.val_in_cu + y * line_num, src.ptr(y), line_num * sizeof(T), cudaMemcpyDeviceToDevice);
+                }
+                const auto p = WM_cuda.get_nowcu_and_buf_byte_div32();
+                float_supporter.sort_and_set((XYIdxT*)p.first, p.second);
+                WM_cuda.construct(nullptr, stream, true);
+                WM_cuda.template median2d<ThW, ThH, MedianResT, false>(radius, dst.step / sizeof(T), (MedianResT*)float_supporter.get_res_table(), stream);
+            } else {
+                for (int y = 0; y < src.rows; ++y) {
+                    cudaMemcpy(WM_cuda.src_cu + y * line_num, src.ptr(y), line_num * sizeof(T), cudaMemcpyDeviceToDevice);
+                }
+                WM_cuda.construct(nullptr, stream);
+                WM_cuda.template median2d<ThW, ThH, MedianResT, false>(radius, dst.step / sizeof(T), nullptr, stream);
+            }
+            WM_cuda.res_cu = nullptr;
+            if (!stream) {
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        }
+
+        template<typename T>
+        void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<T> src, PtrStepSz<T> dst, int radius, const int num_channels, cudaStream_t stream){
+            if (num_channels == 1) {
+                medianFiltering_wavelet_matrix_gpu<1>(src, dst, radius, stream);
+            } else if (num_channels == 3) {
+                medianFiltering_wavelet_matrix_gpu<3>(src, dst, radius, stream);
+            } else if (num_channels == 4) {
+                medianFiltering_wavelet_matrix_gpu<4>(src, dst, radius, stream);
+            } else {
+                CV_Assert(num_channels == 1 || num_channels == 3 || num_channels == 4);
+            }
+        }
+
+        template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<uint8_t>  src, PtrStepSz<uint8_t>  dst, int radius, const int num_channels, cudaStream_t stream);
+        template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<uint16_t> src, PtrStepSz<uint16_t> dst, int radius, const int num_channels, cudaStream_t stream);
+        template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<float>    src, PtrStepSz<float>    dst, int radius, const int num_channels, cudaStream_t stream);
+}}}
+#endif // __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+
 #endif