opencv · asmorkalov · May 31, 2023 · Mar 30, 2022 · Mar 30, 2022 · Apr 1, 2022
diff --git a/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
@@ -62,7 +62,7 @@ namespace opencv_test
     class InRangePerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
     class Split3PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
     class Split4PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
-    class Merge3PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+    class Merge3PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
     class Merge4PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
     class RemapPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
     class FlipPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};

diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
@@ -1577,11 +1577,12 @@ PERF_TEST_P_(Merge3PerfTest, TestPerformance)
 {
     compare_f cmpF;
     cv::Size sz;
+    MatType type = -1;
     cv::GCompileArgs compile_args;
-    std::tie(cmpF, sz, compile_args) = GetParam();
+    std::tie(cmpF, sz, type, compile_args) = GetParam();
 
-    initMatsRandU(CV_8UC1, sz, CV_8UC3);
-    cv::Mat in_mat3(sz, CV_8UC1);
+    initMatsRandU(type, sz, CV_MAKETYPE(type, 3));
+    cv::Mat in_mat3(sz, type);
     cv::Scalar mean = cv::Scalar::all(127);
     cv::Scalar stddev = cv::Scalar::all(40.f);
     cv::randn(in_mat3, mean, stddev);

diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
@@ -252,6 +252,7 @@ INSTANTIATE_TEST_CASE_P(Split4PerfTestCPU, Split4PerfTest,
 INSTANTIATE_TEST_CASE_P(Merge3PerfTestCPU, Merge3PerfTest,
     Combine(Values(AbsExact().to_compare_f()),
             Values(szSmall128, szVGA, sz720p, sz1080p),
+            Values(CV_8U),
             Values(cv::compile_args(CORE_CPU))));
 
 INSTANTIATE_TEST_CASE_P(Merge4PerfTestCPU, Merge4PerfTest,

diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
@@ -253,6 +253,7 @@ INSTANTIATE_TEST_CASE_P(Split4PerfTestFluid, Split4PerfTest,
 INSTANTIATE_TEST_CASE_P(Merge3PerfTestFluid, Merge3PerfTest,
     Combine(Values(AbsExact().to_compare_f()),
             Values(szSmall128, szVGA, sz720p, sz1080p),
+            Values(CV_8U, CV_16S, CV_16U, CV_32F),
             Values(cv::compile_args(CORE_FLUID))));
 
 INSTANTIATE_TEST_CASE_P(Merge4PerfTestFluid, Merge4PerfTest,

diff --git a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
@@ -242,6 +242,7 @@ INSTANTIATE_TEST_CASE_P(Split4PerfTestGPU, Split4PerfTest,
 INSTANTIATE_TEST_CASE_P(Merge3PerfTestGPU, Merge3PerfTest,
     Combine(Values(AbsExact().to_compare_f()),
             Values( szSmall128, szVGA, sz720p, sz1080p ),
+            Values(CV_8U),
             Values(cv::compile_args(CORE_GPU))));
 
 INSTANTIATE_TEST_CASE_P(Merge4PerfTestGPU, Merge4PerfTest,

diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@@ -2320,12 +2320,15 @@ GAPI_FLUID_KERNEL(GFluidSplit3, cv::gapi::core::GSplit3, false)
 
     static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3)
     {
+        GAPI_Assert((src.meta().depth == CV_8U) && (dst1.meta().depth == CV_8U) &&
+                    (dst2.meta().depth == CV_8U) && (dst3.meta().depth == CV_8U) &&
+                    (3 == src.meta().chan));
+
         const auto *in   = src.InLine<uchar>(0);
               auto *out1 = dst1.OutLine<uchar>();
               auto *out2 = dst2.OutLine<uchar>();
               auto *out3 = dst3.OutLine<uchar>();
 
-        GAPI_Assert(3 == src.meta().chan);
         int width = src.length();
         int w = 0;
 
@@ -2348,13 +2351,16 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
 
     static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3, Buffer &dst4)
     {
+        GAPI_Assert((src.meta().depth == CV_8U) && (dst1.meta().depth == CV_8U) &&
+                    (dst2.meta().depth == CV_8U) && (dst3.meta().depth == CV_8U) &&
+                    (dst4.meta().depth == CV_8U) && (4 == src.meta().chan));
+
         const auto *in   = src.InLine<uchar>(0);
               auto *out1 = dst1.OutLine<uchar>();
               auto *out2 = dst2.OutLine<uchar>();
               auto *out3 = dst3.OutLine<uchar>();
               auto *out4 = dst4.OutLine<uchar>();
 
-        GAPI_Assert(4 == src.meta().chan);
         int width = src.length();
         int w = 0;
 
@@ -2372,31 +2378,46 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
     }
 };
 
+template<typename T>
+CV_ALWAYS_INLINE void run_merge3(Buffer& dst, const View& src1, const View& src2, const View& src3)
+{
+    const auto* in1 = src1.InLine<T>(0);
+    const auto* in2 = src2.InLine<T>(0);
+    const auto* in3 = src3.InLine<T>(0);
+    auto* out = dst.OutLine<T>();
+
+    int width = dst.length();
+    int w = 0;
+
+#if CV_SIMD
+        w = merge3_simd(in1, in2, in3, out, width);
+#endif
+
+    for (; w < width; w++)
+    {
+        out[3 * w] = in1[w];
+        out[3 * w + 1] = in2[w];
+        out[3 * w + 2] = in3[w];
+    }
+}
+
 GAPI_FLUID_KERNEL(GFluidMerge3, cv::gapi::core::GMerge3, false)
 {
     static const int Window = 1;
 
-    static void run(const View &src1, const View &src2, const View &src3, Buffer &dst)
+    static void run(const View& src1, const View& src2, const View& src3, Buffer& dst)
     {
-        const auto *in1 = src1.InLine<uchar>(0);
-        const auto *in2 = src2.InLine<uchar>(0);
-        const auto *in3 = src3.InLine<uchar>(0);
-              auto *out = dst.OutLine<uchar>();
-
-        GAPI_Assert(3 == dst.meta().chan);
-        int width = dst.length();
-        int w = 0;
+        GAPI_Assert((src1.meta().depth == dst.meta().depth) &&
+                    (src1.meta().depth == src2.meta().depth) &&
+                    (src1.meta().depth == src3.meta().depth));
 
-    #if CV_SIMD
-        w = merge3_simd(in1, in2, in3, out, width);
-    #endif
+        // SRC/DST TYPE      OP          __VA_ARGS__
+        MERGE3_(uchar,  run_merge3, dst, src1, src2, src3);
+        MERGE3_(ushort, run_merge3, dst, src1, src2, src3);
+        MERGE3_(short,  run_merge3, dst, src1, src2, src3);
+        MERGE3_(float,  run_merge3, dst, src1, src2, src3);
 
-        for (; w < width; w++)
-        {
-            out[3*w    ] = in1[w];
-            out[3*w + 1] = in2[w];
-            out[3*w + 2] = in3[w];
-        }
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
 };
 
@@ -2407,13 +2428,16 @@ GAPI_FLUID_KERNEL(GFluidMerge4, cv::gapi::core::GMerge4, false)
     static void run(const View &src1, const View &src2, const View &src3, const View &src4,
                     Buffer &dst)
     {
+        GAPI_Assert((dst.meta().depth == CV_8U) && (src1.meta().depth == CV_8U) &&
+                    (src2.meta().depth == CV_8U) && (src3.meta().depth == CV_8U) &&
+                    (4 == dst.meta().chan));
+
         const auto *in1 = src1.InLine<uchar>(0);
         const auto *in2 = src2.InLine<uchar>(0);
         const auto *in3 = src3.InLine<uchar>(0);
         const auto *in4 = src4.InLine<uchar>(0);
               auto *out = dst.OutLine<uchar>();
 
-        GAPI_Assert(4 == dst.meta().chan);
         int width = dst.length();
 
         int w = 0; // cycle counter

diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@@ -276,13 +276,21 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[],
                     CV_CPU_DISPATCH_MODES_ALL);
 }
 
-int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[],
-                uchar out[], const int width)
-{
-    CV_CPU_DISPATCH(merge3_simd, (in1, in2, in3, out, width),
-                    CV_CPU_DISPATCH_MODES_ALL);
+#define MERGE3_SIMD(T)                                              \
+int merge3_simd(const T in1[], const T in2[], const T in3[],        \
+                T out[], const int width)                           \
+{                                                                   \
+    CV_CPU_DISPATCH(merge3_simd, (in1, in2, in3, out, width),       \
+                    CV_CPU_DISPATCH_MODES_ALL);                     \
 }
 
+MERGE3_SIMD(uchar)
+MERGE3_SIMD(short)
+MERGE3_SIMD(ushort)
+MERGE3_SIMD(float)
+
+#undef MERGE3_SIMD
+
 int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
                 const uchar in4[], uchar out[], const int width)
 {

diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@@ -216,8 +216,16 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[],
 int split4_simd(const uchar in[], uchar out1[], uchar out2[],
                 uchar out3[], uchar out4[], const int width);
 
-int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[],
-               uchar out[], const int width);
+#define MERGE3_SIMD(T)                                          \
+int merge3_simd(const T in1[], const T in2[], const T in3[],    \
+                T out[], const int width);
+
+MERGE3_SIMD(uchar)
+MERGE3_SIMD(short)
+MERGE3_SIMD(ushort)
+MERGE3_SIMD(float)
+
+#undef MERGE3_SIMD
 
 int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
                 const uchar in4[], uchar out[], const int width);

diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@@ -322,12 +322,21 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[],
 int split4_simd(const uchar in[], uchar out1[], uchar out2[],
                 uchar out3[], uchar out4[], const int width);
 
-int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[],
-                uchar out[], const int width);
+#define MERGE3_SIMD(T)                                          \
+int merge3_simd(const T in1[], const T in2[], const T in3[],    \
+                T out[], const int width);
+
+MERGE3_SIMD(uchar)
+MERGE3_SIMD(short)
+MERGE3_SIMD(ushort)
+MERGE3_SIMD(float)
+
+#undef MERGE3_SIMD
 
 int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
                 const uchar in4[], uchar out[], const int width);
 
+
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 #define SRC_SHORT_OR_USHORT std::is_same<SRC, short>::value || std::is_same<SRC, ushort>::value
@@ -2530,33 +2539,41 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[],
 //
 //-------------------------
 
-int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[],
-                uchar out[], const int width)
-{
-    constexpr int nlanes = v_uint8::nlanes;
-    if (width < nlanes)
-        return 0;
-
-    int x = 0;
-    for (;;)
-    {
-        for (; x <= width - nlanes; x += nlanes)
-        {
-            v_uint8 a, b, c;
-            a = vx_load(&in1[x]);
-            b = vx_load(&in2[x]);
-            c = vx_load(&in3[x]);
-            v_store_interleave(&out[3 * x], a, b, c);
-        }
-        if (x < width)
-        {
-            x = width - nlanes;
-            continue;
-        }
-        break;
-    }
-    return x;
-}
+#define MERGE3_SIMD(T)                                              \
+int merge3_simd(const T in1[], const T in2[], const T in3[],        \
+                T out[], const int width)                           \
+{                                                                   \
+    constexpr int nlanes = vector_type_of_t<T>::nlanes;             \
+    if (width < nlanes)                                             \
+        return 0;                                                   \
+                                                                    \
+    int x = 0;                                                      \
+    for (;;)                                                        \
+    {                                                               \
+        for (; x <= width - nlanes; x += nlanes)                    \
+        {                                                           \
+            vector_type_of_t<T> a, b, c;                            \
+            a = vx_load(&in1[x]);                                   \
+            b = vx_load(&in2[x]);                                   \
+            c = vx_load(&in3[x]);                                   \
+            v_store_interleave(&out[3 * x], a, b, c);               \
+        }                                                           \
+        if (x < width)                                              \
+        {                                                           \
+            x = width - nlanes;                                     \
+            continue;                                               \
+        }                                                           \
+        break;                                                      \
+    }                                                               \
+    return x;                                                       \
+}
+
+MERGE3_SIMD(uchar)
+MERGE3_SIMD(short)
+MERGE3_SIMD(ushort)
+MERGE3_SIMD(float)
+
+#undef MERGE3_SIMD
 
 //-------------------------
 //
@@ -2926,6 +2943,8 @@ CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const SRC* inx, float* outx)
 int convertto_simd(const SRC in[], DST out[], const int length)    \
 {                                                                  \
     constexpr int nlanes = vector_type_of_t<DST>::nlanes;          \
+    if (length < nlanes)                                           \
+        return 0;                                                  \
                                                                    \
     int x = 0;                                                     \
     for (;;)                                                       \
@@ -3093,6 +3112,9 @@ int convertto_scaled_simd(const SRC in[], DST out[], const float alpha,     \
                           const float beta, const int length)               \
 {                                                                           \
     constexpr int nlanes = vector_type_of_t<DST>::nlanes;                   \
+    if (length < nlanes)                                                    \
+        return 0;                                                           \
+                                                                            \
     v_float32 v_alpha = vx_setall_f32(alpha);                               \
     v_float32 v_beta = vx_setall_f32(beta);                                 \
                                                                             \

diff --git a/modules/gapi/src/backends/fluid/gfluidutils.hpp b/modules/gapi/src/backends/fluid/gfluidutils.hpp
@@ -86,6 +86,23 @@ using cv::gapi::own::rintd;
         return;                                            \
     }
 
+#define MERGE3_(T, OP, ...)                                \
+    if (cv::DataType<T>::depth == dst.meta().depth &&      \
+        cv::DataType<T>::depth == src1.meta().depth)       \
+    {                                                      \
+        GAPI_DbgAssert(dst.length() == src1.length());     \
+        GAPI_DbgAssert(dst.length() == src2.length());     \
+        GAPI_DbgAssert(dst.length() == src3.length());     \
+                                                           \
+        GAPI_DbgAssert(1 == src1.meta().chan);             \
+        GAPI_DbgAssert(1 == src2.meta().chan);             \
+        GAPI_DbgAssert(1 == src3.meta().chan);             \
+        GAPI_DbgAssert(3 == dst.meta().chan);              \
+                                                           \
+        OP<T>(__VA_ARGS__);                                \
+        return;                                            \
+    }
+
 } // namespace fluid
 } // namespace gapi
 } // namespace cv