From d02e8baa24e5c8fa307bf7e3af1ca2594205b173 Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Mon, 29 Apr 2024 03:47:31 +0200
Subject: [PATCH 01/13] HAL mul8x8to16 added

---
 modules/core/src/arithm.cpp          | 51 +++++++++++++++++++++++-----
 modules/core/src/hal_replacement.hpp |  2 ++
 2 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index f4ca2d7da966..a3eb2911b37e 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -587,7 +587,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
 static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                       InputArray _mask, int dtype, BinaryFuncC* tab, bool muldiv=false,
-                      void* usrdata=0, int oclop=-1 )
+                      void* usrdata=0, int oclop=-1, bool skipConversion = false )
 {
     const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
     _InputArray::KindFlag kind1 = psrc1->kind(), kind2 = psrc2->kind();
@@ -717,9 +717,9 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
                usrdata, oclop, haveScalar))
 
-    BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
-    BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
-    BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
+    BinaryFunc cvtsrc1 = type1 == wtype ? 0 : (skipConversion ? nullptr : getConvertFunc(type1, wtype));
+    BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : (skipConversion ? nullptr : getConvertFunc(type2, wtype));
+    BinaryFunc cvtdst = dtype == wtype ? 0 : (skipConversion ? nullptr : getConvertFunc(wtype, dtype));
 
     size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
     size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
@@ -949,7 +949,16 @@ void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask)
 namespace cv
 {
 
-static BinaryFuncC* getMulTab()
+static void mul8uExtendWrapper(const uchar* src1, size_t step1,
+                               const uchar* src2, size_t step2,
+                               uchar* dst, size_t step, int width, int height,
+                               void* usrdata)
+{
+    double scale = *((double*)usrdata);
+    cv_hal_mul8uExtend(src1, step1, src2, step2, (ushort*)dst, step, width, height, scale);
+}
+
+static BinaryFuncC* getMulTab(bool extendMul)
 {
     static BinaryFuncC mulTab[CV_DEPTH_MAX] =
     {
@@ -958,6 +967,28 @@ static BinaryFuncC* getMulTab()
         (BinaryFuncC)cv::hal::mul64f, 0
     };
 
+    if (extendMul)
+    {
+        static BinaryFuncC extendMulTab[] =
+        {
+            (BinaryFuncC)mul8uExtendWrapper,
+        };
+
+        // check that HAL function works properly
+        uchar a = 0, b = 0;
+        ushort c = 0;
+        int res = cv_hal_mul8uExtend(/* src1_data */ &a, /* src1_step */ 1, /* src2_data */ &b, /* src2_step */ 1,
+                                     /* dst_data */ &c, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1);
+        if (res == 0)
+        {
+            return extendMulTab;
+        }
+        else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED)
+        {
+            CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8Extend returned %d (0x%08x)", res, res));
+        }
+    }
+
     return mulTab;
 }
 
@@ -986,12 +1017,16 @@ static BinaryFuncC* getRecipTab()
 }
 
 void multiply(InputArray src1, InputArray src2,
-                  OutputArray dst, double scale, int dtype)
+              OutputArray dst, double scale, int dtype)
 {
     CV_INSTRUMENT_REGION();
 
-    arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
-              true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
+    bool extendMul = ((src1.depth() == CV_8U) && (src2.depth() == CV_8U) && (dtype == CV_16U)) ||
+                     ((src1.depth() == CV_8S) && (src2.depth() == CV_8S) && (dtype == CV_16S));
+
+    arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(extendMul),
+              /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE,
+              /*skipConversion*/ extendMul);
 }
 
 void divide(InputArray src1, InputArray src2,
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index bbdfc1e180ec..a27abdf05a22 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -324,6 +324,7 @@ inline int hal_ni_mul16s(const short *src1_data, size_t src1_step, const short *
 inline int hal_ni_mul32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_mul32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul8uExtend(const uchar* src1_data, size_t src1_step, const uchar* src2_data, size_t src2_step, ushort* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 //! @}
 
 /**
@@ -378,6 +379,7 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_
 #define cv_hal_mul32s hal_ni_mul32s
 #define cv_hal_mul32f hal_ni_mul32f
 #define cv_hal_mul64f hal_ni_mul64f
+#define cv_hal_mul8uExtend hal_ni_mul8uExtend
 #define cv_hal_div8u hal_ni_div8u
 #define cv_hal_div8s hal_ni_div8s
 #define cv_hal_div16u hal_ni_div16u

From d9a108b296a5af0fc94696432d4b7855caeea489 Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Mon, 29 Apr 2024 04:07:11 +0200
Subject: [PATCH 02/13] 8s added

---
 modules/core/src/arithm.cpp          | 72 ++++++++++++++++++++--------
 modules/core/src/hal_replacement.hpp |  2 +
 2 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index a3eb2911b37e..a172008afd6c 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -958,6 +958,50 @@ static void mul8uExtendWrapper(const uchar* src1, size_t step1,
     cv_hal_mul8uExtend(src1, step1, src2, step2, (ushort*)dst, step, width, height, scale);
 }
 
+static void mul8sExtendWrapper(const uchar* src1, size_t step1,
+                               const uchar* src2, size_t step2,
+                               uchar* dst, size_t step, int width, int height,
+                               void* usrdata)
+{
+    double scale = *((double*)usrdata);
+    cv_hal_mul8sExtend((schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale);
+}
+
+static bool checkHalMulExtend()
+{
+    bool works = true;
+
+    // check that HAL functions are presented and work properly
+    uchar ua = 0, ub = 0;
+    ushort uc = 0;
+    int res;
+    res = cv_hal_mul8uExtend(/* src1_data */ &ua, /* src1_step */ 1, /* src2_data */ &ub, /* src2_step */ 1,
+                             /* dst_data */ &uc, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1);
+    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+    {
+        works = false;
+    }
+    else if (res != 0)
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8uExtend returned %d (0x%08x)", res, res));
+    }
+
+    schar sa = 0, sb = 0;
+    short sc = 0;
+    res = cv_hal_mul8sExtend(/* src1_data */ &sa, /* src1_step */ 1, /* src2_data */ &sb, /* src2_step */ 1,
+                             /* dst_data */ &sc, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1);
+    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+    {
+        works = false;
+    }
+    else if (res != 0)
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8sExtend returned %d (0x%08x)", res, res));
+    }
+
+    return works;
+}
+
 static BinaryFuncC* getMulTab(bool extendMul)
 {
     static BinaryFuncC mulTab[CV_DEPTH_MAX] =
@@ -967,29 +1011,12 @@ static BinaryFuncC* getMulTab(bool extendMul)
         (BinaryFuncC)cv::hal::mul64f, 0
     };
 
-    if (extendMul)
+    static BinaryFuncC extendMulTab[] =
     {
-        static BinaryFuncC extendMulTab[] =
-        {
-            (BinaryFuncC)mul8uExtendWrapper,
-        };
-
-        // check that HAL function works properly
-        uchar a = 0, b = 0;
-        ushort c = 0;
-        int res = cv_hal_mul8uExtend(/* src1_data */ &a, /* src1_step */ 1, /* src2_data */ &b, /* src2_step */ 1,
-                                     /* dst_data */ &c, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1);
-        if (res == 0)
-        {
-            return extendMulTab;
-        }
-        else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED)
-        {
-            CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8Extend returned %d (0x%08x)", res, res));
-        }
-    }
+        (BinaryFuncC)mul8uExtendWrapper, (BinaryFuncC)mul8sExtendWrapper,
+    };
 
-    return mulTab;
+    return extendMul ? extendMulTab : mulTab;
 }
 
 static BinaryFuncC* getDivTab()
@@ -1021,8 +1048,11 @@ void multiply(InputArray src1, InputArray src2,
 {
     CV_INSTRUMENT_REGION();
 
+    static bool halMulExtendWorks = checkHalMulExtend();
+
     bool extendMul = ((src1.depth() == CV_8U) && (src2.depth() == CV_8U) && (dtype == CV_16U)) ||
                      ((src1.depth() == CV_8S) && (src2.depth() == CV_8S) && (dtype == CV_16S));
+    extendMul = extendMul && halMulExtendWorks;
 
     arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(extendMul),
               /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE,
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index a27abdf05a22..7d7c9f1b0743 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -325,6 +325,7 @@ inline int hal_ni_mul32s(const int *src1_data, size_t src1_step, const int *src2
 inline int hal_ni_mul32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_mul8uExtend(const uchar* src1_data, size_t src1_step, const uchar* src2_data, size_t src2_step, ushort* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul8sExtend(const schar* src1_data, size_t src1_step, const schar* src2_data, size_t src2_step, short* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 //! @}
 
 /**
@@ -380,6 +381,7 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_
 #define cv_hal_mul32f hal_ni_mul32f
 #define cv_hal_mul64f hal_ni_mul64f
 #define cv_hal_mul8uExtend hal_ni_mul8uExtend
+#define cv_hal_mul8sExtend hal_ni_mul8sExtend
 #define cv_hal_div8u hal_ni_div8u
 #define cv_hal_div8s hal_ni_div8s
 #define cv_hal_div16u hal_ni_div16u

From 4c263a41f6d60c58198e158c20f3e3b183878e23 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Thu, 2 May 2024 17:47:30 +0300
Subject: [PATCH 03/13] Code review fixes.

---
 modules/core/src/arithm.cpp          | 18 +++++++++---------
 modules/core/src/hal_replacement.hpp |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index a172008afd6c..d14a2c524ae9 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -949,22 +949,22 @@ void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask)
 namespace cv
 {
 
-static void mul8uExtendWrapper(const uchar* src1, size_t step1,
+static void mul8u16uWrapper(const uchar* src1, size_t step1,
                                const uchar* src2, size_t step2,
                                uchar* dst, size_t step, int width, int height,
                                void* usrdata)
 {
     double scale = *((double*)usrdata);
-    cv_hal_mul8uExtend(src1, step1, src2, step2, (ushort*)dst, step, width, height, scale);
+    cv_hal_mul8u16u(src1, step1, src2, step2, (ushort*)dst, step, width, height, scale);
 }
 
-static void mul8sExtendWrapper(const uchar* src1, size_t step1,
+static void mul8s16sWrapper(const uchar* src1, size_t step1,
                                const uchar* src2, size_t step2,
                                uchar* dst, size_t step, int width, int height,
                                void* usrdata)
 {
     double scale = *((double*)usrdata);
-    cv_hal_mul8sExtend((schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale);
+    cv_hal_mul8s16s((schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale);
 }
 
 static bool checkHalMulExtend()
@@ -975,7 +975,7 @@ static bool checkHalMulExtend()
     uchar ua = 0, ub = 0;
     ushort uc = 0;
     int res;
-    res = cv_hal_mul8uExtend(/* src1_data */ &ua, /* src1_step */ 1, /* src2_data */ &ub, /* src2_step */ 1,
+    res = cv_hal_mul8u16u(/* src1_data */ &ua, /* src1_step */ 1, /* src2_data */ &ub, /* src2_step */ 1,
                              /* dst_data */ &uc, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1);
     if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
     {
@@ -983,12 +983,12 @@ static bool checkHalMulExtend()
     }
     else if (res != 0)
     {
-        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8uExtend returned %d (0x%08x)", res, res));
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8u16s returned %d (0x%08x)", res, res));
     }
 
     schar sa = 0, sb = 0;
     short sc = 0;
-    res = cv_hal_mul8sExtend(/* src1_data */ &sa, /* src1_step */ 1, /* src2_data */ &sb, /* src2_step */ 1,
+    res = cv_hal_mul8s16s(/* src1_data */ &sa, /* src1_step */ 1, /* src2_data */ &sb, /* src2_step */ 1,
                              /* dst_data */ &sc, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1);
     if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
     {
@@ -996,7 +996,7 @@ static bool checkHalMulExtend()
     }
     else if (res != 0)
     {
-        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8sExtend returned %d (0x%08x)", res, res));
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8s16s returned %d (0x%08x)", res, res));
     }
 
     return works;
@@ -1013,7 +1013,7 @@ static BinaryFuncC* getMulTab(bool extendMul)
 
     static BinaryFuncC extendMulTab[] =
     {
-        (BinaryFuncC)mul8uExtendWrapper, (BinaryFuncC)mul8sExtendWrapper,
+        (BinaryFuncC)mul8u16uWrapper, (BinaryFuncC)mul8s16sWrapper,
     };
 
     return extendMul ? extendMulTab : mulTab;
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index 7d7c9f1b0743..d73c0e2db8a1 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -324,8 +324,8 @@ inline int hal_ni_mul16s(const short *src1_data, size_t src1_step, const short *
 inline int hal_ni_mul32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_mul32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_mul8uExtend(const uchar* src1_data, size_t src1_step, const uchar* src2_data, size_t src2_step, ushort* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
-inline int hal_ni_mul8sExtend(const schar* src1_data, size_t src1_step, const schar* src2_data, size_t src2_step, short* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul8u16u(const uchar* src1_data, size_t src1_step, const uchar* src2_data, size_t src2_step, ushort* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul8s16s(const schar* src1_data, size_t src1_step, const schar* src2_data, size_t src2_step, short* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 //! @}
 
 /**
@@ -380,8 +380,8 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_
 #define cv_hal_mul32s hal_ni_mul32s
 #define cv_hal_mul32f hal_ni_mul32f
 #define cv_hal_mul64f hal_ni_mul64f
-#define cv_hal_mul8uExtend hal_ni_mul8uExtend
-#define cv_hal_mul8sExtend hal_ni_mul8sExtend
+#define cv_hal_mul8u16u hal_ni_mul8u16u
+#define cv_hal_mul8s16s hal_ni_mul8s16s
 #define cv_hal_div8u hal_ni_div8u
 #define cv_hal_div8s hal_ni_div8s
 #define cv_hal_div16u hal_ni_div16u

From 64ad518b6050c4c79cf242fc8a190f6c89945992 Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Mon, 6 May 2024 04:31:41 +0200
Subject: [PATCH 04/13] trying to fix HAL search

---
 modules/core/src/arithm.cpp | 39 ++-----------------------------------
 1 file changed, 2 insertions(+), 37 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index d14a2c524ae9..bcb41c751583 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -967,41 +967,6 @@ static void mul8s16sWrapper(const uchar* src1, size_t step1,
     cv_hal_mul8s16s((schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale);
 }
 
-static bool checkHalMulExtend()
-{
-    bool works = true;
-
-    // check that HAL functions are presented and work properly
-    uchar ua = 0, ub = 0;
-    ushort uc = 0;
-    int res;
-    res = cv_hal_mul8u16u(/* src1_data */ &ua, /* src1_step */ 1, /* src2_data */ &ub, /* src2_step */ 1,
-                             /* dst_data */ &uc, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1);
-    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
-    {
-        works = false;
-    }
-    else if (res != 0)
-    {
-        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8u16s returned %d (0x%08x)", res, res));
-    }
-
-    schar sa = 0, sb = 0;
-    short sc = 0;
-    res = cv_hal_mul8s16s(/* src1_data */ &sa, /* src1_step */ 1, /* src2_data */ &sb, /* src2_step */ 1,
-                             /* dst_data */ &sc, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1);
-    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
-    {
-        works = false;
-    }
-    else if (res != 0)
-    {
-        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8s16s returned %d (0x%08x)", res, res));
-    }
-
-    return works;
-}
-
 static BinaryFuncC* getMulTab(bool extendMul)
 {
     static BinaryFuncC mulTab[CV_DEPTH_MAX] =
@@ -1048,11 +1013,11 @@ void multiply(InputArray src1, InputArray src2,
 {
     CV_INSTRUMENT_REGION();
 
-    static bool halMulExtendWorks = checkHalMulExtend();
+    static bool halMul8to16available = (cv_hal_mul8u16u != hal_ni_mul8u16u) && (cv_hal_mul8s16s != hal_ni_mul8s16s);
 
     bool extendMul = ((src1.depth() == CV_8U) && (src2.depth() == CV_8U) && (dtype == CV_16U)) ||
                      ((src1.depth() == CV_8S) && (src2.depth() == CV_8S) && (dtype == CV_16S));
-    extendMul = extendMul && halMulExtendWorks;
+    extendMul = extendMul && halMul8to16available;
 
     arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(extendMul),
               /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE,

From d26f4f31a13306951abf9845795bdb05447c3045 Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Tue, 7 May 2024 11:14:51 +0200
Subject: [PATCH 05/13] ugly fix for HAL

---
 modules/core/src/arithm.cpp | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index bcb41c751583..66e96cff72c1 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -949,22 +949,40 @@ void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask)
 namespace cv
 {
 
+static BinaryFuncC* getMulTab(bool extendMul);
+
 static void mul8u16uWrapper(const uchar* src1, size_t step1,
-                               const uchar* src2, size_t step2,
-                               uchar* dst, size_t step, int width, int height,
-                               void* usrdata)
+                            const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height,
+                            void* usrdata)
 {
     double scale = *((double*)usrdata);
-    cv_hal_mul8u16u(src1, step1, src2, step2, (ushort*)dst, step, width, height, scale);
+    CALL_HAL(mul8u16u, cv_hal_mul8u16u, src1, step1, src2, step2, (ushort*)dst, step, width, height, scale);
+
+    // fallback if HAL does not work
+    Mat src1Arr(height, width, CV_8UC1, const_cast<uchar*>(src1), step1);
+    Mat src2Arr(height, width, CV_8UC1, const_cast<uchar*>(src2), step2);
+    Mat dstArr(height, width, CV_16UC1, dst, step);
+    arithm_op(src1Arr, src2Arr, dstArr, noArray(), CV_16U, getMulTab(false),
+              /* muldiv */ true, usrdata, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE,
+              /*skipConversion*/ false);
 }
 
 static void mul8s16sWrapper(const uchar* src1, size_t step1,
-                               const uchar* src2, size_t step2,
-                               uchar* dst, size_t step, int width, int height,
-                               void* usrdata)
+                            const uchar* src2, size_t step2,
+                            uchar* dst, size_t step, int width, int height,
+                            void* usrdata)
 {
     double scale = *((double*)usrdata);
-    cv_hal_mul8s16s((schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale);
+    CALL_HAL(mul8s16s, cv_hal_mul8s16s, (schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale);
+
+    // fallback if HAL does not work
+    Mat src1Arr(height, width, CV_8SC1, const_cast<uchar*>(src1), step1);
+    Mat src2Arr(height, width, CV_8SC1, const_cast<uchar*>(src2), step2);
+    Mat dstArr(height, width, CV_16SC1, dst, step);
+    arithm_op(src1Arr, src2Arr, dstArr, noArray(), CV_16S, getMulTab(false),
+              /* muldiv */ true, usrdata, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE,
+              /*skipConversion*/ false);
 }
 
 static BinaryFuncC* getMulTab(bool extendMul)

From 3fd4a015e86816431ccb74031f1a7be0bceec020 Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Tue, 7 May 2024 11:19:25 +0200
Subject: [PATCH 06/13] separate 8u and 16u

---
 modules/core/src/arithm.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 66e96cff72c1..134b89e5bce3 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1031,11 +1031,11 @@ void multiply(InputArray src1, InputArray src2,
 {
     CV_INSTRUMENT_REGION();
 
-    static bool halMul8to16available = (cv_hal_mul8u16u != hal_ni_mul8u16u) && (cv_hal_mul8s16s != hal_ni_mul8s16s);
+    static bool hal8u16uAvailable = cv_hal_mul8u16u != hal_ni_mul8u16u;
+    static bool hal8s16sAvailable = cv_hal_mul8s16s != hal_ni_mul8s16s;
 
-    bool extendMul = ((src1.depth() == CV_8U) && (src2.depth() == CV_8U) && (dtype == CV_16U)) ||
-                     ((src1.depth() == CV_8S) && (src2.depth() == CV_8S) && (dtype == CV_16S));
-    extendMul = extendMul && halMul8to16available;
+    bool extendMul = (hal8u16uAvailable && (src1.depth() == CV_8U) && (src2.depth() == CV_8U) && (dtype == CV_16U)) ||
+                     (hal8s16sAvailable && (src1.depth() == CV_8S) && (src2.depth() == CV_8S) && (dtype == CV_16S));
 
     arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(extendMul),
               /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE,

From 43be67b121f30fef66d8ef247e67e6bf2e744cb3 Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Thu, 16 May 2024 01:28:55 +0200
Subject: [PATCH 07/13] extendedFunc implemented

---
 modules/core/src/arithm.cpp | 159 ++++++++++++++++++++----------------
 1 file changed, 88 insertions(+), 71 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 134b89e5bce3..1a66937be299 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -585,9 +585,14 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
 #endif
 
+typedef int (*ExtendedTypeFunc)(const uchar* src1, size_t step1,
+                                const uchar* src2, size_t step2,
+                                uchar* dst, size_t step, int width, int height,
+                                void*);
+
 static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                       InputArray _mask, int dtype, BinaryFuncC* tab, bool muldiv=false,
-                      void* usrdata=0, int oclop=-1, bool skipConversion = false )
+                      void* usrdata=0, int oclop=-1, ExtendedTypeFunc extendedFunc = nullptr )
 {
     const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
     _InputArray::KindFlag kind1 = psrc1->kind(), kind2 = psrc2->kind();
@@ -617,9 +622,13 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
         Size sz = getContinuousSize2D(src1, src2, dst, src1.channels());
-        BinaryFuncC func = tab[depth1];
-        CV_Assert(func);
-        func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata);
+        if (extendedFunc(src1.ptr(), src1.step, src2.ptr(), src2.step,
+                         dst.ptr(), dst.step, sz.width, sz.height, usrdata) != 0)
+        {
+            BinaryFuncC func = tab[depth1];
+            CV_Assert(func);
+            func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata);
+        }
         return;
     }
 
@@ -717,9 +726,9 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
                usrdata, oclop, haveScalar))
 
-    BinaryFunc cvtsrc1 = type1 == wtype ? 0 : (skipConversion ? nullptr : getConvertFunc(type1, wtype));
-    BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : (skipConversion ? nullptr : getConvertFunc(type2, wtype));
-    BinaryFunc cvtdst = dtype == wtype ? 0 : (skipConversion ? nullptr : getConvertFunc(wtype, dtype));
+    BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
+    BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
+    BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
 
     size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
     size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
@@ -750,14 +759,22 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         _buf.allocate(bufesz*blocksize + 64);
         buf = _buf.data();
         if( cvtsrc1 )
+        {
             buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
+        }
         if( cvtsrc2 )
+        {
             buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
+        }
         wbuf = maskbuf = buf;
         if( cvtdst )
+        {
             buf = alignPtr(buf + blocksize*wsz, 16);
+        }
         if( haveMask )
+        {
             maskbuf = buf;
+        }
 
         for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
@@ -767,38 +784,44 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                 Size bszn(bsz*cn, 1);
                 const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1];
                 uchar* dptr = ptrs[2];
-                if( cvtsrc1 )
+                // try to perform operation with conversion in one call
+                // if fail, use converter functions
+                uchar* opconverted = haveMask ? maskbuf : dptr;
+                if (!extendedFunc || extendedFunc(sptr1, 1, sptr2, 1, opconverted, (!haveMask),
+                                                  bszn.width, bszn.height, usrdata) != 0)
                 {
-                    cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
-                    sptr1 = buf1;
-                }
-                if( ptrs[0] == ptrs[1] )
-                    sptr2 = sptr1;
-                else if( cvtsrc2 )
-                {
-                    cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
-                    sptr2 = buf2;
-                }
-
-                if( !haveMask && !cvtdst )
-                    func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata );
-                else
-                {
-                    func( sptr1, 1, sptr2, 1, wbuf, 0, bszn.width, bszn.height, usrdata );
-                    if( !haveMask )
-                        cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
-                    else if( !cvtdst )
+                    if( cvtsrc1 )
                     {
-                        copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[3] += bsz;
+                        cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
+                        sptr1 = buf1;
                     }
-                    else
+                    if( ptrs[0] == ptrs[1] )
                     {
-                        cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
-                        copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[3] += bsz;
+                        sptr2 = sptr1;
+                    }
+                    else if( cvtsrc2 )
+                    {
+                        cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
+                        sptr2 = buf2;
+                    }
+
+                    uchar* fdst = (haveMask || cvtdst) ? wbuf : dptr;
+                    func(sptr1, 1, sptr2, 1, fdst, (!haveMask && !cvtdst), bszn.width, bszn.height, usrdata);
+
+                    if (cvtdst)
+                    {
+                        uchar* cdst = haveMask ? maskbuf : dptr;
+                        cvtdst( wbuf, 1, 0, 1, cdst, 1, bszn, 0 );
                     }
+                    opconverted = cvtdst ? maskbuf : wbuf;
                 }
+
+                if (haveMask)
+                {
+                    copymask(opconverted, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz);
+                    ptrs[3] += bsz;
+                }
+
                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz;
             }
         }
@@ -949,43 +972,31 @@ void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask)
 namespace cv
 {
 
-static BinaryFuncC* getMulTab(bool extendMul);
-
-static void mul8u16uWrapper(const uchar* src1, size_t step1,
-                            const uchar* src2, size_t step2,
-                            uchar* dst, size_t step, int width, int height,
-                            void* usrdata)
+static int mul8u16uWrapper(const uchar* src1, size_t step1,
+                           const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height,
+                           void* usrdata)
 {
     double scale = *((double*)usrdata);
     CALL_HAL(mul8u16u, cv_hal_mul8u16u, src1, step1, src2, step2, (ushort*)dst, step, width, height, scale);
 
-    // fallback if HAL does not work
-    Mat src1Arr(height, width, CV_8UC1, const_cast<uchar*>(src1), step1);
-    Mat src2Arr(height, width, CV_8UC1, const_cast<uchar*>(src2), step2);
-    Mat dstArr(height, width, CV_16UC1, dst, step);
-    arithm_op(src1Arr, src2Arr, dstArr, noArray(), CV_16U, getMulTab(false),
-              /* muldiv */ true, usrdata, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE,
-              /*skipConversion*/ false);
+    // the fallback implementation should be used then
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-static void mul8s16sWrapper(const uchar* src1, size_t step1,
-                            const uchar* src2, size_t step2,
-                            uchar* dst, size_t step, int width, int height,
-                            void* usrdata)
+static int mul8s16sWrapper(const uchar* src1, size_t step1,
+                           const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height,
+                           void* usrdata)
 {
     double scale = *((double*)usrdata);
     CALL_HAL(mul8s16s, cv_hal_mul8s16s, (schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale);
 
-    // fallback if HAL does not work
-    Mat src1Arr(height, width, CV_8SC1, const_cast<uchar*>(src1), step1);
-    Mat src2Arr(height, width, CV_8SC1, const_cast<uchar*>(src2), step2);
-    Mat dstArr(height, width, CV_16SC1, dst, step);
-    arithm_op(src1Arr, src2Arr, dstArr, noArray(), CV_16S, getMulTab(false),
-              /* muldiv */ true, usrdata, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE,
-              /*skipConversion*/ false);
+    // the fallback implementation should be used then
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-static BinaryFuncC* getMulTab(bool extendMul)
+static BinaryFuncC* getMulTab()
 {
     static BinaryFuncC mulTab[CV_DEPTH_MAX] =
     {
@@ -994,12 +1005,23 @@ static BinaryFuncC* getMulTab(bool extendMul)
         (BinaryFuncC)cv::hal::mul64f, 0
     };
 
-    static BinaryFuncC extendMulTab[] =
-    {
-        (BinaryFuncC)mul8u16uWrapper, (BinaryFuncC)mul8s16sWrapper,
-    };
+    return mulTab;
+}
 
-    return extendMul ? extendMulTab : mulTab;
+static ExtendedTypeFunc getMulExtFunc(int src1Type, int src2Type, int dstType)
+{
+    if (src1Type == CV_8U && src2Type == CV_8U && dstType == CV_16U)
+    {
+        return mul8u16uWrapper;
+    }
+    else if (src1Type == CV_8U && src2Type == CV_8S && dstType == CV_16S)
+    {
+        return mul8s16sWrapper;
+    }
+    else
+    {
+        return nullptr;
+    }
 }
 
 static BinaryFuncC* getDivTab()
@@ -1031,15 +1053,10 @@ void multiply(InputArray src1, InputArray src2,
 {
     CV_INSTRUMENT_REGION();
 
-    static bool hal8u16uAvailable = cv_hal_mul8u16u != hal_ni_mul8u16u;
-    static bool hal8s16sAvailable = cv_hal_mul8s16s != hal_ni_mul8s16s;
-
-    bool extendMul = (hal8u16uAvailable && (src1.depth() == CV_8U) && (src2.depth() == CV_8U) && (dtype == CV_16U)) ||
-                     (hal8s16sAvailable && (src1.depth() == CV_8S) && (src2.depth() == CV_8S) && (dtype == CV_16S));
-
-    arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(extendMul),
+    ExtendedTypeFunc mulExtFunc = getMulExtFunc(src1.depth(), src2.depth(), dtype);
+    arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
               /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE,
-              /*skipConversion*/ extendMul);
+              /* extendedFunc */ mulExtFunc );
 }
 
 void divide(InputArray src1, InputArray src2,

From 703fb46c0edfcddf88b2af2bd4ad9666cf80caff Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Thu, 16 May 2024 01:40:35 +0200
Subject: [PATCH 08/13] compile fix

---
 modules/core/src/arithm.cpp | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 1a66937be299..f12e30efd4ba 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -977,11 +977,15 @@ static int mul8u16uWrapper(const uchar* src1, size_t step1,
                            uchar* dst, size_t step, int width, int height,
                            void* usrdata)
 {
-    double scale = *((double*)usrdata);
-    CALL_HAL(mul8u16u, cv_hal_mul8u16u, src1, step1, src2, step2, (ushort*)dst, step, width, height, scale);
-
-    // the fallback implementation should be used then
-    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    double scale = *((double *)usrdata);
+    int res = cv_hal_mul8u16u(src1, step1, src2, step2, (ushort *)dst, step, width, height, scale);
+    if (res == 0 || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8u16u ==> " CVAUX_STR(cv_hal_mul8u16u)
+                                           " returned %d (0x%08x)", res, res));
+    }
 }
 
 static int mul8s16sWrapper(const uchar* src1, size_t step1,
@@ -990,10 +994,14 @@ static int mul8s16sWrapper(const uchar* src1, size_t step1,
                            void* usrdata)
 {
     double scale = *((double*)usrdata);
-    CALL_HAL(mul8s16s, cv_hal_mul8s16s, (schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale);
-
-    // the fallback implementation should be used then
-    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    int res = cv_hal_mul8s16s((schar *)src1, step1, (schar *)src2, step2, (short *)dst, step, width, height, scale);
+    if (res == 0 || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8s16s ==> " CVAUX_STR(cv_hal_mul8s16s)
+                                           " returned %d (0x%08x)", res, res));
+    }
 }
 
 static BinaryFuncC* getMulTab()

From 7db6d65068d08e4ec1fa344b4def450025c58c11 Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Thu, 16 May 2024 01:41:11 +0200
Subject: [PATCH 09/13] minor

---
 modules/core/src/arithm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index f12e30efd4ba..c13b8c84edee 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -977,7 +977,7 @@ static int mul8u16uWrapper(const uchar* src1, size_t step1,
                            uchar* dst, size_t step, int width, int height,
                            void* usrdata)
 {
-    double scale = *((double *)usrdata);
+    double scale = *((double*)usrdata);
     int res = cv_hal_mul8u16u(src1, step1, src2, step2, (ushort *)dst, step, width, height, scale);
     if (res == 0 || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
         return res;

From 59e41eee3fa5533102ebb29a00ad6c14712189a7 Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Thu, 16 May 2024 10:13:44 +0200
Subject: [PATCH 10/13] fixes

---
 modules/core/src/arithm.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index c13b8c84edee..7251f34da688 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -622,8 +622,8 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
         Size sz = getContinuousSize2D(src1, src2, dst, src1.channels());
-        if (extendedFunc(src1.ptr(), src1.step, src2.ptr(), src2.step,
-                         dst.ptr(), dst.step, sz.width, sz.height, usrdata) != 0)
+        if (!extendedFunc || extendedFunc(src1.ptr(), src1.step, src2.ptr(), src2.step,
+                                          dst.ptr(), dst.step, sz.width, sz.height, usrdata) != 0)
         {
             BinaryFuncC func = tab[depth1];
             CV_Assert(func);
@@ -979,7 +979,7 @@ static int mul8u16uWrapper(const uchar* src1, size_t step1,
 {
     double scale = *((double*)usrdata);
     int res = cv_hal_mul8u16u(src1, step1, src2, step2, (ushort *)dst, step, width, height, scale);
-    if (res == 0 || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
         return res;
     else
     {
@@ -995,7 +995,7 @@ static int mul8s16sWrapper(const uchar* src1, size_t step1,
 {
     double scale = *((double*)usrdata);
     int res = cv_hal_mul8s16s((schar *)src1, step1, (schar *)src2, step2, (short *)dst, step, width, height, scale);
-    if (res == 0 || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
         return res;
     else
     {

From 17ae223de280dba9ac8e5babe60634eaac101399 Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Thu, 16 May 2024 17:27:50 +0200
Subject: [PATCH 11/13] fixed scalar mode

---
 modules/core/src/arithm.cpp | 63 +++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 24 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 7251f34da688..8de5276c18aa 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -811,7 +811,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                     if (cvtdst)
                     {
                         uchar* cdst = haveMask ? maskbuf : dptr;
-                        cvtdst( wbuf, 1, 0, 1, cdst, 1, bszn, 0 );
+                        cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0);
                     }
                     opconverted = cvtdst ? maskbuf : wbuf;
                 }
@@ -837,13 +837,19 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         _buf.allocate(bufesz*blocksize + 64);
         buf = _buf.data();
         if( cvtsrc1 )
-            buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
+        {
+            buf1 = buf, buf = alignPtr(buf + blocksize * wsz, 16);
+        }
         buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16);
         wbuf = maskbuf = buf;
         if( cvtdst )
-            buf = alignPtr(buf + blocksize*wsz, 16);
+        {
+            buf = alignPtr(buf + blocksize * wsz, 16);
+        }
         if( haveMask )
+        {
             maskbuf = buf;
+        }
 
         convertAndUnrollScalar( src2, wtype, buf2, blocksize);
 
@@ -857,34 +863,43 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                 const uchar* sptr2 = buf2;
                 uchar* dptr = ptrs[1];
 
-                if( cvtsrc1 )
-                {
-                    cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
-                    sptr1 = buf1;
-                }
-
+                const uchar* extSptr1 = sptr1;
+                const uchar* extSptr2 = sptr2;
                 if( swapped12 )
-                    std::swap(sptr1, sptr2);
-
-                if( !haveMask && !cvtdst )
-                    func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata );
-                else
+                    std::swap(extSptr1, extSptr1);
+                
+                // try to perform operation with conversion in one call
+                // if fail, use converter functions
+                uchar* opconverted = haveMask ? maskbuf : dptr;
+                if (!extendedFunc || extendedFunc(extSptr1, 1, extSptr2, 1, opconverted, 1,
+                                                  bszn.width, bszn.height, usrdata) != 0)
                 {
-                    func( sptr1, 1, sptr2, 1, wbuf, 1, bszn.width, bszn.height, usrdata );
-                    if( !haveMask )
-                        cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
-                    else if( !cvtdst )
+                    if( cvtsrc1 )
                     {
-                        copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[2] += bsz;
+                        cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
+                        sptr1 = buf1;
                     }
-                    else
+
+                    if( swapped12 )
+                        std::swap(sptr1, sptr2);
+
+                    uchar* fdst = ( haveMask || cvtdst ) ? wbuf : dptr;
+                    func( sptr1, 1, sptr2, 1, fdst, 1, bszn.width, bszn.height, usrdata );
+
+                    if (cvtdst)
                     {
-                        cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
-                        copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[2] += bsz;
+                        uchar* cdst = haveMask ? maskbuf : dptr;
+                        cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0);
                     }
+                    opconverted = cvtdst ? maskbuf : wbuf;
+                }
+
+                if (haveMask)
+                {
+                    copymask(opconverted, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz);
+                    ptrs[2] += bsz;
                 }
+                
                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
             }
         }

From 104ff32782723c9d28b1c522cb36ac14c1af89ef Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Thu, 16 May 2024 18:12:50 +0200
Subject: [PATCH 12/13] minor

---
 modules/core/src/arithm.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 8de5276c18aa..fbd425aacf49 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -867,7 +867,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                 const uchar* extSptr2 = sptr2;
                 if( swapped12 )
                     std::swap(extSptr1, extSptr1);
-                
+
                 // try to perform operation with conversion in one call
                 // if fail, use converter functions
                 uchar* opconverted = haveMask ? maskbuf : dptr;
@@ -899,7 +899,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                     copymask(opconverted, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz);
                     ptrs[2] += bsz;
                 }
-                
+
                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
             }
         }

From 8545784f088f26c15a7a60495748cd2d9764bf12 Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Thu, 16 May 2024 18:13:42 +0200
Subject: [PATCH 13/13] default dtype fix

---
 modules/core/src/arithm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index fbd425aacf49..5a189867c2ea 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1076,7 +1076,7 @@ void multiply(InputArray src1, InputArray src2,
 {
     CV_INSTRUMENT_REGION();
 
-    ExtendedTypeFunc mulExtFunc = getMulExtFunc(src1.depth(), src2.depth(), dtype);
+    ExtendedTypeFunc mulExtFunc = getMulExtFunc(src1.depth(), src2.depth(), dtype < 0 ? dst.depth() : dtype);
     arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
               /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE,
               /* extendedFunc */ mulExtFunc );