From 69af621ef63a508ab9d1abbfa259a6876622e71e Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <savuor@gmail.com>
Date: Mon, 20 May 2024 09:43:18 +0200
Subject: [PATCH] Merge pull request #25506 from savuor:rv/hal_mul16

HAL mul8x8to16 added #25506

Fixes #25034

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
---
 modules/core/src/arithm.cpp          | 196 +++++++++++++++++++--------
 modules/core/src/hal_replacement.hpp |   4 +
 2 files changed, 146 insertions(+), 54 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index f4ca2d7da966..5a189867c2ea 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -585,9 +585,14 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
 #endif
 
+typedef int (*ExtendedTypeFunc)(const uchar* src1, size_t step1,
+                                const uchar* src2, size_t step2,
+                                uchar* dst, size_t step, int width, int height,
+                                void*);
+
 static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                       InputArray _mask, int dtype, BinaryFuncC* tab, bool muldiv=false,
-                      void* usrdata=0, int oclop=-1 )
+                      void* usrdata=0, int oclop=-1, ExtendedTypeFunc extendedFunc = nullptr )
 {
     const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
     _InputArray::KindFlag kind1 = psrc1->kind(), kind2 = psrc2->kind();
@@ -617,9 +622,13 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
         Size sz = getContinuousSize2D(src1, src2, dst, src1.channels());
-        BinaryFuncC func = tab[depth1];
-        CV_Assert(func);
-        func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata);
+        if (!extendedFunc || extendedFunc(src1.ptr(), src1.step, src2.ptr(), src2.step,
+                                          dst.ptr(), dst.step, sz.width, sz.height, usrdata) != 0)
+        {
+            BinaryFuncC func = tab[depth1];
+            CV_Assert(func);
+            func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata);
+        }
         return;
     }
 
@@ -750,14 +759,22 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         _buf.allocate(bufesz*blocksize + 64);
         buf = _buf.data();
         if( cvtsrc1 )
+        {
             buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
+        }
         if( cvtsrc2 )
+        {
             buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
+        }
         wbuf = maskbuf = buf;
         if( cvtdst )
+        {
             buf = alignPtr(buf + blocksize*wsz, 16);
+        }
         if( haveMask )
+        {
             maskbuf = buf;
+        }
 
         for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
@@ -767,38 +784,44 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                 Size bszn(bsz*cn, 1);
                 const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1];
                 uchar* dptr = ptrs[2];
-                if( cvtsrc1 )
-                {
-                    cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
-                    sptr1 = buf1;
-                }
-                if( ptrs[0] == ptrs[1] )
-                    sptr2 = sptr1;
-                else if( cvtsrc2 )
-                {
-                    cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
-                    sptr2 = buf2;
-                }
-
-                if( !haveMask && !cvtdst )
-                    func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata );
-                else
+                // try to perform operation with conversion in one call
+                // if fail, use converter functions
+                uchar* opconverted = haveMask ? maskbuf : dptr;
+                if (!extendedFunc || extendedFunc(sptr1, 1, sptr2, 1, opconverted, (!haveMask),
+                                                  bszn.width, bszn.height, usrdata) != 0)
                 {
-                    func( sptr1, 1, sptr2, 1, wbuf, 0, bszn.width, bszn.height, usrdata );
-                    if( !haveMask )
-                        cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
-                    else if( !cvtdst )
+                    if( cvtsrc1 )
                     {
-                        copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[3] += bsz;
+                        cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
+                        sptr1 = buf1;
                     }
-                    else
+                    if( ptrs[0] == ptrs[1] )
                     {
-                        cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
-                        copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[3] += bsz;
+                        sptr2 = sptr1;
                     }
+                    else if( cvtsrc2 )
+                    {
+                        cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
+                        sptr2 = buf2;
+                    }
+
+                    uchar* fdst = (haveMask || cvtdst) ? wbuf : dptr;
+                    func(sptr1, 1, sptr2, 1, fdst, (!haveMask && !cvtdst), bszn.width, bszn.height, usrdata);
+
+                    if (cvtdst)
+                    {
+                        uchar* cdst = haveMask ? maskbuf : dptr;
+                        cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0);
+                    }
+                    opconverted = cvtdst ? maskbuf : wbuf;
+                }
+
+                if (haveMask)
+                {
+                    copymask(opconverted, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz);
+                    ptrs[3] += bsz;
                 }
+
                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz;
             }
         }
@@ -814,13 +837,19 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         _buf.allocate(bufesz*blocksize + 64);
         buf = _buf.data();
         if( cvtsrc1 )
-            buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
+        {
+            buf1 = buf, buf = alignPtr(buf + blocksize * wsz, 16);
+        }
         buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16);
         wbuf = maskbuf = buf;
         if( cvtdst )
-            buf = alignPtr(buf + blocksize*wsz, 16);
+        {
+            buf = alignPtr(buf + blocksize * wsz, 16);
+        }
         if( haveMask )
+        {
             maskbuf = buf;
+        }
 
         convertAndUnrollScalar( src2, wtype, buf2, blocksize);
 
@@ -834,34 +863,43 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                 const uchar* sptr2 = buf2;
                 uchar* dptr = ptrs[1];
 
-                if( cvtsrc1 )
-                {
-                    cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
-                    sptr1 = buf1;
-                }
-
+                const uchar* extSptr1 = sptr1;
+                const uchar* extSptr2 = sptr2;
                 if( swapped12 )
-                    std::swap(sptr1, sptr2);
+                    std::swap(extSptr1, extSptr1);
 
-                if( !haveMask && !cvtdst )
-                    func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata );
-                else
+                // try to perform operation with conversion in one call
+                // if fail, use converter functions
+                uchar* opconverted = haveMask ? maskbuf : dptr;
+                if (!extendedFunc || extendedFunc(extSptr1, 1, extSptr2, 1, opconverted, 1,
+                                                  bszn.width, bszn.height, usrdata) != 0)
                 {
-                    func( sptr1, 1, sptr2, 1, wbuf, 1, bszn.width, bszn.height, usrdata );
-                    if( !haveMask )
-                        cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
-                    else if( !cvtdst )
+                    if( cvtsrc1 )
                     {
-                        copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[2] += bsz;
+                        cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
+                        sptr1 = buf1;
                     }
-                    else
+
+                    if( swapped12 )
+                        std::swap(sptr1, sptr2);
+
+                    uchar* fdst = ( haveMask || cvtdst ) ? wbuf : dptr;
+                    func( sptr1, 1, sptr2, 1, fdst, 1, bszn.width, bszn.height, usrdata );
+
+                    if (cvtdst)
                     {
-                        cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
-                        copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[2] += bsz;
+                        uchar* cdst = haveMask ? maskbuf : dptr;
+                        cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0);
                     }
+                    opconverted = cvtdst ? maskbuf : wbuf;
+                }
+
+                if (haveMask)
+                {
+                    copymask(opconverted, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz);
+                    ptrs[2] += bsz;
                 }
+
                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
             }
         }
@@ -949,6 +987,38 @@ void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask)
 namespace cv
 {
 
+static int mul8u16uWrapper(const uchar* src1, size_t step1,
+                           const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height,
+                           void* usrdata)
+{
+    double scale = *((double*)usrdata);
+    int res = cv_hal_mul8u16u(src1, step1, src2, step2, (ushort *)dst, step, width, height, scale);
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8u16u ==> " CVAUX_STR(cv_hal_mul8u16u)
+                                           " returned %d (0x%08x)", res, res));
+    }
+}
+
+static int mul8s16sWrapper(const uchar* src1, size_t step1,
+                           const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height,
+                           void* usrdata)
+{
+    double scale = *((double*)usrdata);
+    int res = cv_hal_mul8s16s((schar *)src1, step1, (schar *)src2, step2, (short *)dst, step, width, height, scale);
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8s16s ==> " CVAUX_STR(cv_hal_mul8s16s)
+                                           " returned %d (0x%08x)", res, res));
+    }
+}
+
 static BinaryFuncC* getMulTab()
 {
     static BinaryFuncC mulTab[CV_DEPTH_MAX] =
@@ -961,6 +1031,22 @@ static BinaryFuncC* getMulTab()
     return mulTab;
 }
 
+static ExtendedTypeFunc getMulExtFunc(int src1Type, int src2Type, int dstType)
+{
+    if (src1Type == CV_8U && src2Type == CV_8U && dstType == CV_16U)
+    {
+        return mul8u16uWrapper;
+    }
+    else if (src1Type == CV_8U && src2Type == CV_8S && dstType == CV_16S)
+    {
+        return mul8s16sWrapper;
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
 static BinaryFuncC* getDivTab()
 {
     static BinaryFuncC divTab[CV_DEPTH_MAX] =
@@ -986,12 +1072,14 @@ static BinaryFuncC* getRecipTab()
 }
 
 void multiply(InputArray src1, InputArray src2,
-                  OutputArray dst, double scale, int dtype)
+              OutputArray dst, double scale, int dtype)
 {
     CV_INSTRUMENT_REGION();
 
+    ExtendedTypeFunc mulExtFunc = getMulExtFunc(src1.depth(), src2.depth(), dtype < 0 ? dst.depth() : dtype);
     arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
-              true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
+              /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE,
+              /* extendedFunc */ mulExtFunc );
 }
 
 void divide(InputArray src1, InputArray src2,
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index bbdfc1e180ec..d73c0e2db8a1 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -324,6 +324,8 @@ inline int hal_ni_mul16s(const short *src1_data, size_t src1_step, const short *
 inline int hal_ni_mul32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_mul32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul8u16u(const uchar* src1_data, size_t src1_step, const uchar* src2_data, size_t src2_step, ushort* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul8s16s(const schar* src1_data, size_t src1_step, const schar* src2_data, size_t src2_step, short* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 //! @}
 
 /**
@@ -378,6 +380,8 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_
 #define cv_hal_mul32s hal_ni_mul32s
 #define cv_hal_mul32f hal_ni_mul32f
 #define cv_hal_mul64f hal_ni_mul64f
+#define cv_hal_mul8u16u hal_ni_mul8u16u
+#define cv_hal_mul8s16s hal_ni_mul8s16s
 #define cv_hal_div8u hal_ni_div8u
 #define cv_hal_div8s hal_ni_div8s
 #define cv_hal_div16u hal_ni_div16u