From eb783fea4dbcf1469b7672aafa96f3827ccdbe0d Mon Sep 17 00:00:00 2001
From: Rostislav Vasilikhin <rostislav.vasilikhin@opencv.ai>
Date: Thu, 16 May 2024 18:11:58 +0200
Subject: [PATCH] HAL added for sub8x32f

---
 modules/core/src/arithm.cpp | 83 ++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 39 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index bae8fd0cde90..fffffa3d251f 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -584,6 +584,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 }
 
 #endif
+
 typedef int (*ExtendedTypeFunc)(const uchar* src1, size_t step1,
                                 const uchar* src2, size_t step2,
                                 uchar* dst, size_t step, int width, int height,
@@ -866,7 +867,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                 const uchar* extSptr2 = sptr2;
                 if( swapped12 )
                     std::swap(extSptr1, extSptr1);
-                
+
                 // try to perform operation with conversion in one call
                 // if fail, use converter functions
                 uchar* opconverted = haveMask ? maskbuf : dptr;
@@ -898,7 +899,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                     copymask(opconverted, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz);
                     ptrs[2] += bsz;
                 }
-                
+
                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
             }
         }
@@ -919,35 +920,33 @@ static BinaryFuncC* getAddTab()
     return addTab;
 }
 
-static BinaryFuncC* getSubTab(bool extendSub);
-
-static void sub8u32fWrapper(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
-                            uchar* dst, size_t step, int width, int height, void* )
+static int sub8u32fWrapper(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height, void* )
 {
-    CALL_HAL(sub8u32f, cv_hal_sub8u32f, src1, step1, src2, step2, (float*)dst, step, width, height);
-
-    // fallback if HAL does not work
-    Mat src1Arr(height, width, CV_8UC1, const_cast<uchar*>(src1), step1);
-    Mat src2Arr(height, width, CV_8UC1, const_cast<uchar*>(src2), step2);
-    Mat dstArr(height, width, CV_32FC1, dst, step);
-    arithm_op(src1Arr, src2Arr, dstArr, noArray(), CV_32F, getSubTab(false),
-              /* muldiv */ false, 0, OCL_OP_SUB, /* skipConversion */ false);
+    int res = cv_hal_sub8u32f(src1, step1, src2, step2, (float *)dst, step, width, height);
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation sub8u32f ==> " CVAUX_STR(cv_hal_sub8u32f)
+                                           " returned %d (0x%08x)", res, res));
+    }
 }
 
-static void sub8s32fWrapper(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
-                            uchar* dst, size_t step, int width, int height, void* )
+static int sub8s32fWrapper(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                           uchar* dst, size_t step, int width, int height, void* )
 {
-    CALL_HAL(sub8s32f, cv_hal_sub8s32f, (schar*)src1, step1, (schar*)src2, step2, (float*)dst, step, width, height);
-
-    // fallback if HAL does not work
-    Mat src1Arr(height, width, CV_8SC1, const_cast<uchar*>(src1), step1);
-    Mat src2Arr(height, width, CV_8SC1, const_cast<uchar*>(src2), step2);
-    Mat dstArr(height, width, CV_32FC1, dst, step);
-    arithm_op(src1Arr, src2Arr, dstArr, noArray(), CV_32F, getSubTab(false),
-              /* muldiv */ false, 0, OCL_OP_SUB, /* skipConversion */ false);
+    int res = cv_hal_sub8s32f((schar*)src1, step1, (schar*)src2, step2, (float *)dst, step, width, height);
+    if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return res;
+    else
+    {
+        CV_Error_(cv::Error::StsInternal, ("HAL implementation sub8s32f ==> " CVAUX_STR(cv_hal_sub8s32f)
+                                           " returned %d (0x%08x)", res, res));
+    }
 }
 
-static BinaryFuncC* getSubTab(bool extendSub)
+static BinaryFuncC* getSubTab()
 {
     static BinaryFuncC subTab[CV_DEPTH_MAX] =
     {
@@ -958,12 +957,23 @@ static BinaryFuncC* getSubTab(bool extendSub)
         0
     };
 
-    static BinaryFuncC extendSubTab[] =
-    {
-        (BinaryFuncC)sub8u32fWrapper, (BinaryFuncC)sub8s32fWrapper,
-    };
+    return subTab;
+}
 
-    return extendSub ? extendSubTab : subTab;
+static ExtendedTypeFunc getSubExtFunc(int src1Type, int src2Type, int dstType)
+{
+    if (src1Type == CV_8U && src2Type == CV_8U && dstType == CV_32F)
+    {
+        return sub8u32fWrapper;
+    }
+    else if (src1Type == CV_8S && src2Type == CV_8S && dstType == CV_32F)
+    {
+        return sub8s32fWrapper;
+    }
+    else
+    {
+        return nullptr;
+    }
 }
 
 static BinaryFuncC* getAbsDiffTab()
@@ -991,18 +1001,13 @@ void cv::add( InputArray src1, InputArray src2, OutputArray dst,
 }
 
 void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst,
-               InputArray mask, int dtype )
+                   InputArray mask, int dtype )
 {
     CV_INSTRUMENT_REGION();
 
-    static bool hal8u32fAvailable = cv_hal_sub8u32f != hal_ni_sub8u32f;
-    static bool hal8s32fAvailable = cv_hal_sub8s32f != hal_ni_sub8s32f;
-
-    bool extendSub = (hal8u32fAvailable && (_src1.depth() == CV_8U) && (_src2.depth() == CV_8U) && (dtype == CV_32F)) ||
-                     (hal8s32fAvailable && (_src1.depth() == CV_8S) && (_src2.depth() == CV_8S) && (dtype == CV_32F));
-
-    arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(extendSub), false, 0, OCL_OP_SUB,
-              /* skipConversion */ extendSub);
+    ExtendedTypeFunc subExtFunc = getSubExtFunc(_src1.depth(), _src2.depth(), dtype < 0 ? _dst.depth() : dtype);
+    arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB,
+              /* extendedFunc */ subExtFunc);
 }
 
 void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )