From d02e8baa24e5c8fa307bf7e3af1ca2594205b173 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Mon, 29 Apr 2024 03:47:31 +0200 Subject: [PATCH 01/13] HAL mul8x8to16 added --- modules/core/src/arithm.cpp | 51 +++++++++++++++++++++++----- modules/core/src/hal_replacement.hpp | 2 ++ 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index f4ca2d7da966..a3eb2911b37e 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -587,7 +587,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, int dtype, BinaryFuncC* tab, bool muldiv=false, - void* usrdata=0, int oclop=-1 ) + void* usrdata=0, int oclop=-1, bool skipConversion = false ) { const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; _InputArray::KindFlag kind1 = psrc1->kind(), kind2 = psrc2->kind(); @@ -717,9 +717,9 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype, usrdata, oclop, haveScalar)) - BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype); - BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype); - BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype); + BinaryFunc cvtsrc1 = type1 == wtype ? 0 : (skipConversion ? nullptr : getConvertFunc(type1, wtype)); + BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : (skipConversion ? nullptr : getConvertFunc(type2, wtype)); + BinaryFunc cvtdst = dtype == wtype ? 0 : (skipConversion ? nullptr : getConvertFunc(wtype, dtype)); size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2); size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype); @@ -949,7 +949,16 @@ void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask) namespace cv { -static BinaryFuncC* getMulTab() +static void mul8uExtendWrapper(const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void* usrdata) +{ + double scale = *((double*)usrdata); + cv_hal_mul8uExtend(src1, step1, src2, step2, (ushort*)dst, step, width, height, scale); +} + +static BinaryFuncC* getMulTab(bool extendMul) { static BinaryFuncC mulTab[CV_DEPTH_MAX] = { @@ -958,6 +967,28 @@ static BinaryFuncC* getMulTab() (BinaryFuncC)cv::hal::mul64f, 0 }; + if (extendMul) + { + static BinaryFuncC extendMulTab[] = + { + (BinaryFuncC)mul8uExtendWrapper, + }; + + // check that HAL function works properly + uchar a = 0, b = 0; + ushort c = 0; + int res = cv_hal_mul8uExtend(/* src1_data */ &a, /* src1_step */ 1, /* src2_data */ &b, /* src2_step */ 1, + /* dst_data */ &c, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1); + if (res == 0) + { + return extendMulTab; + } + else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) + { + CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8Extend returned %d (0x%08x)", res, res)); + } + } + return mulTab; } @@ -986,12 +1017,16 @@ static BinaryFuncC* getRecipTab() } void multiply(InputArray src1, InputArray src2, - OutputArray dst, double scale, int dtype) + OutputArray dst, double scale, int dtype) { CV_INSTRUMENT_REGION(); - arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), - true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE); + bool extendMul = ((src1.depth() == CV_8U) && (src2.depth() == CV_8U) && (dtype == CV_16U)) || + ((src1.depth() == CV_8S) && (src2.depth() == CV_8S) && (dtype == CV_16S)); + + arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(extendMul), + /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE, + /*skipConversion*/ extendMul); } void divide(InputArray src1, InputArray src2, diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp index bbdfc1e180ec..a27abdf05a22 100644 --- a/modules/core/src/hal_replacement.hpp +++ b/modules/core/src/hal_replacement.hpp @@ -324,6 +324,7 @@ inline int hal_ni_mul16s(const short *src1_data, size_t src1_step, const short * inline int hal_ni_mul32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } inline int hal_ni_mul32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_mul8uExtend(const uchar* src1_data, size_t src1_step, const uchar* src2_data, size_t src2_step, ushort* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } //! @} /** @@ -378,6 +379,7 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_ #define cv_hal_mul32s hal_ni_mul32s #define cv_hal_mul32f hal_ni_mul32f #define cv_hal_mul64f hal_ni_mul64f +#define cv_hal_mul8uExtend hal_ni_mul8uExtend #define cv_hal_div8u hal_ni_div8u #define cv_hal_div8s hal_ni_div8s #define cv_hal_div16u hal_ni_div16u From d9a108b296a5af0fc94696432d4b7855caeea489 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Mon, 29 Apr 2024 04:07:11 +0200 Subject: [PATCH 02/13] 8s added --- modules/core/src/arithm.cpp | 72 ++++++++++++++++++++-------- modules/core/src/hal_replacement.hpp | 2 + 2 files changed, 53 insertions(+), 21 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index a3eb2911b37e..a172008afd6c 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -958,6 +958,50 @@ static void mul8uExtendWrapper(const uchar* src1, size_t step1, cv_hal_mul8uExtend(src1, step1, src2, step2, (ushort*)dst, step, width, height, scale); } +static void mul8sExtendWrapper(const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void* usrdata) +{ + double scale = *((double*)usrdata); + cv_hal_mul8sExtend((schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale); +} + +static bool checkHalMulExtend() +{ + bool works = true; + + // check that HAL functions are presented and work properly + uchar ua = 0, ub = 0; + ushort uc = 0; + int res; + res = cv_hal_mul8uExtend(/* src1_data */ &ua, /* src1_step */ 1, /* src2_data */ &ub, /* src2_step */ 1, + /* dst_data */ &uc, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1); + if (res == CV_HAL_ERROR_NOT_IMPLEMENTED) + { + works = false; + } + else if (res != 0) + { + CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8uExtend returned %d (0x%08x)", res, res)); + } + + schar sa = 0, sb = 0; + short sc = 0; + res = cv_hal_mul8sExtend(/* src1_data */ &sa, /* src1_step */ 1, /* src2_data */ &sb, /* src2_step */ 1, + /* dst_data */ &sc, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1); + if (res == CV_HAL_ERROR_NOT_IMPLEMENTED) + { + works = false; + } + else if (res != 0) + { + CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8sExtend returned %d (0x%08x)", res, res)); + } + + return works; +} + static BinaryFuncC* getMulTab(bool extendMul) { static BinaryFuncC mulTab[CV_DEPTH_MAX] = @@ -967,29 +1011,12 @@ static BinaryFuncC* getMulTab(bool extendMul) (BinaryFuncC)cv::hal::mul64f, 0 }; - if (extendMul) + static BinaryFuncC extendMulTab[] = { - static BinaryFuncC extendMulTab[] = - { - (BinaryFuncC)mul8uExtendWrapper, - }; - - // check that HAL function works properly - uchar a = 0, b = 0; - ushort c = 0; - int res = cv_hal_mul8uExtend(/* src1_data */ &a, /* src1_step */ 1, /* src2_data */ &b, /* src2_step */ 1, - /* dst_data */ &c, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1); - if (res == 0) - { - return extendMulTab; - } - else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) - { - CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8Extend returned %d (0x%08x)", res, res)); - } - } + (BinaryFuncC)mul8uExtendWrapper, (BinaryFuncC)mul8sExtendWrapper, + }; - return mulTab; + return extendMul ? extendMulTab : mulTab; } static BinaryFuncC* getDivTab() @@ -1021,8 +1048,11 @@ void multiply(InputArray src1, InputArray src2, { CV_INSTRUMENT_REGION(); + static bool halMulExtendWorks = checkHalMulExtend(); + bool extendMul = ((src1.depth() == CV_8U) && (src2.depth() == CV_8U) && (dtype == CV_16U)) || ((src1.depth() == CV_8S) && (src2.depth() == CV_8S) && (dtype == CV_16S)); + extendMul = extendMul && halMulExtendWorks; arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(extendMul), /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE, diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp index a27abdf05a22..7d7c9f1b0743 100644 --- a/modules/core/src/hal_replacement.hpp +++ b/modules/core/src/hal_replacement.hpp @@ -325,6 +325,7 @@ inline int hal_ni_mul32s(const int *src1_data, size_t src1_step, const int *src2 inline int hal_ni_mul32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } inline int hal_ni_mul8uExtend(const uchar* src1_data, size_t src1_step, const uchar* src2_data, size_t src2_step, ushort* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_mul8sExtend(const schar* src1_data, size_t src1_step, const schar* src2_data, size_t src2_step, short* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } //! @} /** @@ -380,6 +381,7 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_ #define cv_hal_mul32f hal_ni_mul32f #define cv_hal_mul64f hal_ni_mul64f #define cv_hal_mul8uExtend hal_ni_mul8uExtend +#define cv_hal_mul8sExtend hal_ni_mul8sExtend #define cv_hal_div8u hal_ni_div8u #define cv_hal_div8s hal_ni_div8s #define cv_hal_div16u hal_ni_div16u From 4c263a41f6d60c58198e158c20f3e3b183878e23 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 2 May 2024 17:47:30 +0300 Subject: [PATCH 03/13] Code review fixes. --- modules/core/src/arithm.cpp | 18 +++++++++--------- modules/core/src/hal_replacement.hpp | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index a172008afd6c..d14a2c524ae9 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -949,22 +949,22 @@ void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask) namespace cv { -static void mul8uExtendWrapper(const uchar* src1, size_t step1, +static void mul8u16uWrapper(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* usrdata) { double scale = *((double*)usrdata); - cv_hal_mul8uExtend(src1, step1, src2, step2, (ushort*)dst, step, width, height, scale); + cv_hal_mul8u16u(src1, step1, src2, step2, (ushort*)dst, step, width, height, scale); } -static void mul8sExtendWrapper(const uchar* src1, size_t step1, +static void mul8s16sWrapper(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* usrdata) { double scale = *((double*)usrdata); - cv_hal_mul8sExtend((schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale); + cv_hal_mul8s16s((schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale); } static bool checkHalMulExtend() @@ -975,7 +975,7 @@ static bool checkHalMulExtend() uchar ua = 0, ub = 0; ushort uc = 0; int res; - res = cv_hal_mul8uExtend(/* src1_data */ &ua, /* src1_step */ 1, /* src2_data */ &ub, /* src2_step */ 1, + res = cv_hal_mul8u16u(/* src1_data */ &ua, /* src1_step */ 1, /* src2_data */ &ub, /* src2_step */ 1, /* dst_data */ &uc, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1); if (res == CV_HAL_ERROR_NOT_IMPLEMENTED) { @@ -983,12 +983,12 @@ static bool checkHalMulExtend() } else if (res != 0) { - CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8uExtend returned %d (0x%08x)", res, res)); + CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8u16s returned %d (0x%08x)", res, res)); } schar sa = 0, sb = 0; short sc = 0; - res = cv_hal_mul8sExtend(/* src1_data */ &sa, /* src1_step */ 1, /* src2_data */ &sb, /* src2_step */ 1, + res = cv_hal_mul8s16s(/* src1_data */ &sa, /* src1_step */ 1, /* src2_data */ &sb, /* src2_step */ 1, /* dst_data */ &sc, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1); if (res == CV_HAL_ERROR_NOT_IMPLEMENTED) { @@ -996,7 +996,7 @@ static bool checkHalMulExtend() } else if (res != 0) { - CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8sExtend returned %d (0x%08x)", res, res)); + CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8s16s returned %d (0x%08x)", res, res)); } return works; @@ -1013,7 +1013,7 @@ static BinaryFuncC* getMulTab(bool extendMul) static BinaryFuncC extendMulTab[] = { - (BinaryFuncC)mul8uExtendWrapper, (BinaryFuncC)mul8sExtendWrapper, + (BinaryFuncC)mul8u16uWrapper, (BinaryFuncC)mul8s16sWrapper, }; return extendMul ? extendMulTab : mulTab; diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp index 7d7c9f1b0743..d73c0e2db8a1 100644 --- a/modules/core/src/hal_replacement.hpp +++ b/modules/core/src/hal_replacement.hpp @@ -324,8 +324,8 @@ inline int hal_ni_mul16s(const short *src1_data, size_t src1_step, const short * inline int hal_ni_mul32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } inline int hal_ni_mul32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } -inline int hal_ni_mul8uExtend(const uchar* src1_data, size_t src1_step, const uchar* src2_data, size_t src2_step, ushort* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } -inline int hal_ni_mul8sExtend(const schar* src1_data, size_t src1_step, const schar* src2_data, size_t src2_step, short* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_mul8u16u(const uchar* src1_data, size_t src1_step, const uchar* src2_data, size_t src2_step, ushort* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_mul8s16s(const schar* src1_data, size_t src1_step, const schar* src2_data, size_t src2_step, short* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } //! @} /** @@ -380,8 +380,8 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_ #define cv_hal_mul32s hal_ni_mul32s #define cv_hal_mul32f hal_ni_mul32f #define cv_hal_mul64f hal_ni_mul64f -#define cv_hal_mul8uExtend hal_ni_mul8uExtend -#define cv_hal_mul8sExtend hal_ni_mul8sExtend +#define cv_hal_mul8u16u hal_ni_mul8u16u +#define cv_hal_mul8s16s hal_ni_mul8s16s #define cv_hal_div8u hal_ni_div8u #define cv_hal_div8s hal_ni_div8s #define cv_hal_div16u hal_ni_div16u From 64ad518b6050c4c79cf242fc8a190f6c89945992 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Mon, 6 May 2024 04:31:41 +0200 Subject: [PATCH 04/13] trying to fix HAL search --- modules/core/src/arithm.cpp | 39 ++----------------------------------- 1 file changed, 2 insertions(+), 37 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index d14a2c524ae9..bcb41c751583 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -967,41 +967,6 @@ static void mul8s16sWrapper(const uchar* src1, size_t step1, cv_hal_mul8s16s((schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale); } -static bool checkHalMulExtend() -{ - bool works = true; - - // check that HAL functions are presented and work properly - uchar ua = 0, ub = 0; - ushort uc = 0; - int res; - res = cv_hal_mul8u16u(/* src1_data */ &ua, /* src1_step */ 1, /* src2_data */ &ub, /* src2_step */ 1, - /* dst_data */ &uc, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1); - if (res == CV_HAL_ERROR_NOT_IMPLEMENTED) - { - works = false; - } - else if (res != 0) - { - CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8u16s returned %d (0x%08x)", res, res)); - } - - schar sa = 0, sb = 0; - short sc = 0; - res = cv_hal_mul8s16s(/* src1_data */ &sa, /* src1_step */ 1, /* src2_data */ &sb, /* src2_step */ 1, - /* dst_data */ &sc, /* dst_step */ 1, /* width */ 1, /* height */ 1, /* scale */ 1); - if (res == CV_HAL_ERROR_NOT_IMPLEMENTED) - { - works = false; - } - else if (res != 0) - { - CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8s16s returned %d (0x%08x)", res, res)); - } - - return works; -} - static BinaryFuncC* getMulTab(bool extendMul) { static BinaryFuncC mulTab[CV_DEPTH_MAX] = @@ -1048,11 +1013,11 @@ void multiply(InputArray src1, InputArray src2, { CV_INSTRUMENT_REGION(); - static bool halMulExtendWorks = checkHalMulExtend(); + static bool halMul8to16available = (cv_hal_mul8u16u != hal_ni_mul8u16u) && (cv_hal_mul8s16s != hal_ni_mul8s16s); bool extendMul = ((src1.depth() == CV_8U) && (src2.depth() == CV_8U) && (dtype == CV_16U)) || ((src1.depth() == CV_8S) && (src2.depth() == CV_8S) && (dtype == CV_16S)); - extendMul = extendMul && halMulExtendWorks; + extendMul = extendMul && halMul8to16available; arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(extendMul), /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE, From d26f4f31a13306951abf9845795bdb05447c3045 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Tue, 7 May 2024 11:14:51 +0200 Subject: [PATCH 05/13] ugly fix for HAL --- modules/core/src/arithm.cpp | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index bcb41c751583..66e96cff72c1 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -949,22 +949,40 @@ void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask) namespace cv { +static BinaryFuncC* getMulTab(bool extendMul); + static void mul8u16uWrapper(const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, - void* usrdata) + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void* usrdata) { double scale = *((double*)usrdata); - cv_hal_mul8u16u(src1, step1, src2, step2, (ushort*)dst, step, width, height, scale); + CALL_HAL(mul8u16u, cv_hal_mul8u16u, src1, step1, src2, step2, (ushort*)dst, step, width, height, scale); + + // fallback if HAL does not work + Mat src1Arr(height, width, CV_8UC1, const_cast(src1), step1); + Mat src2Arr(height, width, CV_8UC1, const_cast(src2), step2); + Mat dstArr(height, width, CV_16UC1, dst, step); + arithm_op(src1Arr, src2Arr, dstArr, noArray(), CV_16U, getMulTab(false), + /* muldiv */ true, usrdata, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE, + /*skipConversion*/ false); } static void mul8s16sWrapper(const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, - void* usrdata) + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void* usrdata) { double scale = *((double*)usrdata); - cv_hal_mul8s16s((schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale); + CALL_HAL(mul8s16s, cv_hal_mul8s16s, (schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale); + + // fallback if HAL does not work + Mat src1Arr(height, width, CV_8SC1, const_cast(src1), step1); + Mat src2Arr(height, width, CV_8SC1, const_cast(src2), step2); + Mat dstArr(height, width, CV_16SC1, dst, step); + arithm_op(src1Arr, src2Arr, dstArr, noArray(), CV_16S, getMulTab(false), + /* muldiv */ true, usrdata, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE, + /*skipConversion*/ false); } static BinaryFuncC* getMulTab(bool extendMul) From 3fd4a015e86816431ccb74031f1a7be0bceec020 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Tue, 7 May 2024 11:19:25 +0200 Subject: [PATCH 06/13] separate 8u and 16u --- modules/core/src/arithm.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 66e96cff72c1..134b89e5bce3 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -1031,11 +1031,11 @@ void multiply(InputArray src1, InputArray src2, { CV_INSTRUMENT_REGION(); - static bool halMul8to16available = (cv_hal_mul8u16u != hal_ni_mul8u16u) && (cv_hal_mul8s16s != hal_ni_mul8s16s); + static bool hal8u16uAvailable = cv_hal_mul8u16u != hal_ni_mul8u16u; + static bool hal8s16sAvailable = cv_hal_mul8s16s != hal_ni_mul8s16s; - bool extendMul = ((src1.depth() == CV_8U) && (src2.depth() == CV_8U) && (dtype == CV_16U)) || - ((src1.depth() == CV_8S) && (src2.depth() == CV_8S) && (dtype == CV_16S)); - extendMul = extendMul && halMul8to16available; + bool extendMul = (hal8u16uAvailable && (src1.depth() == CV_8U) && (src2.depth() == CV_8U) && (dtype == CV_16U)) || + (hal8s16sAvailable && (src1.depth() == CV_8S) && (src2.depth() == CV_8S) && (dtype == CV_16S)); arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(extendMul), /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE, From 43be67b121f30fef66d8ef247e67e6bf2e744cb3 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Thu, 16 May 2024 01:28:55 +0200 Subject: [PATCH 07/13] extendedFunc implemented --- modules/core/src/arithm.cpp | 159 ++++++++++++++++++++---------------- 1 file changed, 88 insertions(+), 71 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 134b89e5bce3..1a66937be299 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -585,9 +585,14 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, #endif +typedef int (*ExtendedTypeFunc)(const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void*); + static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, int dtype, BinaryFuncC* tab, bool muldiv=false, - void* usrdata=0, int oclop=-1, bool skipConversion = false ) + void* usrdata=0, int oclop=-1, ExtendedTypeFunc extendedFunc = nullptr ) { const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; _InputArray::KindFlag kind1 = psrc1->kind(), kind2 = psrc2->kind(); @@ -617,9 +622,13 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); Size sz = getContinuousSize2D(src1, src2, dst, src1.channels()); - BinaryFuncC func = tab[depth1]; - CV_Assert(func); - func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata); + if (extendedFunc(src1.ptr(), src1.step, src2.ptr(), src2.step, + dst.ptr(), dst.step, sz.width, sz.height, usrdata) != 0) + { + BinaryFuncC func = tab[depth1]; + CV_Assert(func); + func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata); + } return; } @@ -717,9 +726,9 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype, usrdata, oclop, haveScalar)) - BinaryFunc cvtsrc1 = type1 == wtype ? 0 : (skipConversion ? nullptr : getConvertFunc(type1, wtype)); - BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : (skipConversion ? nullptr : getConvertFunc(type2, wtype)); - BinaryFunc cvtdst = dtype == wtype ? 0 : (skipConversion ? nullptr : getConvertFunc(wtype, dtype)); + BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype); + BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype); + BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype); size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2); size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype); @@ -750,14 +759,22 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, _buf.allocate(bufesz*blocksize + 64); buf = _buf.data(); if( cvtsrc1 ) + { buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16); + } if( cvtsrc2 ) + { buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16); + } wbuf = maskbuf = buf; if( cvtdst ) + { buf = alignPtr(buf + blocksize*wsz, 16); + } if( haveMask ) + { maskbuf = buf; + } for( size_t i = 0; i < it.nplanes; i++, ++it ) { @@ -767,38 +784,44 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, Size bszn(bsz*cn, 1); const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1]; uchar* dptr = ptrs[2]; - if( cvtsrc1 ) + // try to perform operation with conversion in one call + // if fail, use converter functions + uchar* opconverted = haveMask ? maskbuf : dptr; + if (!extendedFunc || extendedFunc(sptr1, 1, sptr2, 1, opconverted, (!haveMask), + bszn.width, bszn.height, usrdata) != 0) { - cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); - sptr1 = buf1; - } - if( ptrs[0] == ptrs[1] ) - sptr2 = sptr1; - else if( cvtsrc2 ) - { - cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 ); - sptr2 = buf2; - } - - if( !haveMask && !cvtdst ) - func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata ); - else - { - func( sptr1, 1, sptr2, 1, wbuf, 0, bszn.width, bszn.height, usrdata ); - if( !haveMask ) - cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); - else if( !cvtdst ) + if( cvtsrc1 ) { - copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz ); - ptrs[3] += bsz; + cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); + sptr1 = buf1; } - else + if( ptrs[0] == ptrs[1] ) { - cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 ); - copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz ); - ptrs[3] += bsz; + sptr2 = sptr1; + } + else if( cvtsrc2 ) + { + cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 ); + sptr2 = buf2; + } + + uchar* fdst = (haveMask || cvtdst) ? wbuf : dptr; + func(sptr1, 1, sptr2, 1, fdst, (!haveMask && !cvtdst), bszn.width, bszn.height, usrdata); + + if (cvtdst) + { + uchar* cdst = haveMask ? maskbuf : dptr; + cvtdst( wbuf, 1, 0, 1, cdst, 1, bszn, 0 ); } + opconverted = cvtdst ? maskbuf : wbuf; } + + if (haveMask) + { + copymask(opconverted, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz); + ptrs[3] += bsz; + } + ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz; } } @@ -949,43 +972,31 @@ void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask) namespace cv { -static BinaryFuncC* getMulTab(bool extendMul); - -static void mul8u16uWrapper(const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, - void* usrdata) +static int mul8u16uWrapper(const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void* usrdata) { double scale = *((double*)usrdata); CALL_HAL(mul8u16u, cv_hal_mul8u16u, src1, step1, src2, step2, (ushort*)dst, step, width, height, scale); - // fallback if HAL does not work - Mat src1Arr(height, width, CV_8UC1, const_cast(src1), step1); - Mat src2Arr(height, width, CV_8UC1, const_cast(src2), step2); - Mat dstArr(height, width, CV_16UC1, dst, step); - arithm_op(src1Arr, src2Arr, dstArr, noArray(), CV_16U, getMulTab(false), - /* muldiv */ true, usrdata, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE, - /*skipConversion*/ false); + // the fallback implementation should be used then + return CV_HAL_ERROR_NOT_IMPLEMENTED; } -static void mul8s16sWrapper(const uchar* src1, size_t step1, - const uchar* src2, size_t step2, - uchar* dst, size_t step, int width, int height, - void* usrdata) +static int mul8s16sWrapper(const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void* usrdata) { double scale = *((double*)usrdata); CALL_HAL(mul8s16s, cv_hal_mul8s16s, (schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale); - // fallback if HAL does not work - Mat src1Arr(height, width, CV_8SC1, const_cast(src1), step1); - Mat src2Arr(height, width, CV_8SC1, const_cast(src2), step2); - Mat dstArr(height, width, CV_16SC1, dst, step); - arithm_op(src1Arr, src2Arr, dstArr, noArray(), CV_16S, getMulTab(false), - /* muldiv */ true, usrdata, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE, - /*skipConversion*/ false); + // the fallback implementation should be used then + return CV_HAL_ERROR_NOT_IMPLEMENTED; } -static BinaryFuncC* getMulTab(bool extendMul) +static BinaryFuncC* getMulTab() { static BinaryFuncC mulTab[CV_DEPTH_MAX] = { @@ -994,12 +1005,23 @@ static BinaryFuncC* getMulTab(bool extendMul) (BinaryFuncC)cv::hal::mul64f, 0 }; - static BinaryFuncC extendMulTab[] = - { - (BinaryFuncC)mul8u16uWrapper, (BinaryFuncC)mul8s16sWrapper, - }; + return mulTab; +} - return extendMul ? extendMulTab : mulTab; +static ExtendedTypeFunc getMulExtFunc(int src1Type, int src2Type, int dstType) +{ + if (src1Type == CV_8U && src2Type == CV_8U && dstType == CV_16U) + { + return mul8u16uWrapper; + } + else if (src1Type == CV_8U && src2Type == CV_8S && dstType == CV_16S) + { + return mul8s16sWrapper; + } + else + { + return nullptr; + } } static BinaryFuncC* getDivTab() @@ -1031,15 +1053,10 @@ void multiply(InputArray src1, InputArray src2, { CV_INSTRUMENT_REGION(); - static bool hal8u16uAvailable = cv_hal_mul8u16u != hal_ni_mul8u16u; - static bool hal8s16sAvailable = cv_hal_mul8s16s != hal_ni_mul8s16s; - - bool extendMul = (hal8u16uAvailable && (src1.depth() == CV_8U) && (src2.depth() == CV_8U) && (dtype == CV_16U)) || - (hal8s16sAvailable && (src1.depth() == CV_8S) && (src2.depth() == CV_8S) && (dtype == CV_16S)); - - arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(extendMul), + ExtendedTypeFunc mulExtFunc = getMulExtFunc(src1.depth(), src2.depth(), dtype); + arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE, - /*skipConversion*/ extendMul); + /* extendedFunc */ mulExtFunc ); } void divide(InputArray src1, InputArray src2, From 703fb46c0edfcddf88b2af2bd4ad9666cf80caff Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Thu, 16 May 2024 01:40:35 +0200 Subject: [PATCH 08/13] compile fix --- modules/core/src/arithm.cpp | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 1a66937be299..f12e30efd4ba 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -977,11 +977,15 @@ static int mul8u16uWrapper(const uchar* src1, size_t step1, uchar* dst, size_t step, int width, int height, void* usrdata) { - double scale = *((double*)usrdata); - CALL_HAL(mul8u16u, cv_hal_mul8u16u, src1, step1, src2, step2, (ushort*)dst, step, width, height, scale); - - // the fallback implementation should be used then - return CV_HAL_ERROR_NOT_IMPLEMENTED; + double scale = *((double *)usrdata); + int res = cv_hal_mul8u16u(src1, step1, src2, step2, (ushort *)dst, step, width, height, scale); + if (res == 0 || res == CV_HAL_ERROR_NOT_IMPLEMENTED) + return res; + else + { + CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8u16u ==> " CVAUX_STR(cv_hal_mul8u16u) + " returned %d (0x%08x)", res, res)); + } } static int mul8s16sWrapper(const uchar* src1, size_t step1, @@ -990,10 +994,14 @@ static int mul8s16sWrapper(const uchar* src1, size_t step1, void* usrdata) { double scale = *((double*)usrdata); - CALL_HAL(mul8s16s, cv_hal_mul8s16s, (schar*)src1, step1, (schar*)src2, step2, (short*)dst, step, width, height, scale); - - // the fallback implementation should be used then - return CV_HAL_ERROR_NOT_IMPLEMENTED; + int res = cv_hal_mul8s16s((schar *)src1, step1, (schar *)src2, step2, (short *)dst, step, width, height, scale); + if (res == 0 || res == CV_HAL_ERROR_NOT_IMPLEMENTED) + return res; + else + { + CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8s16s ==> " CVAUX_STR(cv_hal_mul8s16s) + " returned %d (0x%08x)", res, res)); + } } static BinaryFuncC* getMulTab() From 7db6d65068d08e4ec1fa344b4def450025c58c11 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Thu, 16 May 2024 01:41:11 +0200 Subject: [PATCH 09/13] minor --- modules/core/src/arithm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index f12e30efd4ba..c13b8c84edee 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -977,7 +977,7 @@ static int mul8u16uWrapper(const uchar* src1, size_t step1, uchar* dst, size_t step, int width, int height, void* usrdata) { - double scale = *((double *)usrdata); + double scale = *((double*)usrdata); int res = cv_hal_mul8u16u(src1, step1, src2, step2, (ushort *)dst, step, width, height, scale); if (res == 0 || res == CV_HAL_ERROR_NOT_IMPLEMENTED) return res; From 59e41eee3fa5533102ebb29a00ad6c14712189a7 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Thu, 16 May 2024 10:13:44 +0200 Subject: [PATCH 10/13] fixes --- modules/core/src/arithm.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index c13b8c84edee..7251f34da688 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -622,8 +622,8 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); Size sz = getContinuousSize2D(src1, src2, dst, src1.channels()); - if (extendedFunc(src1.ptr(), src1.step, src2.ptr(), src2.step, - dst.ptr(), dst.step, sz.width, sz.height, usrdata) != 0) + if (!extendedFunc || extendedFunc(src1.ptr(), src1.step, src2.ptr(), src2.step, + dst.ptr(), dst.step, sz.width, sz.height, usrdata) != 0) { BinaryFuncC func = tab[depth1]; CV_Assert(func); @@ -979,7 +979,7 @@ static int mul8u16uWrapper(const uchar* src1, size_t step1, { double scale = *((double*)usrdata); int res = cv_hal_mul8u16u(src1, step1, src2, step2, (ushort *)dst, step, width, height, scale); - if (res == 0 || res == CV_HAL_ERROR_NOT_IMPLEMENTED) + if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED) return res; else { @@ -995,7 +995,7 @@ static int mul8s16sWrapper(const uchar* src1, size_t step1, { double scale = *((double*)usrdata); int res = cv_hal_mul8s16s((schar *)src1, step1, (schar *)src2, step2, (short *)dst, step, width, height, scale); - if (res == 0 || res == CV_HAL_ERROR_NOT_IMPLEMENTED) + if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED) return res; else { From 17ae223de280dba9ac8e5babe60634eaac101399 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Thu, 16 May 2024 17:27:50 +0200 Subject: [PATCH 11/13] fixed scalar mode --- modules/core/src/arithm.cpp | 63 +++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 7251f34da688..8de5276c18aa 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -811,7 +811,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, if (cvtdst) { uchar* cdst = haveMask ? maskbuf : dptr; - cvtdst( wbuf, 1, 0, 1, cdst, 1, bszn, 0 ); + cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0); } opconverted = cvtdst ? maskbuf : wbuf; } @@ -837,13 +837,19 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, _buf.allocate(bufesz*blocksize + 64); buf = _buf.data(); if( cvtsrc1 ) - buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16); + { + buf1 = buf, buf = alignPtr(buf + blocksize * wsz, 16); + } buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16); wbuf = maskbuf = buf; if( cvtdst ) - buf = alignPtr(buf + blocksize*wsz, 16); + { + buf = alignPtr(buf + blocksize * wsz, 16); + } if( haveMask ) + { maskbuf = buf; + } convertAndUnrollScalar( src2, wtype, buf2, blocksize); @@ -857,34 +863,43 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, const uchar* sptr2 = buf2; uchar* dptr = ptrs[1]; - if( cvtsrc1 ) - { - cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); - sptr1 = buf1; - } - + const uchar* extSptr1 = sptr1; + const uchar* extSptr2 = sptr2; if( swapped12 ) - std::swap(sptr1, sptr2); - - if( !haveMask && !cvtdst ) - func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata ); - else + std::swap(extSptr1, extSptr1); + + // try to perform operation with conversion in one call + // if fail, use converter functions + uchar* opconverted = haveMask ? maskbuf : dptr; + if (!extendedFunc || extendedFunc(extSptr1, 1, extSptr2, 1, opconverted, 1, + bszn.width, bszn.height, usrdata) != 0) { - func( sptr1, 1, sptr2, 1, wbuf, 1, bszn.width, bszn.height, usrdata ); - if( !haveMask ) - cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); - else if( !cvtdst ) + if( cvtsrc1 ) { - copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz ); - ptrs[2] += bsz; + cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); + sptr1 = buf1; } - else + + if( swapped12 ) + std::swap(sptr1, sptr2); + + uchar* fdst = ( haveMask || cvtdst ) ? wbuf : dptr; + func( sptr1, 1, sptr2, 1, fdst, 1, bszn.width, bszn.height, usrdata ); + + if (cvtdst) { - cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 ); - copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz ); - ptrs[2] += bsz; + uchar* cdst = haveMask ? maskbuf : dptr; + cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0); } + opconverted = cvtdst ? maskbuf : wbuf; + } + + if (haveMask) + { + copymask(opconverted, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz); + ptrs[2] += bsz; } + ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz; } } From 104ff32782723c9d28b1c522cb36ac14c1af89ef Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Thu, 16 May 2024 18:12:50 +0200 Subject: [PATCH 12/13] minor --- modules/core/src/arithm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 8de5276c18aa..fbd425aacf49 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -867,7 +867,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, const uchar* extSptr2 = sptr2; if( swapped12 ) std::swap(extSptr1, extSptr1); - + // try to perform operation with conversion in one call // if fail, use converter functions uchar* opconverted = haveMask ? maskbuf : dptr; @@ -899,7 +899,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, copymask(opconverted, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz); ptrs[2] += bsz; } - + ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz; } } From 8545784f088f26c15a7a60495748cd2d9764bf12 Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Thu, 16 May 2024 18:13:42 +0200 Subject: [PATCH 13/13] default dtype fix --- modules/core/src/arithm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index fbd425aacf49..5a189867c2ea 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -1076,7 +1076,7 @@ void multiply(InputArray src1, InputArray src2, { CV_INSTRUMENT_REGION(); - ExtendedTypeFunc mulExtFunc = getMulExtFunc(src1.depth(), src2.depth(), dtype); + ExtendedTypeFunc mulExtFunc = getMulExtFunc(src1.depth(), src2.depth(), dtype < 0 ? dst.depth() : dtype); arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE, /* extendedFunc */ mulExtFunc );