From 69af621ef63a508ab9d1abbfa259a6876622e71e Mon Sep 17 00:00:00 2001 From: Rostislav Vasilikhin Date: Mon, 20 May 2024 09:43:18 +0200 Subject: [PATCH] Merge pull request #25506 from savuor:rv/hal_mul16 HAL mul8x8to16 added #25506 Fixes #25034 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- modules/core/src/arithm.cpp | 196 +++++++++++++++++++-------- modules/core/src/hal_replacement.hpp | 4 + 2 files changed, 146 insertions(+), 54 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index f4ca2d7da966..5a189867c2ea 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -585,9 +585,14 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, #endif +typedef int (*ExtendedTypeFunc)(const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void*); + static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, int dtype, BinaryFuncC* tab, bool muldiv=false, - void* usrdata=0, int oclop=-1 ) + void* usrdata=0, int oclop=-1, ExtendedTypeFunc extendedFunc = nullptr ) { const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; _InputArray::KindFlag kind1 = psrc1->kind(), kind2 = psrc2->kind(); @@ -617,9 +622,13 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); Size sz = getContinuousSize2D(src1, src2, dst, src1.channels()); - BinaryFuncC func = tab[depth1]; - CV_Assert(func); - func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata); + if (!extendedFunc || extendedFunc(src1.ptr(), src1.step, src2.ptr(), src2.step, + dst.ptr(), dst.step, sz.width, sz.height, usrdata) != 0) + { + BinaryFuncC func = tab[depth1]; + CV_Assert(func); + func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata); + } return; } @@ -750,14 +759,22 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, _buf.allocate(bufesz*blocksize + 64); buf = _buf.data(); if( cvtsrc1 ) + { buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16); + } if( cvtsrc2 ) + { buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16); + } wbuf = maskbuf = buf; if( cvtdst ) + { buf = alignPtr(buf + blocksize*wsz, 16); + } if( haveMask ) + { maskbuf = buf; + } for( size_t i = 0; i < it.nplanes; i++, ++it ) { @@ -767,38 +784,44 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, Size bszn(bsz*cn, 1); const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1]; uchar* dptr = ptrs[2]; - if( cvtsrc1 ) - { - cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); - sptr1 = buf1; - } - if( ptrs[0] == ptrs[1] ) - sptr2 = sptr1; - else if( cvtsrc2 ) - { - cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 ); - sptr2 = buf2; - } - - if( !haveMask && !cvtdst ) - func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata ); - else + // try to perform operation with conversion in one call + // if fail, use converter functions + uchar* opconverted = haveMask ? maskbuf : dptr; + if (!extendedFunc || extendedFunc(sptr1, 1, sptr2, 1, opconverted, (!haveMask), + bszn.width, bszn.height, usrdata) != 0) { - func( sptr1, 1, sptr2, 1, wbuf, 0, bszn.width, bszn.height, usrdata ); - if( !haveMask ) - cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); - else if( !cvtdst ) + if( cvtsrc1 ) { - copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz ); - ptrs[3] += bsz; + cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); + sptr1 = buf1; } - else + if( ptrs[0] == ptrs[1] ) { - cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 ); - copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz ); - ptrs[3] += bsz; + sptr2 = sptr1; } + else if( cvtsrc2 ) + { + cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 ); + sptr2 = buf2; + } + + uchar* fdst = (haveMask || cvtdst) ? wbuf : dptr; + func(sptr1, 1, sptr2, 1, fdst, (!haveMask && !cvtdst), bszn.width, bszn.height, usrdata); + + if (cvtdst) + { + uchar* cdst = haveMask ? maskbuf : dptr; + cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0); + } + opconverted = cvtdst ? maskbuf : wbuf; + } + + if (haveMask) + { + copymask(opconverted, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz); + ptrs[3] += bsz; } + ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz; } } @@ -814,13 +837,19 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, _buf.allocate(bufesz*blocksize + 64); buf = _buf.data(); if( cvtsrc1 ) - buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16); + { + buf1 = buf, buf = alignPtr(buf + blocksize * wsz, 16); + } buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16); wbuf = maskbuf = buf; if( cvtdst ) - buf = alignPtr(buf + blocksize*wsz, 16); + { + buf = alignPtr(buf + blocksize * wsz, 16); + } if( haveMask ) + { maskbuf = buf; + } convertAndUnrollScalar( src2, wtype, buf2, blocksize); @@ -834,34 +863,43 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, const uchar* sptr2 = buf2; uchar* dptr = ptrs[1]; - if( cvtsrc1 ) - { - cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); - sptr1 = buf1; - } - + const uchar* extSptr1 = sptr1; + const uchar* extSptr2 = sptr2; if( swapped12 ) - std::swap(sptr1, sptr2); + std::swap(extSptr1, extSptr1); - if( !haveMask && !cvtdst ) - func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata ); - else + // try to perform operation with conversion in one call + // if fail, use converter functions + uchar* opconverted = haveMask ? maskbuf : dptr; + if (!extendedFunc || extendedFunc(extSptr1, 1, extSptr2, 1, opconverted, 1, + bszn.width, bszn.height, usrdata) != 0) { - func( sptr1, 1, sptr2, 1, wbuf, 1, bszn.width, bszn.height, usrdata ); - if( !haveMask ) - cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); - else if( !cvtdst ) + if( cvtsrc1 ) { - copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz ); - ptrs[2] += bsz; + cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); + sptr1 = buf1; } - else + + if( swapped12 ) + std::swap(sptr1, sptr2); + + uchar* fdst = ( haveMask || cvtdst ) ? wbuf : dptr; + func( sptr1, 1, sptr2, 1, fdst, 1, bszn.width, bszn.height, usrdata ); + + if (cvtdst) { - cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 ); - copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz ); - ptrs[2] += bsz; + uchar* cdst = haveMask ? maskbuf : dptr; + cvtdst(wbuf, 1, 0, 1, cdst, 1, bszn, 0); } + opconverted = cvtdst ? maskbuf : wbuf; + } + + if (haveMask) + { + copymask(opconverted, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz); + ptrs[2] += bsz; } + ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz; } } @@ -949,6 +987,38 @@ void cv::copyTo(InputArray _src, OutputArray _dst, InputArray _mask) namespace cv { +static int mul8u16uWrapper(const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void* usrdata) +{ + double scale = *((double*)usrdata); + int res = cv_hal_mul8u16u(src1, step1, src2, step2, (ushort *)dst, step, width, height, scale); + if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED) + return res; + else + { + CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8u16u ==> " CVAUX_STR(cv_hal_mul8u16u) + " returned %d (0x%08x)", res, res)); + } +} + +static int mul8s16sWrapper(const uchar* src1, size_t step1, + const uchar* src2, size_t step2, + uchar* dst, size_t step, int width, int height, + void* usrdata) +{ + double scale = *((double*)usrdata); + int res = cv_hal_mul8s16s((schar *)src1, step1, (schar *)src2, step2, (short *)dst, step, width, height, scale); + if (res == CV_HAL_ERROR_OK || res == CV_HAL_ERROR_NOT_IMPLEMENTED) + return res; + else + { + CV_Error_(cv::Error::StsInternal, ("HAL implementation mul8s16s ==> " CVAUX_STR(cv_hal_mul8s16s) + " returned %d (0x%08x)", res, res)); + } +} + static BinaryFuncC* getMulTab() { static BinaryFuncC mulTab[CV_DEPTH_MAX] = @@ -961,6 +1031,22 @@ static BinaryFuncC* getMulTab() return mulTab; } +static ExtendedTypeFunc getMulExtFunc(int src1Type, int src2Type, int dstType) +{ + if (src1Type == CV_8U && src2Type == CV_8U && dstType == CV_16U) + { + return mul8u16uWrapper; + } + else if (src1Type == CV_8U && src2Type == CV_8S && dstType == CV_16S) + { + return mul8s16sWrapper; + } + else + { + return nullptr; + } +} + static BinaryFuncC* getDivTab() { static BinaryFuncC divTab[CV_DEPTH_MAX] = @@ -986,12 +1072,14 @@ static BinaryFuncC* getRecipTab() } void multiply(InputArray src1, InputArray src2, - OutputArray dst, double scale, int dtype) + OutputArray dst, double scale, int dtype) { CV_INSTRUMENT_REGION(); + ExtendedTypeFunc mulExtFunc = getMulExtFunc(src1.depth(), src2.depth(), dtype < 0 ? dst.depth() : dtype); arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), - true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE); + /* muldiv */ true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE, + /* extendedFunc */ mulExtFunc ); } void divide(InputArray src1, InputArray src2, diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp index bbdfc1e180ec..d73c0e2db8a1 100644 --- a/modules/core/src/hal_replacement.hpp +++ b/modules/core/src/hal_replacement.hpp @@ -324,6 +324,8 @@ inline int hal_ni_mul16s(const short *src1_data, size_t src1_step, const short * inline int hal_ni_mul32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } inline int hal_ni_mul32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } inline int hal_ni_mul64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_mul8u16u(const uchar* src1_data, size_t src1_step, const uchar* src2_data, size_t src2_step, ushort* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_mul8s16s(const schar* src1_data, size_t src1_step, const schar* src2_data, size_t src2_step, short* dst_data, size_t dst_step, int width, int height, double scale) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } //! @} /** @@ -378,6 +380,8 @@ inline int hal_ni_recip64f(const double *src_data, size_t src_step, double *dst_ #define cv_hal_mul32s hal_ni_mul32s #define cv_hal_mul32f hal_ni_mul32f #define cv_hal_mul64f hal_ni_mul64f +#define cv_hal_mul8u16u hal_ni_mul8u16u +#define cv_hal_mul8s16s hal_ni_mul8s16s #define cv_hal_div8u hal_ni_div8u #define cv_hal_div8s hal_ni_div8s #define cv_hal_div16u hal_ni_div16u