Skip to content

Commit

Permalink
Merge pull request #24271 from Kumataro:fix24163
Browse files Browse the repository at this point in the history
Fix to convert float32 to int32/uint32 with rounding to nearest (ties to even). #24271

Fix #24163

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake

(carotene is BSD)
  • Loading branch information
Kumataro committed Dec 25, 2023
1 parent d9d4029 commit dba7186
Show file tree
Hide file tree
Showing 13 changed files with 323 additions and 164 deletions.
8 changes: 8 additions & 0 deletions 3rdparty/carotene/CMakeLists.txt
Expand Up @@ -42,6 +42,14 @@ endif()

if(WITH_NEON)
target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
if(NOT DEFINED CAROTENE_NEON_ARCH )
elseif(CAROTENE_NEON_ARCH EQUAL 8)
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=8")
elseif(CAROTENE_NEON_ARCH EQUAL 7)
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=7")
else()
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=0")
endif()
endif()

# we add dummy file to fix XCode build
Expand Down
13 changes: 7 additions & 6 deletions 3rdparty/carotene/src/add_weighted.cpp
Expand Up @@ -39,6 +39,7 @@

#include "common.hpp"
#include "vtransform.hpp"
#include "vround_helper.hpp"

namespace CAROTENE_NS {

Expand Down Expand Up @@ -106,7 +107,7 @@ template <> struct wAdd<s32>
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5);
vgamma = vdupq_n_f32(_gamma);
}

void operator() (const VecTraits<s32>::vec128 & v_src0,
Expand All @@ -118,7 +119,7 @@ template <> struct wAdd<s32>

vs1 = vmlaq_f32(vgamma, vs1, valpha);
vs1 = vmlaq_f32(vs1, vs2, vbeta);
v_dst = vcvtq_s32_f32(vs1);
v_dst = vroundq_s32_f32(vs1);
}

void operator() (const VecTraits<s32>::vec64 & v_src0,
Expand All @@ -130,7 +131,7 @@ template <> struct wAdd<s32>

vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
v_dst = vcvt_s32_f32(vs1);
v_dst = vround_s32_f32(vs1);
}

void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
Expand All @@ -150,7 +151,7 @@ template <> struct wAdd<u32>
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5);
vgamma = vdupq_n_f32(_gamma);
}

void operator() (const VecTraits<u32>::vec128 & v_src0,
Expand All @@ -162,7 +163,7 @@ template <> struct wAdd<u32>

vs1 = vmlaq_f32(vgamma, vs1, valpha);
vs1 = vmlaq_f32(vs1, vs2, vbeta);
v_dst = vcvtq_u32_f32(vs1);
v_dst = vroundq_u32_f32(vs1);
}

void operator() (const VecTraits<u32>::vec64 & v_src0,
Expand All @@ -174,7 +175,7 @@ template <> struct wAdd<u32>

vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
v_dst = vcvt_u32_f32(vs1);
v_dst = vround_u32_f32(vs1);
}

void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
Expand Down
15 changes: 7 additions & 8 deletions 3rdparty/carotene/src/blur.cpp
Expand Up @@ -41,6 +41,7 @@

#include "common.hpp"
#include "saturate_cast.hpp"
#include "vround_helper.hpp"

namespace CAROTENE_NS {

Expand Down Expand Up @@ -198,7 +199,6 @@ void blur3x3(const Size2D &size, s32 cn,
//#define FLOAT_VARIANT_1_9
#ifdef FLOAT_VARIANT_1_9
float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
float32x4_t v0_5 = vdupq_n_f32 (.5);
#else
const int16x8_t vScale = vmovq_n_s16(3640);
#endif
Expand Down Expand Up @@ -283,8 +283,8 @@ void blur3x3(const Size2D &size, s32 cn,
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
tres1 = internal::vroundq_u32_f32(vf1);
tres2 = internal::vroundq_u32_f32(vf2);
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
vst1_u8(drow + x - 8, vmovn_u16(t0));
#else
Expand Down Expand Up @@ -445,8 +445,8 @@ void blur3x3(const Size2D &size, s32 cn,
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
tres1 = internal::vroundq_u32_f32(vf1);
tres2 = internal::vroundq_u32_f32(vf2);
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
vst1_u8(drow + x - 8, vmovn_u16(t0));
#else
Expand Down Expand Up @@ -508,7 +508,6 @@ void blur5x5(const Size2D &size, s32 cn,
#define FLOAT_VARIANT_1_25
#ifdef FLOAT_VARIANT_1_25
float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
float32x4_t v0_5 = vdupq_n_f32 (.5f);
#else
const int16x8_t vScale = vmovq_n_s16(1310);
#endif
Expand Down Expand Up @@ -752,8 +751,8 @@ void blur5x5(const Size2D &size, s32 cn,
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
tres1 = internal::vroundq_u32_f32(vf1);
tres2 = internal::vroundq_u32_f32(vf2);
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
vst1_u8(drow + x - 8, vmovn_u16(t0));
#else
Expand Down
16 changes: 5 additions & 11 deletions 3rdparty/carotene/src/colorconvert.cpp
Expand Up @@ -40,6 +40,7 @@
#include "common.hpp"

#include "saturate_cast.hpp"
#include "vround_helper.hpp"

namespace CAROTENE_NS {

Expand Down Expand Up @@ -1166,17 +1167,10 @@ inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const ui
vSt3 = vmulq_f32(vHF1, vDivTab);
vSt4 = vmulq_f32(vHF2, vDivTab);

float32x4_t bias = vdupq_n_f32(0.5f);

vSt1 = vaddq_f32(vSt1, bias);
vSt2 = vaddq_f32(vSt2, bias);
vSt3 = vaddq_f32(vSt3, bias);
vSt4 = vaddq_f32(vSt4, bias);

uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);
uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);
uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);
uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);
uint32x4_t vRes1 = internal::vroundq_u32_f32(vSt1);
uint32x4_t vRes2 = internal::vroundq_u32_f32(vSt2);
uint32x4_t vRes3 = internal::vroundq_u32_f32(vSt3);
uint32x4_t vRes4 = internal::vroundq_u32_f32(vSt4);

int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));
Expand Down
11 changes: 11 additions & 0 deletions 3rdparty/carotene/src/common.hpp
Expand Up @@ -58,6 +58,17 @@

namespace CAROTENE_NS { namespace internal {

#ifndef CAROTENE_NEON_ARCH
# if defined(__aarch64__) || defined(__aarch32__)
# define CAROTENE_NEON_ARCH 8
# else
# define CAROTENE_NEON_ARCH 7
# endif
#endif
#if ( !defined(__aarch64__) && !defined(__aarch32__) ) && (CAROTENE_NEON_ARCH == 8 )
# error("ARMv7 doen't support A32/A64 Neon instructions")
#endif

inline void prefetch(const void *ptr, size_t offset = 32*10)
{
#if defined __GNUC__
Expand Down

0 comments on commit dba7186

Please sign in to comment.