Merge pull request #24271 from Kumataro:fix24163

Fix to convert float32 to int32/uint32 with rounding to nearest (ties to even). #24271 Fix #24163 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake (carotene is BSD)
opencv · Dec 25, 2023 · dba7186 · dba7186
1 parent d9d4029
commit dba7186
Show file tree

Hide file tree

Showing 13 changed files with 323 additions and 164 deletions.
diff --git a/3rdparty/carotene/CMakeLists.txt b/3rdparty/carotene/CMakeLists.txt
@@ -42,6 +42,14 @@ endif()
 
 if(WITH_NEON)
     target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
+    if(NOT DEFINED CAROTENE_NEON_ARCH )
+    elseif(CAROTENE_NEON_ARCH EQUAL 8)
+	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=8")
+    elseif(CAROTENE_NEON_ARCH EQUAL 7)
+	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=7")
+    else()
+	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=0")
+    endif()
 endif()
 
 # we add dummy file to fix XCode build

diff --git a/3rdparty/carotene/src/add_weighted.cpp b/3rdparty/carotene/src/add_weighted.cpp
@@ -39,6 +39,7 @@
 
 #include "common.hpp"
 #include "vtransform.hpp"
+#include "vround_helper.hpp"
 
 namespace CAROTENE_NS {
 
@@ -106,7 +107,7 @@ template <> struct wAdd<s32>
     {
         valpha = vdupq_n_f32(_alpha);
         vbeta = vdupq_n_f32(_beta);
-        vgamma = vdupq_n_f32(_gamma + 0.5);
+        vgamma = vdupq_n_f32(_gamma);
     }
 
     void operator() (const VecTraits<s32>::vec128 & v_src0,
@@ -118,7 +119,7 @@ template <> struct wAdd<s32>
 
         vs1 = vmlaq_f32(vgamma, vs1, valpha);
         vs1 = vmlaq_f32(vs1, vs2, vbeta);
-        v_dst = vcvtq_s32_f32(vs1);
+        v_dst = vroundq_s32_f32(vs1);
     }
 
     void operator() (const VecTraits<s32>::vec64 & v_src0,
@@ -130,7 +131,7 @@ template <> struct wAdd<s32>
 
         vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
         vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
-        v_dst = vcvt_s32_f32(vs1);
+        v_dst = vround_s32_f32(vs1);
     }
 
     void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
@@ -150,7 +151,7 @@ template <> struct wAdd<u32>
     {
         valpha = vdupq_n_f32(_alpha);
         vbeta = vdupq_n_f32(_beta);
-        vgamma = vdupq_n_f32(_gamma + 0.5);
+        vgamma = vdupq_n_f32(_gamma);
     }
 
     void operator() (const VecTraits<u32>::vec128 & v_src0,
@@ -162,7 +163,7 @@ template <> struct wAdd<u32>
 
         vs1 = vmlaq_f32(vgamma, vs1, valpha);
         vs1 = vmlaq_f32(vs1, vs2, vbeta);
-        v_dst = vcvtq_u32_f32(vs1);
+        v_dst = vroundq_u32_f32(vs1);
     }
 
     void operator() (const VecTraits<u32>::vec64 & v_src0,
@@ -174,7 +175,7 @@ template <> struct wAdd<u32>
 
         vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
         vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
-        v_dst = vcvt_u32_f32(vs1);
+        v_dst = vround_u32_f32(vs1);
     }
 
     void operator() (const u32 * src0, const u32 * src1, u32 * dst) const

diff --git a/3rdparty/carotene/src/blur.cpp b/3rdparty/carotene/src/blur.cpp
@@ -41,6 +41,7 @@
 
 #include "common.hpp"
 #include "saturate_cast.hpp"
+#include "vround_helper.hpp"
 
 namespace CAROTENE_NS {
 
@@ -198,7 +199,6 @@ void blur3x3(const Size2D &size, s32 cn,
 //#define FLOAT_VARIANT_1_9
 #ifdef FLOAT_VARIANT_1_9
     float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
-    float32x4_t v0_5 = vdupq_n_f32 (.5);
 #else
     const int16x8_t vScale = vmovq_n_s16(3640);
 #endif
@@ -283,8 +283,8 @@ void blur3x3(const Size2D &size, s32 cn,
                 uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
                 float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
                 float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
-                tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
-                tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+                tres1 = internal::vroundq_u32_f32(vf1);
+                tres2 = internal::vroundq_u32_f32(vf2);
                 t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
                 vst1_u8(drow + x - 8, vmovn_u16(t0));
 #else
@@ -445,8 +445,8 @@ void blur3x3(const Size2D &size, s32 cn,
                 uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
                 float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
                 float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
-                tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
-                tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+                tres1 = internal::vroundq_u32_f32(vf1);
+                tres2 = internal::vroundq_u32_f32(vf2);
                 t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
                 vst1_u8(drow + x - 8, vmovn_u16(t0));
 #else
@@ -508,7 +508,6 @@ void blur5x5(const Size2D &size, s32 cn,
 #define FLOAT_VARIANT_1_25
 #ifdef FLOAT_VARIANT_1_25
     float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
-    float32x4_t v0_5 = vdupq_n_f32 (.5f);
 #else
     const int16x8_t vScale = vmovq_n_s16(1310);
 #endif
@@ -752,8 +751,8 @@ void blur5x5(const Size2D &size, s32 cn,
             uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
             float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
             float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
-            tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
-            tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+            tres1 = internal::vroundq_u32_f32(vf1);
+            tres2 = internal::vroundq_u32_f32(vf2);
             t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
             vst1_u8(drow + x - 8, vmovn_u16(t0));
 #else

diff --git a/3rdparty/carotene/src/colorconvert.cpp b/3rdparty/carotene/src/colorconvert.cpp
@@ -40,6 +40,7 @@
 #include "common.hpp"
 
 #include "saturate_cast.hpp"
+#include "vround_helper.hpp"
 
 namespace CAROTENE_NS {
 
@@ -1166,17 +1167,10 @@ inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const ui
     vSt3 = vmulq_f32(vHF1, vDivTab);
     vSt4 = vmulq_f32(vHF2, vDivTab);
 
-    float32x4_t bias = vdupq_n_f32(0.5f);
-
-    vSt1 = vaddq_f32(vSt1, bias);
-    vSt2 = vaddq_f32(vSt2, bias);
-    vSt3 = vaddq_f32(vSt3, bias);
-    vSt4 = vaddq_f32(vSt4, bias);
-
-    uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);
-    uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);
-    uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);
-    uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);
+    uint32x4_t vRes1 = internal::vroundq_u32_f32(vSt1);
+    uint32x4_t vRes2 = internal::vroundq_u32_f32(vSt2);
+    uint32x4_t vRes3 = internal::vroundq_u32_f32(vSt3);
+    uint32x4_t vRes4 = internal::vroundq_u32_f32(vSt4);
 
     int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
     int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));

diff --git a/3rdparty/carotene/src/common.hpp b/3rdparty/carotene/src/common.hpp
@@ -58,6 +58,17 @@
 
 namespace CAROTENE_NS { namespace internal {
 
+#ifndef CAROTENE_NEON_ARCH
+#    if defined(__aarch64__) || defined(__aarch32__)
+#        define CAROTENE_NEON_ARCH 8
+#    else
+#        define CAROTENE_NEON_ARCH 7
+#    endif
+#endif
+#if ( !defined(__aarch64__) && !defined(__aarch32__) ) && (CAROTENE_NEON_ARCH == 8 )
+#    error("ARMv7 doen't support A32/A64 Neon instructions")
+#endif
+
 inline void prefetch(const void *ptr, size_t offset = 32*10)
 {
 #if defined __GNUC__