From 4a7c4656e97cd358737046de8270bb2518b705a4 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Thu, 30 Jun 2022 18:59:32 +0000 Subject: [PATCH 01/12] Add new rvv backend (partially implemented). --- .../opencv2/core/hal/intrin_riscv_vector.hpp | 490 ++++++++++++++++++ 1 file changed, 490 insertions(+) create mode 100644 modules/core/include/opencv2/core/hal/intrin_riscv_vector.hpp diff --git a/modules/core/include/opencv2/core/hal/intrin_riscv_vector.hpp b/modules/core/include/opencv2/core/hal/intrin_riscv_vector.hpp new file mode 100644 index 000000000000..9758de191f46 --- /dev/null +++ b/modules/core/include/opencv2/core/hal/intrin_riscv_vector.hpp @@ -0,0 +1,490 @@ + +#ifndef OPENCV_HAL_INTRIN_RVV_VEC_HPP +#define OPENCV_HAL_INTRIN_RVV_VEC_HPP + +#include +#include +#include + +#ifndef CV_RVV_MAX_VLEN +#define CV_RVV_MAX_VLEN 1024 +#endif + +namespace cv +{ +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +#define CV_SIMD_SCALABLE 1 +#define CV_SIMD_SCALABLE_64F 1 + +using v_uint8 = vuint8m1_t; +using v_int8 = vint8m1_t; +using v_uint16 = vuint16m1_t; +using v_int16 = vint16m1_t; +using v_uint32 = vuint32m1_t; +using v_int32 = vint32m1_t; +using v_uint64 = vuint64m1_t; +using v_int64 = vint64m1_t; + +using v_float32 = vfloat32m1_t; +#if CV_SIMD_SCALABLE_64F +using v_float64 = vfloat64m1_t; +#endif + +using uchar = unsigned char; +using schar = signed char; +using ushort = unsigned short; +using uint = unsigned int; +using uint64 = unsigned long int; +using int64 = long int; + + + +template +struct VTraits; + +template <> +struct VTraits +{ + static inline int vlanes() { return vsetvlmax_e8m1(); } + using lane_type = uchar; + static const int max_nlanes = CV_RVV_MAX_VLEN/8; +}; + +template <> +struct VTraits +{ + static inline int vlanes() { return vsetvlmax_e8m1(); } + using lane_type = schar; + static const int max_nlanes = CV_RVV_MAX_VLEN/8; +}; +template <> +struct VTraits +{ + static inline int vlanes() { return vsetvlmax_e16m1(); } + using lane_type = ushort; + static const int max_nlanes = CV_RVV_MAX_VLEN/16; +}; +template <> +struct VTraits +{ + static inline int vlanes() { return vsetvlmax_e16m1(); } + using lane_type = short; + static const int max_nlanes = CV_RVV_MAX_VLEN/16; +}; +template <> +struct VTraits +{ + static inline int vlanes() { return vsetvlmax_e32m1(); } + using lane_type = uint; + static const int max_nlanes = CV_RVV_MAX_VLEN/32; +}; +template <> +struct VTraits +{ + static inline int vlanes() { return vsetvlmax_e32m1(); } + using lane_type = int; + static const int max_nlanes = CV_RVV_MAX_VLEN/32; +}; + +template <> +struct VTraits +{ + static inline int vlanes() { return vsetvlmax_e32m1(); } + using lane_type = float; + static const int max_nlanes = CV_RVV_MAX_VLEN/32; +}; +template <> +struct VTraits +{ + static inline int vlanes() { return vsetvlmax_e64m1(); } + using lane_type = uint64; + static const int max_nlanes = CV_RVV_MAX_VLEN/64; +}; +template <> +struct VTraits +{ + static inline int vlanes() { return vsetvlmax_e64m1(); } + using lane_type = int64; + static const int max_nlanes = CV_RVV_MAX_VLEN/64; +}; +#if CV_SIMD_SCALABLE_64F +template <> +struct VTraits +{ + static inline int vlanes() { return vsetvlmax_e64m1(); } + using lane_type = double; + static const int max_nlanes = CV_RVV_MAX_VLEN/64; +}; +#endif + +//////////// get0 //////////// +#define OPENCV_HAL_IMPL_RVV_GRT0_INT(_Tpvec, _Tp) \ +inline _Tp v_get0(v_##_Tpvec v) \ +{ \ + return vmv_x(v); \ +} + +OPENCV_HAL_IMPL_RVV_GRT0_INT(uint8, uchar) +OPENCV_HAL_IMPL_RVV_GRT0_INT(int8, schar) +OPENCV_HAL_IMPL_RVV_GRT0_INT(uint16, ushort) +OPENCV_HAL_IMPL_RVV_GRT0_INT(int16, short) +OPENCV_HAL_IMPL_RVV_GRT0_INT(uint32, unsigned) +OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int) +OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64) +OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64) + +inline float v_get0(v_float32 v) \ +{ \ + return vfmv_f(v); \ +} +#if CV_SIMD_SCALABLE_64F +inline double v_get0(v_float64 v) \ +{ \ + return vfmv_f(v); \ +} +#endif + +//////////// Initial //////////// + +#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \ +inline v_##_Tpvec v_setzero_##suffix1() \ +{ \ + return vmv_v_x_##suffix2##m1(0, vl); \ +} \ +inline v_##_Tpvec v_setall_##suffix1(_Tp v) \ +{ \ + return vmv_v_x_##suffix2##m1(v, vl); \ +} + +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8, uchar, u8, u8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8, schar, s8, i8, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16, ushort, u16, u16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16, short, s16, i16, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32, uint, u32, u32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32, int, s32, i32, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64, uint64, u64, u64, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64, int64, s64, i64, VTraits::vlanes()) + +#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \ +inline v_##_Tpv v_setzero_##suffix() \ +{ \ + return vfmv_v_f_##suffix##m1(0, vl); \ +} \ +inline v_##_Tpv v_setall_##suffix(_Tp v) \ +{ \ + return vfmv_v_f_##suffix##m1(v, vl); \ +} + +OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits::vlanes()) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits::vlanes()) +#endif + +//////////// Reinterpret //////////// +#define OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(_Tpvec1, suffix1) \ +inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec1& v) \ +{ \ + return v;\ +} +OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint8, u8) +OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint16, u16) +OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint32, u32) +OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint64, u64) +OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8) +OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16) +OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32) +OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64) +OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64) +#endif +// TODO: can be simplified by using overloaded RV intrinsic +#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \ +inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \ +{ \ + return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\ +} \ +inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \ +{ \ + return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\ +} + +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, float64, u64, f64, u64, f64) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64, float64, s64, f64, i64, f64) +#endif +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint16, u8, u16, u8, u16) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint32, u8, u32, u8, u32) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint64, u8, u64, u8, u64) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint32, u16, u32, u16, u32) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint64, u16, u64, u16, u64) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, uint64, u32, u64, u32, u64) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int16, s8, s16, i8, i16) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int32, s8, s32, i8, i32) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int64, s8, s64, i8, i64) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int32, s16, s32, i16, i32) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int64, s16, s64, i16, i64) +OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, int64, s32, s64, i32, i64) + + +#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \ +inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \ +{ \ + return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v));\ +} \ +inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \ +{ \ + return vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v));\ +} + +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int16, u8, s16, u, i, 8, 16) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int32, u8, s32, u, i, 8, 32) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int64, u8, s64, u, i, 8, 64) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int8, u16, s8, u, i, 16, 8) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int32, u16, s32, u, i, 16, 32) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int64, u16, s64, u, i, 16, 64) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int8, u32, s8, u, i, 32, 8) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int16, u32, s16, u, i, 32, 16) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float32, s8, f32, i, f, 8, 32) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float32, s16, f32, i, f, 16, 32) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float32, s64, f32, i, f, 64, 32) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float64, u8, f64, u, f, 8, 64) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float64, u16, f64, u, f, 16, 64) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float64, u32, f64, u, f, 32, 64) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64) +OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64) +// Three times reinterpret +inline v_float32 v_reinterpret_as_f32(const v_float64& v) \ +{ \ + return vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v)));\ +} + +inline v_float64 v_reinterpret_as_f64(const v_float32& v) \ +{ \ + return vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v)));\ +} +#endif + + +////////////// Load/Store ////////////// +#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ \ + return vle##width##_v_##suffix##m1(ptr, vl); \ +} \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ \ + return vle##width##_v_##suffix##m1(ptr, vl); \ +} \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ +{ \ + vse##width##_v_##suffix##m1(ptr, a, vl); \ +} \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ \ + return vle##width##_v_##suffix##m1(ptr, hvl); \ +} \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ \ + return vslideup(vle##width##_v_##suffix##m1(ptr0, hvl), vle##width##_v_##suffix##m1(ptr1, hvl), hvl, vl); \ +} \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ \ + vse##width(ptr, a, vl); \ +} \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ \ + vse##width(ptr, a, vl); \ +} \ +inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ +{ \ + vse##width(ptr, a, vl); \ +} \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ \ + vse##width(ptr, a, hvl); \ +} \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ \ + vse##width(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \ +} \ +inline _Tpvec v_load(std::initializer_list<_Tp> nScalars) \ +{ \ + assert(nScalars.size() == vl); \ + return vle##width##_v_##suffix##m1(nScalars.begin(), nScalars.size()); \ +} \ +template \ +_Tpvec v_load_##suffix(Targs... nScalars) \ +{ \ + return v_load({nScalars...}); \ +} + + +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m1_t, uchar, VTraits::vlanes() / 2, VTraits::vlanes(), 8, u8, vmv_v_x_u8m1) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m1_t, schar, VTraits::vlanes() / 2, VTraits::vlanes(), 8, i8, vmv_v_x_i8m1) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16, vuint16m1_t, ushort, VTraits::vlanes() / 2, VTraits::vlanes(), 16, u16, vmv_v_x_u16m1) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16, vint16m1_t, short, VTraits::vlanes() / 2, VTraits::vlanes(), 16, i16, vmv_v_x_i16m1) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m1_t, unsigned int, VTraits::vlanes() / 2, VTraits::vlanes(), 32, u32, vmv_v_x_u32m1) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m1_t, int, VTraits::vlanes() / 2, VTraits::vlanes(), 32, i32, vmv_v_x_i32m1) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m1_t, uint64, VTraits::vlanes() / 2, VTraits::vlanes(), 64, u64, vmv_v_x_u64m1) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m1_t, int64, VTraits::vlanes() / 2, VTraits::vlanes(), 64, i64, vmv_v_x_i64m1) +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m1_t, float, VTraits::vlanes() /2 , VTraits::vlanes(), 32, f32, vfmv_v_f_f32m1) + +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m1_t, double, VTraits::vlanes() / 2, VTraits::vlanes(), 64, f64, vfmv_v_f_f64m1) +#endif + +////////////// Lookup table access //////////////////// +#define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_lut(const _Tp* tab, const int* idx) \ +{ \ + vuint32##suffix##_t vidx = vmul(vreinterpret_u32##suffix(vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ + return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \ +} \ +inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \ +{ \ + std::vector idx_; \ + for (size_t i = 0; i < VTraits::vlanes(); ++i) { \ + idx_.push_back(idx[i]); \ + idx_.push_back(idx[i]+1); \ + } \ + vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ + return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \ +} \ +inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \ +{ \ + std::vector idx_; \ + for (size_t i = 0; i < VTraits::vlanes(); ++i) { \ + idx_.push_back(idx[i]); \ + idx_.push_back(idx[i]+1); \ + idx_.push_back(idx[i]+2); \ + idx_.push_back(idx[i]+3); \ + } \ + vuint32##suffix##_t vidx = vmul(vle32_v_u32##suffix(idx_.data(), VTraits<_Tpvec>::vlanes()), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \ + return vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \ +} +OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m4) +OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m2) +OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m1) +OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, mf2) +OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m1) + +inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); } +inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); } +inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); } +inline v_uint16 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); } +inline v_uint16 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); } +inline v_uint16 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); } +inline v_uint32 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); } +inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); } +inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); } +inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); } +inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } +inline v_uint64 v_lut_quads(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_quads((const int64_t*)tab, idx)); } + + +////////////// Min/Max ////////////// + +#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \ +inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return intrin(a, b, vl); \ +} + +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_min, vminu, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_max, vmaxu, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_min, vmin, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_max, vmax, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_min, vminu, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_max, vmaxu, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_min, vmin, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_max, vmax, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, vminu, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, vmaxu, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, vmin, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, vmax, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, vfmin, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, vfmax, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_min, vminu, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64, v_max, vmaxu, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_min, vmin, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64, v_max, vmax, VTraits::vlanes()) +#if CV_SIMD_SCALABLE_64F +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, vfmin, VTraits::vlanes()) +OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, vfmax, VTraits::vlanes()) +#endif + + +//////////// Value reordering //////////// + +#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tp, _Tpwvec, _Tpwvec_m2, _Tpvec, width, suffix, suffix2, cvt) \ +inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ +{ \ + _Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \ + b0 = vget_##suffix##m1(temp, 0); \ + b1 = vget_##suffix##m1(temp, 1); \ +} \ +inline _Tpwvec v_expand_low(const _Tpvec& a) \ +{ \ + _Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \ + return vget_##suffix##m1(temp, 0); \ +} \ +inline _Tpwvec v_expand_high(const _Tpvec& a) \ +{ \ + _Tpwvec_m2 temp = cvt(a, vsetvlmax_e##width##m1()); \ + return vget_##suffix##m1(temp, 1); \ +} \ +inline _Tpwvec v_load_expand(const _Tp* ptr) \ +{ \ + return cvt(vle##width##_v_##suffix2##mf2(ptr, vsetvlmax_e##width##m1()), vsetvlmax_e##width##m1()); \ +} + +OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m2_t, v_uint8, 8, u16, u8, vwcvtu_x) +OPENCV_HAL_IMPL_RVV_EXPAND(schar, v_int16, vint16m2_t, v_int8, 8, i16, i8, vwcvt_x) +OPENCV_HAL_IMPL_RVV_EXPAND(ushort, v_uint32, vuint32m2_t, v_uint16, 16, u32, u16, vwcvtu_x) +OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m2_t, v_int16, 16, i32, i16, vwcvt_x) +OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m2_t, v_uint32, 32, u64, u32, vwcvtu_x) +OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m2_t, v_int32, 32, i64, i32, vwcvt_x) + +inline v_uint32 v_load_expand_q(const uchar* ptr) +{ + return vwcvtu_x(vwcvtu_x(vle8_v_u8mf4(ptr, VTraits::vlanes()), VTraits::vlanes()), VTraits::vlanes()); +} + +inline v_int32 v_load_expand_q(const schar* ptr) +{ + return vwcvt_x(vwcvt_x(vle8_v_i8mf4(ptr, VTraits::vlanes()), VTraits::vlanes()), VTraits::vlanes()); +} + + +////// FP16 support /////// + +inline v_float32 v_load_expand(const float16_t* ptr) +{ + // TODO + return vundefined_f32m1(); +} + +inline void v_cleanup() {} + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + +} //namespace cv + +#endif \ No newline at end of file From f4884dd90e953565d8871a1ba1910559863891d1 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Thu, 30 Jun 2022 19:03:56 +0000 Subject: [PATCH 02/12] Modify the framework of Universal Intrinsic. --- .../core/include/opencv2/core/hal/intrin.hpp | 295 +++++++++++++++++- .../opencv2/core/hal/simd_utils.impl.hpp | 42 ++- platforms/linux/riscv64-clang.toolchain.cmake | 4 + 3 files changed, 337 insertions(+), 4 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index ac331f2154de..1f1c5a290b5f 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -200,7 +200,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; # undef CV_RVV #endif -#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP) +#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071) && !defined(CV_FORCE_SIMD128_CPP) #define CV__SIMD_FORWARD 128 #include "opencv2/core/hal/intrin_forward.hpp" #endif @@ -229,9 +229,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; #elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP) #include "opencv2/core/hal/intrin_wasm.hpp" -#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) +#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_NEW) #include "opencv2/core/hal/intrin_rvv.hpp" - +#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_NEW +#include "opencv2/core/hal/intrin_riscv_vector.hpp" #else #include "opencv2/core/hal/intrin_cpp.hpp" @@ -314,6 +315,14 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN #define CV_SIMD512_FP16 0 #endif +#ifndef CV_SIMD_SCALABLE +#define CV_SIMD_SCALABLE 0 +#endif + +#ifndef CV_SIMD_SCALABLE_64F +#define CV_SIMD_SCALABLE_64F 0 +#endif + //================================================================================================== template struct V_RegTraits @@ -375,6 +384,18 @@ template struct V_RegTraits CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void); CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16); #endif +#if CV_SIMD_SCALABLE + CV_DEF_REG_TRAITS(v, v_uint8, uchar, u8, v_uint8, v_uint16, v_uint32, v_int8, void); + CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void); + CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void); + CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void); + CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void); + CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void); + CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32); + CV_DEF_REG_TRAITS(v, v_uint64, uint64, u64, v_uint64, void, void, v_int64, void); + CV_DEF_REG_TRAITS(v, v_int64, int64, s64, v_uint64, void, void, v_int64, void); + CV_DEF_REG_TRAITS(v, v_float64, double, f64, v_float64, void, void, v_int64, v_int32); +#endif //! @endcond #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512) @@ -488,6 +509,17 @@ namespace CV__SIMD_NAMESPACE { #define VXPREFIX(func) v##func } // namespace using namespace CV__SIMD_NAMESPACE; + +#elif CV_SIMD_SCALABLE +#define CV__SIMD_NAMESPACE simd +namespace CV__SIMD_NAMESPACE { + #define CV_SIMD 0 + #define CV_SIMD_WIDTH 128 /* 1024/8 */ + + #define VXPREFIX(func) v##func +} // namespace +using namespace CV__SIMD_NAMESPACE; + #endif namespace CV__SIMD_NAMESPACE { @@ -663,6 +695,263 @@ namespace CV__SIMD_NAMESPACE { /** @brief SIMD processing state cleanup call */ inline void vx_cleanup() { VXPREFIX(_cleanup)(); } +#if CV_SIMD + // Compatibility layer + #define CV_SIMD_SCALABLE 0 + #define CV_SIMD_SCALABLE_64F 0 + + template + struct VTraits; + + template <> + struct VTraits + { + static inline int vlanes() { return v_uint8::nlanes; } + static const int nlanes = v_uint8::nlanes; + using lane_type = uchar; + static const int max_nlanes = nlanes; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_int8::nlanes; } + static const int nlanes = v_int8::nlanes; + using lane_type = schar; + static const int max_nlanes = nlanes; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_uint16::nlanes; } + static const int nlanes = v_uint16::nlanes; + using lane_type = ushort; + static const int max_nlanes = nlanes; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_int16::nlanes; } + static const int nlanes = v_int16::nlanes; + using lane_type = short; + static const int max_nlanes = nlanes; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_uint32::nlanes; } + static const int nlanes = v_uint32::nlanes; + using lane_type = uint; + static const int max_nlanes = nlanes; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_int32::nlanes; } + static const int nlanes = v_int32::nlanes; + using lane_type = int; + static const int max_nlanes = nlanes; + }; + + template <> + struct VTraits + { + static inline int vlanes() { return v_float32::nlanes; } + static const int nlanes = v_float32::nlanes; + using lane_type = float; + static const int max_nlanes = nlanes; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_uint64::nlanes; } + static const int nlanes = v_uint64::nlanes; + using lane_type = uint64; + static const int max_nlanes = nlanes; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_int64::nlanes; } + static const int nlanes = v_int64::nlanes; + using lane_type = int64; + static const int max_nlanes = nlanes; + }; + #if CV_SIMD_64F + template <> + struct VTraits + { + static inline int vlanes() { return v_float64::nlanes; } + static const int nlanes = v_float64::nlanes; + using lane_type = double; + static const int max_nlanes = nlanes; + }; + #endif + + #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \ + inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \ + { \ + return a + b; \ + } \ + inline _Tpvec v_sub(const _Tpvec& a, const _Tpvec& b) \ + { \ + return a - b; \ + } \ + template \ + inline _Tpvec v_add(_Tpvec f1, _Tpvec f2, Args... vf) { \ + return v_add(f1 + f2, vf...); \ + } + + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64) + #endif + + #define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \ + inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \ + { \ + return a & b; \ + } \ + inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \ + { \ + return a | b; \ + } \ + inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \ + { \ + return a ^ b; \ + } \ + inline _Tpvec v_not(const _Tpvec& a) \ + { \ + return ~a; \ + } + + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64) + + + #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \ + inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ + { \ + return a * b; \ + } \ + template \ + inline _Tpvec v_mul(_Tpvec f1, _Tpvec f2, Args... vf) { \ + return v_mul(f1 * f2, vf...); \ + } + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64) + #endif + + + inline v_float32 v_div(const v_float32& a, const v_float32& b) \ + { \ + return a / b; \ + } + #if CV_SIMD_64F + inline v_float64 v_div(const v_float64& a, const v_float64& b) \ + { \ + return a / b; \ + } + #endif + + #define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \ + inline _Tpvec v_##intrin(const _Tpvec& a, const _Tpvec& b) \ + { \ + return a op b; \ + } + + #define OPENCV_HAL_WRAP_CMP(_Tpvec) \ + OPENCV_HAL_WRAP_CMP_OP(_Tpvec, eq, ==) \ + OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ne, !=) \ + OPENCV_HAL_WRAP_CMP_OP(_Tpvec, lt, <) \ + OPENCV_HAL_WRAP_CMP_OP(_Tpvec, gt, >) \ + OPENCV_HAL_WRAP_CMP_OP(_Tpvec, le, <=) \ + OPENCV_HAL_WRAP_CMP_OP(_Tpvec, ge, >=) + + OPENCV_HAL_WRAP_CMP(v_uint8) + OPENCV_HAL_WRAP_CMP(v_uint16) + OPENCV_HAL_WRAP_CMP(v_uint32) + // OPENCV_HAL_WRAP_CMP(v_uint64) + OPENCV_HAL_WRAP_CMP(v_int8) + OPENCV_HAL_WRAP_CMP(v_int16) + OPENCV_HAL_WRAP_CMP(v_int32) + // OPENCV_HAL_WRAP_CMP(v_int64) + OPENCV_HAL_WRAP_CMP(v_float32) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_CMP(v_float64) + #endif + + //////////// get0 //////////// + #define OPENCV_HAL_WRAP_GRT0_INT(_Tpvec, _Tp) \ + inline _Tp v_get0(v_##_Tpvec v) \ + { \ + return v.get0(); \ + } + + OPENCV_HAL_WRAP_GRT0_INT(uint8, uchar) + OPENCV_HAL_WRAP_GRT0_INT(int8, schar) + OPENCV_HAL_WRAP_GRT0_INT(uint16, ushort) + OPENCV_HAL_WRAP_GRT0_INT(int16, short) + OPENCV_HAL_WRAP_GRT0_INT(uint32, unsigned) + OPENCV_HAL_WRAP_GRT0_INT(int32, int) + OPENCV_HAL_WRAP_GRT0_INT(uint64, uint64) + OPENCV_HAL_WRAP_GRT0_INT(int64, int64) + OPENCV_HAL_WRAP_GRT0_INT(float32, float) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_GRT0_INT(float64, double) + #endif + + #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec, _Tp, vl) \ + inline _Tp v_extract_highest(_Tpvec v) \ + { \ + return v_extract_n(v); \ + } + + OPENCV_HAL_WRAP_EXTRACT(v_uint8, uchar, VTraits::nlanes) + OPENCV_HAL_WRAP_EXTRACT(v_int8, schar, VTraits::nlanes) + OPENCV_HAL_WRAP_EXTRACT(v_uint16, ushort, VTraits::nlanes) + OPENCV_HAL_WRAP_EXTRACT(v_int16, short, VTraits::nlanes) + OPENCV_HAL_WRAP_EXTRACT(v_uint32, unsigned int, VTraits::nlanes) + OPENCV_HAL_WRAP_EXTRACT(v_int32, int, VTraits::nlanes) + OPENCV_HAL_WRAP_EXTRACT(v_uint64, uint64, VTraits::nlanes) + OPENCV_HAL_WRAP_EXTRACT(v_int64, int64, VTraits::nlanes) + OPENCV_HAL_WRAP_EXTRACT(v_float32, float, VTraits::nlanes) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_EXTRACT(v_float64, double, VTraits::nlanes) + #endif + + #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \ + inline _Tpvec v_broadcast_highest(_Tpvec v) \ + { \ + return v_broadcast_element::nlanes-1>(v); \ + } + + OPENCV_HAL_WRAP_BROADCAST(v_uint32) + OPENCV_HAL_WRAP_BROADCAST(v_int32) + OPENCV_HAL_WRAP_BROADCAST(v_float32) + + +#endif //CV_SIMD //! @cond IGNORED diff --git a/modules/core/include/opencv2/core/hal/simd_utils.impl.hpp b/modules/core/include/opencv2/core/hal/simd_utils.impl.hpp index fff8f942b8d7..0a1ab2c52307 100644 --- a/modules/core/include/opencv2/core/hal/simd_utils.impl.hpp +++ b/modules/core/include/opencv2/core/hal/simd_utils.impl.hpp @@ -128,8 +128,48 @@ template<> inline Type2Vec512_Traits::vec_type v512_setall(const #endif // SIMD512 +#if CV_SIMD_SCALABLE +template struct Type2Vec_Traits; +#define CV_INTRIN_DEF_TYPE2VEC_TRAITS(type_, vec_type_) \ + template<> struct Type2Vec_Traits \ + { \ + typedef vec_type_ vec_type; \ + } + +CV_INTRIN_DEF_TYPE2VEC_TRAITS(uchar, v_uint8); +CV_INTRIN_DEF_TYPE2VEC_TRAITS(schar, v_int8); +CV_INTRIN_DEF_TYPE2VEC_TRAITS(ushort, v_uint16); +CV_INTRIN_DEF_TYPE2VEC_TRAITS(short, v_int16); +CV_INTRIN_DEF_TYPE2VEC_TRAITS(unsigned, v_uint32); +CV_INTRIN_DEF_TYPE2VEC_TRAITS(int, v_int32); +CV_INTRIN_DEF_TYPE2VEC_TRAITS(float, v_float32); +CV_INTRIN_DEF_TYPE2VEC_TRAITS(uint64, v_uint64); +CV_INTRIN_DEF_TYPE2VEC_TRAITS(int64, v_int64); +#if CV_SIMD_SCALABLE_64F +CV_INTRIN_DEF_TYPE2VEC_TRAITS(double, v_float64); +#endif +template static inline +typename Type2Vec_Traits<_T>::vec_type v_setall(const _T& a); + +template<> inline Type2Vec_Traits< uchar>::vec_type v_setall< uchar>(const uchar& a) { return v_setall_u8(a); } +template<> inline Type2Vec_Traits< schar>::vec_type v_setall< schar>(const schar& a) { return v_setall_s8(a); } +template<> inline Type2Vec_Traits::vec_type v_setall(const ushort& a) { return v_setall_u16(a); } +template<> inline Type2Vec_Traits< short>::vec_type v_setall< short>(const short& a) { return v_setall_s16(a); } +template<> inline Type2Vec_Traits< uint>::vec_type v_setall< uint>(const uint& a) { return v_setall_u32(a); } +template<> inline Type2Vec_Traits< int>::vec_type v_setall< int>(const int& a) { return v_setall_s32(a); } +template<> inline Type2Vec_Traits::vec_type v_setall(const uint64& a) { return v_setall_u64(a); } +template<> inline Type2Vec_Traits< int64>::vec_type v_setall< int64>(const int64& a) { return v_setall_s64(a); } +template<> inline Type2Vec_Traits< float>::vec_type v_setall< float>(const float& a) { return v_setall_f32(a); } +#if CV_SIMD_SCALABLE_64F +template<> inline Type2Vec_Traits::vec_type v_setall(const double& a) { return v_setall_f64(a); } +#endif +#endif -#if CV_SIMD_WIDTH == 16 + +#if CV_SIMD_SCALABLE +template static inline +typename Type2Vec_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); } +#elif CV_SIMD_WIDTH == 16 template static inline typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); } #elif CV_SIMD_WIDTH == 32 diff --git a/platforms/linux/riscv64-clang.toolchain.cmake b/platforms/linux/riscv64-clang.toolchain.cmake index 62d9e293d275..a14ad3f368c5 100644 --- a/platforms/linux/riscv64-clang.toolchain.cmake +++ b/platforms/linux/riscv64-clang.toolchain.cmake @@ -22,6 +22,10 @@ set(CMAKE_CXX_FLAGS "-march=rv64gcv --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2") +OPTION(NEWRVV "use new rvv ui" ON) # Enabled by default +IF(NEWRVV) + ADD_DEFINITIONS(-DCV_RVV_NEW) +ENDIF(NEWRVV) set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT}) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) From 7b67f3afe022be58c431b80962f54aa53c3e1642 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Thu, 30 Jun 2022 19:06:29 +0000 Subject: [PATCH 03/12] Add CV_SIMD macro guards to current UI code. --- modules/core/src/arithm.simd.hpp | 20 +++++++++++++++++++ .../fluid/gfluidcore_func.dispatch.cpp | 5 +++-- .../src/backends/fluid/gfluidcore_func.hpp | 2 +- modules/imgproc/src/color_lab.cpp | 2 ++ modules/imgproc/src/color_yuv.simd.hpp | 8 ++++++++ 5 files changed, 34 insertions(+), 3 deletions(-) diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp index f88597aacc68..06ebfb767842 100644 --- a/modules/core/src/arithm.simd.hpp +++ b/modules/core/src/arithm.simd.hpp @@ -266,24 +266,30 @@ struct op_absdiff template<> struct op_absdiff { +#if CV_SIMD static inline v_int8 r(const v_int8& a, const v_int8& b) { return v_absdiffs(a, b); } +#endif static inline schar r(schar a, schar b) { return c_absdiff(a, b); } }; template<> struct op_absdiff { +#if CV_SIMD static inline v_int16 r(const v_int16& a, const v_int16& b) { return v_absdiffs(a, b); } +#endif static inline short r(short a, short b) { return c_absdiff(a, b); } }; template<> struct op_absdiff { +#if CV_SIMD static inline v_int32 r(const v_int32& a, const v_int32& b) { return v_reinterpret_as_s32(v_absdiff(a, b)); } +#endif static inline int r(int a, int b) { return c_absdiff(a, b); } }; @@ -1430,11 +1436,13 @@ struct op_mul template struct op_mul_scale { +#if CV_SIMD static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); return v_scalar * a * b; } +#endif static inline T1 r(T1 a, T1 b, const T2* scalar) { return c_mul(a, b, *scalar); } static inline Tvec pre(const Tvec&, const Tvec& res) @@ -1569,6 +1577,7 @@ struct op_div_f template struct op_div_scale { +#if CV_SIMD static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); @@ -1579,6 +1588,7 @@ struct op_div_scale const Tvec v_zero = vx_setall(0); return v_select(denom == v_zero, v_zero, res); } +#endif static inline T1 r(T1 a, T1 denom, const T2* scalar) { CV_StaticAssert(std::numeric_limits::is_integer, ""); @@ -1589,11 +1599,13 @@ struct op_div_scale template<> struct op_div_scale { +#if CV_SIMD static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); return a * v_scalar / b; } +#endif static inline float r(float a, float denom, const float* scalar) { return c_div(a, denom, *scalar); } }; @@ -1673,11 +1685,13 @@ DEFINE_SIMD_ALL(div, div_loop) template struct op_add_scale { +#if CV_SIMD static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar) { const v_float32 v_alpha = vx_setall_f32(*scalar); return v_fma(a, v_alpha, b); } +#endif static inline T1 r(T1 a, T1 b, const T2* scalar) { return c_add(a, b, *scalar); } static inline Tvec pre(const Tvec&, const Tvec& res) @@ -1704,6 +1718,7 @@ struct op_add_scale template struct op_add_weighted { +#if CV_SIMD static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars) { const v_float32 v_alpha = vx_setall_f32(scalars[0]); @@ -1711,6 +1726,7 @@ struct op_add_weighted const v_float32 v_gamma = vx_setall_f32(scalars[2]); return v_fma(a, v_alpha, v_fma(b, v_beta, v_gamma)); } +#endif static inline T1 r(T1 a, T1 b, const T2* scalars) { return c_add(a, b, scalars[0], scalars[1], scalars[2]); } static inline Tvec pre(const Tvec&, const Tvec& res) @@ -1819,6 +1835,7 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d) template struct op_recip { +#if CV_SIMD static inline v_float32 r(const v_float32& a, const T2* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); @@ -1829,6 +1846,7 @@ struct op_recip const Tvec v_zero = vx_setall(0); return v_select(denom == v_zero, v_zero, res); } +#endif static inline T1 r(T1 denom, const T2* scalar) { CV_StaticAssert(std::numeric_limits::is_integer, ""); @@ -1839,11 +1857,13 @@ struct op_recip template<> struct op_recip { +#if CV_SIMD static inline v_float32 r(const v_float32& a, const float* scalar) { const v_float32 v_scalar = vx_setall_f32(*scalar); return v_scalar / a; } +#endif static inline float r(float denom, const float* scalar) { return c_div(*scalar, denom); } }; diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index c9d329b2ff35..6171bff8020d 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -6,6 +6,8 @@ #if !defined(GAPI_STANDALONE) +#include +#if CV_SIMD #include "gfluidcore_func.hpp" #include "gfluidcore_func.simd.hpp" @@ -14,7 +16,6 @@ #include "gfluidutils.hpp" #include -#include #include #include @@ -394,5 +395,5 @@ CONVERTTO_SCALED_SIMD(float, float) } // namespace fluid } // namespace gapi } // namespace cv - +#endif // CV_SIMD #endif // !defined(GAPI_STANDALONE) diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index 81aa098b6418..aec03c0b5011 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -6,7 +6,7 @@ #pragma once -#if !defined(GAPI_STANDALONE) +#if !defined(GAPI_STANDALONE) && CV_SIMD #include diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp index c5ebe30fe154..3b18944a0c6b 100644 --- a/modules/imgproc/src/color_lab.cpp +++ b/modules/imgproc/src/color_lab.cpp @@ -3612,6 +3612,7 @@ struct Luv2RGBinteger } } +#if CV_SIMD inline void processLuvToXYZ(const v_uint8& lv, const v_uint8& uv, const v_uint8& vv, v_int32 (&x)[4], v_int32 (&y)[4], v_int32 (&z)[4]) const { @@ -3717,6 +3718,7 @@ struct Luv2RGBinteger z[k] = v_max(zero, v_min(base2, z[k])); } } +#endif void operator()(const uchar* src, uchar* dst, int n) const { diff --git a/modules/imgproc/src/color_yuv.simd.hpp b/modules/imgproc/src/color_yuv.simd.hpp index 196a03b99558..dd7d21c75db3 100644 --- a/modules/imgproc/src/color_yuv.simd.hpp +++ b/modules/imgproc/src/color_yuv.simd.hpp @@ -1038,6 +1038,7 @@ static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, i buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu; } +#if CV_SIMD static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v, v_int32 (&ruv)[4], v_int32 (&guv)[4], @@ -1067,6 +1068,7 @@ static inline void uvToRGBuv(const v_uint8& u, const v_uint8& v, buv[k] = vshift + ub * uu[k]; } } +#endif static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, const int buv, uchar& r, uchar& g, uchar& b, uchar& a) @@ -1079,6 +1081,7 @@ static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, co a = uchar(0xff); } +#if CV_SIMD static inline void yRGBuvToRGBA(const v_uint8& vy, const v_int32 (&ruv)[4], const v_int32 (&guv)[4], @@ -1117,6 +1120,7 @@ static inline void yRGBuvToRGBA(const v_uint8& vy, gg = v_pack_u(g0, g1); bb = v_pack_u(b0, b1); } +#endif template static inline void cvtYuv42xxp2RGB8(const uchar u, const uchar v, @@ -1426,6 +1430,7 @@ static inline uchar rgbToY42x(uchar r, uchar g, uchar b) return saturate_cast(yy >> ITUR_BT_601_SHIFT); } +#if CV_SIMD static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint8& b) { const int shifted16 = (16 << ITUR_BT_601_SHIFT); @@ -1455,6 +1460,7 @@ static inline v_uint8 rgbToY42x(const v_uint8& r, const v_uint8& g, const v_uint return v_pack(y0, y1); } +#endif static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v) { @@ -1467,6 +1473,7 @@ static inline void rgbToUV42x(uchar r, uchar g, uchar b, uchar& u, uchar& v) v = saturate_cast(vv >> ITUR_BT_601_SHIFT); } +#if CV_SIMD static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint8& g0, const v_uint8& g1, const v_uint8& b0, const v_uint8& b1, v_uint8& u, v_uint8& v) { @@ -1514,6 +1521,7 @@ static inline void rgbToUV42x(const v_uint8& r0, const v_uint8& r1, const v_uint u = v_pack_u(u0, u1); v = v_pack_u(v0, v1); } +#endif struct RGB8toYUV420pInvoker: public ParallelLoopBody From 02b203bed624af2244760768b2bbd6c18087e907 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Thu, 30 Jun 2022 19:07:07 +0000 Subject: [PATCH 04/12] Use vlanes() instead of nlanes. --- modules/calib3d/src/stereosgbm.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp index 70eb3c658ae5..e30973ec9416 100644 --- a/modules/calib3d/src/stereosgbm.cpp +++ b/modules/calib3d/src/stereosgbm.cpp @@ -177,7 +177,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y, { int x, c, width = img1.cols, cn = img1.channels(); int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); - int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1; + int D = (int)alignSize(maxD - minD, VTraits::vlanes()), width1 = maxX1 - minX1; //This minX1 & maxX2 correction is defining which part of calculatable line must be calculated //That is needs of parallel algorithm xrange_min = (xrange_min < 0) ? 0: xrange_min; @@ -502,8 +502,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2, int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); const int D = params.numDisparities; int width1 = maxX1 - minX1; - int Da = (int)alignSize(D, v_int16::nlanes); - int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D + int Da = (int)alignSize(D,VTraits::vlanes()); + int Dlra = Da + VTraits::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2; int npasses = params.isFullDP() ? 2 : 1; @@ -977,11 +977,10 @@ struct CalcVerticalSums: public ParallelLoopBody width = img1.cols; int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); D = maxD - minD; - Da = (int)alignSize(D, v_int16::nlanes); - Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D + Da = (int)alignSize(D, VTraits::vlanes()); + Dlra = Da + VTraits::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D width1 = maxX1 - minX1; D = params.numDisparities; - Da = (int)alignSize(D, v_int16::nlanes); } void operator()(const Range& range) const CV_OVERRIDE @@ -1235,8 +1234,8 @@ struct CalcHorizontalSums: public ParallelLoopBody INVALID_DISP = minD - 1; INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; D = maxD - minD; - Da = (int)alignSize(D, v_int16::nlanes); - Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D + Da = (int)alignSize(D, VTraits::vlanes()); + Dlra = Da + VTraits::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D width1 = maxX1 - minX1; } @@ -1484,8 +1483,8 @@ static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2, int width = disp1.cols, height = disp1.rows; int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); int width1 = maxX1 - minX1; - int Da = (int)alignSize(params.numDisparities, v_int16::nlanes); - int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D + int Da = (int)alignSize(params.numDisparities, VTraits::vlanes()); + int Dlra = Da + VTraits::vlanes();//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D int INVALID_DISP = minD - 1; int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; @@ -1630,7 +1629,7 @@ SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1, width = img1->cols; height = img1->rows; minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD; minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1; - Da = (int)alignSize(D, v_int16::nlanes); + Da = (int)alignSize(D, VTraits::vlanes()); SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1; From b58109cd14973d9d379cce3a4e638431bea1bab8 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Thu, 30 Jun 2022 19:07:39 +0000 Subject: [PATCH 05/12] Modify the UI test. --- modules/core/test/test_intrin128.simd.hpp | 2 +- modules/core/test/test_intrin_utils.hpp | 612 +++++++++++++++------- 2 files changed, 425 insertions(+), 189 deletions(-) diff --git a/modules/core/test/test_intrin128.simd.hpp b/modules/core/test/test_intrin128.simd.hpp index 1d9bee2d331d..46e18020f767 100644 --- a/modules/core/test/test_intrin128.simd.hpp +++ b/modules/core/test/test_intrin128.simd.hpp @@ -7,7 +7,7 @@ #include "opencv2/core/hal/intrin.hpp" #undef CV__SIMD_FORCE_WIDTH -#if CV_SIMD_WIDTH != 16 +#if CV_SIMD && CV_SIMD_WIDTH != 16 #error "Invalid build configuration" #endif diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 3f196f134238..9397e26c1b16 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -25,6 +25,7 @@ void test_hal_intrin_float16(); template struct Data; template struct initializer; +#if CV_SIMD template <> struct initializer<64> { template static R init(const Data & d) @@ -77,16 +78,85 @@ template <> struct initializer<2> } }; +#else + +template <> struct initializer<128> +{ + template static R init(const Data & d) + { + return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15], + d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31], + d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47], + d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63], + d[64], d[65], d[66], d[67], d[68], d[69], d[70], d[71], d[72], d[73], d[74], d[75], d[76], d[77], d[78], d[79], + d[80], d[81], d[82], d[83], d[84], d[85], d[86], d[87], d[88], d[89], d[90], d[91], d[92], d[93], d[94], d[95], + d[96], d[97], d[98], d[99], d[100], d[101], d[102], d[103], d[104], d[105], d[106], d[107], d[108], d[109], d[110], d[111], + d[112], d[113], d[114], d[115], d[116], d[117], d[118], d[119], d[120], d[121], d[122], d[123], d[124], d[125], d[126], d[127]}); + } +}; + +template <> struct initializer<64> +{ + template static R init(const Data & d) + { + return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15], + d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31], + d[32], d[33], d[34], d[35], d[36], d[37], d[38], d[39], d[40], d[41], d[42], d[43], d[44], d[45], d[46], d[47], + d[48], d[49], d[50], d[51], d[52], d[53], d[54], d[55], d[56], d[57], d[58], d[59], d[60], d[61], d[62], d[63]}); + } +}; + +template <> struct initializer<32> +{ + template static R init(const Data & d) + { + return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15], + d[16], d[17], d[18], d[19], d[20], d[21], d[22], d[23], d[24], d[25], d[26], d[27], d[28], d[29], d[30], d[31]}); + } +}; + +template <> struct initializer<16> +{ + template static R init(const Data & d) + { + return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]}); + } +}; + +template <> struct initializer<8> +{ + template static R init(const Data & d) + { + return v_load({d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]}); + } +}; + +template <> struct initializer<4> +{ + template static R init(const Data & d) + { + return v_load({d[0], d[1], d[2], d[3]}); + } +}; + +template <> struct initializer<2> +{ + template static R init(const Data & d) + { + return v_load({d[0], d[1]}); + } +}; +#endif //================================================================================================== template struct Data { - typedef typename R::lane_type LaneType; + typedef typename VTraits::lane_type LaneType; typedef typename V_TypeTraits::int_type int_type; Data() { - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) d[i] = (LaneType)(i + 1); } Data(LaneType val) @@ -99,7 +169,7 @@ template struct Data } operator R () { - return initializer().init(*this); + return initializer::max_nlanes>().init(*this); } Data & operator=(const R & r) { @@ -108,17 +178,17 @@ template struct Data } template Data & operator*=(T m) { - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) d[i] *= (LaneType)m; return *this; } template Data & operator+=(T m) { - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) d[i] += (LaneType)m; return *this; } - void fill(LaneType val, int s, int c = R::nlanes) + void fill(LaneType val, int s, int c = VTraits::vlanes()) { for (int i = s; i < c; ++i) d[i] = val; @@ -129,26 +199,26 @@ template struct Data } void reverse() { - for (int i = 0; i < R::nlanes / 2; ++i) - std::swap(d[i], d[R::nlanes - i - 1]); + for (int i = 0; i < VTraits::vlanes() / 2; ++i) + std::swap(d[i], d[VTraits::vlanes() - i - 1]); } const LaneType & operator[](int i) const { #if 0 // TODO: strange bug - AVX2 tests are failed with this - CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, ""); + CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)VTraits::vlanes(), ""); #else - CV_Assert(i >= 0 && i < R::nlanes); + CV_Assert(i >= 0 && i < VTraits::max_nlanes); #endif return d[i]; } LaneType & operator[](int i) { - CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, ""); + CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)VTraits::max_nlanes, ""); return d[i]; } int_type as_int(int i) const { - CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)R::nlanes, ""); + CV_CheckGE(i, 0, ""); CV_CheckLT(i, (int)VTraits::max_nlanes, ""); union { LaneType l; @@ -159,11 +229,11 @@ template struct Data } const LaneType * mid() const { - return d + R::nlanes / 2; + return d + VTraits::vlanes() / 2; } LaneType * mid() { - return d + R::nlanes / 2; + return d + VTraits::vlanes() / 2; } LaneType sum(int s, int c) { @@ -174,11 +244,11 @@ template struct Data } LaneType sum() { - return sum(0, R::nlanes); + return sum(0, VTraits::vlanes()); } bool operator==(const Data & other) const { - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) if (d[i] != other.d[i]) return false; return true; @@ -193,17 +263,17 @@ template struct Data } bool isValue(uchar val) const { - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) if (d[i] != val) return false; return true; } - LaneType d[R::nlanes]; + LaneType d[VTraits::max_nlanes]; }; template struct AlignedData { - Data CV_DECL_ALIGNED(CV_SIMD_WIDTH) a; // aligned + Data CV_DECL_ALIGNED(sizeof(typename VTraits::lane_type)*VTraits::max_nlanes) a; // aligned char dummy; Data u; // unaligned }; @@ -211,11 +281,11 @@ template struct AlignedData template std::ostream & operator<<(std::ostream & out, const Data & d) { out << "{ "; - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { - // out << std::hex << +V_TypeTraits::reinterpret_int(d.d[i]); + // out << std::hex << +V_TypeTraits::lane_type>::reinterpret_int(d.d[i]); out << +d.d[i]; - if (i + 1 < R::nlanes) + if (i + 1 < VTraits::vlanes()) out << ", "; } out << " }"; @@ -246,7 +316,7 @@ inline unsigned pack_saturate_cast(uint64 a) { return static_c template struct TheTest { - typedef typename R::lane_type LaneType; + typedef typename VTraits::lane_type LaneType; template static inline void EXPECT_COMPARE_EQ(const T1 a, const T2 b) @@ -260,37 +330,37 @@ template struct TheTest AlignedData out; // check if addresses are aligned and unaligned respectively - EXPECT_EQ((size_t)0, (size_t)&data.a.d % CV_SIMD_WIDTH); - EXPECT_NE((size_t)0, (size_t)&data.u.d % CV_SIMD_WIDTH); - EXPECT_EQ((size_t)0, (size_t)&out.a.d % CV_SIMD_WIDTH); - EXPECT_NE((size_t)0, (size_t)&out.u.d % CV_SIMD_WIDTH); + EXPECT_EQ((size_t)0, (size_t)&data.a.d % (sizeof(typename VTraits::lane_type) * VTraits::vlanes())); + EXPECT_NE((size_t)0, (size_t)&data.u.d % (sizeof(typename VTraits::lane_type) * VTraits::vlanes())); + EXPECT_EQ((size_t)0, (size_t)&out.a.d % (sizeof(typename VTraits::lane_type) * VTraits::vlanes())); + EXPECT_NE((size_t)0, (size_t)&out.u.d % (sizeof(typename VTraits::lane_type) * VTraits::vlanes())); // check some initialization methods R r1 = data.a; R r2 = vx_load(data.u.d); R r3 = vx_load_aligned(data.a.d); R r4(r2); - EXPECT_EQ(data.a[0], r1.get0()); - EXPECT_EQ(data.u[0], r2.get0()); - EXPECT_EQ(data.a[0], r3.get0()); - EXPECT_EQ(data.u[0], r4.get0()); + EXPECT_EQ(data.a[0], v_get0(r1)); + EXPECT_EQ(data.u[0], v_get0(r2)); + EXPECT_EQ(data.a[0], v_get0(r3)); + EXPECT_EQ(data.u[0], v_get0(r4)); R r_low = vx_load_low((LaneType*)data.u.d); - EXPECT_EQ(data.u[0], r_low.get0()); + EXPECT_EQ(data.u[0], v_get0(r_low)); v_store(out.u.d, r_low); - for (int i = 0; i < R::nlanes/2; ++i) + for (int i = 0; i < VTraits::vlanes()/2; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ((LaneType)data.u[i], (LaneType)out.u[i]); } - R r_low_align8byte = vx_load_low((LaneType*)((char*)data.u.d + (CV_SIMD_WIDTH / 2))); - EXPECT_EQ(data.u[R::nlanes/2], r_low_align8byte.get0()); + R r_low_align8byte = vx_load_low((LaneType*)((char*)data.u.d + (sizeof(typename VTraits::lane_type) * VTraits::vlanes() / 2))); + EXPECT_EQ(data.u[VTraits::vlanes()/2], v_get0(r_low_align8byte)); v_store(out.u.d, r_low_align8byte); - for (int i = 0; i < R::nlanes/2; ++i) + for (int i = 0; i < VTraits::vlanes()/2; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ((LaneType)data.u[i + R::nlanes/2], (LaneType)out.u[i]); + EXPECT_EQ((LaneType)data.u[i + VTraits::vlanes()/2], (LaneType)out.u[i]); } // check some store methods @@ -318,7 +388,7 @@ template struct TheTest Data resZ, resV; resZ.fill((LaneType)0); resV.fill((LaneType)8); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ((LaneType)0, resZ[i]); @@ -348,13 +418,16 @@ template struct TheTest #elif CV_SIMD_WIDTH == 64 R setall_res1 = v512_setall((LaneType)5); R setall_res2 = v512_setall(6); +#elif CV_SIMD_SCALABLE + R setall_res1 = v_setall((LaneType)5); + R setall_res2 = v_setall(6); #else #error "Configuration error" #endif #if CV_SIMD_WIDTH > 0 Data setall_res1_; v_store(setall_res1_.d, setall_res1); Data setall_res2_; v_store(setall_res2_.d, setall_res2); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ((LaneType)5, setall_res1_[i]); @@ -366,7 +439,7 @@ template struct TheTest R vx_setall_res2 = vx_setall(12); Data vx_setall_res1_; v_store(vx_setall_res1_.d, vx_setall_res1); Data vx_setall_res2_; v_store(vx_setall_res2_.d, vx_setall_res2); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ((LaneType)11, vx_setall_res1_[i]); @@ -378,14 +451,14 @@ template struct TheTest uint64 a = CV_BIG_INT(0x7fffffffffffffff); uint64 b = (uint64)CV_BIG_INT(0xcfffffffffffffff); v_uint64x2 uint64_vec(a, b); - EXPECT_EQ(a, uint64_vec.get0()); + EXPECT_EQ(a, v_get0(uint64_vec)); EXPECT_EQ(b, v_extract_n<1>(uint64_vec)); } { int64 a = CV_BIG_INT(0x7fffffffffffffff); int64 b = CV_BIG_INT(-1); v_int64x2 int64_vec(a, b); - EXPECT_EQ(a, int64_vec.get0()); + EXPECT_EQ(a, v_get0(int64_vec)); EXPECT_EQ(b, v_extract_n<1>(int64_vec)); } #endif @@ -404,8 +477,8 @@ template struct TheTest R a = data1, b = data2, c = data3; R d = data1, e = data2, f = data3, g = data4; - LaneType buf3[R::nlanes * 3]; - LaneType buf4[R::nlanes * 4]; + LaneType buf3[VTraits::vlanes() * 3]; + LaneType buf4[VTraits::vlanes() * 4]; v_store_interleave(buf3, a, b, c); v_store_interleave(buf4, d, e, f, g); @@ -416,7 +489,7 @@ template struct TheTest v_load_deinterleave(buf3, a, b, c); v_load_deinterleave(buf4, d, e, f, g); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(data1, Data(a)); @@ -440,7 +513,7 @@ template struct TheTest R a = data1, b = data2; - LaneType buf2[R::nlanes * 2]; + LaneType buf2[VTraits::vlanes() * 2]; v_store_interleave(buf2, a, b); @@ -449,7 +522,7 @@ template struct TheTest v_load_deinterleave(buf2, a, b); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(data1, Data(a)); @@ -475,7 +548,7 @@ template struct TheTest f = v_expand_high(a); Data resC = c, resD = d, resE = e, resF = f; - const int n = Rx2::nlanes; + const int n = VTraits::vlanes(); for (int i = 0; i < n; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); @@ -494,7 +567,7 @@ template struct TheTest typedef typename V_RegTraits::q_reg Rx4; Data data; Data out = vx_load_expand_q(data.d); - const int n = Rx4::nlanes; + const int n = VTraits::vlanes(); for (int i = 0; i < n; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); @@ -510,8 +583,8 @@ template struct TheTest dataB.reverse(); R a = dataA, b = dataB; - Data resC = a + b, resD = a - b; - for (int i = 0; i < R::nlanes; ++i) + Data resC = v_add(a, b), resD = v_sub(a, b); + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(saturate_cast(dataA[i] + dataB[i]), resC[i]); @@ -530,7 +603,7 @@ template struct TheTest Data resC = v_add_wrap(a, b), resD = v_sub_wrap(a, b), resE = v_mul_wrap(a, b); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ((LaneType)(dataA[i] + dataB[i]), resC[i]); @@ -547,8 +620,8 @@ template struct TheTest dataB.reverse(); R a = dataA, b = dataB; - Data resC = a * b; - for (int i = 0; i < R::nlanes; ++i) + Data resC = v_mul(a, b); + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(saturate_cast(dataA[i] * dataB[i]), resC[i]); @@ -563,8 +636,8 @@ template struct TheTest dataB.reverse(); R a = dataA, b = dataB; - Data resC = a / b; - for (int i = 0; i < R::nlanes; ++i) + Data resC = v_div(a, b); + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(dataA[i] / dataB[i], resC[i]); @@ -583,12 +656,12 @@ template struct TheTest v_mul_expand(a, b, c, d); Data resC = c, resD = d; - const int n = R::nlanes / 2; + const int n = VTraits::vlanes() / 2; for (int i = 0; i < n; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ((typename Rx2::lane_type)dataA[i] * dataB[i], resC[i]); - EXPECT_EQ((typename Rx2::lane_type)dataA[i + n] * dataB[i + n], resD[i]); + EXPECT_EQ((typename VTraits::lane_type)dataA[i] * dataB[i], resC[i]); + EXPECT_EQ((typename VTraits::lane_type)dataA[i + n] * dataB[i + n], resD[i]); } return *this; @@ -603,11 +676,11 @@ template struct TheTest R c = v_mul_hi(a, b); Data resC = c; - const int n = R::nlanes / 2; + const int n = VTraits::vlanes() / 2; for (int i = 0; i < n; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ((typename R::lane_type)((dataA[i] * dataB[i]) >> 16), resC[i]); + EXPECT_EQ((typename VTraits::lane_type)((dataA[i] * dataB[i]) >> 16), resC[i]); } return *this; @@ -616,17 +689,19 @@ template struct TheTest TheTest & test_abs() { typedef typename V_RegTraits::u_reg Ru; - typedef typename Ru::lane_type u_type; + typedef typename VTraits::lane_type u_type; + typedef typename VTraits::lane_type R_type; Data dataA, dataB(10); R a = dataA, b = dataB; - a = a - b; + a = v_sub(a, b); Data resC = v_abs(a); - for (int i = 0; i < Ru::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ((u_type)std::abs(dataA[i] - dataB[i]), resC[i]); + R_type ssub = dataA[i] - dataB[i] < std::numeric_limits::min() ? std::numeric_limits::min() : dataA[i] - dataB[i]; + EXPECT_EQ((u_type)std::abs(ssub), resC[i]); } return *this; @@ -640,9 +715,9 @@ template struct TheTest dataA[0] = static_cast(std::numeric_limits::max()); R a = dataA; - Data resB = a << s, resC = v_shl(a), resD = a >> s, resE = v_shr(a); + Data resB = v_shl(a), resC = v_shl(a), resD = v_shr(a), resE = v_shr(a); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(static_cast(dataA[i] << s), resB[i]); @@ -660,14 +735,14 @@ template struct TheTest dataB += 1; R a = dataA, b = dataB; - Data resC = (a == b); - Data resD = (a != b); - Data resE = (a > b); - Data resF = (a >= b); - Data resG = (a < b); - Data resH = (a <= b); + Data resC = (v_eq(a, b)); + Data resD = (v_ne(a, b)); + Data resE = (v_gt(a, b)); + Data resF = (v_ge(a, b)); + Data resG = (v_lt(a, b)); + Data resH = (v_le(a, b)); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0); @@ -683,23 +758,23 @@ template struct TheTest TheTest & test_dotprod() { typedef typename V_RegTraits::w_reg Rx2; - typedef typename Rx2::lane_type w_type; + typedef typename VTraits::lane_type w_type; Data dataA, dataB; - dataA += std::numeric_limits::max() - R::nlanes; - dataB += std::numeric_limits::min() + R::nlanes; + dataA += std::numeric_limits::max() - VTraits::vlanes(); + dataB += std::numeric_limits::min() + VTraits::vlanes(); R a = dataA, b = dataB; Data dataC; dataC += std::numeric_limits::is_signed ? std::numeric_limits::min() : - std::numeric_limits::max() - R::nlanes * (dataB[0] + 1); + std::numeric_limits::max() - VTraits::vlanes() * (dataB[0] + 1); Rx2 c = dataC; Data resD = v_dotprod(a, b), resE = v_dotprod(a, b, c); - const int n = R::nlanes / 2; + const int n = VTraits::vlanes() / 2; w_type sumAB = 0, sumABC = 0, tmp_sum; for (int i = 0; i < n; ++i) { @@ -725,11 +800,11 @@ template struct TheTest TheTest & test_dotprod_expand() { typedef typename V_RegTraits::q_reg Rx4; - typedef typename Rx4::lane_type l4_type; + typedef typename VTraits::lane_type l4_type; Data dataA, dataB; - dataA += std::numeric_limits::max() - R::nlanes; - dataB += std::numeric_limits::min() + R::nlanes; + dataA += std::numeric_limits::max() - VTraits::vlanes(); + dataB += std::numeric_limits::min() + VTraits::vlanes(); R a = dataA, b = dataB; Data dataC; @@ -739,7 +814,7 @@ template struct TheTest resE = v_dotprod_expand(a, b, c); l4_type sumAB = 0, sumABC = 0, tmp_sum; - for (int i = 0; i < Rx4::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); tmp_sum = (l4_type)dataA[i*4] * (l4_type)dataB[i*4] + @@ -766,7 +841,7 @@ template struct TheTest { #if CV_SIMD_64F Data dataA, dataB; - dataA += std::numeric_limits::max() - R::nlanes; + dataA += std::numeric_limits::max() - VTraits::vlanes(); dataB += std::numeric_limits::min(); R a = dataA, b = dataB; @@ -777,7 +852,7 @@ template struct TheTest resB = v_dotprod_expand(b, b), resC = v_dotprod_expand(a, b, c); - const int n = R::nlanes / 2; + const int n = VTraits::vlanes() / 2; for (int i = 0; i < n; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); @@ -797,8 +872,8 @@ template struct TheTest Data dataA, dataB(2); R a = dataA, b = dataB; - Data resC = a & b, resD = a | b, resE = a ^ b, resF = ~a; - for (int i = 0; i < R::nlanes; ++i) + Data resC = v_and(a, b), resD = v_or(a, b), resE = v_xor(a, b), resF = v_not(a); + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(dataA[i] & dataB[i], resC[i]); @@ -817,7 +892,7 @@ template struct TheTest R a = dataA, d = dataD; Data resB = v_sqrt(a), resC = v_invsqrt(a), resE = v_abs(d); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_COMPARE_EQ((float)std::sqrt(dataA[i]), (float)resB[i]); @@ -835,7 +910,7 @@ template struct TheTest R a = dataA, b = dataB; Data resC = v_min(a, b), resD = v_max(a, b); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(std::min(dataA[i], dataB[i]), resC[i]); @@ -863,7 +938,7 @@ template struct TheTest R a = dataA; Data resB = v_popcount(a); - for (int i = 0; i < Ru::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) EXPECT_EQ(popcountTable[i + 1], resB[i]); return *this; @@ -872,7 +947,7 @@ template struct TheTest TheTest & test_absdiff() { typedef typename V_RegTraits::u_reg Ru; - typedef typename Ru::lane_type u_type; + typedef typename VTraits::lane_type u_type; Data dataA(std::numeric_limits::max()), dataB(std::numeric_limits::min()); dataA[0] = (LaneType)-1; @@ -882,7 +957,7 @@ template struct TheTest R a = dataA, b = dataB; Data resC = v_absdiff(a, b); const u_type mask = std::numeric_limits::is_signed ? (u_type)(1 << (sizeof(u_type)*8 - 1)) : 0; - for (int i = 0; i < Ru::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); u_type uA = dataA[i] ^ mask; @@ -902,7 +977,7 @@ template struct TheTest dataB[1] = -2; R a = dataA, b = dataB; Data resC = v_absdiff(a, b); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(dataA[i] > dataB[i] ? dataA[i] - dataB[i] : dataB[i] - dataA[i], resC[i]); @@ -920,7 +995,7 @@ template struct TheTest dataB[1] = (LaneType)-2; R a = dataA, b = dataB; Data resC = v_absdiffs(a, b); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { EXPECT_EQ(saturate_cast(std::abs(dataA[i] - dataB[i])), resC[i]); } @@ -930,27 +1005,40 @@ template struct TheTest TheTest & test_reduce() { Data dataA; + LaneType min = VTraits::vlanes(), max = 0; int sum = 0; - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { + min = std::min(min, dataA[i]); + max = std::max(max, dataA[i]); sum += (int)(dataA[i]); // To prevent a constant overflow with int8 } R a = dataA; - EXPECT_EQ((LaneType)1, (LaneType)v_reduce_min(a)); - EXPECT_EQ((LaneType)(R::nlanes), (LaneType)v_reduce_max(a)); + EXPECT_EQ((LaneType)min, (LaneType)v_reduce_min(a)); + EXPECT_EQ((LaneType)max, (LaneType)v_reduce_max(a)); EXPECT_EQ((int)(sum), (int)v_reduce_sum(a)); - dataA[0] += R::nlanes; - R an = dataA; - EXPECT_EQ((LaneType)2, (LaneType)v_reduce_min(an)); + dataA[0] += VTraits::vlanes(); + R an = dataA; + min = VTraits::vlanes(); + for (int i = 0; i < VTraits::vlanes(); ++i) + { + min = std::min(min, dataA[i]); + } + EXPECT_EQ((LaneType)min, (LaneType)v_reduce_min(an)); return *this; } TheTest & test_reduce_sad() { - Data dataA, dataB(R::nlanes/2); + Data dataA, dataB(VTraits::vlanes()/2); R a = dataA; R b = dataB; - EXPECT_EQ((unsigned)(R::nlanes*R::nlanes/4), v_reduce_sad(a, b)); + uint sum = 0; + for (int i = 0; i < VTraits::vlanes(); ++i) + { + sum += std::abs(int(dataA[i] - dataB[i])); + } + EXPECT_EQ(sum, v_reduce_sad(a, b)); return *this; } @@ -958,8 +1046,8 @@ template struct TheTest { typedef typename V_RegTraits::int_reg int_reg; typedef typename V_RegTraits::u_reg uint_reg; - typedef typename int_reg::lane_type int_type; - typedef typename uint_reg::lane_type uint_type; + typedef typename VTraits::lane_type int_type; + typedef typename VTraits::lane_type uint_type; Data dataA, dataB(0), dataC, dataD(1), dataE(2); dataA[1] *= (LaneType)-1; @@ -971,18 +1059,18 @@ template struct TheTest all1s; all1s.ui = (uint_type)-1; LaneType mask_one = all1s.l; - dataB[R::nlanes - 1] = mask_one; + dataB[VTraits::vlanes() - 1] = mask_one; R l = dataB; dataB[1] = mask_one; - dataB[R::nlanes / 2] = mask_one; + dataB[VTraits::vlanes() / 2] = mask_one; dataC *= (LaneType)-1; R a = dataA, b = dataB, c = dataC, d = dataD, e = dataE; - dataC[R::nlanes - 1] = 0; + dataC[VTraits::vlanes() - 1] = 0; R nl = dataC; EXPECT_EQ(2, v_signmask(a)); -#if CV_SIMD_WIDTH <= 32 - EXPECT_EQ(2 | (1 << (R::nlanes / 2)) | (1 << (R::nlanes - 1)), v_signmask(b)); +#if (CV_SIMD_WIDTH <= 32) && (!CV_SIMD_SCALABLE) + EXPECT_EQ(2 | (1 << (VTraits::vlanes() / 2)) | (1 << (VTraits::vlanes() - 1)), v_signmask(b)); #endif EXPECT_EQ(false, v_check_all(a)); @@ -996,7 +1084,7 @@ template struct TheTest EXPECT_EQ(true, v_check_any(l)); R f = v_select(b, d, e); Data resF = f; - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); int_type m2 = dataB.as_int(i); @@ -1011,7 +1099,7 @@ template struct TheTest { SCOPED_TRACE(s); typedef typename V_RegTraits::w_reg Rx2; - typedef typename Rx2::lane_type w_type; + typedef typename VTraits::lane_type w_type; Data dataA, dataB; dataA += std::numeric_limits::is_signed ? -10 : 10; dataB *= 10; @@ -1027,7 +1115,7 @@ template struct TheTest Data resF(0); v_rshr_pack_store(resF.d, b); - const int n = Rx2::nlanes; + const int n = VTraits::vlanes(); const w_type add = (w_type)1 << (s - 1); for (int i = 0; i < n; ++i) { @@ -1051,7 +1139,7 @@ template struct TheTest //typedef typename V_RegTraits::w_type LaneType_w; typedef typename V_RegTraits::w_reg R2; typedef typename V_RegTraits::int_reg Ri2; - typedef typename Ri2::lane_type w_type; + typedef typename VTraits::lane_type w_type; Data dataA, dataB; dataA += -10; @@ -1068,7 +1156,7 @@ template struct TheTest Data resF(0); v_rshr_pack_u_store(resF.d, b); - const int n = Ri2::nlanes; + const int n = VTraits::vlanes(); const w_type add = (w_type)1 << (s - 1); for (int i = 0; i < n; ++i) { @@ -1090,26 +1178,26 @@ template struct TheTest { // 16-bit Data dataA, dataB; - dataB.fill(0, R::nlanes / 2); + dataB.fill(0, VTraits::vlanes() / 2); R a = dataA, b = dataB; - Data maskA = a == b, maskB = a != b; + Data maskA = v_eq(a, b), maskB = v_ne(a, b); a = maskA; b = maskB; Data res = v_pack_b(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); - for (int i = 0; i < v_uint16::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(maskA[i * 2], res[i]); - EXPECT_EQ(maskB[i * 2], res[i + v_uint16::nlanes]); + EXPECT_EQ(maskB[i * 2], res[i + VTraits::vlanes()]); } // 32-bit Data dataC, dataD; - dataD.fill(0, R::nlanes / 2); + dataD.fill(0, VTraits::vlanes() / 2); R c = dataC, d = dataD; - Data maskC = c == d, maskD = c != d; + Data maskC = v_eq(c, d), maskD = v_ne(c, d); c = maskC; d = maskD; res = v_pack_b @@ -1118,21 +1206,21 @@ template struct TheTest v_reinterpret_as_u32(c), v_reinterpret_as_u32(d) ); - for (int i = 0; i < v_uint32::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(maskA[i * 4], res[i]); - EXPECT_EQ(maskB[i * 4], res[i + v_uint32::nlanes]); - EXPECT_EQ(maskC[i * 4], res[i + v_uint32::nlanes * 2]); - EXPECT_EQ(maskD[i * 4], res[i + v_uint32::nlanes * 3]); + EXPECT_EQ(maskB[i * 4], res[i + VTraits::vlanes()]); + EXPECT_EQ(maskC[i * 4], res[i + VTraits::vlanes() * 2]); + EXPECT_EQ(maskD[i * 4], res[i + VTraits::vlanes() * 3]); } // 64-bit Data dataE, dataF, dataG(0), dataH(0xFF); - dataF.fill(0, R::nlanes / 2); + dataF.fill(0, VTraits::vlanes() / 2); R e = dataE, f = dataF, g = dataG, h = dataH; - Data maskE = e == f, maskF = e != f; + Data maskE = v_eq(e, f), maskF = v_ne(e, f); e = maskE; f = maskF; res = v_pack_b @@ -1143,18 +1231,18 @@ template struct TheTest v_reinterpret_as_u64(g), v_reinterpret_as_u64(h) ); - for (int i = 0; i < v_uint64::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(maskA[i * 8], res[i]); - EXPECT_EQ(maskB[i * 8], res[i + v_uint64::nlanes]); - EXPECT_EQ(maskC[i * 8], res[i + v_uint64::nlanes * 2]); - EXPECT_EQ(maskD[i * 8], res[i + v_uint64::nlanes * 3]); - - EXPECT_EQ(maskE[i * 8], res[i + v_uint64::nlanes * 4]); - EXPECT_EQ(maskF[i * 8], res[i + v_uint64::nlanes * 5]); - EXPECT_EQ(dataG[i * 8], res[i + v_uint64::nlanes * 6]); - EXPECT_EQ(dataH[i * 8], res[i + v_uint64::nlanes * 7]); + EXPECT_EQ(maskB[i * 8], res[i + VTraits::vlanes()]); + EXPECT_EQ(maskC[i * 8], res[i + VTraits::vlanes() * 2]); + EXPECT_EQ(maskD[i * 8], res[i + VTraits::vlanes() * 3]); + + EXPECT_EQ(maskE[i * 8], res[i + VTraits::vlanes() * 4]); + EXPECT_EQ(maskF[i * 8], res[i + VTraits::vlanes() * 5]); + EXPECT_EQ(dataG[i * 8], res[i + VTraits::vlanes() * 6]); + EXPECT_EQ(dataH[i * 8], res[i + VTraits::vlanes() * 7]); } return *this; @@ -1174,7 +1262,7 @@ template struct TheTest Data resC = c, resD = d, resE = e, resF = f, resLo = lo, resHi = hi; - const int n = R::nlanes/2; + const int n = VTraits::vlanes()/2; for (int i = 0; i < n; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); @@ -1204,10 +1292,10 @@ template struct TheTest Data resB = v_reverse(a); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ(dataA[R::nlanes - i - 1], resB[i]); + EXPECT_EQ(dataA[VTraits::vlanes() - i - 1], resB[i]); } return *this; @@ -1223,11 +1311,11 @@ template struct TheTest Data resC = v_extract(a, b); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - if (i + s >= R::nlanes) - EXPECT_EQ(dataB[i - R::nlanes + s], resC[i]); + if (i + s >= VTraits::vlanes()) + EXPECT_EQ(dataB[i - VTraits::vlanes() + s], resC[i]); else EXPECT_EQ(dataA[i + s], resC[i]); } @@ -1249,16 +1337,16 @@ template struct TheTest Data resE = v_rotate_left(a); Data resF = v_rotate_left(a, b); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - if (i + s >= R::nlanes) + if (i + s >= VTraits::vlanes()) { EXPECT_EQ((LaneType)0, resC[i]); - EXPECT_EQ(dataB[i - R::nlanes + s], resD[i]); + EXPECT_EQ(dataB[i - VTraits::vlanes() + s], resD[i]); - EXPECT_EQ((LaneType)0, resE[i - R::nlanes + s]); - EXPECT_EQ(dataB[i], resF[i - R::nlanes + s]); + EXPECT_EQ((LaneType)0, resE[i - VTraits::vlanes() + s]); + EXPECT_EQ(dataB[i], resF[i - VTraits::vlanes() + s]); } else { @@ -1287,6 +1375,19 @@ template struct TheTest return *this; } + TheTest & test_extract_highest() + { + Data dataA; + LaneType test_value = (LaneType)(VTraits::vlanes()-1 + 50); + dataA[VTraits::vlanes()-1] = test_value; + R a = dataA; + + LaneType res = v_extract_highest(a); + EXPECT_EQ(test_value, res); + + return *this; + } + template TheTest & test_broadcast_element() { @@ -1298,7 +1399,24 @@ template struct TheTest Data res = v_broadcast_element(a); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) + { + SCOPED_TRACE(i); + EXPECT_EQ(test_value, res[i]); + } + return *this; + } + + TheTest & test_broadcast_highest() + { + Data dataA; + LaneType test_value = (LaneType)(VTraits::vlanes()-1 + 50); + dataA[VTraits::vlanes()-1] = test_value; + R a = dataA; + + Data res = v_broadcast_highest(a); + + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(i); EXPECT_EQ(test_value, res[i]); @@ -1323,11 +1441,11 @@ template struct TheTest resG = v_sqr_magnitude(a1, a2), resH = v_muladd(a1, a2, a3); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(cvRound(data1[i]), resB[i]); - EXPECT_EQ((typename Ri::lane_type)data1[i], resC[i]); + EXPECT_EQ((typename VTraits::lane_type)data1[i], resC[i]); EXPECT_EQ(cvFloor(data1[i]), resD[i]); EXPECT_EQ(cvCeil(data1[i]), resE[i]); @@ -1347,11 +1465,11 @@ template struct TheTest R a = dataA; Rt b = v_cvt_f32(a); Data resB = b; - int n = std::min(Rt::nlanes, R::nlanes); + int n = std::min(VTraits::vlanes(), VTraits::vlanes()); for (int i = 0; i < n; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]); + EXPECT_EQ((typename VTraits::lane_type)dataA[i], resB[i]); } return *this; } @@ -1367,16 +1485,16 @@ template struct TheTest Rt c = v_cvt_f64_high(a); Data resB = b; Data resC = c; - int n = std::min(Rt::nlanes, R::nlanes); + int n = std::min(VTraits::vlanes(), VTraits::vlanes()); for (int i = 0; i < n; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]); + EXPECT_EQ((typename VTraits::lane_type)dataA[i], resB[i]); } for (int i = 0; i < n; ++i) { SCOPED_TRACE(cv::format("i=%d", i)); - EXPECT_EQ((typename Rt::lane_type)dataA[i+n], resC[i]); + EXPECT_EQ((typename VTraits::lane_type)dataA[i+n], resC[i]); } #endif return *this; @@ -1387,7 +1505,7 @@ template struct TheTest #if CV_SIMD_64F Data dataA(std::numeric_limits::max()), dataB(std::numeric_limits::min()); - dataB += R::nlanes; + dataB += VTraits::vlanes(); R a = dataA, b = dataB; v_float64 c = v_cvt_f64(a), d = v_cvt_f64(b); @@ -1395,7 +1513,7 @@ template struct TheTest Data resC = c; Data resD = d; - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ((double)dataA[i], resC[i]); @@ -1414,8 +1532,9 @@ template struct TheTest R v = dataV, a = dataA, b = dataB, c = dataC, d = dataD; Data res = v_matmul(v, a, b, c, d); - for (int i = 0; i < R::nlanes; i += 4) - { + // for (int i = 0; i < VTraits::vlanes(); i += 4) + // { + int i = 0; for (int j = i; j < i + 4; ++j) { SCOPED_TRACE(cv::format("i=%d j=%d", i, j)); @@ -1425,11 +1544,12 @@ template struct TheTest + dataV[i + 3] * dataD[j]; EXPECT_COMPARE_EQ(val, res[j]); } - } + // } Data resAdd = v_matmuladd(v, a, b, c, d); - for (int i = 0; i < R::nlanes; i += 4) - { + // for (int i = 0; i < VTraits::vlanes(); i += 4) + // { + i = 0; for (int j = i; j < i + 4; ++j) { SCOPED_TRACE(cv::format("i=%d j=%d", i, j)); @@ -1439,7 +1559,7 @@ template struct TheTest + dataD[j]; EXPECT_COMPARE_EQ(val, resAdd[j]); } - } + // } return *this; } @@ -1455,8 +1575,9 @@ template struct TheTest e, f, g, h); Data res[4] = {e, f, g, h}; - for (int i = 0; i < R::nlanes; i += 4) - { + // for (int i = 0; i < VTraits::vlanes(); i += 4) + // { + int i = 0; for (int j = 0; j < 4; ++j) { SCOPED_TRACE(cv::format("i=%d j=%d", i, j)); @@ -1465,7 +1586,7 @@ template struct TheTest EXPECT_EQ(dataC[i + j], res[j][i + 2]); EXPECT_EQ(dataD[i + j], res[j][i + 3]); } - } + // } return *this; } @@ -1479,14 +1600,15 @@ template struct TheTest R a = dataA, b = dataB, c = dataC, d = dataD; Data res = v_reduce_sum4(a, b, c, d); - for (int i = 0; i < R::nlanes; i += 4) - { + // for (int i = 0; i < VTraits::vlanes(); i += 4) + // { + int i = 0; SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_COMPARE_EQ(dataA.sum(i, 4), res[i]); EXPECT_COMPARE_EQ(dataB.sum(i, 4), res[i + 1]); EXPECT_COMPARE_EQ(dataC.sum(i, 4), res[i + 2]); EXPECT_COMPARE_EQ(dataD.sum(i, 4), res[i + 3]); - } + // } return *this; } @@ -1495,19 +1617,19 @@ template struct TheTest printf("test_loadstore_fp16_f32 ...\n"); AlignedData data; data.a.clear(); data.a.d[0] = 0x3c00; // 1.0 - data.a.d[R::nlanes - 1] = (unsigned short)0xc000; // -2.0 + data.a.d[VTraits::vlanes() - 1] = (unsigned short)0xc000; // -2.0 AlignedData data_f32; data_f32.a.clear(); AlignedData out; R r1 = vx_load_expand((const cv::float16_t*)data.a.d); R r2(r1); - EXPECT_EQ(1.0f, r1.get0()); + EXPECT_EQ(1.0f, v_get0(r1)); v_store(data_f32.a.d, r2); - EXPECT_EQ(-2.0f, data_f32.a.d[R::nlanes - 1]); + EXPECT_EQ(-2.0f, data_f32.a.d[VTraits::vlanes() - 1]); out.a.clear(); v_pack_store((cv::float16_t*)out.a.d, r2); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { EXPECT_EQ(data.a[i], out.a[i]) << "i=" << i; } @@ -1523,18 +1645,18 @@ template struct TheTest AlignedData out; // check if addresses are aligned and unaligned respectively - EXPECT_EQ((size_t)0, (size_t)&data.a.d % CV_SIMD_WIDTH); - EXPECT_NE((size_t)0, (size_t)&data.u.d % CV_SIMD_WIDTH); - EXPECT_EQ((size_t)0, (size_t)&out.a.d % CV_SIMD_WIDTH); - EXPECT_NE((size_t)0, (size_t)&out.u.d % CV_SIMD_WIDTH); + EXPECT_EQ((size_t)0, (size_t)&data.a.d % VTraits::max_nlanes); + EXPECT_NE((size_t)0, (size_t)&data.u.d % VTraits::max_nlanes); + EXPECT_EQ((size_t)0, (size_t)&out.a.d % VTraits::max_nlanes); + EXPECT_NE((size_t)0, (size_t)&out.u.d % VTraits::max_nlanes); // check some initialization methods R r1 = data.u; R r2 = vx_load_expand((const float16_t*)data.a.d); R r3(r2); - EXPECT_EQ(data.u[0], r1.get0()); - EXPECT_EQ(data.a[0], r2.get0()); - EXPECT_EQ(data.a[0], r3.get0()); + EXPECT_EQ(data.u[0], v_get0(r1)); + EXPECT_EQ(data.a[0], v_get0(r2)); + EXPECT_EQ(data.a[0], v_get0(r3)); // check some store methods out.a.clear(); @@ -1552,8 +1674,8 @@ template struct TheTest v_float32 r1 = vx_load(data.a.d); v_float16 r2 = v_cvt_f16(r1, vx_setzero_f32()); v_float32 r3 = v_cvt_f32(r2); - EXPECT_EQ(0x3c00, r2.get0()); - EXPECT_EQ(r3.get0(), r1.get0()); + EXPECT_EQ(0x3c00, v_get0(r2)); + EXPECT_EQ(v_get0(r3), v_get0(r1)); return *this; } @@ -1565,7 +1687,7 @@ template struct TheTest Data dataA, dataB; R a = dataA, b = dataB; - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { dataA[i] = dataB[i]; } @@ -1576,14 +1698,14 @@ template struct TheTest Data resC = (a == b); Data resD = (a != b); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0); EXPECT_EQ(dataA[i] != dataB[i], resD[i] != 0); } - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { dataA[i] = dataB[i] = (LaneType)-1; } @@ -1593,7 +1715,7 @@ template struct TheTest resC = (a == b); resD = (a != b); - for (int i = 0; i < R::nlanes; ++i) + for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ(dataA[i] == dataB[i], resC[i] != 0); @@ -1605,10 +1727,8 @@ template struct TheTest }; -#if 1 +#if CV_SIMD #define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*(int)sizeof(v_uint8), CV__TRACE_FUNCTION); -#endif - //============= 8-bit integer ===================================================================== void test_hal_intrin_uint8() @@ -1955,6 +2075,122 @@ void test_hal_intrin_float16() std::cout << "SKIP: CV_FP16 is not available" << std::endl; #endif } +#elif CV_SIMD_SCALABLE //Temporary +#define DUMP_ENTRY(type) printf("SIMD: %s\n", CV__TRACE_FUNCTION); + + +//============= 8-bit integer ===================================================================== + +void test_hal_intrin_uint8() +{ + DUMP_ENTRY(v_uint8); + // typedef v_uint8 R; + TheTest() + .test_loadstore() + .test_min_max() + ; +} + +void test_hal_intrin_int8() +{ + DUMP_ENTRY(v_int8); + // typedef v_int8 R; + TheTest() + .test_loadstore() + .test_min_max() + ; +} + +//============= 16-bit integer ===================================================================== + +void test_hal_intrin_uint16() +{ + DUMP_ENTRY(v_uint16); + // typedef v_uint16 R; + TheTest() + .test_loadstore() + .test_min_max() + ; +} + +void test_hal_intrin_int16() +{ + DUMP_ENTRY(v_int16); + // typedef v_int16 R; + TheTest() + .test_loadstore() + .test_min_max() + ; +} + +//============= 32-bit integer ===================================================================== + +void test_hal_intrin_uint32() +{ + DUMP_ENTRY(v_uint32); + // typedef v_uint32 R; + TheTest() + .test_loadstore() + .test_min_max() + ; +} + +void test_hal_intrin_int32() +{ + DUMP_ENTRY(v_int32); + // typedef v_int32 R; + TheTest() + .test_loadstore() + .test_min_max() + ; +} + +//============= 64-bit integer ===================================================================== + +void test_hal_intrin_uint64() +{ + DUMP_ENTRY(v_uint64); + // typedef v_uint64 R; + TheTest() + .test_loadstore() + ; +} + +void test_hal_intrin_int64() +{ + DUMP_ENTRY(v_int64); + // typedef v_int64 R; + TheTest() + .test_loadstore() + ; +} + +//============= Floating point ===================================================================== +void test_hal_intrin_float32() +{ + DUMP_ENTRY(v_float32); + // typedef v_float32 R; + TheTest() + .test_loadstore() + .test_min_max() + ; +} + +void test_hal_intrin_float64() +{ + DUMP_ENTRY(v_float64); +#if CV_SIMD_64F + // typedef v_float64 R; + TheTest() + .test_loadstore() + .test_min_max() + ; + +#endif +} + +#endif + /*#if defined(CV_CPU_DISPATCH_MODE_FP16) && CV_CPU_DISPATCH_MODE == FP16 void test_hal_intrin_float16() From e9fc981349693fedf7ac9a8d9f11a49c8f0925ea Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Thu, 30 Jun 2022 19:08:59 +0000 Subject: [PATCH 06/12] Enable the new RVV (scalable) backend. --- modules/imgproc/src/median_blur.simd.hpp | 201 ++++++----------------- 1 file changed, 47 insertions(+), 154 deletions(-) diff --git a/modules/imgproc/src/median_blur.simd.hpp b/modules/imgproc/src/median_blur.simd.hpp index 068b7d638ff3..5759c7aade65 100644 --- a/modules/imgproc/src/median_blur.simd.hpp +++ b/modules/imgproc/src/median_blur.simd.hpp @@ -497,7 +497,6 @@ struct MinMax8u { typedef uchar value_type; typedef int arg_type; - enum { SIZE = 1 }; arg_type load(const uchar* ptr) { return *ptr; } void store(uchar* ptr, arg_type val) { *ptr = (uchar)val; } void operator()(arg_type& a, arg_type& b) const @@ -511,7 +510,6 @@ struct MinMax16u { typedef ushort value_type; typedef int arg_type; - enum { SIZE = 1 }; arg_type load(const ushort* ptr) { return *ptr; } void store(ushort* ptr, arg_type val) { *ptr = (ushort)val; } void operator()(arg_type& a, arg_type& b) const @@ -526,7 +524,6 @@ struct MinMax16s { typedef short value_type; typedef int arg_type; - enum { SIZE = 1 }; arg_type load(const short* ptr) { return *ptr; } void store(short* ptr, arg_type val) { *ptr = (short)val; } void operator()(arg_type& a, arg_type& b) const @@ -541,7 +538,6 @@ struct MinMax32f { typedef float value_type; typedef float arg_type; - enum { SIZE = 1 }; arg_type load(const float* ptr) { return *ptr; } void store(float* ptr, arg_type val) { *ptr = val; } void operator()(arg_type& a, arg_type& b) const @@ -552,14 +548,13 @@ struct MinMax32f } }; -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE struct MinMaxVec8u { typedef uchar value_type; - typedef v_uint8x16 arg_type; - enum { SIZE = v_uint8x16::nlanes }; - arg_type load(const uchar* ptr) { return v_load(ptr); } + typedef v_uint8 arg_type; + arg_type load(const uchar* ptr) { return vx_load(ptr); } void store(uchar* ptr, const arg_type &val) { v_store(ptr, val); } void operator()(arg_type& a, arg_type& b) const { @@ -567,27 +562,14 @@ struct MinMaxVec8u a = v_min(a, b); b = v_max(b, t); } -#if CV_SIMD_WIDTH > 16 - typedef v_uint8 warg_type; - enum { WSIZE = v_uint8::nlanes }; - warg_type wload(const uchar* ptr) { return vx_load(ptr); } - void store(uchar* ptr, const warg_type &val) { v_store(ptr, val); } - void operator()(warg_type& a, warg_type& b) const - { - warg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#endif }; struct MinMaxVec16u { typedef ushort value_type; - typedef v_uint16x8 arg_type; - enum { SIZE = v_uint16x8::nlanes }; - arg_type load(const ushort* ptr) { return v_load(ptr); } + typedef v_uint16 arg_type; + arg_type load(const ushort* ptr) { return vx_load(ptr); } void store(ushort* ptr, const arg_type &val) { v_store(ptr, val); } void operator()(arg_type& a, arg_type& b) const { @@ -595,27 +577,14 @@ struct MinMaxVec16u a = v_min(a, b); b = v_max(b, t); } -#if CV_SIMD_WIDTH > 16 - typedef v_uint16 warg_type; - enum { WSIZE = v_uint16::nlanes }; - warg_type wload(const ushort* ptr) { return vx_load(ptr); } - void store(ushort* ptr, const warg_type &val) { v_store(ptr, val); } - void operator()(warg_type& a, warg_type& b) const - { - warg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#endif }; struct MinMaxVec16s { typedef short value_type; - typedef v_int16x8 arg_type; - enum { SIZE = v_int16x8::nlanes }; - arg_type load(const short* ptr) { return v_load(ptr); } + typedef v_int16 arg_type; + arg_type load(const short* ptr) { return vx_load(ptr); } void store(short* ptr, const arg_type &val) { v_store(ptr, val); } void operator()(arg_type& a, arg_type& b) const { @@ -623,27 +592,14 @@ struct MinMaxVec16s a = v_min(a, b); b = v_max(b, t); } -#if CV_SIMD_WIDTH > 16 - typedef v_int16 warg_type; - enum { WSIZE = v_int16::nlanes }; - warg_type wload(const short* ptr) { return vx_load(ptr); } - void store(short* ptr, const warg_type &val) { v_store(ptr, val); } - void operator()(warg_type& a, warg_type& b) const - { - warg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#endif }; struct MinMaxVec32f { typedef float value_type; - typedef v_float32x4 arg_type; - enum { SIZE = v_float32x4::nlanes }; - arg_type load(const float* ptr) { return v_load(ptr); } + typedef v_float32 arg_type; + arg_type load(const float* ptr) { return vx_load(ptr); } void store(float* ptr, const arg_type &val) { v_store(ptr, val); } void operator()(arg_type& a, arg_type& b) const { @@ -651,18 +607,6 @@ struct MinMaxVec32f a = v_min(a, b); b = v_max(b, t); } -#if CV_SIMD_WIDTH > 16 - typedef v_float32 warg_type; - enum { WSIZE = v_float32::nlanes }; - warg_type wload(const float* ptr) { return vx_load(ptr); } - void store(float* ptr, const warg_type &val) { v_store(ptr, val); } - void operator()(warg_type& a, warg_type& b) const - { - warg_type t = a; - a = v_min(a, b); - b = v_max(b, t); - } -#endif }; #else @@ -683,9 +627,6 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) typedef typename Op::value_type T; typedef typename Op::arg_type WT; typedef typename VecOp::arg_type VT; -#if CV_SIMD_WIDTH > 16 - typedef typename VecOp::warg_type WVT; -#endif const T* src = _src.ptr(); T* dst = _dst.ptr(); @@ -747,22 +688,11 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) if( limit == size.width ) break; -#if CV_SIMD_WIDTH > 16 - for( ; j <= size.width - VecOp::WSIZE - cn; j += VecOp::WSIZE ) - { - WVT p0 = vop.wload(row0+j-cn), p1 = vop.wload(row0+j), p2 = vop.wload(row0+j+cn); - WVT p3 = vop.wload(row1+j-cn), p4 = vop.wload(row1+j), p5 = vop.wload(row1+j+cn); - WVT p6 = vop.wload(row2+j-cn), p7 = vop.wload(row2+j), p8 = vop.wload(row2+j+cn); - - vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1); - vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5); - vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7); - vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7); - vop(p4, p2); vop(p6, p4); vop(p4, p2); - vop.store(dst+j, p4); - } +#if CV_SIMD || CV_SIMD_SCALABLE + for( ; j <= size.width - VTraits::vlanes() - cn; j += VTraits::vlanes() ) +#else + for( ; j <= size.width - 1 - cn; j += 1 ) #endif - for( ; j <= size.width - VecOp::SIZE - cn; j += VecOp::SIZE ) { VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn); VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn); @@ -862,79 +792,42 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) if( limit == size.width ) break; -#if CV_SIMD_WIDTH > 16 - for( ; j <= size.width - VecOp::WSIZE - cn*2; j += VecOp::WSIZE ) - { - WVT p[25]; - for( k = 0; k < 5; k++ ) - { - const T* rowk = row[k]; - p[k*5] = vop.wload(rowk+j-cn*2); p[k*5+1] = vop.wload(rowk+j-cn); - p[k*5+2] = vop.wload(rowk+j); p[k*5+3] = vop.wload(rowk+j+cn); - p[k*5+4] = vop.wload(rowk+j+cn*2); - } - - vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]); - vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]); - vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]); - vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]); - vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]); - vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]); - vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]); - vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]); - vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]); - vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]); - vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]); - vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]); - vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]); - vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]); - vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]); - vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]); - vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]); - vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]); - vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]); - vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]); - vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]); - vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]); - vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]); - vop.store(dst+j, p[12]); - } +#if CV_SIMD || CV_SIMD_SCALABLE + for( ; j <= size.width - VTraits::vlanes() - cn*2; j += VTraits::vlanes() ) +#else + for( ; j <= size.width - 1 - cn*2; j += 1 ) #endif - for( ; j <= size.width - VecOp::SIZE - cn*2; j += VecOp::SIZE ) { - VT p[25]; - for( k = 0; k < 5; k++ ) - { - const T* rowk = row[k]; - p[k*5] = vop.load(rowk+j-cn*2); p[k*5+1] = vop.load(rowk+j-cn); - p[k*5+2] = vop.load(rowk+j); p[k*5+3] = vop.load(rowk+j+cn); - p[k*5+4] = vop.load(rowk+j+cn*2); - } - - vop(p[1], p[2]); vop(p[0], p[1]); vop(p[1], p[2]); vop(p[4], p[5]); vop(p[3], p[4]); - vop(p[4], p[5]); vop(p[0], p[3]); vop(p[2], p[5]); vop(p[2], p[3]); vop(p[1], p[4]); - vop(p[1], p[2]); vop(p[3], p[4]); vop(p[7], p[8]); vop(p[6], p[7]); vop(p[7], p[8]); - vop(p[10], p[11]); vop(p[9], p[10]); vop(p[10], p[11]); vop(p[6], p[9]); vop(p[8], p[11]); - vop(p[8], p[9]); vop(p[7], p[10]); vop(p[7], p[8]); vop(p[9], p[10]); vop(p[0], p[6]); - vop(p[4], p[10]); vop(p[4], p[6]); vop(p[2], p[8]); vop(p[2], p[4]); vop(p[6], p[8]); - vop(p[1], p[7]); vop(p[5], p[11]); vop(p[5], p[7]); vop(p[3], p[9]); vop(p[3], p[5]); - vop(p[7], p[9]); vop(p[1], p[2]); vop(p[3], p[4]); vop(p[5], p[6]); vop(p[7], p[8]); - vop(p[9], p[10]); vop(p[13], p[14]); vop(p[12], p[13]); vop(p[13], p[14]); vop(p[16], p[17]); - vop(p[15], p[16]); vop(p[16], p[17]); vop(p[12], p[15]); vop(p[14], p[17]); vop(p[14], p[15]); - vop(p[13], p[16]); vop(p[13], p[14]); vop(p[15], p[16]); vop(p[19], p[20]); vop(p[18], p[19]); - vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[21], p[23]); vop(p[22], p[24]); - vop(p[22], p[23]); vop(p[18], p[21]); vop(p[20], p[23]); vop(p[20], p[21]); vop(p[19], p[22]); - vop(p[22], p[24]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[12], p[18]); - vop(p[16], p[22]); vop(p[16], p[18]); vop(p[14], p[20]); vop(p[20], p[24]); vop(p[14], p[16]); - vop(p[18], p[20]); vop(p[22], p[24]); vop(p[13], p[19]); vop(p[17], p[23]); vop(p[17], p[19]); - vop(p[15], p[21]); vop(p[15], p[17]); vop(p[19], p[21]); vop(p[13], p[14]); vop(p[15], p[16]); - vop(p[17], p[18]); vop(p[19], p[20]); vop(p[21], p[22]); vop(p[23], p[24]); vop(p[0], p[12]); - vop(p[8], p[20]); vop(p[8], p[12]); vop(p[4], p[16]); vop(p[16], p[24]); vop(p[12], p[16]); - vop(p[2], p[14]); vop(p[10], p[22]); vop(p[10], p[14]); vop(p[6], p[18]); vop(p[6], p[10]); - vop(p[10], p[12]); vop(p[1], p[13]); vop(p[9], p[21]); vop(p[9], p[13]); vop(p[5], p[17]); - vop(p[13], p[17]); vop(p[3], p[15]); vop(p[11], p[23]); vop(p[11], p[15]); vop(p[7], p[19]); - vop(p[7], p[11]); vop(p[11], p[13]); vop(p[11], p[12]); - vop.store(dst+j, p[12]); + VT p0 = vop.load(row[0]+j-cn*2), p5 = vop.load(row[1]+j-cn*2), p10 = vop.load(row[2]+j-cn*2), p15 = vop.load(row[3]+j-cn*2), p20 = vop.load(row[4]+j-cn*2); + VT p1 = vop.load(row[0]+j-cn*1), p6 = vop.load(row[1]+j-cn*1), p11 = vop.load(row[2]+j-cn*1), p16 = vop.load(row[3]+j-cn*1), p21 = vop.load(row[4]+j-cn*1); + VT p2 = vop.load(row[0]+j-cn*0), p7 = vop.load(row[1]+j-cn*0), p12 = vop.load(row[2]+j-cn*0), p17 = vop.load(row[3]+j-cn*0), p22 = vop.load(row[4]+j-cn*0); + VT p3 = vop.load(row[0]+j+cn*1), p8 = vop.load(row[1]+j+cn*1), p13 = vop.load(row[2]+j+cn*1), p18 = vop.load(row[3]+j+cn*1), p23 = vop.load(row[4]+j+cn*1); + VT p4 = vop.load(row[0]+j+cn*2), p9 = vop.load(row[1]+j+cn*2), p14 = vop.load(row[2]+j+cn*2), p19 = vop.load(row[3]+j+cn*2), p24 = vop.load(row[4]+j+cn*2); + + vop(p1, p2); vop(p0, p1); vop(p1, p2); vop(p4, p5); vop(p3, p4); + vop(p4, p5); vop(p0, p3); vop(p2, p5); vop(p2, p3); vop(p1, p4); + vop(p1, p2); vop(p3, p4); vop(p7, p8); vop(p6, p7); vop(p7, p8); + vop(p10, p11); vop(p9, p10); vop(p10, p11); vop(p6, p9); vop(p8, p11); + vop(p8, p9); vop(p7, p10); vop(p7, p8); vop(p9, p10); vop(p0, p6); + vop(p4, p10); vop(p4, p6); vop(p2, p8); vop(p2, p4); vop(p6, p8); + vop(p1, p7); vop(p5, p11); vop(p5, p7); vop(p3, p9); vop(p3, p5); + vop(p7, p9); vop(p1, p2); vop(p3, p4); vop(p5, p6); vop(p7, p8); + vop(p9, p10); vop(p13, p14); vop(p12, p13); vop(p13, p14); vop(p16, p17); + vop(p15, p16); vop(p16, p17); vop(p12, p15); vop(p14, p17); vop(p14, p15); + vop(p13, p16); vop(p13, p14); vop(p15, p16); vop(p19, p20); vop(p18, p19); + vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p21, p23); vop(p22, p24); + vop(p22, p23); vop(p18, p21); vop(p20, p23); vop(p20, p21); vop(p19, p22); + vop(p22, p24); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p12, p18); + vop(p16, p22); vop(p16, p18); vop(p14, p20); vop(p20, p24); vop(p14, p16); + vop(p18, p20); vop(p22, p24); vop(p13, p19); vop(p17, p23); vop(p17, p19); + vop(p15, p21); vop(p15, p17); vop(p19, p21); vop(p13, p14); vop(p15, p16); + vop(p17, p18); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p0, p12); + vop(p8, p20); vop(p8, p12); vop(p4, p16); vop(p16, p24); vop(p12, p16); + vop(p2, p14); vop(p10, p22); vop(p10, p14); vop(p6, p18); vop(p6, p10); + vop(p10, p12); vop(p1, p13); vop(p9, p21); vop(p9, p13); vop(p5, p17); + vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19); + vop(p7, p11); vop(p11, p13); vop(p11, p12); + vop.store(dst+j, p12); } limit = size.width; From d20bb8d708e9954a301c522126978843dcbf5e95 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Fri, 1 Jul 2022 06:20:08 +0000 Subject: [PATCH 07/12] Remove whitespace. --- .../core/include/opencv2/core/hal/intrin.hpp | 38 +++++++++---------- .../opencv2/core/hal/intrin_riscv_vector.hpp | 34 ++++++++--------- modules/core/test/test_intrin_utils.hpp | 2 +- modules/imgproc/src/color_yuv.simd.hpp | 2 +- 4 files changed, 38 insertions(+), 38 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 1f1c5a290b5f..21434ec86417 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -704,7 +704,7 @@ namespace CV__SIMD_NAMESPACE { struct VTraits; template <> - struct VTraits + struct VTraits { static inline int vlanes() { return v_uint8::nlanes; } static const int nlanes = v_uint8::nlanes; @@ -712,76 +712,76 @@ namespace CV__SIMD_NAMESPACE { static const int max_nlanes = nlanes; }; template <> - struct VTraits + struct VTraits { static inline int vlanes() { return v_int8::nlanes; } - static const int nlanes = v_int8::nlanes; + static const int nlanes = v_int8::nlanes; using lane_type = schar; static const int max_nlanes = nlanes; }; template <> - struct VTraits + struct VTraits { static inline int vlanes() { return v_uint16::nlanes; } - static const int nlanes = v_uint16::nlanes; + static const int nlanes = v_uint16::nlanes; using lane_type = ushort; static const int max_nlanes = nlanes; }; template <> - struct VTraits + struct VTraits { static inline int vlanes() { return v_int16::nlanes; } - static const int nlanes = v_int16::nlanes; + static const int nlanes = v_int16::nlanes; using lane_type = short; static const int max_nlanes = nlanes; }; template <> - struct VTraits + struct VTraits { static inline int vlanes() { return v_uint32::nlanes; } - static const int nlanes = v_uint32::nlanes; + static const int nlanes = v_uint32::nlanes; using lane_type = uint; static const int max_nlanes = nlanes; }; template <> - struct VTraits + struct VTraits { static inline int vlanes() { return v_int32::nlanes; } - static const int nlanes = v_int32::nlanes; + static const int nlanes = v_int32::nlanes; using lane_type = int; static const int max_nlanes = nlanes; }; template <> - struct VTraits + struct VTraits { static inline int vlanes() { return v_float32::nlanes; } - static const int nlanes = v_float32::nlanes; + static const int nlanes = v_float32::nlanes; using lane_type = float; static const int max_nlanes = nlanes; }; template <> - struct VTraits + struct VTraits { static inline int vlanes() { return v_uint64::nlanes; } - static const int nlanes = v_uint64::nlanes; + static const int nlanes = v_uint64::nlanes; using lane_type = uint64; static const int max_nlanes = nlanes; }; template <> - struct VTraits + struct VTraits { static inline int vlanes() { return v_int64::nlanes; } - static const int nlanes = v_int64::nlanes; + static const int nlanes = v_int64::nlanes; using lane_type = int64; static const int max_nlanes = nlanes; }; #if CV_SIMD_64F template <> - struct VTraits + struct VTraits { static inline int vlanes() { return v_float64::nlanes; } - static const int nlanes = v_float64::nlanes; + static const int nlanes = v_float64::nlanes; using lane_type = double; static const int max_nlanes = nlanes; }; diff --git a/modules/core/include/opencv2/core/hal/intrin_riscv_vector.hpp b/modules/core/include/opencv2/core/hal/intrin_riscv_vector.hpp index 9758de191f46..2010a817516b 100644 --- a/modules/core/include/opencv2/core/hal/intrin_riscv_vector.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_riscv_vector.hpp @@ -44,65 +44,65 @@ template struct VTraits; template <> -struct VTraits +struct VTraits { - static inline int vlanes() { return vsetvlmax_e8m1(); } + static inline int vlanes() { return vsetvlmax_e8m1(); } using lane_type = uchar; static const int max_nlanes = CV_RVV_MAX_VLEN/8; }; template <> -struct VTraits +struct VTraits { - static inline int vlanes() { return vsetvlmax_e8m1(); } + static inline int vlanes() { return vsetvlmax_e8m1(); } using lane_type = schar; static const int max_nlanes = CV_RVV_MAX_VLEN/8; }; template <> -struct VTraits +struct VTraits { - static inline int vlanes() { return vsetvlmax_e16m1(); } + static inline int vlanes() { return vsetvlmax_e16m1(); } using lane_type = ushort; static const int max_nlanes = CV_RVV_MAX_VLEN/16; }; template <> -struct VTraits +struct VTraits { - static inline int vlanes() { return vsetvlmax_e16m1(); } + static inline int vlanes() { return vsetvlmax_e16m1(); } using lane_type = short; static const int max_nlanes = CV_RVV_MAX_VLEN/16; }; template <> -struct VTraits +struct VTraits { - static inline int vlanes() { return vsetvlmax_e32m1(); } + static inline int vlanes() { return vsetvlmax_e32m1(); } using lane_type = uint; static const int max_nlanes = CV_RVV_MAX_VLEN/32; }; template <> -struct VTraits +struct VTraits { - static inline int vlanes() { return vsetvlmax_e32m1(); } + static inline int vlanes() { return vsetvlmax_e32m1(); } using lane_type = int; static const int max_nlanes = CV_RVV_MAX_VLEN/32; }; template <> -struct VTraits +struct VTraits { - static inline int vlanes() { return vsetvlmax_e32m1(); } + static inline int vlanes() { return vsetvlmax_e32m1(); } using lane_type = float; static const int max_nlanes = CV_RVV_MAX_VLEN/32; }; template <> -struct VTraits +struct VTraits { static inline int vlanes() { return vsetvlmax_e64m1(); } using lane_type = uint64; static const int max_nlanes = CV_RVV_MAX_VLEN/64; }; template <> -struct VTraits +struct VTraits { static inline int vlanes() { return vsetvlmax_e64m1(); } using lane_type = int64; @@ -110,7 +110,7 @@ struct VTraits }; #if CV_SIMD_SCALABLE_64F template <> -struct VTraits +struct VTraits { static inline int vlanes() { return vsetvlmax_e64m1(); } using lane_type = double; diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index 9397e26c1b16..e6c9bcea5121 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -1018,7 +1018,7 @@ template struct TheTest EXPECT_EQ((LaneType)max, (LaneType)v_reduce_max(a)); EXPECT_EQ((int)(sum), (int)v_reduce_sum(a)); dataA[0] += VTraits::vlanes(); - R an = dataA; + R an = dataA; min = VTraits::vlanes(); for (int i = 0; i < VTraits::vlanes(); ++i) { diff --git a/modules/imgproc/src/color_yuv.simd.hpp b/modules/imgproc/src/color_yuv.simd.hpp index dd7d21c75db3..b5f73d873a73 100644 --- a/modules/imgproc/src/color_yuv.simd.hpp +++ b/modules/imgproc/src/color_yuv.simd.hpp @@ -1081,7 +1081,7 @@ static inline void yRGBuvToRGBA(const uchar vy, const int ruv, const int guv, co a = uchar(0xff); } -#if CV_SIMD +#if CV_SIMD static inline void yRGBuvToRGBA(const v_uint8& vy, const v_int32 (&ruv)[4], const v_int32 (&guv)[4], From a8875ee2d24fe479f0623973e5df4f19803e88e1 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Wed, 6 Jul 2022 08:43:53 +0000 Subject: [PATCH 08/12] Rename and some others modify. --- .../core/include/opencv2/core/hal/intrin.hpp | 36 +++++++------------ ...scv_vector.hpp => intrin_rvv_scalable.hpp} | 0 modules/imgproc/src/median_blur.simd.hpp | 10 +++--- platforms/linux/riscv64-clang.toolchain.cmake | 8 ++--- 4 files changed, 23 insertions(+), 31 deletions(-) rename modules/core/include/opencv2/core/hal/{intrin_riscv_vector.hpp => intrin_rvv_scalable.hpp} (100%) diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 21434ec86417..38f1739171d2 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -229,10 +229,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; #elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP) #include "opencv2/core/hal/intrin_wasm.hpp" -#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_NEW) +#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && !defined(CV_RVV_SCALABLE) #include "opencv2/core/hal/intrin_rvv.hpp" -#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_NEW -#include "opencv2/core/hal/intrin_riscv_vector.hpp" +#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP) && CV_RVV_SCALABLE +#include "opencv2/core/hal/intrin_rvv_scalable.hpp" #else #include "opencv2/core/hal/intrin_cpp.hpp" @@ -707,83 +707,73 @@ namespace CV__SIMD_NAMESPACE { struct VTraits { static inline int vlanes() { return v_uint8::nlanes; } - static const int nlanes = v_uint8::nlanes; + enum { nlanes = 16, max_nlanes = nlanes }; using lane_type = uchar; - static const int max_nlanes = nlanes; }; template <> struct VTraits { static inline int vlanes() { return v_int8::nlanes; } - static const int nlanes = v_int8::nlanes; + enum { nlanes = 16, max_nlanes = nlanes }; using lane_type = schar; - static const int max_nlanes = nlanes; }; template <> struct VTraits { static inline int vlanes() { return v_uint16::nlanes; } - static const int nlanes = v_uint16::nlanes; + enum { nlanes = 8, max_nlanes = nlanes }; using lane_type = ushort; - static const int max_nlanes = nlanes; }; template <> struct VTraits { static inline int vlanes() { return v_int16::nlanes; } - static const int nlanes = v_int16::nlanes; + enum { nlanes = 8, max_nlanes = nlanes }; using lane_type = short; - static const int max_nlanes = nlanes; }; template <> struct VTraits { static inline int vlanes() { return v_uint32::nlanes; } - static const int nlanes = v_uint32::nlanes; + enum { nlanes = 4, max_nlanes = nlanes }; using lane_type = uint; - static const int max_nlanes = nlanes; }; template <> struct VTraits { static inline int vlanes() { return v_int32::nlanes; } - static const int nlanes = v_int32::nlanes; + enum { nlanes = 4, max_nlanes = nlanes }; using lane_type = int; - static const int max_nlanes = nlanes; }; template <> struct VTraits { static inline int vlanes() { return v_float32::nlanes; } - static const int nlanes = v_float32::nlanes; + enum { nlanes = 4, max_nlanes = nlanes }; using lane_type = float; - static const int max_nlanes = nlanes; }; template <> struct VTraits { static inline int vlanes() { return v_uint64::nlanes; } - static const int nlanes = v_uint64::nlanes; + enum { nlanes = 2, max_nlanes = nlanes }; using lane_type = uint64; - static const int max_nlanes = nlanes; }; template <> struct VTraits { static inline int vlanes() { return v_int64::nlanes; } - static const int nlanes = v_int64::nlanes; + enum { nlanes = 2, max_nlanes = nlanes }; using lane_type = int64; - static const int max_nlanes = nlanes; }; #if CV_SIMD_64F template <> struct VTraits { static inline int vlanes() { return v_float64::nlanes; } - static const int nlanes = v_float64::nlanes; + enum { nlanes = 2, max_nlanes = nlanes }; using lane_type = double; - static const int max_nlanes = nlanes; }; #endif diff --git a/modules/core/include/opencv2/core/hal/intrin_riscv_vector.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp similarity index 100% rename from modules/core/include/opencv2/core/hal/intrin_riscv_vector.hpp rename to modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp diff --git a/modules/imgproc/src/median_blur.simd.hpp b/modules/imgproc/src/median_blur.simd.hpp index 5759c7aade65..90f0b2033021 100644 --- a/modules/imgproc/src/median_blur.simd.hpp +++ b/modules/imgproc/src/median_blur.simd.hpp @@ -689,10 +689,11 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) break; #if CV_SIMD || CV_SIMD_SCALABLE - for( ; j <= size.width - VTraits::vlanes() - cn; j += VTraits::vlanes() ) + int nlanes = VTraits::vlanes(); #else - for( ; j <= size.width - 1 - cn; j += 1 ) + int nlanes = 1; #endif + for( ; j <= size.width - nlanes - cn; j += nlanes ) { VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn); VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn); @@ -793,10 +794,11 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) break; #if CV_SIMD || CV_SIMD_SCALABLE - for( ; j <= size.width - VTraits::vlanes() - cn*2; j += VTraits::vlanes() ) + int nlanes = VTraits::vlanes(); #else - for( ; j <= size.width - 1 - cn*2; j += 1 ) + int nlanes = 1; #endif + for( ; j <= size.width - nlanes - cn*2; j += nlanes ) { VT p0 = vop.load(row[0]+j-cn*2), p5 = vop.load(row[1]+j-cn*2), p10 = vop.load(row[2]+j-cn*2), p15 = vop.load(row[3]+j-cn*2), p20 = vop.load(row[4]+j-cn*2); VT p1 = vop.load(row[0]+j-cn*1), p6 = vop.load(row[1]+j-cn*1), p11 = vop.load(row[2]+j-cn*1), p16 = vop.load(row[3]+j-cn*1), p21 = vop.load(row[4]+j-cn*1); diff --git a/platforms/linux/riscv64-clang.toolchain.cmake b/platforms/linux/riscv64-clang.toolchain.cmake index a14ad3f368c5..2efd67ad9385 100644 --- a/platforms/linux/riscv64-clang.toolchain.cmake +++ b/platforms/linux/riscv64-clang.toolchain.cmake @@ -22,10 +22,10 @@ set(CMAKE_CXX_FLAGS "-march=rv64gcv --gcc-toolchain=${RISCV_GCC_INSTALL_ROOT} -w set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2") -OPTION(NEWRVV "use new rvv ui" ON) # Enabled by default -IF(NEWRVV) - ADD_DEFINITIONS(-DCV_RVV_NEW) -ENDIF(NEWRVV) +OPTION(RISCV_RVV_SCALABLE "Use scalable RVV API on RISC-V" ON) # Enabled by default +IF(RISCV_RVV_SCALABLE) + ADD_DEFINITIONS(-DCV_RVV_SCALABLE) +ENDIF() set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT}) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) From 405d43b3880fda4981ad349637f00dfa04a74809 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Wed, 6 Jul 2022 14:45:40 +0000 Subject: [PATCH 09/12] Update intrin.hpp but still not work on AVX/SSE --- .../core/include/opencv2/core/hal/intrin.hpp | 149 ++++++++++++++++++ .../opencv2/core/hal/intrin_rvv_scalable.hpp | 6 +- 2 files changed, 152 insertions(+), 3 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 38f1739171d2..cb5d5b75dcfd 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -702,7 +702,155 @@ namespace CV__SIMD_NAMESPACE { template struct VTraits; +#if CV_SIMD512 + template <> + struct VTraits + { + static inline int vlanes() { return v_uint8::nlanes; } + enum { nlanes = 64, max_nlanes = nlanes }; + using lane_type = uchar; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_int8::nlanes; } + enum { nlanes = 64, max_nlanes = nlanes }; + using lane_type = schar; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_uint16::nlanes; } + enum { nlanes = 32, max_nlanes = nlanes }; + using lane_type = ushort; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_int16::nlanes; } + enum { nlanes = 32, max_nlanes = nlanes }; + using lane_type = short; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_uint32::nlanes; } + enum { nlanes = 16, max_nlanes = nlanes }; + using lane_type = uint; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_int32::nlanes; } + enum { nlanes = 16, max_nlanes = nlanes }; + using lane_type = int; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_float32::nlanes; } + enum { nlanes = 16, max_nlanes = nlanes }; + using lane_type = float; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_uint64::nlanes; } + enum { nlanes = 8, max_nlanes = nlanes }; + using lane_type = uint64; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_int64::nlanes; } + enum { nlanes = 8, max_nlanes = nlanes }; + using lane_type = int64; + }; + #if CV_SIMD_64F + template <> + struct VTraits + { + static inline int vlanes() { return v_float64::nlanes; } + enum { nlanes = 8, max_nlanes = nlanes }; + using lane_type = double; + }; + #endif +#elif CV_SIMD256 + template <> + struct VTraits + { + static inline int vlanes() { return v_uint8::nlanes; } + enum { nlanes = 32, max_nlanes = nlanes }; + using lane_type = uchar; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_int8::nlanes; } + enum { nlanes = 32, max_nlanes = nlanes }; + using lane_type = schar; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_uint16::nlanes; } + enum { nlanes = 16, max_nlanes = nlanes }; + using lane_type = ushort; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_int16::nlanes; } + enum { nlanes = 16, max_nlanes = nlanes }; + using lane_type = short; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_uint32::nlanes; } + enum { nlanes = 8, max_nlanes = nlanes }; + using lane_type = uint; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_int32::nlanes; } + enum { nlanes = 8, max_nlanes = nlanes }; + using lane_type = int; + }; + + template <> + struct VTraits + { + static inline int vlanes() { return v_float32::nlanes; } + enum { nlanes = 8, max_nlanes = nlanes }; + using lane_type = float; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_uint64::nlanes; } + enum { nlanes = 4, max_nlanes = nlanes }; + using lane_type = uint64; + }; + template <> + struct VTraits + { + static inline int vlanes() { return v_int64::nlanes; } + enum { nlanes = 4, max_nlanes = nlanes }; + using lane_type = int64; + }; + #if CV_SIMD_64F + template <> + struct VTraits + { + static inline int vlanes() { return v_float64::nlanes; } + enum { nlanes = 4, max_nlanes = nlanes }; + using lane_type = double; + }; + #endif +#elif CV_SIMD128 template <> struct VTraits { @@ -776,6 +924,7 @@ namespace CV__SIMD_NAMESPACE { using lane_type = double; }; #endif +#endif #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \ inline _Tpvec v_add(const _Tpvec& a, const _Tpvec& b) \ diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index 2010a817516b..b9eb691f3aca 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -1,6 +1,6 @@ -#ifndef OPENCV_HAL_INTRIN_RVV_VEC_HPP -#define OPENCV_HAL_INTRIN_RVV_VEC_HPP +#ifndef OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP +#define OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP #include #include @@ -487,4 +487,4 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END } //namespace cv -#endif \ No newline at end of file +#endif //OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP \ No newline at end of file From 1af0e0fd6aff3432b5c36c27363de423cbc8d1a6 Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Mon, 18 Jul 2022 04:02:30 +0000 Subject: [PATCH 10/12] Update conditional compilation macros. --- modules/core/include/opencv2/core/hal/intrin.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index cb5d5b75dcfd..0041030dadcf 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -702,7 +702,7 @@ namespace CV__SIMD_NAMESPACE { template struct VTraits; -#if CV_SIMD512 +#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512) template <> struct VTraits { @@ -776,7 +776,7 @@ namespace CV__SIMD_NAMESPACE { using lane_type = double; }; #endif -#elif CV_SIMD256 +#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256) template <> struct VTraits { @@ -850,7 +850,7 @@ namespace CV__SIMD_NAMESPACE { using lane_type = double; }; #endif -#elif CV_SIMD128 +#elif CV_SIMD128 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128) template <> struct VTraits { From 8a8f1b41fbd40409802aecad42772581b1445b7a Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Mon, 18 Jul 2022 04:04:30 +0000 Subject: [PATCH 11/12] Use static variable for vlanes. --- .../opencv2/core/hal/intrin_rvv_scalable.hpp | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index b9eb691f3aca..b984411436ad 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -38,7 +38,10 @@ using uint = unsigned int; using uint64 = unsigned long int; using int64 = long int; - +static const int __cv_rvv_e8_nlanes = vsetvlmax_e8m1(); +static const int __cv_rvv_e16_nlanes = vsetvlmax_e16m1(); +static const int __cv_rvv_e32_nlanes = vsetvlmax_e32m1(); +static const int __cv_rvv_e64_nlanes = vsetvlmax_e64m1(); template struct VTraits; @@ -46,7 +49,7 @@ struct VTraits; template <> struct VTraits { - static inline int vlanes() { return vsetvlmax_e8m1(); } + static inline int vlanes() { return __cv_rvv_e8_nlanes; } using lane_type = uchar; static const int max_nlanes = CV_RVV_MAX_VLEN/8; }; @@ -54,35 +57,35 @@ struct VTraits template <> struct VTraits { - static inline int vlanes() { return vsetvlmax_e8m1(); } + static inline int vlanes() { return __cv_rvv_e8_nlanes; } using lane_type = schar; static const int max_nlanes = CV_RVV_MAX_VLEN/8; }; template <> struct VTraits { - static inline int vlanes() { return vsetvlmax_e16m1(); } + static inline int vlanes() { return __cv_rvv_e16_nlanes; } using lane_type = ushort; static const int max_nlanes = CV_RVV_MAX_VLEN/16; }; template <> struct VTraits { - static inline int vlanes() { return vsetvlmax_e16m1(); } + static inline int vlanes() { return __cv_rvv_e16_nlanes; } using lane_type = short; static const int max_nlanes = CV_RVV_MAX_VLEN/16; }; template <> struct VTraits { - static inline int vlanes() { return vsetvlmax_e32m1(); } + static inline int vlanes() { return __cv_rvv_e32_nlanes; } using lane_type = uint; static const int max_nlanes = CV_RVV_MAX_VLEN/32; }; template <> struct VTraits { - static inline int vlanes() { return vsetvlmax_e32m1(); } + static inline int vlanes() { return __cv_rvv_e32_nlanes; } using lane_type = int; static const int max_nlanes = CV_RVV_MAX_VLEN/32; }; @@ -90,21 +93,21 @@ struct VTraits template <> struct VTraits { - static inline int vlanes() { return vsetvlmax_e32m1(); } + static inline int vlanes() { return __cv_rvv_e32_nlanes; } using lane_type = float; static const int max_nlanes = CV_RVV_MAX_VLEN/32; }; template <> struct VTraits { - static inline int vlanes() { return vsetvlmax_e64m1(); } + static inline int vlanes() { return __cv_rvv_e64_nlanes; } using lane_type = uint64; static const int max_nlanes = CV_RVV_MAX_VLEN/64; }; template <> struct VTraits { - static inline int vlanes() { return vsetvlmax_e64m1(); } + static inline int vlanes() { return __cv_rvv_e64_nlanes; } using lane_type = int64; static const int max_nlanes = CV_RVV_MAX_VLEN/64; }; @@ -112,7 +115,7 @@ struct VTraits template <> struct VTraits { - static inline int vlanes() { return vsetvlmax_e64m1(); } + static inline int vlanes() { return __cv_rvv_e64_nlanes; } using lane_type = double; static const int max_nlanes = CV_RVV_MAX_VLEN/64; }; From a59d2115be79e0032f51cb19e1c4061eb6d431cc Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Tue, 19 Jul 2022 07:25:36 +0000 Subject: [PATCH 12/12] Use max_nlanes for array defining. --- modules/core/test/test_intrin_utils.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index e6c9bcea5121..5d4442f1111a 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -477,8 +477,8 @@ template struct TheTest R a = data1, b = data2, c = data3; R d = data1, e = data2, f = data3, g = data4; - LaneType buf3[VTraits::vlanes() * 3]; - LaneType buf4[VTraits::vlanes() * 4]; + LaneType buf3[VTraits::max_nlanes * 3]; + LaneType buf4[VTraits::max_nlanes * 4]; v_store_interleave(buf3, a, b, c); v_store_interleave(buf4, d, e, f, g); @@ -513,7 +513,7 @@ template struct TheTest R a = data1, b = data2; - LaneType buf2[VTraits::vlanes() * 2]; + LaneType buf2[VTraits::max_nlanes * 2]; v_store_interleave(buf2, a, b);