Skip to content

Commit

Permalink
Merge pull request #24324 from hanliutong:rewrite-fluid
Browse files Browse the repository at this point in the history
Rewrite Universal Intrinsic code: gapi module (fluid part).
  • Loading branch information
asmorkalov committed Oct 13, 2023
2 parents b3d3acf + 419060d commit cd7cbe3
Show file tree
Hide file tree
Showing 5 changed files with 352 additions and 375 deletions.
60 changes: 29 additions & 31 deletions modules/gapi/src/backends/fluid/gfluidcore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <opencv2/core/hal/hal.hpp>
#include <opencv2/core/hal/intrin.hpp>

#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
#include "gfluidcore_func.hpp"
#endif

Expand Down Expand Up @@ -113,7 +113,7 @@ static inline DST divr(SRC1 x, SRC2 y, float scale=1)
// Fluid kernels: addWeighted
//
//---------------------------
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
CV_ALWAYS_INLINE v_float32 v_load_f32(const ushort* in)
{
return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
Expand Down Expand Up @@ -150,8 +150,8 @@ CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], DST out[],
((std::is_same<SRC, short>::value) && (std::is_same<DST, short>::value)),
"This templated overload is only for short and ushort type combinations.");

constexpr int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(v_uint16::nlanes) :
static_cast<int>(v_int16::nlanes);
const int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(VTraits<v_uint16>::vlanes()) :
static_cast<int>(VTraits<v_int16>::vlanes());

if (length < nlanes)
return 0;
Expand Down Expand Up @@ -189,7 +189,7 @@ CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], uchar out[],
const float _alpha, const float _beta,
const float _gamma, int length)
{
constexpr int nlanes = v_uint8::nlanes;
const int nlanes = VTraits<v_uint8>::vlanes();

if (length < nlanes)
return 0;
Expand Down Expand Up @@ -298,7 +298,7 @@ GAPI_FLUID_KERNEL(GFluidAddW, cv::gapi::core::GAddW, false)

enum Arithm { ARITHM_ABSDIFF, ARITHM_ADD, ARITHM_SUBTRACT, ARITHM_MULTIPLY, ARITHM_DIVIDE };

#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
CV_ALWAYS_INLINE void absdiff_store(short out[], const v_int16& a, const v_int16& b, int x)
{
vx_store(&out[x], v_absdiffs(a, b));
Expand All @@ -322,7 +322,7 @@ CV_ALWAYS_INLINE void absdiff_store(float out[], const v_float32& a, const v_flo
template<typename T, typename VT>
CV_ALWAYS_INLINE int absdiff_impl(const T in1[], const T in2[], T out[], int length)
{
constexpr int nlanes = static_cast<int>(VT::nlanes);
const int nlanes = static_cast<int>(VTraits<VT>::vlanes());

if (length < nlanes)
return 0;
Expand Down Expand Up @@ -403,7 +403,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
{
case ARITHM_ADD:
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
x = add_simd(in1, in2, out, length);
#endif
for (; x < length; ++x)
Expand All @@ -412,7 +412,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
}
case ARITHM_SUBTRACT:
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
x = sub_simd(in1, in2, out, length);
#endif
for (; x < length; ++x)
Expand All @@ -421,7 +421,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
}
case ARITHM_MULTIPLY:
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
x = mul_simd(in1, in2, out, length, scale);
#endif
for (; x < length; ++x)
Expand All @@ -430,7 +430,7 @@ CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2
}
case ARITHM_DIVIDE:
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
x = div_simd(in1, in2, out, length, scale);
#endif
for (; x < length; ++x)
Expand Down Expand Up @@ -569,7 +569,7 @@ static void run_absdiff(Buffer &dst, const View &src1, const View &src2)

int x = 0;

#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
x = absdiff_simd(in1, in2, out, length);
#endif
for (; x < length; ++x)
Expand Down Expand Up @@ -660,7 +660,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
case ARITHM_ADD:
{
int w = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
w = addc_simd(in, scalar, out, length, chan);
#endif
for (; w < length; ++w)
Expand All @@ -671,7 +671,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
case ARITHM_SUBTRACT:
{
int w = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
w = subc_simd(in, scalar, out, length, chan);
#endif
for (; w < length; ++w)
Expand All @@ -681,7 +681,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
case ARITHM_MULTIPLY:
{
int w = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
w = mulc_simd(in, scalar, out, length, chan, scale);
#endif
for (; w < width; ++w)
Expand Down Expand Up @@ -709,7 +709,7 @@ CV_ALWAYS_INLINE void run_arithm_rs(Buffer &dst, const View &src, const float sc
case ARITHM_SUBTRACT:
{
int w = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
w = subrc_simd(scalar, in, out, length, chan);
#endif
for (; w < length; ++w)
Expand All @@ -721,7 +721,7 @@ CV_ALWAYS_INLINE void run_arithm_rs(Buffer &dst, const View &src, const float sc
case ARITHM_DIVIDE:
{
int w = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
w = divrc_simd(scalar, in, out, length, chan, scale);
#endif
for (; w < length; ++w)
Expand All @@ -744,7 +744,7 @@ CV_ALWAYS_INLINE void setScratchSize(Buffer& scratch, const int buflen)

CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
// 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
constexpr int maxNlanes = 16;

Expand Down Expand Up @@ -783,7 +783,7 @@ CV_ALWAYS_INLINE void run_absdiffc(Buffer& dst, const View& src, const float sca
const int length = width * chan;

int w = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
w = absdiffc_simd(in, scalar, out, length, chan);
#endif

Expand Down Expand Up @@ -1076,7 +1076,7 @@ CV_ALWAYS_INLINE void run_divc(Buffer& dst, const View& src, Buffer& scratch,
const int length = width * chan;

int w = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
int scratch_length = scratch.length();
int indicator_offset = scratch_length - 1;
const int set_mask_indicator = static_cast<int>(*(scratch.OutLine<float>() + (indicator_offset)));
Expand Down Expand Up @@ -1143,7 +1143,7 @@ GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, true)

static void initScratch(const GMatDesc&, const GScalarDesc&, double, int, Buffer& scratch)
{
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
// 512 bits / 32 bits = 16 elements of float32 a AVX512 SIMD vector can contain.
constexpr int maxNlanes = 16;

Expand Down Expand Up @@ -1565,7 +1565,7 @@ template<typename SRC, typename DST>
CV_ALWAYS_INLINE void convertto_impl(const SRC in[], DST out[], const int length)
{
int x = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
x = convertto_simd(in, out, length);
#endif
// tail of SIMD cycle
Expand All @@ -1580,7 +1580,7 @@ CV_ALWAYS_INLINE void convertto_impl(const SRC *in, DST* out, const float alpha,
const int length)
{
int x = 0;
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
x = convertto_scaled_simd(in, out, alpha, beta, length);
#endif

Expand Down Expand Up @@ -2096,9 +2096,7 @@ static void run_inrange3(uchar out[], const uchar in[], int width,
v_load_deinterleave(&in[3*w], i0, i1, i2);

v_uint8x16 o;
o = (i0 >= v_setall_u8(lower[0])) & (i0 <= v_setall_u8(upper[0])) &
(i1 >= v_setall_u8(lower[1])) & (i1 <= v_setall_u8(upper[1])) &
(i2 >= v_setall_u8(lower[2])) & (i2 <= v_setall_u8(upper[2]));
o = v_and(v_and(v_and(v_and(v_and(v_ge(i0, v_setall_u8(lower[0])), v_le(i0, v_setall_u8(upper[0]))), v_ge(i1, v_setall_u8(lower[1]))), v_le(i1, v_setall_u8(upper[1]))), v_ge(i2, v_setall_u8(lower[2]))), v_le(i2, v_setall_u8(upper[2])));

v_store(&out[w], o);
}
Expand Down Expand Up @@ -2226,7 +2224,7 @@ static void run_select_row3(int width, uchar out[], uchar in1[], uchar in2[], uc
v_load_deinterleave(&in2[3*w], a2, b2, c2);

mask = v_load(&in3[w]);
mask = mask != v_setzero_u8();
mask = v_ne(mask, v_setzero_u8());

a = v_select(mask, a1, a2);
b = v_select(mask, b1, b2);
Expand Down Expand Up @@ -2332,7 +2330,7 @@ GAPI_FLUID_KERNEL(GFluidSplit3, cv::gapi::core::GSplit3, false)
int width = src.length();
int w = 0;

#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
w = split3_simd(in, out1, out2, out3, width);
#endif

Expand Down Expand Up @@ -2364,7 +2362,7 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
int width = src.length();
int w = 0;

#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
w = split4_simd(in, out1, out2, out3, out4, width);
#endif

Expand All @@ -2389,7 +2387,7 @@ CV_ALWAYS_INLINE void run_merge3(Buffer& dst, const View& src1, const View& src2
int width = dst.length();
int w = 0;

#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
w = merge3_simd(in1, in2, in3, out, width);
#endif

Expand Down Expand Up @@ -2442,7 +2440,7 @@ GAPI_FLUID_KERNEL(GFluidMerge4, cv::gapi::core::GMerge4, false)

int w = 0; // cycle counter

#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
w = merge4_simd(in1, in2, in3, in4, out, width);
#endif

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#if !defined(GAPI_STANDALONE)

#include <opencv2/core/hal/intrin.hpp>
#if CV_SIMD
#if (CV_SIMD || CV_SIMD_SCALABLE)
#include "gfluidcore_func.hpp"
#include "gfluidcore_func.simd.hpp"

Expand Down
2 changes: 1 addition & 1 deletion modules/gapi/src/backends/fluid/gfluidcore_func.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

#pragma once

#if !defined(GAPI_STANDALONE) && CV_SIMD
#if !defined(GAPI_STANDALONE) && (CV_SIMD || CV_SIMD_SCALABLE)

#include <opencv2/core.hpp>

Expand Down

0 comments on commit cd7cbe3

Please sign in to comment.