-
-
Notifications
You must be signed in to change notification settings - Fork 9.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SIMD: Add sum intrinsics for float/double. #17681
Conversation
@seiko2plus looks ok? |
|
||
// Horizontal add: Calculates the sum of all vector elements. | ||
NPY_FINLINE float npyv_sum_f32(float32x4_t a) | ||
{ | ||
float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); | ||
return vget_lane_f32(vpadd_f32(r, r), 0); | ||
} | ||
#ifdef __aarch64__ | ||
NPY_FINLINE double npyv_sum_f64(float64x2_t a) | ||
{ | ||
return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); | ||
} | ||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Horizontal add: Calculates the sum of all vector elements. | |
NPY_FINLINE float npyv_sum_f32(float32x4_t a) | |
{ | |
float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); | |
return vget_lane_f32(vpadd_f32(r, r), 0); | |
} | |
#ifdef __aarch64__ | |
NPY_FINLINE double npyv_sum_f64(float64x2_t a) | |
{ | |
return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); | |
} | |
#endif | |
// Horizontal add: Calculates the sum of all vector elements. | |
#if NPY_SIMD_F64 | |
#define npyv_sum_f32 vaddvq_f32 | |
#define npyv_sum_f64 vaddvq_f64 | |
#else | |
NPY_FINLINE float npyv_sum_f32(npyv_f32 a) | |
{ | |
float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); | |
return vget_lane_f32(vpadd_f32(r, r), 0); | |
} | |
#endif |
EDIT: bring vpadd_f32
again as it was, It should perform better than extracting two scalars.
NPY_FINLINE float npyv_sum_f32(npyv_f32 a) | ||
{ | ||
return vec_extract(a, 0) + vec_extract(a, 1) + | ||
vec_extract(a, 2) + vec_extract(a, 3); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NPY_FINLINE float npyv_sum_f32(npyv_f32 a) | |
{ | |
return vec_extract(a, 0) + vec_extract(a, 1) + | |
vec_extract(a, 2) + vec_extract(a, 3); | |
} | |
NPY_FINLINE float npyv_sum_f32(npyv_f32 a) | |
{ | |
npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a)); | |
return vec_extract(sum, 0) + vec_extract(sum, 1); | |
} |
EDIT: my bad, fix swaping the highest half
@@ -117,3 +117,23 @@ | |||
} | |||
#endif // !NPY_HAVE_FMA3 | |||
#endif // _NPY_SIMD_AVX2_ARITHMETIC_H | |||
|
|||
// Horizontal add: Calculates the sum of all vector elements. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please, move the intrinsics inside the header guard _NPY_SIMD_AVX2_ARITHMETIC_H
,
same thing for other SIMD extensions.
I prefer adding a testing unit for any new intrinsics to keep things under control. // try to follow the current defentions in the way of sorting the source
// this how you should define the new python methods
SIMD_IMPL_INTRIN_1(sum_f32, f32, vf32)
#if NPY_SIMD_F64
SIMD_IMPL_INTRIN_1(sum_f64, f64, vf64)
#endif
// and that how we attach them
SIMD_INTRIN_DEF(sum_f32)
#if NPY_SIMD_F64
SIMD_INTRIN_DEF(sum_f64)
#endif Once you get done bringing the new methods to def test_reduce(self):
data = self._data()
vdata = self.load(data)
# reduce sum
data_sum = sum(data)
vsum = self.sum(vdata)
assert vsum == data_sum You can also have some fun and try to use # 1- bring the baseline via dict `targets` or attribute `baseline`
from numpy.core._simd import targets, baseline
npyv = targets["baseline"] # or baseline
# you can also dump `targets` to get the supported SIMD extensions
# by default build option `--simd-test` contains the most common SIMD extentions
if not npyv.simd: # equivalent to C def `NPY_SIMD`
print((
"How that possible? changed the default build settings?\n"
"maybe you running it under armhf, then get targets['NEON']"
))
return
a = npyv.load_f32(range(npyv.nlanes_f32))
print("sum of f32", npyv.sum_f32(a))
if npyv.simd_f64: # equivalent to C def `NPY_SIMD_F64`
b = npyv.load_f64(range(npyv.nlanes_f64))
print("sum of f64", npyv.sum_f64(b)) EDIT: improve the examples |
@seiko2plus Thanks for your detailed recommendations, The |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, Thank you.
|
||
data_sum = sum(data) | ||
vsum = self.sum(vdata) | ||
assert vsum == data_sum |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice!
Thanks @Qiyu8 |
The origin PR is too large to review, So I will split to several small PRs, Here is the sum intrinsics that was about to used in
einsum
, The intrinsics has been fully discussed and tested.