Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: improve the speed of array conversions using AVX2 if available #21123

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/release/upcoming_changes/21123.performance.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Faster casting on modern x86-64 processors
------------------------------------------
Implicit/explicit casting is now significantly faster on contiguous arrays on
processors supporting AVX-2. This speeds many functions up like `numpy.sum`,
`numpy.prod`, `np.cumsum`, `np.cumprod`, `np.all` and `np.any`.
Functions like `np.mean` or basic binary operation with a constant of a
different type requiring the array to be casted are a bit faster.
67 changes: 48 additions & 19 deletions numpy/core/src/multiarray/lowlevel_strided_loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,15 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
/************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/

/**begin repeat
* #isa = , _avx2#
* #ISA = , AVX2#
* #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)#
* #ATTR = , NPY_GCC_TARGET_AVX2#
*/

#if @CHK@

/**begin repeat1
*
* #NAME1 = BOOL,
* UBYTE, USHORT, UINT, ULONG, ULONGLONG,
Expand Down Expand Up @@ -737,7 +746,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
* #is_complex1 = 0*15, 1*3#
*/

/**begin repeat1
/**begin repeat2
*
* #NAME2 = BOOL,
* UBYTE, USHORT, UINT, ULONG, ULONGLONG,
Expand Down Expand Up @@ -766,7 +775,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
* #is_complex2 = 0*15, 1*3#
*/

/**begin repeat2
/**begin repeat3
* #prefix = _aligned,,_aligned_contig,_contig#
* #aligned = 1,0,1,0#
* #contig = 0,0,1,1#
Expand Down Expand Up @@ -839,8 +848,9 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *

#endif

static NPY_GCC_OPT_3 int
@prefix@_cast_@name1@_to_@name2@(
#if @CHK@
static NPY_GCC_OPT_3 @ATTR@ int
@prefix@_cast_@name1@_to_@name2@@isa@(
PyArrayMethod_Context *context, char *const *args,
const npy_intp *dimensions, const npy_intp *strides,
NpyAuxData *NPY_UNUSED(data))
Expand Down Expand Up @@ -868,7 +878,7 @@ static NPY_GCC_OPT_3 int
assert(N == 0 || npy_is_aligned(dst, _ALIGN(_TYPE2)));
#endif

/*printf("@prefix@_cast_@name1@_to_@name2@\n");*/
/*printf("@prefix@_cast_@name1@_to_@name2@@isa@\n");*/

while (N--) {
#if @aligned@
Expand Down Expand Up @@ -932,26 +942,27 @@ static NPY_GCC_OPT_3 int
}
return 0;
}
#endif

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif

/**end repeat3**/

/**end repeat2**/

/**end repeat1**/

/**end repeat**/

NPY_NO_EXPORT PyArrayMethod_StridedLoop *
PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
static PyArrayMethod_StridedLoop *
PyArray_GetStridedNumericCastFn_body@isa@(int aligned, npy_intp src_stride,
npy_intp dst_stride,
int src_type_num, int dst_type_num)
{
switch (src_type_num) {
/**begin repeat
/**begin repeat1
*
* #NAME1 = BOOL,
* UBYTE, USHORT, UINT, ULONG, ULONGLONG,
Expand All @@ -973,7 +984,7 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
case NPY_@NAME1@:
/*printf("test fn %d - second %d\n", NPY_@NAME1@, dst_type_num);*/
switch (dst_type_num) {
/**begin repeat1
/**begin repeat2
*
* #NAME2 = BOOL,
* UBYTE, USHORT, UINT, ULONG, ULONGLONG,
Expand All @@ -997,34 +1008,52 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
# if NPY_USE_UNALIGNED_ACCESS
if (src_stride == sizeof(@type1@) &&
dst_stride == sizeof(@type2@)) {
return &_aligned_contig_cast_@name1@_to_@name2@;
return &_aligned_contig_cast_@name1@_to_@name2@@isa@;
}
else {
return &_aligned_cast_@name1@_to_@name2@;
return &_aligned_cast_@name1@_to_@name2@@isa@;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Meant to do a review, but then posted instead:

IIRC this branch is never used (NPY_USE_UNALIGNED_ACCESS is always 0 here) and I don't think vectorization is OK if it was used. So I would either not do this, or just delete the whole # if block: It doesn't really add a whole lot.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am unsure of the meaning of NPY_USE_UNALIGNED_ACCESS, but if we can remove it here since it is always set to 0, why not removing it from the whole file and possibly the whole code ? It would make the code a bit more clean/readable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the comment of this macro, vectorization should be disabled at a build level. AFAIK, telling the compiler that AVX can be used does not change anything. Using O3 causes GCC to auto-vectorize the code as opposed to O2 so far but the new versions of GCC (starting from GCC 12) should now enable the auto-vectorization even in O2 which is the default optimization level for Numpy so far. Thus, this change should not cause more harm than currently (but I think the code path enabled by NPY_USE_UNALIGNED_ACCESS is certainly arlready harmful). What do you think about that?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have to look closer, but I think we should just delete this code (if you agree with that).

}
# else
if (src_stride == sizeof(@type1@) &&
dst_stride == sizeof(@type2@)) {
return aligned ?
&_aligned_contig_cast_@name1@_to_@name2@ :
&_contig_cast_@name1@_to_@name2@;
&_aligned_contig_cast_@name1@_to_@name2@@isa@ :
&_contig_cast_@name1@_to_@name2@@isa@;
}
else {
return aligned ? &_aligned_cast_@name1@_to_@name2@ :
&_cast_@name1@_to_@name2@;
return aligned ? &_aligned_cast_@name1@_to_@name2@@isa@ :
&_cast_@name1@_to_@name2@@isa@;
}
# endif

/**end repeat1**/
/**end repeat2**/
}
/*printf("switched test fn %d - second %d\n", NPY_@NAME1@, dst_type_num);*/

/**end repeat**/
/**end repeat1**/
}

return NULL;
}

#endif

/**end repeat**/

NPY_NO_EXPORT PyArrayMethod_StridedLoop *
PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
npy_intp dst_stride,
int src_type_num, int dst_type_num)
{
#ifdef HAVE_ATTRIBUTE_TARGET_AVX2
if (NPY_CPU_HAVE(AVX2)) {
return PyArray_GetStridedNumericCastFn_body_avx2(aligned, src_stride, dst_stride, src_type_num, dst_type_num);
}
#endif

return PyArray_GetStridedNumericCastFn_body(aligned, src_stride, dst_stride, src_type_num, dst_type_num);
}


/****************** PRIMITIVE FLAT TO/FROM NDIM FUNCTIONS ******************/

Expand Down