diff --git a/doc/release/upcoming_changes/21123.performance.rst b/doc/release/upcoming_changes/21123.performance.rst new file mode 100644 index 000000000000..90f652fd3590 --- /dev/null +++ b/doc/release/upcoming_changes/21123.performance.rst @@ -0,0 +1,7 @@ +Faster casting on modern x86-64 processors +--------------------- +Implicit/explicit casting is now significantly faster on contiguous arrays on +processors supporting AVX-2. This speeds many functions up like `numpy.sum`, +`numpy.prod`, `np.cumsum`, `np.cumprod`, `np.all` and `np.any`. +Functions like `np.mean` or basic binary operation with a constant of a +different type requiring the array to be casted are a bit faster. diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src index 8e3afd3cc658..4bcdf18d05dd 100644 --- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src @@ -709,6 +709,15 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/ /**begin repeat + * #isa = , _avx2# + * #ISA = , AVX2# + * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)# + * #ATTR = , NPY_GCC_TARGET_AVX2# + */ + +#if @CHK@ + +/**begin repeat1 * * #NAME1 = BOOL, * UBYTE, USHORT, UINT, ULONG, ULONGLONG, @@ -737,7 +746,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * * #is_complex1 = 0*15, 1*3# */ -/**begin repeat1 +/**begin repeat2 * * #NAME2 = BOOL, * UBYTE, USHORT, UINT, ULONG, ULONGLONG, @@ -766,7 +775,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * * #is_complex2 = 0*15, 1*3# */ -/**begin repeat2 +/**begin repeat3 * #prefix = _aligned,,_aligned_contig,_contig# * #aligned = 1,0,1,0# * #contig = 0,0,1,1# @@ -839,8 +848,9 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop * #endif -static NPY_GCC_OPT_3 int -@prefix@_cast_@name1@_to_@name2@( +#if @CHK@ +static NPY_GCC_OPT_3 @ATTR@ int +@prefix@_cast_@name1@_to_@name2@@isa@( PyArrayMethod_Context *context, char *const *args, const npy_intp *dimensions, const npy_intp *strides, NpyAuxData *NPY_UNUSED(data)) @@ -868,7 +878,7 @@ static NPY_GCC_OPT_3 int assert(N == 0 || npy_is_aligned(dst, _ALIGN(_TYPE2))); #endif - /*printf("@prefix@_cast_@name1@_to_@name2@\n");*/ + /*printf("@prefix@_cast_@name1@_to_@name2@@isa@\n");*/ while (N--) { #if @aligned@ @@ -932,6 +942,7 @@ static NPY_GCC_OPT_3 int } return 0; } +#endif #undef _CONVERT_FN #undef _TYPE2 @@ -939,19 +950,19 @@ static NPY_GCC_OPT_3 int #endif +/**end repeat3**/ + /**end repeat2**/ /**end repeat1**/ -/**end repeat**/ - -NPY_NO_EXPORT PyArrayMethod_StridedLoop * -PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride, +static PyArrayMethod_StridedLoop * +PyArray_GetStridedNumericCastFn_body@isa@(int aligned, npy_intp src_stride, npy_intp dst_stride, int src_type_num, int dst_type_num) { switch (src_type_num) { -/**begin repeat +/**begin repeat1 * * #NAME1 = BOOL, * UBYTE, USHORT, UINT, ULONG, ULONGLONG, @@ -973,7 +984,7 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride, case NPY_@NAME1@: /*printf("test fn %d - second %d\n", NPY_@NAME1@, dst_type_num);*/ switch (dst_type_num) { -/**begin repeat1 +/**begin repeat2 * * #NAME2 = BOOL, * UBYTE, USHORT, UINT, ULONG, ULONGLONG, @@ -997,34 +1008,52 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride, # if NPY_USE_UNALIGNED_ACCESS if (src_stride == sizeof(@type1@) && dst_stride == sizeof(@type2@)) { - return &_aligned_contig_cast_@name1@_to_@name2@; + return &_aligned_contig_cast_@name1@_to_@name2@@isa@; } else { - return &_aligned_cast_@name1@_to_@name2@; + return &_aligned_cast_@name1@_to_@name2@@isa@; } # else if (src_stride == sizeof(@type1@) && dst_stride == sizeof(@type2@)) { return aligned ? - &_aligned_contig_cast_@name1@_to_@name2@ : - &_contig_cast_@name1@_to_@name2@; + &_aligned_contig_cast_@name1@_to_@name2@@isa@ : + &_contig_cast_@name1@_to_@name2@@isa@; } else { - return aligned ? &_aligned_cast_@name1@_to_@name2@ : - &_cast_@name1@_to_@name2@; + return aligned ? &_aligned_cast_@name1@_to_@name2@@isa@ : + &_cast_@name1@_to_@name2@@isa@; } # endif -/**end repeat1**/ +/**end repeat2**/ } /*printf("switched test fn %d - second %d\n", NPY_@NAME1@, dst_type_num);*/ -/**end repeat**/ +/**end repeat1**/ } return NULL; } +#endif + +/**end repeat**/ + +NPY_NO_EXPORT PyArrayMethod_StridedLoop * +PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride, + npy_intp dst_stride, + int src_type_num, int dst_type_num) +{ + #ifdef HAVE_ATTRIBUTE_TARGET_AVX2 + if (NPY_CPU_HAVE(AVX2)) { + return PyArray_GetStridedNumericCastFn_body_avx2(aligned, src_stride, dst_stride, src_type_num, dst_type_num); + } + #endif + + return PyArray_GetStridedNumericCastFn_body(aligned, src_stride, dst_stride, src_type_num, dst_type_num); +} + /****************** PRIMITIVE FLAT TO/FROM NDIM FUNCTIONS ******************/