numpy · zephyr111 · Feb 6, 2022 · Jul 16, 2022 · seberg · Feb 28, 2022
diff --git a/doc/release/upcoming_changes/21123.performance.rst b/doc/release/upcoming_changes/21123.performance.rst
@@ -0,0 +1,7 @@
+Faster casting on modern x86-64 processors
+------------------------------------------
+Implicit/explicit casting is now significantly faster on contiguous arrays on 
+processors supporting AVX-2. This speeds many functions up like `numpy.sum`, 
+`numpy.prod`, `np.cumsum`, `np.cumprod`, `np.all` and `np.any`.
+Functions like `np.mean` or basic binary operation with a constant of a 
+different type requiring the array to be casted are a bit faster.
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -709,6 +709,15 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 /************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/
 
 /**begin repeat
+ * #isa = , _avx2#
+ * #ISA = , AVX2#
+ * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)#
+ * #ATTR = , NPY_GCC_TARGET_AVX2#
+ */
+
+#if @CHK@
+
+/**begin repeat1
  *
  * #NAME1 = BOOL,
  *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
@@ -737,7 +746,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
  * #is_complex1 = 0*15, 1*3#
  */
 
-/**begin repeat1
+/**begin repeat2
  *
  * #NAME2 = BOOL,
  *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
@@ -766,7 +775,7 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
  * #is_complex2 = 0*15, 1*3#
  */
 
-/**begin repeat2
+/**begin repeat3
  * #prefix = _aligned,,_aligned_contig,_contig#
  * #aligned = 1,0,1,0#
  * #contig = 0,0,1,1#
@@ -839,8 +848,9 @@ NPY_NO_EXPORT PyArrayMethod_StridedLoop *
 
 #endif
 
-static NPY_GCC_OPT_3 int
-@prefix@_cast_@name1@_to_@name2@(
+#if @CHK@
+static NPY_GCC_OPT_3 @ATTR@ int
+@prefix@_cast_@name1@_to_@name2@@isa@(
         PyArrayMethod_Context *context, char *const *args,
         const npy_intp *dimensions, const npy_intp *strides,
         NpyAuxData *NPY_UNUSED(data))
@@ -868,7 +878,7 @@ static NPY_GCC_OPT_3 int
     assert(N == 0 || npy_is_aligned(dst, _ALIGN(_TYPE2)));
 #endif
 
-    /*printf("@prefix@_cast_@name1@_to_@name2@\n");*/
+    /*printf("@prefix@_cast_@name1@_to_@name2@@isa@\n");*/
 
     while (N--) {
 #if @aligned@
@@ -932,26 +942,27 @@ static NPY_GCC_OPT_3 int
     }
     return 0;
 }
+#endif
 
 #undef _CONVERT_FN
 #undef _TYPE2
 #undef _TYPE1
 
 #endif
 
+/**end repeat3**/
+
 /**end repeat2**/
 
 /**end repeat1**/
 
-/**end repeat**/
-
-NPY_NO_EXPORT PyArrayMethod_StridedLoop *
-PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
+static PyArrayMethod_StridedLoop *
+PyArray_GetStridedNumericCastFn_body@isa@(int aligned, npy_intp src_stride,
                              npy_intp dst_stride,
                              int src_type_num, int dst_type_num)
 {
     switch (src_type_num) {
-/**begin repeat
+/**begin repeat1
  *
  * #NAME1 = BOOL,
  *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
@@ -973,7 +984,7 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
         case NPY_@NAME1@:
             /*printf("test fn %d - second %d\n", NPY_@NAME1@, dst_type_num);*/
             switch (dst_type_num) {
-/**begin repeat1
+/**begin repeat2
  *
  * #NAME2 = BOOL,
  *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
@@ -997,34 +1008,52 @@ PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
 #  if NPY_USE_UNALIGNED_ACCESS
                     if (src_stride == sizeof(@type1@) &&
                                 dst_stride == sizeof(@type2@)) {
-                        return &_aligned_contig_cast_@name1@_to_@name2@;
+                        return &_aligned_contig_cast_@name1@_to_@name2@@isa@;
                     }
                     else {
-                        return &_aligned_cast_@name1@_to_@name2@;
+                        return &_aligned_cast_@name1@_to_@name2@@isa@;
                     }
 #  else
                     if (src_stride == sizeof(@type1@) &&
                                 dst_stride == sizeof(@type2@)) {
                         return aligned ?
-                                    &_aligned_contig_cast_@name1@_to_@name2@ :
-                                    &_contig_cast_@name1@_to_@name2@;
+                                    &_aligned_contig_cast_@name1@_to_@name2@@isa@ :
+                                    &_contig_cast_@name1@_to_@name2@@isa@;
                     }
                     else {
-                        return aligned ? &_aligned_cast_@name1@_to_@name2@ :
-                                         &_cast_@name1@_to_@name2@;
+                        return aligned ? &_aligned_cast_@name1@_to_@name2@@isa@ :
+                                         &_cast_@name1@_to_@name2@@isa@;
                     }
 #  endif
 
-/**end repeat1**/
+/**end repeat2**/
             }
             /*printf("switched test fn %d - second %d\n", NPY_@NAME1@, dst_type_num);*/
 
-/**end repeat**/
+/**end repeat1**/
     }
 
     return NULL;
 }
 
+#endif
+
+/**end repeat**/
+
+NPY_NO_EXPORT PyArrayMethod_StridedLoop *
+PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
+                             npy_intp dst_stride,
+                             int src_type_num, int dst_type_num)
+{
+    #ifdef HAVE_ATTRIBUTE_TARGET_AVX2
+    if (NPY_CPU_HAVE(AVX2)) {
+        return PyArray_GetStridedNumericCastFn_body_avx2(aligned, src_stride, dst_stride, src_type_num, dst_type_num);
+    }
+    #endif
+
+    return PyArray_GetStridedNumericCastFn_body(aligned, src_stride, dst_stride, src_type_num, dst_type_num);
+}
+
 
 /****************** PRIMITIVE FLAT TO/FROM NDIM FUNCTIONS ******************/