numpy · charris · Aug 6, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/numpy/_core/src/common/simd/avx512/avx512.h b/numpy/_core/src/common/simd/avx512/avx512.h
@@ -11,6 +11,8 @@
 // Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
 #define NPY_SIMD_MAXLOAD_STRIDE32  (0x7fffffff / 16)
 #define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
+#define NPY_SIMD_MAXLOAD_STRIDE64  (0x7fffffff / 16)
+#define NPY_SIMD_MAXSTORE_STRIDE64 (0x7fffffff / 16)
 
 typedef __m512i npyv_u8;
 typedef __m512i npyv_s8;

diff --git a/numpy/_core/src/umath/fast_loop_macros.h b/numpy/_core/src/umath/fast_loop_macros.h
@@ -323,34 +323,6 @@ abs_ptrdiff(char *a, char *b)
      ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
       ((abs_ptrdiff(args[1], args[0]) == 0))))
 
-/*
- * Avoid using SIMD for very large step sizes for several reasons:
- * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
- *    in which case we need two i64gather instructions and an additional vinsertf32x8
- *    instruction to load a single zmm register (since one i64gather instruction
- *    loads into a ymm register). This is not ideal for performance.
- * 2) Gather and scatter instructions can be slow when the loads/stores
- *    cross page boundaries.
- *
- * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
- * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
- * ensures this. The condition also requires that the input and output arrays
- * should have no overlap in memory.
- */
-#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
-    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
-     (labs(steps[1]) < MAX_STEP_SIZE)  && \
-     (labs(steps[2]) < MAX_STEP_SIZE)  && \
-     (nomemoverlap(args[0], steps[0], args[2], steps[2], dimensions[0])) && \
-     (nomemoverlap(args[1], steps[1], args[2], steps[2], dimensions[0])))
-
-#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
-    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
-     (labs(steps[1]) < MAX_STEP_SIZE)  && \
-     (labs(steps[2]) < MAX_STEP_SIZE)  && \
-     (nomemoverlap(args[0], steps[0], args[2], steps[2], dimensions[0])) && \
-     (nomemoverlap(args[0], steps[0], args[1], steps[1], dimensions[0])))
-
 /*
  * 1) Output should be contiguous, can handle strided input data
  * 2) Input step should be smaller than MAX_STEP_SIZE for performance

diff --git a/numpy/_core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/_core/src/umath/loops_exponent_log.dispatch.c.src
@@ -1350,12 +1350,17 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
  *  #TYPE = FLOAT, DOUBLE#
  *  #c = f, #
  *  #C = F, #
+ *  #suffix = f32, f64#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_frexp)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
 #ifdef SIMD_AVX512_SKX
-    if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
+    if ((npyv_loadable_stride_@suffix@(steps[0])) &&
+        (npyv_storable_stride_@suffix@(steps[1])) &&
+        (npyv_storable_stride_@suffix@(steps[2])) &&
+        (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0])) &&
+        (!is_mem_overlap(args[0], steps[0], args[1], steps[1], dimensions[0]))) {
         AVX512_SKX_frexp_@TYPE@(args, dimensions, steps);
         return;
     }
@@ -1370,7 +1375,11 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_ldexp)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
 #ifdef SIMD_AVX512_SKX
-    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
+    if ((npyv_loadable_stride_@suffix@(steps[0])) &&
+        (npyv_storable_stride_@suffix@(steps[1])) &&
+        (npyv_storable_stride_@suffix@(steps[2])) &&
+        (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0])) &&
+        (!is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0]))) {
         AVX512_SKX_ldexp_@TYPE@(args, dimensions, steps);
         return;
     }