Merge pull request #26055 from eendebakpt/optimize_power_double

ENH: Optimize np.power(x, 2) for double and float type
numpy · May 14, 2024 · 35c4f37 · 35c4f37
2 parents 1d3201e + 51bedd5
commit 35c4f37
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 2 deletions.
diff --git a/numpy/_core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/_core/src/umath/loops_umath_fp.dispatch.c.src
@@ -226,14 +226,27 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
  *  #type = npy_double, npy_float#
  *  #vsub = , f#
  *  #sfx  = f64, f32#
+ *  #sqrt  = sqrt, sqrtf#
  */
 /**begin repeat1
- *  #func = power, arctan2#
- *  #intrin = pow, atan2#
+ *  #func = power#
+ *  #intrin = pow#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
+    int stride_zero = steps[1]==0;
+    if (stride_zero) {
+        BINARY_DEFS
+        const @type@ in2 = *(@type@ *)ip2;
+        if (in2 == 2.0) {
+            BINARY_LOOP_SLIDING {
+                const @type@ in1 = *(@type@ *)ip1;
+                *(@type@ *)op1 = in1 * in1;
+            }
+            return;
+        }
+    }
 #if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
     const @type@ *src1 = (@type@*)args[0];
     const @type@ *src2 = (@type@*)args[1];
@@ -258,4 +271,37 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
     }
 }
 /**end repeat1**/
+
+/**begin repeat1
+ *  #func = arctan2#
+ *  #intrin = atan2#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const @type@ *src1 = (@type@*)args[0];
+    const @type@ *src2 = (@type@*)args[1];
+          @type@ *dst  = (@type@*)args[2];
+    const int lsize = sizeof(src1[0]);
+    const npy_intp ssrc1 = steps[0] / lsize;
+    const npy_intp ssrc2 = steps[1] / lsize;
+    const npy_intp sdst  = steps[2] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src1, steps[0], dst, steps[2], len) && !is_mem_overlap(src2, steps[1], dst, steps[2], len) &&
+        npyv_loadable_stride_@sfx@(ssrc1) && npyv_loadable_stride_@sfx@(ssrc2) &&
+        npyv_storable_stride_@sfx@(sdst)) {
+        simd_@intrin@_@sfx@(src1, ssrc1, src2, ssrc2, dst, sdst, len);
+        return;
+    }
+#endif
+    BINARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        const @type@ in2 = *(@type@ *)ip2;
+        *(@type@ *)op1 = npy_@intrin@@vsub@(in1, in2);
+    }
+}
+/**end repeat1**/
+
 /**end repeat**/
diff --git a/numpy/_core/tests/test_umath.py b/numpy/_core/tests/test_umath.py
@@ -1235,6 +1235,19 @@ def test_float_to_inf_power(self):
             r = np.array([1, 1, np.inf, 0, np.inf, 0, np.inf, 0], dt)
             assert_equal(np.power(a, b), r)
 
+    def test_power_fast_paths(self):
+        # gh-26055
+        for dt in [np.float32, np.float64]:
+            a = np.array([0, 1.1, 2, 12e12, -10., np.inf, -np.inf], dt)
+            expected = np.array([0.0, 1.21, 4., 1.44e+26, 100, np.inf, np.inf])
+            result = np.power(a, 2.)
+            assert_array_max_ulp(result, expected.astype(dt), maxulp=1)
+
+            a = np.array([0, 1.1, 2, 12e12], dt)
+            expected = np.sqrt(a).astype(dt)
+            result = np.power(a, 0.5)
+            assert_array_max_ulp(result, expected, maxulp=1)
+
 
 class TestFloat_power:
     def test_type_conversion(self):