Merge pull request #23174 from Developer-Ecosystem-Engineering/fix-si…

…md-multiply-and-divide-on-apple-silicon BUG: Fix Apple silicon builds by working around clang partial load bug in …
numpy · Feb 10, 2023 · 6dadb8c · 6dadb8c
2 parents 16ddf19 + bc6c53f
commit 6dadb8c
Showing 1 changed file with 83 additions and 2 deletions.
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -32,6 +32,58 @@
  ** Defining ufunc inner functions
  ********************************************************************************/
 
+/*
+ * clang has a bug that's present at -O1 or greater.  When partially loading a
+ * vector register for a divide operation, the remaining elements are set
+ * to 1 to avoid divide-by-zero.  The partial load is paired with a partial
+ * store after the divide operation.  clang notices that the entire register
+ * is not needed for the store and optimizes out the fill of 1 to the remaining
+ * elements.  This causes either a divide-by-zero or 0/0 with invalid exception
+ * that we were trying to avoid by filling.
+ *
+ * Using a dummy variable marked 'volatile' convinces clang not to ignore
+ * the explicit fill of remaining elements.  If `-ftrapping-math` is
+ * supported, then it'll also avoid the bug.  `-ftrapping-math` is supported
+ * on Apple clang v12+ for x86_64.  It is not currently supported for arm64.
+ * `-ftrapping-math` is set by default of Numpy builds in
+ * numpy/distutils/ccompiler.py.
+ *
+ * Note: Apple clang and clang upstream have different versions that overlap
+ */
+#if defined(__clang__)
+    #if defined(__apple_build_version__)
+    // Apple Clang
+        #if __apple_build_version__ < 12000000
+        // Apple Clang before v12
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+        // Apple Clang after v12, targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+        #else
+        // Apple Clang after v12, not targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #endif
+    #else
+    // Clang, not Apple Clang
+        #if __clang_major__ < 10
+        // Clang before v10
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #elif defined(_MSC_VER)
+        // clang-cl has the same bug
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+        // Clang v10+, targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+        #else
+        // Clang v10+, not targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #endif
+    #endif
+#else
+// Not a Clang compiler
+#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+#endif
+
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double#
@@ -96,7 +148,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_store_@sfx@((@type@*)dst, r0);
                 npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+        #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            const int vstop = hstep - 1;
+        #else
+            const int vstop = 0;
+        #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            for (; len > vstop; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
             #if @is_div@
                 npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
                 npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
@@ -107,6 +164,15 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
                 npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
+        #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            // last partial iteration for divide and working around clang partial load bug
+            if(len > 0){
+                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+                volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
+            }
+        #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
         }
         else if (ssrc0 == 0 && ssrc1 == sizeof(@type@) && sdst == ssrc1) {
             npyv_@sfx@ a = npyv_setall_@sfx@(*((@type@*)src0));
@@ -118,7 +184,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_store_@sfx@((@type@*)dst, r0);
                 npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+        #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            const int vstop = hstep - 1;
+        #else
+            const int vstop = 0;
+        #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            for (; len > vstop; len -= hstep, src1 += vstep, dst += vstep) {
             #if @is_div@ || @is_mul@
                 npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
             #else
@@ -127,6 +198,14 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
                 npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
+        #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            // last partial iteration for multiply / divide and working around clang partial load bug
+            if(len > 0){
+                volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
+            }
+        #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
         }
         else if (ssrc1 == 0 && ssrc0 == sizeof(@type@) && sdst == ssrc0) {
             npyv_@sfx@ b = npyv_setall_@sfx@(*((@type@*)src1));
@@ -182,6 +261,8 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
 /**end repeat1**/
 /**end repeat**/
 
+#undef WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+
 //###############################################################################
 //## Complex Single/Double precision
 //###############################################################################