Skip to content

Commit

Permalink
Merge pull request #23174 from Developer-Ecosystem-Engineering/fix-si…
Browse files Browse the repository at this point in the history
…md-multiply-and-divide-on-apple-silicon

BUG: Fix Apple silicon builds by working around clang partial load bug in …
  • Loading branch information
seberg committed Feb 10, 2023
2 parents 16ddf19 + bc6c53f commit 6dadb8c
Showing 1 changed file with 83 additions and 2 deletions.
85 changes: 83 additions & 2 deletions numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,58 @@
** Defining ufunc inner functions
********************************************************************************/

/*
* clang has a bug that's present at -O1 or greater. When partially loading a
* vector register for a divide operation, the remaining elements are set
* to 1 to avoid divide-by-zero. The partial load is paired with a partial
* store after the divide operation. clang notices that the entire register
* is not needed for the store and optimizes out the fill of 1 to the remaining
* elements. This causes either a divide-by-zero or 0/0 with invalid exception
* that we were trying to avoid by filling.
*
* Using a dummy variable marked 'volatile' convinces clang not to ignore
* the explicit fill of remaining elements. If `-ftrapping-math` is
* supported, then it'll also avoid the bug. `-ftrapping-math` is supported
* on Apple clang v12+ for x86_64. It is not currently supported for arm64.
* `-ftrapping-math` is set by default of Numpy builds in
* numpy/distutils/ccompiler.py.
*
* Note: Apple clang and clang upstream have different versions that overlap
*/
#if defined(__clang__)
#if defined(__apple_build_version__)
// Apple Clang
#if __apple_build_version__ < 12000000
// Apple Clang before v12
#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
#elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
// Apple Clang after v12, targeting i386 or x86_64
#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
#else
// Apple Clang after v12, not targeting i386 or x86_64
#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
#endif
#else
// Clang, not Apple Clang
#if __clang_major__ < 10
// Clang before v10
#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
#elif defined(_MSC_VER)
// clang-cl has the same bug
#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
#elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
// Clang v10+, targeting i386 or x86_64
#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
#else
// Clang v10+, not targeting i386 or x86_64
#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
#endif
#endif
#else
// Not a Clang compiler
#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
#endif

/**begin repeat
* Float types
* #type = npy_float, npy_double#
Expand Down Expand Up @@ -96,7 +148,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
npyv_store_@sfx@((@type@*)dst, r0);
npyv_store_@sfx@((@type@*)(dst + vstep), r1);
}
for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
#if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
const int vstop = hstep - 1;
#else
const int vstop = 0;
#endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
for (; len > vstop; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
#if @is_div@
npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
Expand All @@ -107,6 +164,15 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
npyv_store_till_@sfx@((@type@*)dst, len, r);
}
#if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
// last partial iteration for divide and working around clang partial load bug
if(len > 0){
npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
npyv_store_till_@sfx@((@type@*)dst, len, r);
}
#endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
}
else if (ssrc0 == 0 && ssrc1 == sizeof(@type@) && sdst == ssrc1) {
npyv_@sfx@ a = npyv_setall_@sfx@(*((@type@*)src0));
Expand All @@ -118,7 +184,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
npyv_store_@sfx@((@type@*)dst, r0);
npyv_store_@sfx@((@type@*)(dst + vstep), r1);
}
for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
#if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
const int vstop = hstep - 1;
#else
const int vstop = 0;
#endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
for (; len > vstop; len -= hstep, src1 += vstep, dst += vstep) {
#if @is_div@ || @is_mul@
npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
#else
Expand All @@ -127,6 +198,14 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
npyv_store_till_@sfx@((@type@*)dst, len, r);
}
#if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
// last partial iteration for multiply / divide and working around clang partial load bug
if(len > 0){
volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
npyv_store_till_@sfx@((@type@*)dst, len, r);
}
#endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
}
else if (ssrc1 == 0 && ssrc0 == sizeof(@type@) && sdst == ssrc0) {
npyv_@sfx@ b = npyv_setall_@sfx@(*((@type@*)src1));
Expand Down Expand Up @@ -182,6 +261,8 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
/**end repeat1**/
/**end repeat**/

#undef WORKAROUND_CLANG_PARTIAL_LOAD_BUG

//###############################################################################
//## Complex Single/Double precision
//###############################################################################
Expand Down

0 comments on commit 6dadb8c

Please sign in to comment.