numpy · charris · May 10, 2018 · May 3, 2018 · May 5, 2018 · May 6, 2018
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst
@@ -20,15 +20,18 @@ New functions
 * ``nanquantile`` function, an interface to ``nanpercentile`` without factors
   of 100
 
-* `np.printoptions`, the context manager which sets print options temporarily
+* `np.printoptions`, a context manager that sets print options temporarily
   for the scope of the ``with`` block::
 
     >>> with np.printoptions(precision=2):
     ...     print(np.array([2.0]) / 3)
     [0.67]
 
-  * `np.histogram_bin_edges`, a function to get the edges of the bins used by a histogram
-    without needing to calculate the histogram.
+* `np.histogram_bin_edges`, a function to get the edges of the bins used by a histogram
+  without needing to calculate the histogram.
+
+* `npy_get_floatstatus_barrier`` and ``npy_clear_floatstatus_barrier`` have been added to
+  deal with compiler optimization changing the order of operations. See below for details.
 
 Deprecations
 ============
@@ -42,6 +45,7 @@ Deprecations
   * `np.ma.loads`, `np.ma.dumps`
   * `np.ma.load`, `np.ma.dump` - these functions already failed on python 3,
     when called with a string.
+
 * Direct imports from the following modules is deprecated. All testing related
   imports should come from `numpy.testing`.
   * `np.testing.utils`
@@ -55,7 +59,7 @@ Deprecations
   In the future, it might return a different result. Use `np.sum(np.from_iter(generator))`
   or the built-in Python `sum` instead.
 
-* Users of the C-API should call ``PyArrayResolveWriteBackIfCopy`` or 
+* Users of the C-API should call ``PyArrayResolveWriteBackIfCopy`` or
   ``PyArray_DiscardWritbackIfCopy`` on any array with the ``WRITEBACKIFCOPY``
   flag set, before the array is deallocated. A deprecation warning will be
   emitted if those calls are not used when needed.
@@ -81,7 +85,7 @@ are some circumstances where nditer doesn't actually give you a view onto the
 writable array. Instead, it gives you a copy, and if you make changes to the
 copy, nditer later writes those changes back into your actual array. Currently,
 this writeback occurs when the array objects are garbage collected, which makes
-this API error-prone on CPython and entirely broken on PyPy. Therefore, 
+this API error-prone on CPython and entirely broken on PyPy. Therefore,
 ``nditer`` should now be used as a context manager whenever using ``nditer``
 with writeable arrays (``with np.nditer(...) as it: ...``). You may also
 explicitly call ``it.close()`` for cases where a context manager is unusable,
@@ -123,6 +127,14 @@ C API changes
 * ``NpyIter_Close`` has been added and should be called before
   ``NpyIter_Deallocate`` to resolve possible writeback-enabled arrays.
 
+* Functions ``npy_get_floatstatus_barrier`` and ``npy_clear_floatstatus_barrier``
+  have been added and should be used in place of the ``npy_get_floatstatus``and
+  ``npy_clear_status`` functions. Optimizing compilers like GCC 8.1 and Clang
+  were rearranging the order of operations when the previous functions were
+  used in the ufunc SIMD functions, resulting in the floatstatus flags being '
+  checked before the operation whose status we wanted to check was run.  
+  See `#10339 <https://github.com/numpy/numpy/issues/10370>`__.
+
 New Features
 ============
 

diff --git a/doc/source/reference/c-api.coremath.rst b/doc/source/reference/c-api.coremath.rst
@@ -185,12 +185,33 @@ Those can be useful for precise floating point comparison.
 
     .. versionadded:: 1.9.0
 
+.. c:function:: int npy_get_floatstatus_barrier(char*)
+
+    Get floating point status. A pointer to a local variable is passed in to
+    prevent aggresive compiler optimizations from reodering this function call.
+    Returns a bitmask with following possible flags:
+
+    * NPY_FPE_DIVIDEBYZERO
+    * NPY_FPE_OVERFLOW
+    * NPY_FPE_UNDERFLOW
+    * NPY_FPE_INVALID
+
+    .. versionadded:: 1.15.0
+
 .. c:function:: int npy_clear_floatstatus()
 
     Clears the floating point status. Returns the previous status mask.
 
     .. versionadded:: 1.9.0
 
+.. c:function:: int npy_clear_floatstatus(char*)
+
+    Clears the floating point status. A pointer to a local variable is passed in to
+    prevent aggresive compiler optimizations from reodering this function call.
+    Returns the previous status mask.
+
+    .. versionadded:: 1.15.0
+n
 Complex functions
 ~~~~~~~~~~~~~~~~~
 

diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
@@ -524,8 +524,14 @@ npy_clongdouble npy_catanhl(npy_clongdouble z);
 #define NPY_FPE_UNDERFLOW     4
 #define NPY_FPE_INVALID       8
 
-int npy_get_floatstatus(void);
+int npy_clear_floatstatus_barrier(char*);
+int npy_get_floatstatus_barrier(char*);
+/*
+ * use caution with these - clang and gcc8.1 are known to reorder calls
+ * to this form of the function which can defeat the check
+ */
 int npy_clear_floatstatus(void);
+int npy_get_floatstatus(void);
 void npy_set_floatstatus_divbyzero(void);
 void npy_set_floatstatus_overflow(void);
 void npy_set_floatstatus_underflow(void);

diff --git a/numpy/core/src/npymath/ieee754.c.src b/numpy/core/src/npymath/ieee754.c.src
@@ -6,6 +6,7 @@
  */
 #include "npy_math_common.h"
 #include "npy_math_private.h"
+#include "numpy/utils.h"
 
 #ifndef HAVE_COPYSIGN
 double npy_copysign(double x, double y)
@@ -557,6 +558,15 @@ npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y)
 }
 #endif
 
+int npy_clear_floatstatus() {
+    char x=0;
+    return npy_clear_floatstatus_barrier(&x);
+}
+int npy_get_floatstatus() {
+    char x=0;
+    return npy_get_floatstatus_barrier(&x);
+}
+
 /*
  * Functions to set the floating point status word.
  * keep in sync with NO_FLOATING_POINT_SUPPORT in ufuncobject.h
@@ -574,18 +584,24 @@ npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y)
     defined(__NetBSD__)
 #include <ieeefp.h>
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char * param))
 {
     int fpstatus = fpgetsticky();
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
     return ((FP_X_DZ  & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
            ((FP_X_OFL & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
            ((FP_X_UFL & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
            ((FP_X_INV & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char * param)
 {
-    int fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     fpsetsticky(0);
 
     return fpstatus;
@@ -617,21 +633,27 @@ void npy_set_floatstatus_invalid(void)
       (defined(__FreeBSD__) && (__FreeBSD_version >= 502114))
 #  include <fenv.h>
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char* param)
 {
     int fpstatus = fetestexcept(FE_DIVBYZERO | FE_OVERFLOW |
                                 FE_UNDERFLOW | FE_INVALID);
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
 
     return ((FE_DIVBYZERO  & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
            ((FE_OVERFLOW   & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
            ((FE_UNDERFLOW  & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
            ((FE_INVALID    & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char * param)
 {
     /* testing float status is 50-100 times faster than clearing on x86 */
-    int fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     if (fpstatus != 0) {
         feclearexcept(FE_DIVBYZERO | FE_OVERFLOW |
                       FE_UNDERFLOW | FE_INVALID);
@@ -665,18 +687,24 @@ void npy_set_floatstatus_invalid(void)
 #include <float.h>
 #include <fpxcp.h>
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char *param)
 {
     int fpstatus = fp_read_flag();
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
     return ((FP_DIV_BY_ZERO & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
            ((FP_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
            ((FP_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
            ((FP_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char * param)
 {
-    int fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     fp_swap_flag(0);
 
     return fpstatus;
@@ -710,8 +738,11 @@ void npy_set_floatstatus_invalid(void)
 #include <float.h>
 
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char *param)
 {
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
 #if defined(_WIN64)
     int fpstatus = _statusfp();
 #else
@@ -720,15 +751,18 @@ int npy_get_floatstatus(void)
     _statusfp2(&fpstatus, &fpstatus2);
     fpstatus |= fpstatus2;
 #endif
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
     return ((SW_ZERODIVIDE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
            ((SW_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
            ((SW_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
            ((SW_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char *param)
 {
-    int fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     _clearfp();
 
     return fpstatus;
@@ -739,18 +773,24 @@ int npy_clear_floatstatus(void)
 
 #include <machine/fpu.h>
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char *param)
 {
     unsigned long fpstatus = ieee_get_fp_control();
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
     return  ((IEEE_STATUS_DZE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
             ((IEEE_STATUS_OVF & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
             ((IEEE_STATUS_UNF & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
             ((IEEE_STATUS_INV & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char *param)
 {
-    long fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     /* clear status bits as well as disable exception mode if on */
     ieee_set_fp_control(0);
 
@@ -759,13 +799,14 @@ int npy_clear_floatstatus(void)
 
 #else
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char NPY_UNUSED(*param))
 {
     return 0;
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char *param)
 {
+    int fpstatus = npy_get_floatstatus_barrier(param);
     return 0;
 }
 

diff --git a/numpy/core/src/umath/extobj.c b/numpy/core/src/umath/extobj.c
@@ -284,7 +284,7 @@ _check_ufunc_fperr(int errmask, PyObject *extobj, const char *ufunc_name) {
     if (!errmask) {
         return 0;
     }
-    fperr = PyUFunc_getfperr();
+    fperr = npy_get_floatstatus_barrier((char*)extobj);
     if (!fperr) {
         return 0;
     }

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
@@ -1819,7 +1819,7 @@ NPY_NO_EXPORT void
             *((npy_bool *)op1) = @func@(in1) != 0;
         }
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
 
@@ -1901,7 +1901,7 @@ NPY_NO_EXPORT void
             *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
         }
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
 
@@ -1991,7 +1991,7 @@ NPY_NO_EXPORT void
             *((@type@ *)op1) = tmp + 0;
         }
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 
 NPY_NO_EXPORT void
@@ -2177,7 +2177,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
         const npy_half in1 = *(npy_half *)ip1;
         *((npy_bool *)op1) = @func@(in1) != 0;
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat**/
 
@@ -2239,7 +2239,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
         const npy_half in2 = *(npy_half *)ip2;
         *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat**/
 
@@ -2681,7 +2681,7 @@ NPY_NO_EXPORT void
         const @ftype@ in1i = ((@ftype@ *)ip1)[1];
         *((npy_bool *)op1) = @func@(in1r) @OP@ @func@(in1i);
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
 
@@ -2790,7 +2790,7 @@ NPY_NO_EXPORT void
             ((@ftype@ *)op1)[1] = in2i;
         }
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
 

diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
@@ -537,7 +537,7 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
     }
 
     /* Start with the floating-point exception flags cleared */
-    PyUFunc_clearfperr();
+    npy_clear_floatstatus_barrier((char*)&iter);
 
     if (NpyIter_GetIterSize(iter) != 0) {
         NpyIter_IterNextFunc *iternext;