alternative fix for npy_get_floatstatus, npy_clear_floatstatus

numpy · May 5, 2018 · f21ad36 · f21ad36
1 parent a51f86b
commit f21ad36
Show file tree

Hide file tree

Showing 8 changed files with 111 additions and 148 deletions.
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst
@@ -55,7 +55,7 @@ Deprecations
   In the future, it might return a different result. Use `np.sum(np.from_iter(generator))`
   or the built-in Python `sum` instead.
 
-* Users of the C-API should call ``PyArrayResolveWriteBackIfCopy`` or 
+* Users of the C-API should call ``PyArrayResolveWriteBackIfCopy`` or
   ``PyArray_DiscardWritbackIfCopy`` on any array with the ``WRITEBACKIFCOPY``
   flag set, before the array is deallocated. A deprecation warning will be
   emitted if those calls are not used when needed.
@@ -81,7 +81,7 @@ are some circumstances where nditer doesn't actually give you a view onto the
 writable array. Instead, it gives you a copy, and if you make changes to the
 copy, nditer later writes those changes back into your actual array. Currently,
 this writeback occurs when the array objects are garbage collected, which makes
-this API error-prone on CPython and entirely broken on PyPy. Therefore, 
+this API error-prone on CPython and entirely broken on PyPy. Therefore,
 ``nditer`` should now be used as a context manager whenever using ``nditer``
 with writeable arrays (``with np.nditer(...) as it: ...``). You may also
 explicitly call ``it.close()`` for cases where a context manager is unusable,
@@ -123,6 +123,16 @@ C API changes
 * ``NpyIter_Close`` has been added and should be called before
   ``NpyIter_Deallocate`` to resolve possible writeback-enabled arrays.
 
+* ``npy_get_floatstatus_barrier`` and ``npy_clear_floatstatus_barrier`` have
+  been added, they should be used instead of ``npy_get_floatstatus`` and
+  ``npy_clear_status`` respectively. Optimizing compilers like GCC 8.1 and
+  Clang rearrange the order of operations when calling a function near SIMD
+  operations. The earlier versions of theses functions were thus moved around,
+  changing the results of ``do simd operation, check floatstatus flags`` to
+  ``check floatstatus flags, simd operation`` leading to wrong results. The new
+  forms of these functions accept a pointer to a local variable, and by
+  assigning to that value within the function prevent the reordering.
+
 New Features
 ============
 

diff --git a/doc/source/reference/c-api.coremath.rst b/doc/source/reference/c-api.coremath.rst
@@ -183,17 +183,35 @@ Those can be useful for precise floating point comparison.
     * NPY_FPE_UNDERFLOW
     * NPY_FPE_INVALID
 
-    If used in conjunction with intrisic functions, optimizing compilers can
-    reorder the call and put it before the intrisics, defeating the check.
-
     .. versionadded:: 1.9.0
 
+.. c:function:: int npy_get_floatstatus_barrier(void*)
+
+    Get floating point status. A pointer to a local variable is passed in to
+    prevent aggresive compiler optimizations from reodering this function call.
+    Returns a bitmask with following possible flags:
+
+    * NPY_FPE_DIVIDEBYZERO
+    * NPY_FPE_OVERFLOW
+    * NPY_FPE_UNDERFLOW
+    * NPY_FPE_INVALID
+
+    .. versionadded:: 1.15.0
+
 .. c:function:: int npy_clear_floatstatus()
 
     Clears the floating point status. Returns the previous status mask.
 
     .. versionadded:: 1.9.0
 
+.. c:function:: int npy_clear_floatstatus(void*)
+
+    Clears the floating point status. A pointer to a local variable is passed in to
+    prevent aggresive compiler optimizations from reodering this function call.
+    Returns the previous status mask.
+
+    .. versionadded:: 1.15.0
+
 Complex functions
 ~~~~~~~~~~~~~~~~~
 

diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
@@ -524,8 +524,14 @@ npy_clongdouble npy_catanhl(npy_clongdouble z);
 #define NPY_FPE_UNDERFLOW     4
 #define NPY_FPE_INVALID       8
 
-int npy_get_floatstatus(void);
+int npy_clear_floatstatus_barrier(char*);
+int npy_get_floatstatus_barrier(char*);
+/*
+ * use caution with these - clang and gcc8.1 are known to reorder calls
+ * to this form of the function which can defeat the check
+ */
 int npy_clear_floatstatus(void);
+int npy_get_floatstatus(void);
 void npy_set_floatstatus_divbyzero(void);
 void npy_set_floatstatus_overflow(void);
 void npy_set_floatstatus_underflow(void);

diff --git a/numpy/core/src/npymath/ieee754.c.src b/numpy/core/src/npymath/ieee754.c.src
@@ -6,6 +6,7 @@
  */
 #include "npy_math_common.h"
 #include "npy_math_private.h"
+#include "numpy/utils.h"
 
 #ifndef HAVE_COPYSIGN
 double npy_copysign(double x, double y)
@@ -557,6 +558,15 @@ npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y)
 }
 #endif
 
+int npy_clear_floatstatus() {
+    char x=0;
+    return npy_clear_floatstatus_barrier(&x);
+}
+int npy_get_floatstatus() {
+    char x=0;
+    return npy_get_floatstatus_barrier(&x);
+}
+
 /*
  * Functions to set the floating point status word.
  * keep in sync with NO_FLOATING_POINT_SUPPORT in ufuncobject.h
@@ -574,18 +584,22 @@ npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y)
     defined(__NetBSD__)
 #include <ieeefp.h>
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char * param))
 {
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    volatile char c = *(char*)param;
     int fpstatus = fpgetsticky();
     return ((FP_X_DZ  & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
            ((FP_X_OFL & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
            ((FP_X_UFL & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
            ((FP_X_INV & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char * param)
 {
-    int fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     fpsetsticky(0);
 
     return fpstatus;
@@ -617,8 +631,12 @@ void npy_set_floatstatus_invalid(void)
       (defined(__FreeBSD__) && (__FreeBSD_version >= 502114))
 #  include <fenv.h>
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char* param)
 {
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    volatile char NPY_UNUSED(c) = *(char*)param;
     int fpstatus = fetestexcept(FE_DIVBYZERO | FE_OVERFLOW |
                                 FE_UNDERFLOW | FE_INVALID);
 
@@ -628,10 +646,10 @@ int npy_get_floatstatus(void)
            ((FE_INVALID    & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char * param)
 {
     /* testing float status is 50-100 times faster than clearing on x86 */
-    int fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     if (fpstatus != 0) {
         feclearexcept(FE_DIVBYZERO | FE_OVERFLOW |
                       FE_UNDERFLOW | FE_INVALID);
@@ -665,18 +683,22 @@ void npy_set_floatstatus_invalid(void)
 #include <float.h>
 #include <fpxcp.h>
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char *param)
 {
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    volatile char c = *(char*)param;
     int fpstatus = fp_read_flag();
     return ((FP_DIV_BY_ZERO & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
            ((FP_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
            ((FP_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
            ((FP_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char * param)
 {
-    int fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     fp_swap_flag(0);
 
     return fpstatus;
@@ -710,8 +732,12 @@ void npy_set_floatstatus_invalid(void)
 #include <float.h>
 
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char *param)
 {
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    volatile char c = *(char*)param;
 #if defined(_WIN64)
     int fpstatus = _statusfp();
 #else
@@ -726,9 +752,9 @@ int npy_get_floatstatus(void)
            ((SW_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char *param)
 {
-    int fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     _clearfp();
 
     return fpstatus;
@@ -739,18 +765,22 @@ int npy_clear_floatstatus(void)
 
 #include <machine/fpu.h>
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char *param)
 {
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    volatile char c = *(char*)param;
     unsigned long fpstatus = ieee_get_fp_control();
     return  ((IEEE_STATUS_DZE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
             ((IEEE_STATUS_OVF & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
             ((IEEE_STATUS_UNF & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
             ((IEEE_STATUS_INV & fpstatus) ? NPY_FPE_INVALID : 0);
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char *param)
 {
-    long fpstatus = npy_get_floatstatus();
+    int fpstatus = npy_get_floatstatus_barrier(param);
     /* clear status bits as well as disable exception mode if on */
     ieee_set_fp_control(0);
 
@@ -759,13 +789,18 @@ int npy_clear_floatstatus(void)
 
 #else
 
-int npy_get_floatstatus(void)
+int npy_get_floatstatus_barrier(char *param)
 {
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    volatile char c = *(char*)param;
     return 0;
 }
 
-int npy_clear_floatstatus(void)
+int npy_clear_floatstatus_barrier(char *param)
 {
+    int fpstatus = npy_get_floatstatus_barrier(param);
     return 0;
 }
 

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
@@ -1819,7 +1819,7 @@ NPY_NO_EXPORT void
             *((npy_bool *)op1) = @func@(in1) != 0;
         }
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
 
@@ -1901,7 +1901,7 @@ NPY_NO_EXPORT void
             *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
         }
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
 
@@ -1991,7 +1991,7 @@ NPY_NO_EXPORT void
             *((@type@ *)op1) = tmp + 0;
         }
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 
 NPY_NO_EXPORT void
@@ -2177,7 +2177,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
         const npy_half in1 = *(npy_half *)ip1;
         *((npy_bool *)op1) = @func@(in1) != 0;
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat**/
 
@@ -2239,7 +2239,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
         const npy_half in2 = *(npy_half *)ip2;
         *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat**/
 
@@ -2681,7 +2681,7 @@ NPY_NO_EXPORT void
         const @ftype@ in1i = ((@ftype@ *)ip1)[1];
         *((npy_bool *)op1) = @func@(in1r) @OP@ @func@(in1i);
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
 
@@ -2790,7 +2790,7 @@ NPY_NO_EXPORT void
             ((@ftype@ *)op1)[1] = in2i;
         }
     }
-    npy_clear_floatstatus();
+    npy_clear_floatstatus_barrier((char*)dimensions);
 }
 /**end repeat1**/
 

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
@@ -1026,17 +1026,12 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
     assert(n < (stride) || npy_is_aligned(&ip[i], 16));
     if (i + 3 * stride <= n) {
         /* load the first elements */
-        /*
-         * TODO: find a more explicit way to prevent optimization
-         * compilers reorder npy_get_floatstatus to before minps
-         * volatile seems  to be sufficient but is brittle and opaque
-         */
-        volatile @vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
-        volatile @vtype@ c2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
+        @vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
+        @vtype@ c2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
         i += 2 * stride;
 
         /* minps/minpd will set invalid flag if nan is encountered */
-        npy_clear_floatstatus();
+        npy_clear_floatstatus_barrier((char*)&c1);
         LOOP_BLOCKED(@type@, 32) {
             @vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
             @vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
@@ -1045,7 +1040,7 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
         }
         c1 = @vpre@_@VOP@_@vsuf@(c1, c2);
 
-        if (npy_get_floatstatus() & NPY_FPE_INVALID) {
+        if (npy_get_floatstatus_barrier((char*)&c1) & NPY_FPE_INVALID) {
             *op = @nan@;
         }
         else {

diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
@@ -100,7 +100,8 @@ PyUFunc_getfperr(void)
      * non-clearing get was only added in 1.9 so this function always cleared
      * keep it so just in case third party code relied on the clearing
      */
-    return npy_clear_floatstatus();
+    char param = 0;
+    return npy_clear_floatstatus_barrier(&param);
 }
 
 #define HANDLEIT(NAME, str) {if (retstatus & NPY_FPE_##NAME) {          \
@@ -133,7 +134,8 @@ NPY_NO_EXPORT int
 PyUFunc_checkfperr(int errmask, PyObject *errobj, int *first)
 {
     /* clearing is done for backward compatibility */
-    int retstatus = npy_clear_floatstatus();
+    int retstatus;
+    retstatus = npy_clear_floatstatus_barrier((char*)&retstatus);
 
     return PyUFunc_handlefperr(errmask, errobj, retstatus, first);
 }
@@ -144,7 +146,8 @@ PyUFunc_checkfperr(int errmask, PyObject *errobj, int *first)
 NPY_NO_EXPORT void
 PyUFunc_clearfperr()
 {
-    npy_clear_floatstatus();
+    char param = 0;
+    npy_clear_floatstatus_barrier(&param);
 }
 
 /*