Skip to content

Commit

Permalink
alternative fix for npy_get_floatstatus, npy_clear_floatstatus
Browse files Browse the repository at this point in the history
  • Loading branch information
mattip committed May 5, 2018
1 parent a51f86b commit f21ad36
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 148 deletions.
14 changes: 12 additions & 2 deletions doc/release/1.15.0-notes.rst
Expand Up @@ -55,7 +55,7 @@ Deprecations
In the future, it might return a different result. Use `np.sum(np.from_iter(generator))`
or the built-in Python `sum` instead.

* Users of the C-API should call ``PyArrayResolveWriteBackIfCopy`` or
* Users of the C-API should call ``PyArrayResolveWriteBackIfCopy`` or
``PyArray_DiscardWritbackIfCopy`` on any array with the ``WRITEBACKIFCOPY``
flag set, before the array is deallocated. A deprecation warning will be
emitted if those calls are not used when needed.
Expand All @@ -81,7 +81,7 @@ are some circumstances where nditer doesn't actually give you a view onto the
writable array. Instead, it gives you a copy, and if you make changes to the
copy, nditer later writes those changes back into your actual array. Currently,
this writeback occurs when the array objects are garbage collected, which makes
this API error-prone on CPython and entirely broken on PyPy. Therefore,
this API error-prone on CPython and entirely broken on PyPy. Therefore,
``nditer`` should now be used as a context manager whenever using ``nditer``
with writeable arrays (``with np.nditer(...) as it: ...``). You may also
explicitly call ``it.close()`` for cases where a context manager is unusable,
Expand Down Expand Up @@ -123,6 +123,16 @@ C API changes
* ``NpyIter_Close`` has been added and should be called before
``NpyIter_Deallocate`` to resolve possible writeback-enabled arrays.

* ``npy_get_floatstatus_barrier`` and ``npy_clear_floatstatus_barrier`` have
been added, they should be used instead of ``npy_get_floatstatus`` and
``npy_clear_status`` respectively. Optimizing compilers like GCC 8.1 and
Clang rearrange the order of operations when calling a function near SIMD
operations. The earlier versions of theses functions were thus moved around,
changing the results of ``do simd operation, check floatstatus flags`` to
``check floatstatus flags, simd operation`` leading to wrong results. The new
forms of these functions accept a pointer to a local variable, and by
assigning to that value within the function prevent the reordering.

New Features
============

Expand Down
24 changes: 21 additions & 3 deletions doc/source/reference/c-api.coremath.rst
Expand Up @@ -183,17 +183,35 @@ Those can be useful for precise floating point comparison.
* NPY_FPE_UNDERFLOW
* NPY_FPE_INVALID
If used in conjunction with intrisic functions, optimizing compilers can
reorder the call and put it before the intrisics, defeating the check.
.. versionadded:: 1.9.0
.. c:function:: int npy_get_floatstatus_barrier(void*)
Get floating point status. A pointer to a local variable is passed in to
prevent aggresive compiler optimizations from reodering this function call.
Returns a bitmask with following possible flags:
* NPY_FPE_DIVIDEBYZERO
* NPY_FPE_OVERFLOW
* NPY_FPE_UNDERFLOW
* NPY_FPE_INVALID
.. versionadded:: 1.15.0
.. c:function:: int npy_clear_floatstatus()
Clears the floating point status. Returns the previous status mask.
.. versionadded:: 1.9.0
.. c:function:: int npy_clear_floatstatus(void*)
Clears the floating point status. A pointer to a local variable is passed in to
prevent aggresive compiler optimizations from reodering this function call.
Returns the previous status mask.
.. versionadded:: 1.15.0
Complex functions
~~~~~~~~~~~~~~~~~
Expand Down
8 changes: 7 additions & 1 deletion numpy/core/include/numpy/npy_math.h
Expand Up @@ -524,8 +524,14 @@ npy_clongdouble npy_catanhl(npy_clongdouble z);
#define NPY_FPE_UNDERFLOW 4
#define NPY_FPE_INVALID 8

int npy_get_floatstatus(void);
int npy_clear_floatstatus_barrier(char*);
int npy_get_floatstatus_barrier(char*);
/*
* use caution with these - clang and gcc8.1 are known to reorder calls
* to this form of the function which can defeat the check
*/
int npy_clear_floatstatus(void);
int npy_get_floatstatus(void);
void npy_set_floatstatus_divbyzero(void);
void npy_set_floatstatus_overflow(void);
void npy_set_floatstatus_underflow(void);
Expand Down
69 changes: 52 additions & 17 deletions numpy/core/src/npymath/ieee754.c.src
Expand Up @@ -6,6 +6,7 @@
*/
#include "npy_math_common.h"
#include "npy_math_private.h"
#include "numpy/utils.h"

#ifndef HAVE_COPYSIGN
double npy_copysign(double x, double y)
Expand Down Expand Up @@ -557,6 +558,15 @@ npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y)
}
#endif

int npy_clear_floatstatus() {
char x=0;
return npy_clear_floatstatus_barrier(&x);
}
int npy_get_floatstatus() {
char x=0;
return npy_get_floatstatus_barrier(&x);
}

/*
* Functions to set the floating point status word.
* keep in sync with NO_FLOATING_POINT_SUPPORT in ufuncobject.h
Expand All @@ -574,18 +584,22 @@ npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y)
defined(__NetBSD__)
#include <ieeefp.h>

int npy_get_floatstatus(void)
int npy_get_floatstatus_barrier(char * param))
{
/*
* By using a volatile, the compiler cannot reorder this call
*/
volatile char c = *(char*)param;
int fpstatus = fpgetsticky();
return ((FP_X_DZ & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
((FP_X_OFL & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
((FP_X_UFL & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
((FP_X_INV & fpstatus) ? NPY_FPE_INVALID : 0);
}

int npy_clear_floatstatus(void)
int npy_clear_floatstatus_barrier(char * param)
{
int fpstatus = npy_get_floatstatus();
int fpstatus = npy_get_floatstatus_barrier(param);
fpsetsticky(0);

return fpstatus;
Expand Down Expand Up @@ -617,8 +631,12 @@ void npy_set_floatstatus_invalid(void)
(defined(__FreeBSD__) && (__FreeBSD_version >= 502114))
# include <fenv.h>

int npy_get_floatstatus(void)
int npy_get_floatstatus_barrier(char* param)
{
/*
* By using a volatile, the compiler cannot reorder this call
*/
volatile char NPY_UNUSED(c) = *(char*)param;
int fpstatus = fetestexcept(FE_DIVBYZERO | FE_OVERFLOW |
FE_UNDERFLOW | FE_INVALID);

Expand All @@ -628,10 +646,10 @@ int npy_get_floatstatus(void)
((FE_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
}

int npy_clear_floatstatus(void)
int npy_clear_floatstatus_barrier(char * param)
{
/* testing float status is 50-100 times faster than clearing on x86 */
int fpstatus = npy_get_floatstatus();
int fpstatus = npy_get_floatstatus_barrier(param);
if (fpstatus != 0) {
feclearexcept(FE_DIVBYZERO | FE_OVERFLOW |
FE_UNDERFLOW | FE_INVALID);
Expand Down Expand Up @@ -665,18 +683,22 @@ void npy_set_floatstatus_invalid(void)
#include <float.h>
#include <fpxcp.h>

int npy_get_floatstatus(void)
int npy_get_floatstatus_barrier(char *param)
{
/*
* By using a volatile, the compiler cannot reorder this call
*/
volatile char c = *(char*)param;
int fpstatus = fp_read_flag();
return ((FP_DIV_BY_ZERO & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
((FP_OVERFLOW & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
((FP_UNDERFLOW & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
((FP_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
}

int npy_clear_floatstatus(void)
int npy_clear_floatstatus_barrier(char * param)
{
int fpstatus = npy_get_floatstatus();
int fpstatus = npy_get_floatstatus_barrier(param);
fp_swap_flag(0);

return fpstatus;
Expand Down Expand Up @@ -710,8 +732,12 @@ void npy_set_floatstatus_invalid(void)
#include <float.h>


int npy_get_floatstatus(void)
int npy_get_floatstatus_barrier(char *param)
{
/*
* By using a volatile, the compiler cannot reorder this call
*/
volatile char c = *(char*)param;
#if defined(_WIN64)
int fpstatus = _statusfp();
#else
Expand All @@ -726,9 +752,9 @@ int npy_get_floatstatus(void)
((SW_INVALID & fpstatus) ? NPY_FPE_INVALID : 0);
}

int npy_clear_floatstatus(void)
int npy_clear_floatstatus_barrier(char *param)
{
int fpstatus = npy_get_floatstatus();
int fpstatus = npy_get_floatstatus_barrier(param);
_clearfp();

return fpstatus;
Expand All @@ -739,18 +765,22 @@ int npy_clear_floatstatus(void)

#include <machine/fpu.h>

int npy_get_floatstatus(void)
int npy_get_floatstatus_barrier(char *param)
{
/*
* By using a volatile, the compiler cannot reorder this call
*/
volatile char c = *(char*)param;
unsigned long fpstatus = ieee_get_fp_control();
return ((IEEE_STATUS_DZE & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
((IEEE_STATUS_OVF & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
((IEEE_STATUS_UNF & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
((IEEE_STATUS_INV & fpstatus) ? NPY_FPE_INVALID : 0);
}

int npy_clear_floatstatus(void)
int npy_clear_floatstatus_barrier(char *param)
{
long fpstatus = npy_get_floatstatus();
int fpstatus = npy_get_floatstatus_barrier(param);
/* clear status bits as well as disable exception mode if on */
ieee_set_fp_control(0);

Expand All @@ -759,13 +789,18 @@ int npy_clear_floatstatus(void)

#else

int npy_get_floatstatus(void)
int npy_get_floatstatus_barrier(char *param)
{
/*
* By using a volatile, the compiler cannot reorder this call
*/
volatile char c = *(char*)param;
return 0;
}

int npy_clear_floatstatus(void)
int npy_clear_floatstatus_barrier(char *param)
{
int fpstatus = npy_get_floatstatus_barrier(param);
return 0;
}

Expand Down
14 changes: 7 additions & 7 deletions numpy/core/src/umath/loops.c.src
Expand Up @@ -1819,7 +1819,7 @@ NPY_NO_EXPORT void
*((npy_bool *)op1) = @func@(in1) != 0;
}
}
npy_clear_floatstatus();
npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

Expand Down Expand Up @@ -1901,7 +1901,7 @@ NPY_NO_EXPORT void
*((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
}
}
npy_clear_floatstatus();
npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

Expand Down Expand Up @@ -1991,7 +1991,7 @@ NPY_NO_EXPORT void
*((@type@ *)op1) = tmp + 0;
}
}
npy_clear_floatstatus();
npy_clear_floatstatus_barrier((char*)dimensions);
}

NPY_NO_EXPORT void
Expand Down Expand Up @@ -2177,7 +2177,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
const npy_half in1 = *(npy_half *)ip1;
*((npy_bool *)op1) = @func@(in1) != 0;
}
npy_clear_floatstatus();
npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat**/

Expand Down Expand Up @@ -2239,7 +2239,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
const npy_half in2 = *(npy_half *)ip2;
*((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
}
npy_clear_floatstatus();
npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat**/

Expand Down Expand Up @@ -2681,7 +2681,7 @@ NPY_NO_EXPORT void
const @ftype@ in1i = ((@ftype@ *)ip1)[1];
*((npy_bool *)op1) = @func@(in1r) @OP@ @func@(in1i);
}
npy_clear_floatstatus();
npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

Expand Down Expand Up @@ -2790,7 +2790,7 @@ NPY_NO_EXPORT void
((@ftype@ *)op1)[1] = in2i;
}
}
npy_clear_floatstatus();
npy_clear_floatstatus_barrier((char*)dimensions);
}
/**end repeat1**/

Expand Down
13 changes: 4 additions & 9 deletions numpy/core/src/umath/simd.inc.src
Expand Up @@ -1026,17 +1026,12 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
assert(n < (stride) || npy_is_aligned(&ip[i], 16));
if (i + 3 * stride <= n) {
/* load the first elements */
/*
* TODO: find a more explicit way to prevent optimization
* compilers reorder npy_get_floatstatus to before minps
* volatile seems to be sufficient but is brittle and opaque
*/
volatile @vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
volatile @vtype@ c2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
@vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
@vtype@ c2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
i += 2 * stride;

/* minps/minpd will set invalid flag if nan is encountered */
npy_clear_floatstatus();
npy_clear_floatstatus_barrier((char*)&c1);
LOOP_BLOCKED(@type@, 32) {
@vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
@vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
Expand All @@ -1045,7 +1040,7 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
}
c1 = @vpre@_@VOP@_@vsuf@(c1, c2);

if (npy_get_floatstatus() & NPY_FPE_INVALID) {
if (npy_get_floatstatus_barrier((char*)&c1) & NPY_FPE_INVALID) {
*op = @nan@;
}
else {
Expand Down
9 changes: 6 additions & 3 deletions numpy/core/src/umath/ufunc_object.c
Expand Up @@ -100,7 +100,8 @@ PyUFunc_getfperr(void)
* non-clearing get was only added in 1.9 so this function always cleared
* keep it so just in case third party code relied on the clearing
*/
return npy_clear_floatstatus();
char param = 0;
return npy_clear_floatstatus_barrier(&param);
}

#define HANDLEIT(NAME, str) {if (retstatus & NPY_FPE_##NAME) { \
Expand Down Expand Up @@ -133,7 +134,8 @@ NPY_NO_EXPORT int
PyUFunc_checkfperr(int errmask, PyObject *errobj, int *first)
{
/* clearing is done for backward compatibility */
int retstatus = npy_clear_floatstatus();
int retstatus;
retstatus = npy_clear_floatstatus_barrier((char*)&retstatus);

return PyUFunc_handlefperr(errmask, errobj, retstatus, first);
}
Expand All @@ -144,7 +146,8 @@ PyUFunc_checkfperr(int errmask, PyObject *errobj, int *first)
NPY_NO_EXPORT void
PyUFunc_clearfperr()
{
npy_clear_floatstatus();
char param = 0;
npy_clear_floatstatus_barrier(&param);
}

/*
Expand Down

0 comments on commit f21ad36

Please sign in to comment.