Skip to content

Commit

Permalink
ENH: Vectorize float min/max operation with sse2
Browse files Browse the repository at this point in the history
Improves performance by ~1.5/3.0 for float/double.
  • Loading branch information
juliantaylor committed Jun 9, 2013
1 parent 558cd20 commit eb6cf4b
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 25 deletions.
8 changes: 4 additions & 4 deletions doc/release/1.8.0-notes.rst
Expand Up @@ -149,10 +149,10 @@ advantage of compiler builtins to avoid expensive calls to libc.
This improves performance of these operations by about a factor of two on gnu
libc systems.

Performance improvements to `sqrt` and `abs`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The `sqrt` and `abs` functions for unit stride elementary operations have been
improved to make use of SSE2 CPU SIMD instructions.
Performance improvements to `sqrt`, `abs` and `min/max`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The `sqrt`, `abs`, `min/max` functions for unit stride elementary operations
have been improved to make use of SSE2 CPU SIMD instructions.
This improves performance of these operations up to 4x/2x for float32/float64
depending on the location of the data in the CPU caches. The performance gain
is greatest for in-place operations.
Expand Down
3 changes: 3 additions & 0 deletions numpy/core/src/umath/loops.c.src
Expand Up @@ -1418,6 +1418,9 @@ NPY_NO_EXPORT void
{
/* */
if (IS_BINARY_REDUCE) {
if (run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
return;
}
BINARY_REDUCE_LOOP(@type@) {
const @type@ in2 = *(@type@ *)ip2;
io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
Expand Down
131 changes: 110 additions & 21 deletions numpy/core/src/umath/simd.inc.src
Expand Up @@ -17,9 +17,14 @@

#include "lowlevel_strided_loops.h"
#include "npy_config.h"
/* for NO_FLOATING_POINT_SUPPORT */
#include "numpy/ufuncobject.h"
#include <assert.h>
#include <stdlib.h>

int PyUFunc_getfperr(void);
void PyUFunc_clearfperr(void);

/*
* stride is equal to element size and input and destination are equal or
* don't overlap within one register
Expand All @@ -29,21 +34,23 @@
(npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
((abs(args[1] - args[0]) >= (vsize)) || ((abs(args[1] - args[0]) == 0))))

#define IS_BLOCKABLE_REDUCE(esize, vsize) \
(steps[1] == (esize) && abs(args[1] - args[0]) >= (vsize))


/* align var to alignment */
#define UNARY_LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
alignment, n);\
for(i = 0; i < peel; i++)

#define UNARY_LOOP_BLOCKED(type, vsize)\
#define LOOP_BLOCKED(type, vsize)\
for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
i += (vsize / sizeof(type)))

#define UNARY_LOOP_BLOCKED_END\
#define LOOP_BLOCKED_END\
for (; i < n; i++)


/*
* Dispatcher functions
* decide whether the operation can be vectorized and run it
Expand All @@ -58,28 +65,31 @@
*/

/**begin repeat1
* #func = sqrt, absolute#
* #func = sqrt, absolute, minimum, maximum#
* #check = IS_BLOCKABLE_UNARY, IS_BLOCKABLE_UNARY, IS_BLOCKABLE_REDUCE, IS_BLOCKABLE_REDUCE#
* #name = unary, unary, unary_reduce, unary_reduce#
*/

#if @vector@

/* prototypes */
static void
sse2_@func@_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n);
sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);

#endif

static NPY_INLINE int
run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if @vector@ && defined HAVE_EMMINTRIN_H
if (IS_BLOCKABLE_UNARY(sizeof(@type@), 16)) {
if (@check@(sizeof(@type@), 16)) {
sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
return 1;
}
#endif
return 0;
}

/**end repeat1**/

/**end repeat**/
Expand All @@ -89,6 +99,34 @@ run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
* Vectorized operations
*/

#ifdef HAVE_EMMINTRIN_H
#include <emmintrin.h>

/**begin repeat
* horizontal reductions on a vector
* # VOP = min, max#
*/

static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
{
npy_float r;
__m128 tmp = _mm_movehl_ps(v, v); /* c d ... */
__m128 m = _mm_@VOP@_ps(v, tmp); /* m(ac) m(bd) ... */
tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */
_mm_store_ss(&r, _mm_@VOP@_ps(tmp, m)); /* m(acbd) ... */
return r;
}

static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
{
npy_double r;
__m128d tmp = _mm_unpackhi_pd(v, v); /* b b */
_mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
return r;
}

/**end repeat**/

/**begin repeat
* #type = npy_float, npy_double#
* #TYPE = FLOAT, DOUBLE#
Expand All @@ -97,40 +135,38 @@ run_unary_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
* #vtype = __m128, __m128d#
* #vpre = _mm, _mm#
* #vsuf = ps, pd#
* #nan = NPY_NANF, NPY_NAN#
*/

#ifdef HAVE_EMMINTRIN_H
#include <emmintrin.h>


static void
sse2_sqrt_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
{
/* align output to 16 bytes */
UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
op[i] = @scalarf@(ip[i]);
}
assert(npy_is_aligned(&op[i], 16));
if (npy_is_aligned(&ip[i], 16)) {
UNARY_LOOP_BLOCKED(@type@, 16) {
LOOP_BLOCKED(@type@, 16) {
@vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
@vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
}
}
else {
UNARY_LOOP_BLOCKED(@type@, 16) {
LOOP_BLOCKED(@type@, 16) {
@vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
@vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
}
}
UNARY_LOOP_BLOCKED_END {
LOOP_BLOCKED_END {
op[i] = @scalarf@(ip[i]);
}
}


static void
sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
sse2_absolute_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
{
/*
* get 0x7FFFFFFF mask (everything but signbit set)
Expand All @@ -140,34 +176,87 @@ sse2_absolute_@TYPE@(@type@ * op, const @type@ * ip, const npy_intp n)
const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);

/* align output to 16 bytes */
UNARY_LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
/* add 0 to clear -0.0 */
op[i] = tmp + 0;
}
assert(npy_is_aligned(&op[i], 16));
if (npy_is_aligned(&ip[i], 16)) {
UNARY_LOOP_BLOCKED(@type@, 16) {
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
@vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
}
}
else {
UNARY_LOOP_BLOCKED(@type@, 16) {
LOOP_BLOCKED(@type@, 16) {
@vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
@vpre@_store_@vsuf@(&op[i], @vpre@_andnot_@vsuf@(mask, a));
}
}
UNARY_LOOP_BLOCKED_END {
LOOP_BLOCKED_END {
const @type@ tmp = ip[i] > 0 ? ip[i]: -ip[i];
/* add 0 to clear -0.0 */
op[i] = tmp + 0;
}
}


/**begin repeat1
* #kind = maximum, minimum#
* #VOP = max, min#
* #OP = >=, <=#
**/
/* arguments swapped as unary reduce has the swapped compared to unary */
static void
sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
{
LOOP_BLOCK_ALIGN_VAR(ip, @type@, 16) {
*op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
}
assert(npy_is_aligned(&ip[i], 16));
if (i + 2 * 16 / sizeof(@type@) <= n) {
/* load the first elements */
@vtype@ c = @vpre@_load_@vsuf@((@type@*)&ip[i]);
#ifdef NO_FLOATING_POINT_SUPPORT
@vtype@ cnan = @vpre@_or_@vsuf@(@vpre@_cmpneq_@vsuf@(c, c), cnan);
#else
/* minps/minpd will set invalid flag if nan is encountered */
PyUFunc_clearfperr();
#endif
i += 16 / sizeof(@type@);

LOOP_BLOCKED(@type@, 16) {
@vtype@ v = @vpre@_load_@vsuf@((@type@*)&ip[i]);
c = @vpre@_@VOP@_@vsuf@(c, v);
#ifdef NO_FLOATING_POINT_SUPPORT
/* check for nan, breaking the loop makes non nan case slow */
cnan = @vpre@_or_@vsuf@(@vpre@_cmpneq_@vsuf@(v, v), cnan);
}

if (@vpre@_movemask_@vsuf@(cnan)) {
*op = @nan@;
return;
}
#else
}
#endif
{
@type@ tmp = sse2_horizontal_@VOP@_@vtype@(c);
if (PyUFunc_getfperr() & UFUNC_FPE_INVALID)
*op = @nan@;
else
*op = (*op @OP@ tmp || npy_isnan(*op)) ? *op : tmp;
}
}
LOOP_BLOCKED_END {
*op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
}
}
/**end repeat1**/

/**end repeat**/

#endif /* HAVE_EMMINTRIN_H */

#endif
21 changes: 21 additions & 0 deletions numpy/core/tests/test_umath.py
Expand Up @@ -687,6 +687,27 @@ def test_sign(self):
np.seterr(**olderr)


class TestMinMax(TestCase):
def test_minmax_blocked(self):
"simd tests on max/min"
for dt in [np.float32, np.float64]:
for out, inp, msg in gen_alignment_data(dtype=dt, type='unary',
max_size=17):
for i in range(inp.size):
inp[:] = np.arange(inp.size, dtype=dt)
inp[i] = np.nan
self.assertTrue(np.isnan(inp.max()),
msg=repr(inp) + '\n' + msg)
self.assertTrue(np.isnan(inp.min()),
msg=repr(inp) + '\n' + msg)

inp[i] = 1e10
assert_equal(inp.max(), 1e10, err_msg=msg)
inp[i] = -1e10
assert_equal(inp.min(), -1e10, err_msg=msg)



class TestAbsolute(TestCase):
def test_abs_blocked(self):
"simd tests on abs"
Expand Down

0 comments on commit eb6cf4b

Please sign in to comment.