Skip to content

Commit

Permalink
BUG: make checking for sse intrinsics more robust
Browse files Browse the repository at this point in the history
check for two intrinsics from [ex]mmintrin.h instead of only checking
the existance of the headers.
E.g. mingw 4.2 emmintrin.h can be included even if SSE2 is disabled.
closes #3760
  • Loading branch information
juliantaylor committed Sep 21, 2013
1 parent fde3dee commit fd2e110
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 20 deletions.
8 changes: 8 additions & 0 deletions numpy/core/include/numpy/npy_common.h
Expand Up @@ -18,6 +18,14 @@
#define NPY_GCC_UNROLL_LOOPS
#endif

#if defined HAVE_XMMINTRIN_H && defined HAVE__MM_LOAD_PS
#define NPY_HAVE_SSE_INTRINSICS
#endif

#if defined HAVE_EMMINTRIN_H && defined HAVE__MM_LOAD_PD
#define NPY_HAVE_SSE2_INTRINSICS
#endif

/*
* give a hint to the compiler which branch is more likely or unlikely
* to occur, e.g. rare error cases:
Expand Down
10 changes: 8 additions & 2 deletions numpy/core/setup.py
Expand Up @@ -165,8 +165,14 @@ def check_funcs(funcs_name):
if config.check_func("", decl=False, call=False, headers=[h]):
moredefs.append((fname2def(h).replace(".", "_"), 1))

for f, args in OPTIONAL_INTRINSICS:
if config.check_func(f, decl=False, call=True, call_args=args):
for tup in OPTIONAL_INTRINSICS:
headers = None
if len(tup) == 2:
f, args = tup
else:
f, args, headers = tup[0], tup[1], [tup[2]]
if config.check_func(f, decl=False, call=True, call_args=args,
headers=headers):
moredefs.append((fname2def(f), 1))

for dec, fn in OPTIONAL_GCC_ATTRIBUTES:
Expand Down
5 changes: 4 additions & 1 deletion numpy/core/setup_common.py
Expand Up @@ -107,14 +107,17 @@ def check_api_version(apiversion, codegen_dir):
"emmintrin.h", # SSE2
]

# optional gcc compiler builtins and their call arguments
# optional gcc compiler builtins and their call arguments and optional a
# required header
# call arguments are required as the compiler will do strict signature checking
OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
("__builtin_isinf", '5.'),
("__builtin_isfinite", '5.'),
("__builtin_bswap32", '5u'),
("__builtin_bswap64", '5u'),
("__builtin_expect", '5, 0'),
("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE
("_mm_load_pd", '(double*)0', "emmintrin.h"), # SSE2
]

# gcc function attributes
Expand Down
6 changes: 3 additions & 3 deletions numpy/core/src/multiarray/einsum.c.src
Expand Up @@ -14,16 +14,16 @@

#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#define _MULTIARRAYMODULE
#include <numpy/npy_common.h>
#include <numpy/arrayobject.h>
#include <numpy/halffloat.h>
#include <npy_pycompat.h>
#include <npy_config.h>

#include <ctype.h>

#include "convert.h"

#ifdef HAVE_XMMINTRIN_H
#ifdef NPY_HAVE_SSE_INTRINSICS
#define EINSUM_USE_SSE1 1
#else
#define EINSUM_USE_SSE1 0
Expand All @@ -32,7 +32,7 @@
/*
* TODO: Only some SSE2 for float64 is implemented.
*/
#ifdef HAVE_EMMINTRIN_H
#ifdef NPY_HAVE_SSE2_INTRINSICS
#define EINSUM_USE_SSE2 1
#else
#define EINSUM_USE_SSE2 0
Expand Down
3 changes: 2 additions & 1 deletion numpy/core/src/umath/loops.c.src
Expand Up @@ -10,6 +10,7 @@
#define NO_IMPORT_ARRAY
#endif

#include "numpy/npy_common.h"
#include "numpy/arrayobject.h"
#include "numpy/ufuncobject.h"
#include "numpy/npy_math.h"
Expand Down Expand Up @@ -564,7 +565,7 @@ NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
if(IS_BINARY_REDUCE) {
#ifdef HAVE_EMMINTRIN_H
#ifdef NPY_HAVE_SSE2_INTRINSICS
/*
* stick with our variant for more reliable performance, only known
* platform which outperforms it by ~20% is an i7 with glibc 2.17
Expand Down
26 changes: 13 additions & 13 deletions numpy/core/src/umath/simd.inc.src
Expand Up @@ -16,10 +16,10 @@
#define __NPY_SIMD_INC

#include "lowlevel_strided_loops.h"
#include "npy_config.h"
#include "numpy/npy_common.h"
/* for NO_FLOATING_POINT_SUPPORT */
#include "numpy/ufuncobject.h"
#ifdef HAVE_EMMINTRIN_H
#ifdef NPY_HAVE_SSE2_INTRINSICS
#include <emmintrin.h>
#endif
#include <assert.h>
Expand Down Expand Up @@ -140,7 +140,7 @@ static const npy_int32 fanout_4[] = {
* #name = unary, unary, unary_reduce, unary_reduce#
*/

#if @vector@ && defined HAVE_EMMINTRIN_H
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS

/* prototypes */
static void
Expand All @@ -151,7 +151,7 @@ sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);
static NPY_INLINE int
run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if @vector@ && defined HAVE_EMMINTRIN_H
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
if (@check@(sizeof(@type@), 16)) {
sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
return 1;
Expand All @@ -167,7 +167,7 @@ run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
* # kind = add, subtract, multiply, divide#
*/

#if @vector@ && defined HAVE_EMMINTRIN_H
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS

/* prototypes */
static void
Expand All @@ -185,7 +185,7 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
static NPY_INLINE int
run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if @vector@ && defined HAVE_EMMINTRIN_H
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
@type@ * ip1 = (@type@ *)args[0];
@type@ * ip2 = (@type@ *)args[1];
@type@ * op = (@type@ *)args[2];
Expand Down Expand Up @@ -216,7 +216,7 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
* #simd = 1, 1, 1, 1, 1, 1, 0, 0#
*/

#if @vector@ && @simd@ && defined HAVE_EMMINTRIN_H
#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS

/* prototypes */
static void
Expand All @@ -234,7 +234,7 @@ sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
static NPY_INLINE int
run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if @vector@ && @simd@ && defined HAVE_EMMINTRIN_H
#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
@type@ * ip1 = (@type@ *)args[0];
@type@ * ip2 = (@type@ *)args[1];
npy_bool * op = (npy_bool *)args[2];
Expand Down Expand Up @@ -278,7 +278,7 @@ sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
static NPY_INLINE int
run_binary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if defined HAVE_EMMINTRIN_H
#if defined NPY_HAVE_SSE2_INTRINSICS
if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_BINARY(sizeof(npy_bool), 16)) {
sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
(npy_bool*)args[1], dimensions[0]);
Expand All @@ -295,7 +295,7 @@ sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n);
static NPY_INLINE int
run_reduce_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if defined HAVE_EMMINTRIN_H
#if defined NPY_HAVE_SSE2_INTRINSICS
if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_REDUCE(sizeof(npy_bool), 16)) {
sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
dimensions[0]);
Expand All @@ -317,7 +317,7 @@ sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
static NPY_INLINE int
run_unary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if defined HAVE_EMMINTRIN_H
#if defined NPY_HAVE_SSE2_INTRINSICS
if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_UNARY(sizeof(npy_bool), 16)) {
sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
return 1;
Expand All @@ -328,7 +328,7 @@ run_unary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)

/**end repeat**/

#ifdef HAVE_EMMINTRIN_H
#ifdef NPY_HAVE_SSE2_INTRINSICS

/*
* Vectorized operations
Expand Down Expand Up @@ -843,6 +843,6 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)

/**end repeat**/

#endif /* HAVE_EMMINTRIN_H */
#endif /* NPY_HAVE_SSE2_INTRINSICS */

#endif

0 comments on commit fd2e110

Please sign in to comment.