BUG: make checking for sse intrinsics more robust

check for two intrinsics from [ex]mmintrin.h instead of only checking the existance of the headers. E.g. mingw 4.2 emmintrin.h can be included even if SSE2 is disabled. closes #3760
numpy · Sep 21, 2013 · fd2e110 · fd2e110
1 parent fde3dee
commit fd2e110
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 20 deletions.
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
@@ -18,6 +18,14 @@
 #define NPY_GCC_UNROLL_LOOPS
 #endif
 
+#if defined HAVE_XMMINTRIN_H && defined HAVE__MM_LOAD_PS
+#define NPY_HAVE_SSE_INTRINSICS
+#endif
+
+#if defined HAVE_EMMINTRIN_H && defined HAVE__MM_LOAD_PD
+#define NPY_HAVE_SSE2_INTRINSICS
+#endif
+
 /*
  * give a hint to the compiler which branch is more likely or unlikely
  * to occur, e.g. rare error cases:

diff --git a/numpy/core/setup.py b/numpy/core/setup.py
@@ -165,8 +165,14 @@ def check_funcs(funcs_name):
         if config.check_func("", decl=False, call=False, headers=[h]):
             moredefs.append((fname2def(h).replace(".", "_"), 1))
 
-    for f, args in OPTIONAL_INTRINSICS:
-        if config.check_func(f, decl=False, call=True, call_args=args):
+    for tup in OPTIONAL_INTRINSICS:
+        headers = None
+        if len(tup) == 2:
+            f, args = tup
+        else:
+            f, args, headers = tup[0], tup[1], [tup[2]]
+        if config.check_func(f, decl=False, call=True, call_args=args,
+                             headers=headers):
             moredefs.append((fname2def(f), 1))
 
     for dec, fn in OPTIONAL_GCC_ATTRIBUTES:

diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
@@ -107,14 +107,17 @@ def check_api_version(apiversion, codegen_dir):
                 "emmintrin.h", # SSE2
 ]
 
-# optional gcc compiler builtins and their call arguments
+# optional gcc compiler builtins and their call arguments and optional a
+# required header
 # call arguments are required as the compiler will do strict signature checking
 OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
                        ("__builtin_isinf", '5.'),
                        ("__builtin_isfinite", '5.'),
                        ("__builtin_bswap32", '5u'),
                        ("__builtin_bswap64", '5u'),
                        ("__builtin_expect", '5, 0'),
+                       ("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE
+                       ("_mm_load_pd", '(double*)0', "emmintrin.h"), # SSE2
                        ]
 
 # gcc function attributes

diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
@@ -14,16 +14,16 @@
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+#include <numpy/npy_common.h>
 #include <numpy/arrayobject.h>
 #include <numpy/halffloat.h>
 #include <npy_pycompat.h>
-#include <npy_config.h>
 
 #include <ctype.h>
 
 #include "convert.h"
 
-#ifdef HAVE_XMMINTRIN_H
+#ifdef NPY_HAVE_SSE_INTRINSICS
 #define EINSUM_USE_SSE1 1
 #else
 #define EINSUM_USE_SSE1 0
@@ -32,7 +32,7 @@
 /*
  * TODO: Only some SSE2 for float64 is implemented.
  */
-#ifdef HAVE_EMMINTRIN_H
+#ifdef NPY_HAVE_SSE2_INTRINSICS
 #define EINSUM_USE_SSE2 1
 #else
 #define EINSUM_USE_SSE2 0

diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
@@ -10,6 +10,7 @@
 #define NO_IMPORT_ARRAY
 #endif
 
+#include "numpy/npy_common.h"
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
 #include "numpy/npy_math.h"
@@ -564,7 +565,7 @@ NPY_NO_EXPORT void
 BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_BINARY_REDUCE) {
-#ifdef HAVE_EMMINTRIN_H
+#ifdef NPY_HAVE_SSE2_INTRINSICS
         /*
          * stick with our variant for more reliable performance, only known
          * platform which outperforms it by ~20% is an i7 with glibc 2.17

diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
@@ -16,10 +16,10 @@
 #define __NPY_SIMD_INC
 
 #include "lowlevel_strided_loops.h"
-#include "npy_config.h"
+#include "numpy/npy_common.h"
 /* for NO_FLOATING_POINT_SUPPORT */
 #include "numpy/ufuncobject.h"
-#ifdef HAVE_EMMINTRIN_H
+#ifdef NPY_HAVE_SSE2_INTRINSICS
 #include <emmintrin.h>
 #endif
 #include <assert.h>
@@ -140,7 +140,7 @@ static const npy_int32 fanout_4[] = {
  * #name = unary, unary, unary_reduce, unary_reduce#
  */
 
-#if @vector@ && defined HAVE_EMMINTRIN_H
+#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
 
 /* prototypes */
 static void
@@ -151,7 +151,7 @@ sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);
 static NPY_INLINE int
 run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
 {
-#if @vector@ && defined HAVE_EMMINTRIN_H
+#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
     if (@check@(sizeof(@type@), 16)) {
         sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
         return 1;
@@ -167,7 +167,7 @@ run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
  * # kind = add, subtract, multiply, divide#
  */
 
-#if @vector@ && defined HAVE_EMMINTRIN_H
+#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
 
 /* prototypes */
 static void
@@ -185,7 +185,7 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
 static NPY_INLINE int
 run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
 {
-#if @vector@ && defined HAVE_EMMINTRIN_H
+#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
     @type@ * ip1 = (@type@ *)args[0];
     @type@ * ip2 = (@type@ *)args[1];
     @type@ * op = (@type@ *)args[2];
@@ -216,7 +216,7 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
  * #simd = 1, 1, 1, 1, 1, 1, 0, 0#
  */
 
-#if @vector@ && @simd@ && defined HAVE_EMMINTRIN_H
+#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
 
 /* prototypes */
 static void
@@ -234,7 +234,7 @@ sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
 static NPY_INLINE int
 run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
 {
-#if @vector@ && @simd@ && defined HAVE_EMMINTRIN_H
+#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
     @type@ * ip1 = (@type@ *)args[0];
     @type@ * ip2 = (@type@ *)args[1];
     npy_bool * op = (npy_bool *)args[2];
@@ -278,7 +278,7 @@ sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
 static NPY_INLINE int
 run_binary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
 {
-#if defined HAVE_EMMINTRIN_H
+#if defined NPY_HAVE_SSE2_INTRINSICS
     if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_BINARY(sizeof(npy_bool), 16)) {
         sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
                                (npy_bool*)args[1], dimensions[0]);
@@ -295,7 +295,7 @@ sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n);
 static NPY_INLINE int
 run_reduce_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
 {
-#if defined HAVE_EMMINTRIN_H
+#if defined NPY_HAVE_SSE2_INTRINSICS
     if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_REDUCE(sizeof(npy_bool), 16)) {
         sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
                                 dimensions[0]);
@@ -317,7 +317,7 @@ sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
 static NPY_INLINE int
 run_unary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
 {
-#if defined HAVE_EMMINTRIN_H
+#if defined NPY_HAVE_SSE2_INTRINSICS
     if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_UNARY(sizeof(npy_bool), 16)) {
         sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
         return 1;
@@ -328,7 +328,7 @@ run_unary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
 
 /**end repeat**/
 
-#ifdef HAVE_EMMINTRIN_H
+#ifdef NPY_HAVE_SSE2_INTRINSICS
 
 /*
  * Vectorized operations
@@ -843,6 +843,6 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 
 /**end repeat**/
 
-#endif /* HAVE_EMMINTRIN_H */
+#endif /* NPY_HAVE_SSE2_INTRINSICS */
 
 #endif