diff --git a/CHANGES.rst b/CHANGES.rst index 1dfe4ad0..47a7e6ed 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -26,6 +26,7 @@ Bug fixes ---------- - Incorrect calculations for non-native endian data [#191] - Workaround for GNU Assembler bug causing incorrect calculations [#196] +- Only report runtime isa support if we also have compiler support [#200] diff --git a/common.mk b/common.mk index 03de6818..5ae0f9f6 100644 --- a/common.mk +++ b/common.mk @@ -400,6 +400,8 @@ ifeq ($(DO_CHECKS), 1) CFLAGS += -xCORE-AVX2 endif + CFLAGS += -DGAS_BUG_DISABLE_AVX512 + ifneq ($(GAS_BUG_WARNING_PRINTED),1) $(warning $(ccred)DISABLING AVX-512 SUPPORT DUE TO GNU ASSEMBLER BUG. UPGRADE TO BINUTILS >=2.32 TO FIX THIS.$(ccreset)) endif diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src index 21c00513..84fe2dda 100644 --- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src +++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src @@ -114,63 +114,63 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_rp_pi_mocks_driver_DOUBLE(const stru /* Array of function pointers */ countpairs_mocks_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - countpairs_rp_pi_mocks_avx512_intrinsics_DOUBLE, + countpairs_rp_pi_mocks_fallback_DOUBLE, +#ifdef __SSE4_2__ + countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ - countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE, + countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + countpairs_rp_pi_mocks_avx512_intrinsics_DOUBLE, #endif - countpairs_rp_pi_mocks_fallback_DOUBLE }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42): function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } - if(function_dispatch >= num_functions) { fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); @@ -180,17 +180,18 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_rp_pi_mocks_driver_DOUBLE(const stru old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ + } else if(function_dispatch == sse_index){ fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { - printf("Unknown kernel!\n"); + fprintf(stderr,"Unknown kernel!\n"); + return NULL; } } diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src index df60763e..b52a67c4 100644 --- a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src +++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src @@ -115,63 +115,63 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_s_mu_mocks_driver_DOUBLE(const struc /* Array of function pointers */ countpairs_mocks_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - countpairs_s_mu_mocks_avx512_intrinsics_DOUBLE, + countpairs_s_mu_mocks_fallback_DOUBLE, +#ifdef __SSE4_2__ + countpairs_s_mu_mocks_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ countpairs_s_mu_mocks_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - countpairs_s_mu_mocks_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + countpairs_s_mu_mocks_avx512_intrinsics_DOUBLE, #endif - countpairs_s_mu_mocks_fallback_DOUBLE }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42): function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } - if(function_dispatch >= num_functions) { fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); @@ -181,15 +181,15 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_s_mu_mocks_driver_DOUBLE(const struc old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ + } else if(function_dispatch == sse_index){ fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { fprintf(stderr,"Unknown kernel!\n"); return NULL; diff --git a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src index 164228f4..1fd759b2 100644 --- a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src +++ b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src @@ -94,63 +94,61 @@ countpairs_theta_mocks_func_ptr_DOUBLE countpairs_theta_mocks_driver_DOUBLE(cons /* Array of function pointers */ countpairs_theta_mocks_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - countpairs_theta_mocks_avx512_instrinsics_DOUBLE, + countpairs_theta_mocks_fallback_DOUBLE, +#ifdef __SSE4_2__ + countpairs_theta_mocks_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ - countpairs_theta_mocks_avx_instrinsics_DOUBLE, + countpairs_theta_mocks_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - countpairs_theta_mocks_sse_instrinsics_DOUBLE, +#ifdef __AVX512F__ + countpairs_theta_mocks_avx512_intrinsics_DOUBLE, #endif - countpairs_theta_mocks_fallback_DOUBLE }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - /* Since highest_isa is only used in cases where SSE4.2 or AVX is defined, - without this protection, there will be an unnecessary function call - and an unused variable compiler warning. */ - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } if(function_dispatch >= num_functions) { @@ -162,15 +160,15 @@ countpairs_theta_mocks_func_ptr_DOUBLE countpairs_theta_mocks_driver_DOUBLE(cons old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ + } else if(function_dispatch == sse_index){ fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { fprintf(stderr,"Unknown kernel!\n"); return NULL; diff --git a/mocks/DDtheta_mocks/countpairs_theta_mocks_kernels.c.src b/mocks/DDtheta_mocks/countpairs_theta_mocks_kernels.c.src index 7c3169e0..8dbc916b 100644 --- a/mocks/DDtheta_mocks/countpairs_theta_mocks_kernels.c.src +++ b/mocks/DDtheta_mocks/countpairs_theta_mocks_kernels.c.src @@ -246,7 +246,7 @@ static inline int countpairs_theta_mocks_fallback_DOUBLE(const int64_t N0, DOUBL #if defined(__SSE4_2__) #include "sse_calls.h" -static inline int countpairs_theta_mocks_sse_instrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, +static inline int countpairs_theta_mocks_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0, const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, @@ -586,7 +586,7 @@ static inline int countpairs_theta_mocks_sse_instrinsics_DOUBLE(const int64_t N0 #if defined(__AVX__) #include "avx_calls.h" -static inline int countpairs_theta_mocks_avx_instrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, +static inline int countpairs_theta_mocks_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0, const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, @@ -917,7 +917,7 @@ static inline int countpairs_theta_mocks_avx_instrinsics_DOUBLE(const int64_t N0 #if defined(__AVX512F__) #include "avx512_calls.h" -static inline int countpairs_theta_mocks_avx512_instrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, +static inline int countpairs_theta_mocks_avx512_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0, const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, diff --git a/mocks/python_bindings/_countpairs_mocks.c b/mocks/python_bindings/_countpairs_mocks.c index 9fa8dcb1..30c214a1 100644 --- a/mocks/python_bindings/_countpairs_mocks.c +++ b/mocks/python_bindings/_countpairs_mocks.c @@ -868,7 +868,7 @@ PyObject *PyInit__countpairs_mocks(void) /* Load `numpy` functionality. */ import_array(); - highest_isa_mocks = instrset_detect(); + highest_isa_mocks = get_max_usable_isa(); #if PY_MAJOR_VERSION >= 3 return module; diff --git a/mocks/vpf_mocks/countspheres_mocks_impl.c.src b/mocks/vpf_mocks/countspheres_mocks_impl.c.src index 70f6d4c5..6b55081b 100644 --- a/mocks/vpf_mocks/countspheres_mocks_impl.c.src +++ b/mocks/vpf_mocks/countspheres_mocks_impl.c.src @@ -52,64 +52,65 @@ vpf_mocks_func_ptr_DOUBLE vpf_mocks_driver_DOUBLE(const struct config_options *o return function; } - //Seriously this is the declaration for the function pointers...here be dragons. + /* Array of function pointers */ vpf_mocks_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - vpf_mocks_avx512_intrinsics_DOUBLE, + vpf_mocks_fallback_DOUBLE, +#ifdef __SSE4_2__ + vpf_mocks_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ vpf_mocks_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - vpf_mocks_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + vpf_mocks_avx512_intrinsics_DOUBLE, #endif - vpf_mocks_fallback_DOUBLE }; + const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX): function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } - if(function_dispatch >= num_functions) { fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); @@ -118,6 +119,22 @@ vpf_mocks_func_ptr_DOUBLE vpf_mocks_driver_DOUBLE(const struct config_options *o function = allfunctions[function_dispatch]; old_isa = options->instruction_set; + if(options->verbose){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ + fprintf(stderr,"Using fallback kernel\n"); + } else if(function_dispatch == sse_index){ + fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); + } else { + fprintf(stderr,"Unknown kernel!\n"); + return NULL; + } + } + return function; } diff --git a/theory/DD/countpairs_impl.c.src b/theory/DD/countpairs_impl.c.src index 7b8603c4..0d37dda3 100644 --- a/theory/DD/countpairs_impl.c.src +++ b/theory/DD/countpairs_impl.c.src @@ -19,7 +19,7 @@ #include "defs.h" #include "utils.h" //all of the utilities #include "progressbar.h" //for the progressbar -#include "cpu_features.h" //prototype instrset_detect required for runtime dispatch +#include "cpu_features.h" //prototype get_max_usable_isa required for runtime dispatch #include "gridlink_impl_DOUBLE.h"//function proto-type for gridlink #include "gridlink_utils_DOUBLE.h" //for associated helper routines @@ -48,60 +48,61 @@ countpairs_func_ptr_DOUBLE countpairs_driver_DOUBLE(const struct config_options /* Array of function pointers */ countpairs_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - countpairs_avx512_intrinsics_DOUBLE, + countpairs_fallback_DOUBLE, +#ifdef __SSE4_2__ + countpairs_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ countpairs_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - countpairs_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + countpairs_avx512_intrinsics_DOUBLE, #endif - countpairs_fallback_DOUBLE }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0;//Set default to fastest available + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } if(function_dispatch >= num_functions) { @@ -113,15 +114,15 @@ countpairs_func_ptr_DOUBLE countpairs_driver_DOUBLE(const struct config_options old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ + } else if(function_dispatch == sse_index){ fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { fprintf(stderr,"Unknown kernel!\n"); return NULL; diff --git a/theory/DDrppi/countpairs_rp_pi_impl.c.src b/theory/DDrppi/countpairs_rp_pi_impl.c.src index d82bcfce..32564125 100644 --- a/theory/DDrppi/countpairs_rp_pi_impl.c.src +++ b/theory/DDrppi/countpairs_rp_pi_impl.c.src @@ -20,7 +20,7 @@ #include "defs.h" #include "utils.h" //all of the utilities #include "progressbar.h" //for the progressbar -#include "cpu_features.h" //prototype instrset_detect required for runtime dispatch +#include "cpu_features.h" //prototype get_max_usable_isa required for runtime dispatch #include "gridlink_impl_DOUBLE.h"//function proto-type for gridlink #include "gridlink_utils_DOUBLE.h" //for associated helper routines @@ -45,67 +45,67 @@ countpairs_rp_pi_func_ptr_DOUBLE countpairs_rp_pi_driver_DOUBLE(const struct con return function; } - /* Array of function pointers */ countpairs_rp_pi_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - countpairs_rp_pi_avx512_intrinsics_DOUBLE, + countpairs_rp_pi_fallback_DOUBLE, +#ifdef __SSE4_2__ + countpairs_rp_pi_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ - countpairs_rp_pi_avx_intrinsics_DOUBLE, + countpairs_rp_pi_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - countpairs_rp_pi_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + countpairs_rp_pi_avx512_intrinsics_DOUBLE, #endif - countpairs_rp_pi_fallback_DOUBLE }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } if(function_dispatch >= num_functions) { - fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", + fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); return NULL; } @@ -113,15 +113,15 @@ countpairs_rp_pi_func_ptr_DOUBLE countpairs_rp_pi_driver_DOUBLE(const struct con old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ + } else if(function_dispatch == sse_index){ fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { fprintf(stderr,"Unknown kernel!\n"); return NULL; diff --git a/theory/DDsmu/countpairs_s_mu_impl.c.src b/theory/DDsmu/countpairs_s_mu_impl.c.src index 6d30470b..09ad5b60 100644 --- a/theory/DDsmu/countpairs_s_mu_impl.c.src +++ b/theory/DDsmu/countpairs_s_mu_impl.c.src @@ -20,7 +20,7 @@ #include "defs.h" #include "utils.h" //all of the utilities #include "progressbar.h" //for the progressbar -#include "cpu_features.h" //prototype instrset_detect required for runtime dispatch +#include "cpu_features.h" //prototype get_max_usable_isa required for runtime dispatch #include "gridlink_impl_DOUBLE.h"//function proto-type for gridlink #include "gridlink_utils_DOUBLE.h" //for associated helper routines @@ -48,65 +48,65 @@ countpairs_s_mu_func_ptr_DOUBLE countpairs_s_mu_driver_DOUBLE(const struct confi /* Array of function pointers */ countpairs_s_mu_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - countpairs_s_mu_avx512_intrinsics_DOUBLE, + countpairs_s_mu_fallback_DOUBLE, +#ifdef __SSE4_2__ + countpairs_s_mu_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ - countpairs_s_mu_avx_intrinsics_DOUBLE, + countpairs_s_mu_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - countpairs_s_mu_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + countpairs_s_mu_avx512_intrinsics_DOUBLE, #endif - countpairs_s_mu_fallback_DOUBLE }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } - if(function_dispatch >= num_functions) { - fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", + fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); return NULL; } @@ -114,17 +114,17 @@ countpairs_s_mu_func_ptr_DOUBLE countpairs_s_mu_driver_DOUBLE(const struct confi old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr, "Using AVX512F kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr, "Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ - fprintf(stderr, "Using SSE kernel\n"); + } else if(function_dispatch == sse_index){ + fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { - fprintf(stderr, "Unknown kernel!\n"); + fprintf(stderr,"Unknown kernel!\n"); return NULL; } } diff --git a/theory/python_bindings/_countpairs.c b/theory/python_bindings/_countpairs.c index 6c1904de..a02689c2 100644 --- a/theory/python_bindings/_countpairs.c +++ b/theory/python_bindings/_countpairs.c @@ -987,7 +987,7 @@ PyMODINIT_FUNC init_countpairs(void) /* Load `numpy` functionality. */ import_array(); - highest_isa = instrset_detect(); + highest_isa = get_max_usable_isa(); #if PY_MAJOR_VERSION >= 3 return module; diff --git a/theory/vpf/countspheres_impl.c.src b/theory/vpf/countspheres_impl.c.src index e9f3d19a..fd144a57 100644 --- a/theory/vpf/countspheres_impl.c.src +++ b/theory/vpf/countspheres_impl.c.src @@ -44,63 +44,65 @@ vpf_func_ptr_DOUBLE vpf_driver_DOUBLE(const struct config_options *options) return function; } - //Seriously this is the declaration for the function pointers...here be dragons. +/* Array of function pointers */ vpf_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - vpf_avx512_intrinsics_DOUBLE, + vpf_fallback_DOUBLE, +#ifdef __SSE4_2__ + vpf_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ vpf_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - vpf_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + vpf_avx512_intrinsics_DOUBLE, #endif - vpf_fallback_DOUBLE }; + const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + int curr_index = 0; + + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } - if(function_dispatch >= num_functions) { fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); @@ -109,6 +111,22 @@ vpf_func_ptr_DOUBLE vpf_driver_DOUBLE(const struct config_options *options) function = allfunctions[function_dispatch]; old_isa = options->instruction_set; + if(options->verbose){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ + fprintf(stderr,"Using fallback kernel\n"); + } else if(function_dispatch == sse_index){ + fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); + } else { + fprintf(stderr,"Unknown kernel!\n"); + return NULL; + } + } + return function; } diff --git a/theory/wp/countpairs_wp_impl.c.src b/theory/wp/countpairs_wp_impl.c.src index d54725d5..4c348614 100644 --- a/theory/wp/countpairs_wp_impl.c.src +++ b/theory/wp/countpairs_wp_impl.c.src @@ -20,7 +20,7 @@ #include "utils.h" //all of the utilities #include "progressbar.h" //for the progressbar -#include "cpu_features.h" //prototype instrset_detect required for runtime dispatch +#include "cpu_features.h" //prototype get_max_usable_isa required for runtime dispatch #include "gridlink_impl_DOUBLE.h"//function proto-type for gridlink #include "gridlink_utils_DOUBLE.h" //for associated helper routines @@ -47,72 +47,74 @@ wp_func_ptr_DOUBLE wp_driver_DOUBLE(const struct config_options *options) return function; } - //Seriously this is the declaration for the function pointers + /* Array of function pointers */ wp_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - wp_avx512_intrinsics_DOUBLE, -#endif -#ifdef __AVX2__ - wp_avx2_intrinsics_DOUBLE, + wp_fallback_DOUBLE, +#ifdef __SSE4_2__ + wp_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ - wp_avx_intrinsics_DOUBLE, + wp_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - wp_sse_intrinsics_DOUBLE, +#ifdef __AVX2__ + wp_avx2_intrinsics_DOUBLE, +#endif +#ifdef __AVX512F__ + wp_avx512_intrinsics_DOUBLE, #endif - wp_fallback_DOUBLE }; + const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX2__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; -#endif - - /* Check for AVX2 support */ - int avx2_offset = fallback_offset; -#ifdef __AVX2__ - avx2_offset = highest_isa >= 8 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Now check if AVX2 is supported by the CPU */ + int avx2_index = curr_index; +#ifdef __AVX2__ + curr_index++; + if(highest_isa >= AVX2) avx2_index = curr_index; +#endif + + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0;//Set default to fastest available + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { - switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; - case(AVX2):function_dispatch=avx2_offset;break; - case(AVX):function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; - } + switch(options->instruction_set) { + case(AVX512F):function_dispatch=avx512_index;break; + case(AVX2):function_dispatch=avx2_index;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; + } } if(function_dispatch >= num_functions) { fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", @@ -123,20 +125,21 @@ wp_func_ptr_DOUBLE wp_driver_DOUBLE(const struct config_options *options) old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ - fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx2_offset){ - fprintf(stderr,"Using AVX2 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ - fprintf(stderr,"Using SSE4 kernel\n"); - } else { - printf("Unknown kernel!\n"); - } + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ + fprintf(stderr,"Using fallback kernel\n"); + } else if(function_dispatch == sse_index){ + fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx2_index){ + fprintf(stderr,"Using AVX2 kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); + } else { + fprintf(stderr,"Unknown kernel!\n"); + return NULL; + } } return function; diff --git a/theory/xi/countpairs_xi_impl.c.src b/theory/xi/countpairs_xi_impl.c.src index 8523fbd3..ea1b006a 100644 --- a/theory/xi/countpairs_xi_impl.c.src +++ b/theory/xi/countpairs_xi_impl.c.src @@ -20,7 +20,7 @@ #include "defs.h" #include "utils.h" //all of the utilities #include "progressbar.h" //for the progressbar -#include "cpu_features.h" //prototype instrset_detect required for runtime dispatch +#include "cpu_features.h" //prototype get_max_usable_isa required for runtime dispatch #include "gridlink_impl_DOUBLE.h"//function proto-type for gridlink #include "gridlink_utils_DOUBLE.h" //for associated helper routines @@ -45,63 +45,65 @@ xi_func_ptr_DOUBLE xi_driver_DOUBLE(const struct config_options *options) return function; } - //Seriously this is the declaration for the function pointers...here be dragons. + /* Array of function pointers */ xi_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - xi_avx512_intrinsics_DOUBLE, + xi_fallback_DOUBLE, +#ifdef __SSE4_2__ + xi_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ - xi_avx_intrinsics_DOUBLE, + xi_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - xi_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + xi_avx512_intrinsics_DOUBLE, #endif - xi_fallback_DOUBLE }; + const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + int curr_index = 0; + + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX): function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } - if(function_dispatch >= num_functions) { fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); @@ -111,17 +113,17 @@ xi_func_ptr_DOUBLE xi_driver_DOUBLE(const struct config_options *options) old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ + } else if(function_dispatch == sse_index){ fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { - fprintf(stderr, "Unknown kernel!\n"); + fprintf(stderr,"Unknown kernel!\n"); return NULL; } } diff --git a/utils/cpu_features.c b/utils/cpu_features.c index 7ea4060b..7e5b2813 100644 --- a/utils/cpu_features.c +++ b/utils/cpu_features.c @@ -8,15 +8,21 @@ Adapted from Agner Fog's vectorclass: http://agner.org/ */ +#include + #include "cpu_features.h" -int instrset_detect(void) +// Use CPUID to detect what instruction sets the CPU supports +// The compiler may not support all these features though! +// Use get_max_usable_isa() to find the max ISA supported +// by both the compiler and CPU +int runtime_instrset_detect(void) { static int iset = -1; // remember value for next call if (iset >= 0) { return iset; // called before } - iset = 0; // default value + iset = FALLBACK; // default value int abcd[4] = {0,0,0,0}; // cpuid results cpuid(abcd, 0); // call cpuid function 0 if (abcd[0] == 0) return iset; // no further cpuid function supported @@ -26,28 +32,119 @@ int instrset_detect(void) if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE - iset = 1; // 1: SSE supported + iset = SSE; // 1: SSE supported + if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2 - iset = 2; // 2: SSE2 supported + iset = SSE2; // 2: SSE2 supported + if ((abcd[2] & (1 << 0)) == 0) return iset; // no SSE3 - iset = 3; // 3: SSE3 supported + iset = SSE3; // 3: SSE3 supported + if ((abcd[2] & (1 << 9)) == 0) return iset; // no SSSE3 - iset = 4; // 4: SSSE3 supported + iset = SSSE3; // 4: SSSE3 supported + if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1 - iset = 5; // 5: SSE4.1 supported + iset = SSE4; // 5: SSE4.1 supported + if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2 - iset = 6; // 6: SSE4.2 supported + iset = SSE42; // 6: SSE4.2 supported + if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE if ((xgetbv(0) & 6) != 6) return iset; // AVX not enabled in O.S. if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX - iset = 7; // 7: AVX supported + iset = AVX; // 7: AVX supported + cpuid(abcd, 7); // call cpuid leaf 7 for feature flags if ((abcd[1] & (1 << 5)) == 0) return iset; // no AVX2 - iset = 8; // 8: AVX2 supported + iset = AVX2; // 8: AVX2 supported + cpuid(abcd, 0xD); // call cpuid leaf 0xD for feature flags if ((abcd[0] & 0x60) != 0x60) return iset; // no AVX512 - iset = 9; // 9: AVX512F supported + iset = AVX512F; // 9: AVX512F supported return iset; } +// Report the max ISA supported by both the CPU and compiler +int get_max_usable_isa(void) +{ + static int iset = -1; // remember value for next call + if (iset >= 0) { + return iset; // called before + } + iset = runtime_instrset_detect(); + + switch(iset){ + case AVX512F: +#ifdef __AVX512F__ + iset = AVX512F; + break; +#elif defined(GAS_BUG_DISABLE_AVX512) + fprintf(stderr, "[Warning] AVX512F is disabled due to a GNU Assembler bug. Upgrade to binutils >= 2.32 to fix this.\n"); +#else + fprintf(stderr, "[Warning] The CPU supports AVX512F but the compiler does not. Can you try another compiler?\n"); +#endif + case AVX2: +#ifdef __AVX2__ + iset = AVX2; + break; +#else + fprintf(stderr, "[Warning] The CPU supports AVX2 but the compiler does not. Can you try another compiler?\n"); +#endif + case AVX: +#ifdef __AVX__ + iset = AVX; + break; +#else + fprintf(stderr, "[Warning] The CPU supports AVX but the compiler does not. Can you try another compiler?\n"); +#endif + case SSE42: +#ifdef __SSE4_2__ + iset = SSE42; + break; +#else + fprintf(stderr, "[Warning] The CPU supports SSE4.2 but the compiler does not. Can you try another compiler?\n"); +#endif + case SSE4: +#ifdef __SSE4_1__ + iset = SSE4; + break; +#else + fprintf(stderr, "[Warning] The CPU supports SSE4.1 but the compiler does not. Can you try another compiler?\n"); +#endif + case SSSE3: +#ifdef __SSSE3__ + iset = SSSE3; + break; +#else + fprintf(stderr, "[Warning] The CPU supports SSSE3 but the compiler does not. Can you try another compiler?\n"); +#endif + case SSE3: +#ifdef __SSE3__ + iset = SSE3; + break; +#else + fprintf(stderr, "[Warning] The CPU supports SSE3 but the compiler does not. Can you try another compiler?\n"); +#endif + case SSE2: +#ifdef __SSE2__ + iset = SSE2; + break; +#else + fprintf(stderr, "[Warning] The CPU supports SSE2 but the compiler does not. Can you try another compiler?\n"); +#endif + case SSE: +#ifdef __SSE__ + iset = SSE; + break; +#else + fprintf(stderr, "[Warning] The CPU supports SSE but the compiler does not. Can you try another compiler?\n"); +#endif + case FALLBACK: + default: + iset = FALLBACK; + break; + } + + return iset; +} diff --git a/utils/cpu_features.h b/utils/cpu_features.h index 73c906e8..b85d0f04 100644 --- a/utils/cpu_features.h +++ b/utils/cpu_features.h @@ -17,6 +17,22 @@ extern "C" { #endif +typedef enum { + DEFAULT=-42,/* present simply to make the enum a signed int*/ + FALLBACK=0, /* No special options */ + SSE=1, /* 64 bit vectors */ + SSE2=2, /* 128 bit vectors */ + SSE3=3, /* 128 bit vectors */ + SSSE3=4, /* 128 bit vectors */ + SSE4=5,/* 128bit vectors */ + SSE42=6, /* 128bit vectors with blend operations */ + AVX=7, /* 256bit vector width */ + AVX2=8, /* AVX2 (integer operations)*/ + AVX512F=9,/* AVX 512 Foundation */ + NUM_ISA /*NUM_ISA will be the next integer after + the last declared enum. AVX512F:=9 (so, NUM_ISA==10)*/ +} isa; //name for instruction sets -> corresponds to the return values for functions in cpu_features.c + static inline void cpuid (int output[4], int functionnumber) { #if defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax @@ -67,7 +83,8 @@ static inline int64_t xgetbv (int ctr) { #endif } -extern int instrset_detect(void); +extern int runtime_instrset_detect(void); +extern int get_max_usable_isa(void); #ifdef __cplusplus } diff --git a/utils/defs.h b/utils/defs.h index 5674c601..554fc644 100644 --- a/utils/defs.h +++ b/utils/defs.h @@ -8,12 +8,14 @@ #pragma once +#include #include #include #include #include #include "macros.h" +#include "cpu_features.h" #ifdef __cplusplus extern "C" { @@ -21,22 +23,6 @@ extern "C" { #define API_VERSION STR("2.3.1") -typedef enum { - DEFAULT=-42,/* present simply to make the enum a signed int*/ - FALLBACK=0, /* No special options */ - SSE=1, /* 64 bit vectors */ - SSE2=2, /* 128 bit vectors */ - SSE3=3, /* 128 bit vectors */ - SSSE3=4, /* 128 bit vectors */ - SSE4=5,/* 128bit vectors */ - SSE42=6, /* 128bit vectors with blend operations */ - AVX=7, /* 256bit vector width */ - AVX2=8, /* AVX2 (integer operations)*/ - AVX512F=9,/* AVX 512 Foundation */ - NUM_ISA /*NUM_ISA will be the next integer after - the last declared enum. AVX512F:=9 (so, NUM_ISA==10)*/ -} isa;//name for instruction sets -> corresponds to the return from instrset_detect in cpu_features.c - /* Macros as mask for the binning_flags */ /* These consititute the 32 bytes for