From 31f66e1bf12bb89d6ce02c3f5b7f36a1eed8620d Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Mon, 7 Oct 2019 15:21:07 +0000 Subject: [PATCH 1/9] Only report runtime isa support if we also have compiler support --- utils/cpu_features.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/utils/cpu_features.c b/utils/cpu_features.c index 7ea4060b..ba857967 100644 --- a/utils/cpu_features.c +++ b/utils/cpu_features.c @@ -26,28 +26,54 @@ int instrset_detect(void) if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE +#ifdef __SSE__ iset = 1; // 1: SSE supported +#endif + if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2 +#ifdef __SSE2__ iset = 2; // 2: SSE2 supported +#endif + if ((abcd[2] & (1 << 0)) == 0) return iset; // no SSE3 +#ifdef __SSE3__ iset = 3; // 3: SSE3 supported +#endif + if ((abcd[2] & (1 << 9)) == 0) return iset; // no SSSE3 +#ifdef __SSSE3__ iset = 4; // 4: SSSE3 supported +#endif + if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1 +#ifdef __SSE4_1__ iset = 5; // 5: SSE4.1 supported +#endif + if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2 +#ifdef __SSE4_2__ iset = 6; // 6: SSE4.2 supported +#endif + if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE if ((xgetbv(0) & 6) != 6) return iset; // AVX not enabled in O.S. if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX +#ifdef __AVX__ iset = 7; // 7: AVX supported +#endif + cpuid(abcd, 7); // call cpuid leaf 7 for feature flags if ((abcd[1] & (1 << 5)) == 0) return iset; // no AVX2 +#ifdef __AVX2__ iset = 8; // 8: AVX2 supported +#endif + cpuid(abcd, 0xD); // call cpuid leaf 0xD for feature flags if ((abcd[0] & 0x60) != 0x60) return iset; // no AVX512 +#ifdef __AVX512F__ iset = 9; // 9: AVX512F supported +#endif return iset; } From 7817b19168fbafb068b3d8e7d537ad44c7070d5e Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Mon, 7 Oct 2019 15:26:58 +0000 Subject: [PATCH 2/9] Update changelog --- CHANGES.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.rst b/CHANGES.rst index 1dfe4ad0..47a7e6ed 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -26,6 +26,7 @@ Bug fixes ---------- - Incorrect calculations for non-native endian data [#191] - Workaround for GNU Assembler bug causing incorrect calculations [#196] +- Only report runtime isa support if we also have compiler support [#200] From 6f7fca7ae4e70009526441dde974686a047a8794 Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Fri, 11 Oct 2019 19:08:03 +0000 Subject: [PATCH 3/9] Warn user if the compiler does not support an instruction set that the CPU supports --- common.mk | 2 + .../countpairs_rp_pi_mocks_impl.c.src | 2 +- .../countpairs_s_mu_mocks_impl.c.src | 2 +- .../countpairs_theta_mocks_impl.c.src | 2 +- mocks/python_bindings/_countpairs_mocks.c | 2 +- mocks/vpf_mocks/countspheres_mocks_impl.c.src | 2 +- theory/DD/countpairs_impl.c.src | 4 +- theory/DDrppi/countpairs_rp_pi_impl.c.src | 4 +- theory/DDsmu/countpairs_s_mu_impl.c.src | 4 +- theory/python_bindings/_countpairs.c | 2 +- theory/vpf/countspheres_impl.c.src | 2 +- theory/wp/countpairs_wp_impl.c.src | 4 +- theory/xi/countpairs_xi_impl.c.src | 4 +- utils/cpu_features.c | 108 +++++++++++++++--- utils/cpu_features.h | 3 +- 15 files changed, 110 insertions(+), 37 deletions(-) diff --git a/common.mk b/common.mk index 03de6818..5ae0f9f6 100644 --- a/common.mk +++ b/common.mk @@ -400,6 +400,8 @@ ifeq ($(DO_CHECKS), 1) CFLAGS += -xCORE-AVX2 endif + CFLAGS += -DGAS_BUG_DISABLE_AVX512 + ifneq ($(GAS_BUG_WARNING_PRINTED),1) $(warning $(ccred)DISABLING AVX-512 SUPPORT DUE TO GNU ASSEMBLER BUG. UPGRADE TO BINUTILS >=2.32 TO FIX THIS.$(ccreset)) endif diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src index 21c00513..6189cbc6 100644 --- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src +++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src @@ -129,7 +129,7 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_rp_pi_mocks_driver_DOUBLE(const stru const int num_functions = sizeof(allfunctions)/sizeof(void *); const int fallback_offset = num_functions - 1; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif int curr_offset = 0; diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src index df60763e..0b2ecfb8 100644 --- a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src +++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src @@ -130,7 +130,7 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_s_mu_mocks_driver_DOUBLE(const struc const int num_functions = sizeof(allfunctions)/sizeof(void *); const int fallback_offset = num_functions - 1; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif int curr_offset = 0; diff --git a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src index 164228f4..9c70a156 100644 --- a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src +++ b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src @@ -112,7 +112,7 @@ countpairs_theta_mocks_func_ptr_DOUBLE countpairs_theta_mocks_driver_DOUBLE(cons /* Since highest_isa is only used in cases where SSE4.2 or AVX is defined, without this protection, there will be an unnecessary function call and an unused variable compiler warning. */ - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif int curr_offset = 0; diff --git a/mocks/python_bindings/_countpairs_mocks.c b/mocks/python_bindings/_countpairs_mocks.c index 9fa8dcb1..30c214a1 100644 --- a/mocks/python_bindings/_countpairs_mocks.c +++ b/mocks/python_bindings/_countpairs_mocks.c @@ -868,7 +868,7 @@ PyObject *PyInit__countpairs_mocks(void) /* Load `numpy` functionality. */ import_array(); - highest_isa_mocks = instrset_detect(); + highest_isa_mocks = get_max_usable_isa(); #if PY_MAJOR_VERSION >= 3 return module; diff --git a/mocks/vpf_mocks/countspheres_mocks_impl.c.src b/mocks/vpf_mocks/countspheres_mocks_impl.c.src index 70f6d4c5..84480a5b 100644 --- a/mocks/vpf_mocks/countspheres_mocks_impl.c.src +++ b/mocks/vpf_mocks/countspheres_mocks_impl.c.src @@ -68,7 +68,7 @@ vpf_mocks_func_ptr_DOUBLE vpf_mocks_driver_DOUBLE(const struct config_options *o const int num_functions = sizeof(allfunctions)/sizeof(void *); const int fallback_offset = num_functions - 1; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif int curr_offset = 0; diff --git a/theory/DD/countpairs_impl.c.src b/theory/DD/countpairs_impl.c.src index 7b8603c4..cb75b7c0 100644 --- a/theory/DD/countpairs_impl.c.src +++ b/theory/DD/countpairs_impl.c.src @@ -19,7 +19,7 @@ #include "defs.h" #include "utils.h" //all of the utilities #include "progressbar.h" //for the progressbar -#include "cpu_features.h" //prototype instrset_detect required for runtime dispatch +#include "cpu_features.h" //prototype get_max_usable_isa required for runtime dispatch #include "gridlink_impl_DOUBLE.h"//function proto-type for gridlink #include "gridlink_utils_DOUBLE.h" //for associated helper routines @@ -63,7 +63,7 @@ countpairs_func_ptr_DOUBLE countpairs_driver_DOUBLE(const struct config_options const int num_functions = sizeof(allfunctions)/sizeof(void *); const int fallback_offset = num_functions - 1; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif int curr_offset = 0; diff --git a/theory/DDrppi/countpairs_rp_pi_impl.c.src b/theory/DDrppi/countpairs_rp_pi_impl.c.src index d82bcfce..87a9abfa 100644 --- a/theory/DDrppi/countpairs_rp_pi_impl.c.src +++ b/theory/DDrppi/countpairs_rp_pi_impl.c.src @@ -20,7 +20,7 @@ #include "defs.h" #include "utils.h" //all of the utilities #include "progressbar.h" //for the progressbar -#include "cpu_features.h" //prototype instrset_detect required for runtime dispatch +#include "cpu_features.h" //prototype get_max_usable_isa required for runtime dispatch #include "gridlink_impl_DOUBLE.h"//function proto-type for gridlink #include "gridlink_utils_DOUBLE.h" //for associated helper routines @@ -63,7 +63,7 @@ countpairs_rp_pi_func_ptr_DOUBLE countpairs_rp_pi_driver_DOUBLE(const struct con const int num_functions = sizeof(allfunctions)/sizeof(void *); const int fallback_offset = num_functions - 1; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif int curr_offset = 0; diff --git a/theory/DDsmu/countpairs_s_mu_impl.c.src b/theory/DDsmu/countpairs_s_mu_impl.c.src index 6d30470b..b88cbf89 100644 --- a/theory/DDsmu/countpairs_s_mu_impl.c.src +++ b/theory/DDsmu/countpairs_s_mu_impl.c.src @@ -20,7 +20,7 @@ #include "defs.h" #include "utils.h" //all of the utilities #include "progressbar.h" //for the progressbar -#include "cpu_features.h" //prototype instrset_detect required for runtime dispatch +#include "cpu_features.h" //prototype get_max_usable_isa required for runtime dispatch #include "gridlink_impl_DOUBLE.h"//function proto-type for gridlink #include "gridlink_utils_DOUBLE.h" //for associated helper routines @@ -63,7 +63,7 @@ countpairs_s_mu_func_ptr_DOUBLE countpairs_s_mu_driver_DOUBLE(const struct confi const int num_functions = sizeof(allfunctions)/sizeof(void *); const int fallback_offset = num_functions - 1; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif int curr_offset = 0; diff --git a/theory/python_bindings/_countpairs.c b/theory/python_bindings/_countpairs.c index 6c1904de..a02689c2 100644 --- a/theory/python_bindings/_countpairs.c +++ b/theory/python_bindings/_countpairs.c @@ -987,7 +987,7 @@ PyMODINIT_FUNC init_countpairs(void) /* Load `numpy` functionality. */ import_array(); - highest_isa = instrset_detect(); + highest_isa = get_max_usable_isa(); #if PY_MAJOR_VERSION >= 3 return module; diff --git a/theory/vpf/countspheres_impl.c.src b/theory/vpf/countspheres_impl.c.src index e9f3d19a..97616d2a 100644 --- a/theory/vpf/countspheres_impl.c.src +++ b/theory/vpf/countspheres_impl.c.src @@ -60,7 +60,7 @@ vpf_func_ptr_DOUBLE vpf_driver_DOUBLE(const struct config_options *options) const int num_functions = sizeof(allfunctions)/sizeof(void *); const int fallback_offset = num_functions - 1; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif int curr_offset = 0; /* Check for AVX512F support */ diff --git a/theory/wp/countpairs_wp_impl.c.src b/theory/wp/countpairs_wp_impl.c.src index d54725d5..49f310b5 100644 --- a/theory/wp/countpairs_wp_impl.c.src +++ b/theory/wp/countpairs_wp_impl.c.src @@ -20,7 +20,7 @@ #include "utils.h" //all of the utilities #include "progressbar.h" //for the progressbar -#include "cpu_features.h" //prototype instrset_detect required for runtime dispatch +#include "cpu_features.h" //prototype get_max_usable_isa required for runtime dispatch #include "gridlink_impl_DOUBLE.h"//function proto-type for gridlink #include "gridlink_utils_DOUBLE.h" //for associated helper routines @@ -66,7 +66,7 @@ wp_func_ptr_DOUBLE wp_driver_DOUBLE(const struct config_options *options) const int num_functions = sizeof(allfunctions)/sizeof(void *); const int fallback_offset = num_functions - 1; #if defined(__AVX512F__) || defined(__AVX2__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif int curr_offset = 0; diff --git a/theory/xi/countpairs_xi_impl.c.src b/theory/xi/countpairs_xi_impl.c.src index 8523fbd3..d1f01115 100644 --- a/theory/xi/countpairs_xi_impl.c.src +++ b/theory/xi/countpairs_xi_impl.c.src @@ -20,7 +20,7 @@ #include "defs.h" #include "utils.h" //all of the utilities #include "progressbar.h" //for the progressbar -#include "cpu_features.h" //prototype instrset_detect required for runtime dispatch +#include "cpu_features.h" //prototype get_max_usable_isa required for runtime dispatch #include "gridlink_impl_DOUBLE.h"//function proto-type for gridlink #include "gridlink_utils_DOUBLE.h" //for associated helper routines @@ -61,7 +61,7 @@ xi_func_ptr_DOUBLE xi_driver_DOUBLE(const struct config_options *options) const int num_functions = sizeof(allfunctions)/sizeof(void *); const int fallback_offset = num_functions - 1; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - const int highest_isa = instrset_detect(); + const int highest_isa = get_max_usable_isa(); #endif int curr_offset = 0; /* Check for AVX512F support */ diff --git a/utils/cpu_features.c b/utils/cpu_features.c index ba857967..0c06487d 100644 --- a/utils/cpu_features.c +++ b/utils/cpu_features.c @@ -9,8 +9,13 @@ */ #include "cpu_features.h" +#include -int instrset_detect(void) +// Use CPUID to detect what instruction sets the CPU supports +// The compiler may not support all these features though! +// Use get_max_usable_isa() to find the max ISA supported +// by both the compiler and CPU +int runtime_instrset_detect(void) { static int iset = -1; // remember value for next call if (iset >= 0) { @@ -26,54 +31,119 @@ int instrset_detect(void) if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE -#ifdef __SSE__ iset = 1; // 1: SSE supported -#endif if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2 -#ifdef __SSE2__ iset = 2; // 2: SSE2 supported -#endif if ((abcd[2] & (1 << 0)) == 0) return iset; // no SSE3 -#ifdef __SSE3__ iset = 3; // 3: SSE3 supported -#endif if ((abcd[2] & (1 << 9)) == 0) return iset; // no SSSE3 -#ifdef __SSSE3__ iset = 4; // 4: SSSE3 supported -#endif if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1 -#ifdef __SSE4_1__ iset = 5; // 5: SSE4.1 supported -#endif if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2 -#ifdef __SSE4_2__ iset = 6; // 6: SSE4.2 supported -#endif if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE if ((xgetbv(0) & 6) != 6) return iset; // AVX not enabled in O.S. if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX -#ifdef __AVX__ iset = 7; // 7: AVX supported -#endif cpuid(abcd, 7); // call cpuid leaf 7 for feature flags if ((abcd[1] & (1 << 5)) == 0) return iset; // no AVX2 -#ifdef __AVX2__ iset = 8; // 8: AVX2 supported -#endif cpuid(abcd, 0xD); // call cpuid leaf 0xD for feature flags if ((abcd[0] & 0x60) != 0x60) return iset; // no AVX512 -#ifdef __AVX512F__ iset = 9; // 9: AVX512F supported -#endif return iset; } +// Report the max ISA supported by both the CPU and compiler +int get_max_usable_isa(void) +{ + static int iset = -1; // remember value for next call + if (iset >= 0) { + return iset; // called before + } + iset = runtime_instrset_detect(); + + switch(iset){ + case 9: +#ifdef __AVX512F__ + iset = 9; + break; +#elif defined(GAS_BUG_DISABLE_AVX512) + fprintf(stderr, "[Warning] AVX512 is disabled due to a GNU Assembler bug. Upgrade to binutils >= 2.32 to fix this.\n"); +#else + fprintf(stderr, "[Warning] The CPU supports AVX512 but the compiler does not. Can you try another compiler?\n"); +#endif + case 8: +#ifdef __AVX2__ + iset = 8; + break; +#else + fprintf(stderr, "[Warning] The CPU supports AVX2 but the compiler does not. Can you try another compiler?\n"); +#endif + case 7: +#ifdef __AVX__ + iset = 7; + break; +#else + fprintf(stderr, "[Warning] The CPU supports AVX but the compiler does not. Can you try another compiler?\n"); +#endif + case 6: +#ifdef __SSE4_2__ + iset = 6; + break; +#else + fprintf(stderr, "[Warning] The CPU supports SSE4.2 but the compiler does not. Can you try another compiler?\n"); +#endif + case 5: +#ifdef __SSE4_1__ + iset = 5; + break; +#else + fprintf(stderr, "[Warning] The CPU supports SSE4.1 but the compiler does not. Can you try another compiler?\n"); +#endif + case 4: +#ifdef __SSSE3__ + iset = 4; + break; +#else + fprintf(stderr, "[Warning] The CPU supports SSSE3 but the compiler does not. Can you try another compiler?\n"); +#endif + case 3: +#ifdef __SSE3__ + iset = 3; + break; +#else + fprintf(stderr, "[Warning] The CPU supports SSE3 but the compiler does not. Can you try another compiler?\n"); +#endif + case 2: +#ifdef __SSE2__ + iset = 2; + break; +#else + fprintf(stderr, "[Warning] The CPU supports SSE2 but the compiler does not. Can you try another compiler?\n"); +#endif + case 1: +#ifdef __SSE__ + iset = 1; + break; +#else + fprintf(stderr, "[Warning] The CPU supports SSE but the compiler does not. Can you try another compiler?\n"); +#endif + case 0: + default: + iset = 0; + break; + } + + return iset; +} diff --git a/utils/cpu_features.h b/utils/cpu_features.h index 73c906e8..de8b00fa 100644 --- a/utils/cpu_features.h +++ b/utils/cpu_features.h @@ -67,7 +67,8 @@ static inline int64_t xgetbv (int ctr) { #endif } -extern int instrset_detect(void); +extern int runtime_instrset_detect(void); +extern int get_max_usable_isa(void); #ifdef __cplusplus } From 5c7af95f1dbe31254d9e60f3ce22925088491d94 Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Mon, 14 Oct 2019 16:27:25 +0000 Subject: [PATCH 4/9] Use the highest supported ISA <= the requested version. Needs porting to other correlation functions. --- theory/DD/countpairs_impl.c.src | 52 ++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/theory/DD/countpairs_impl.c.src b/theory/DD/countpairs_impl.c.src index cb75b7c0..14ce55de 100644 --- a/theory/DD/countpairs_impl.c.src +++ b/theory/DD/countpairs_impl.c.src @@ -48,48 +48,52 @@ countpairs_func_ptr_DOUBLE countpairs_driver_DOUBLE(const struct config_options /* Array of function pointers */ countpairs_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - countpairs_avx512_intrinsics_DOUBLE, + countpairs_fallback_DOUBLE, +#ifdef __SSE4_2__ + countpairs_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ countpairs_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - countpairs_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + countpairs_avx512_intrinsics_DOUBLE, #endif - countpairs_fallback_DOUBLE }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_offset = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) const int highest_isa = get_max_usable_isa(); #endif int curr_offset = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_offset = curr_offset; +#ifdef __SSE4_2__ curr_offset++; + if(highest_isa >= 6) + sse_offset = curr_offset; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_offset = curr_offset; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; curr_offset++; + if(highest_isa >= 7) + avx_offset = curr_offset; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; + /* Check for AVX512F support */ + int avx512_offset = curr_offset; +#ifdef __AVX512F__ curr_offset++; + if(highest_isa >= 9) + avx512_offset = curr_offset; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_offset != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* num_functions-1 = %d-1)\n", + curr_offset, num_functions); return NULL; } @@ -113,15 +117,15 @@ countpairs_func_ptr_DOUBLE countpairs_driver_DOUBLE(const struct config_options old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) + // Must be ordered low to high, since higher ISA may be aliased to lower ones if(function_dispatch == fallback_offset){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); } else if(function_dispatch == sse_offset){ fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_offset){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_offset){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { fprintf(stderr,"Unknown kernel!\n"); return NULL; From a2c3c447ea3c316acf21a2b4d656db5b366986dc Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Mon, 2 Dec 2019 18:38:43 -0500 Subject: [PATCH 5/9] * Rename isa_offset to isa_index in kernel dispatch * Use ISA enums instead of magic numbers * Fix "instrinsics" typo * Port new ISA dispatch to all CFs --- .../countpairs_rp_pi_mocks_impl.c.src | 80 ++++++------ .../countpairs_s_mu_mocks_impl.c.src | 75 +++++------ .../countpairs_theta_mocks_impl.c.src | 79 ++++++------ .../countpairs_theta_mocks_kernels.c.src | 6 +- mocks/vpf_mocks/countspheres_mocks_impl.c.src | 80 +++++++----- theory/DD/countpairs_impl.c.src | 50 ++++---- theory/DDrppi/countpairs_rp_pi_impl.c.src | 79 ++++++------ theory/DDsmu/countpairs_s_mu_impl.c.src | 83 ++++++------ theory/vpf/countspheres_impl.c.src | 81 +++++++----- theory/wp/countpairs_wp_impl.c.src | 119 +++++++++--------- theory/xi/countpairs_xi_impl.c.src | 83 ++++++------ utils/cpu_features.c | 61 ++++----- utils/defs.h | 1 + 13 files changed, 473 insertions(+), 404 deletions(-) diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src index 6189cbc6..965dfe2a 100644 --- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src +++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src @@ -114,63 +114,66 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_rp_pi_mocks_driver_DOUBLE(const stru /* Array of function pointers */ countpairs_mocks_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - countpairs_rp_pi_mocks_avx512_intrinsics_DOUBLE, + countpairs_rp_pi_mocks_fallback_DOUBLE, +#ifdef __SSE4_2__ + countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ - countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE, + countpairs_rp_pi_mocks_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - countpairs_rp_pi_mocks_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + countpairs_rp_pi_mocks_avx512_intrinsics_DOUBLE, #endif - countpairs_rp_pi_mocks_fallback_DOUBLE }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) + sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) + avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) + avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch=0;//Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42): function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } - if(function_dispatch >= num_functions) { fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); @@ -180,17 +183,18 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_rp_pi_mocks_driver_DOUBLE(const stru old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ + } else if(function_dispatch == sse_index){ fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { - printf("Unknown kernel!\n"); + fprintf(stderr,"Unknown kernel!\n"); + return NULL; } } diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src index 0b2ecfb8..ddf1720f 100644 --- a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src +++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src @@ -115,63 +115,66 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_s_mu_mocks_driver_DOUBLE(const struc /* Array of function pointers */ countpairs_mocks_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - countpairs_s_mu_mocks_avx512_intrinsics_DOUBLE, + countpairs_s_mu_mocks_fallback_DOUBLE, +#ifdef __SSE4_2__ + countpairs_s_mu_mocks_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ countpairs_s_mu_mocks_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - countpairs_s_mu_mocks_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + countpairs_s_mu_mocks_avx512_intrinsics_DOUBLE, #endif - countpairs_s_mu_mocks_fallback_DOUBLE }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) + sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) + avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) + avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch=0;//Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42): function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } - if(function_dispatch >= num_functions) { fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); @@ -181,15 +184,15 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_s_mu_mocks_driver_DOUBLE(const struc old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ + } else if(function_dispatch == sse_index){ fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { fprintf(stderr,"Unknown kernel!\n"); return NULL; diff --git a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src index 9c70a156..0b30cf7f 100644 --- a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src +++ b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src @@ -94,63 +94,64 @@ countpairs_theta_mocks_func_ptr_DOUBLE countpairs_theta_mocks_driver_DOUBLE(cons /* Array of function pointers */ countpairs_theta_mocks_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - countpairs_theta_mocks_avx512_instrinsics_DOUBLE, + countpairs_theta_mocks_fallback_DOUBLE, +#ifdef __SSE4_2__ + countpairs_theta_mocks_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ - countpairs_theta_mocks_avx_instrinsics_DOUBLE, + countpairs_theta_mocks_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - countpairs_theta_mocks_sse_instrinsics_DOUBLE, +#ifdef __AVX512F__ + countpairs_theta_mocks_avx512_intrinsics_DOUBLE, #endif - countpairs_theta_mocks_fallback_DOUBLE }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) - /* Since highest_isa is only used in cases where SSE4.2 or AVX is defined, - without this protection, there will be an unnecessary function call - and an unused variable compiler warning. */ const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) + sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) + avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) + avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch=0;//Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } if(function_dispatch >= num_functions) { @@ -162,15 +163,15 @@ countpairs_theta_mocks_func_ptr_DOUBLE countpairs_theta_mocks_driver_DOUBLE(cons old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ + } else if(function_dispatch == sse_index){ fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { fprintf(stderr,"Unknown kernel!\n"); return NULL; diff --git a/mocks/DDtheta_mocks/countpairs_theta_mocks_kernels.c.src b/mocks/DDtheta_mocks/countpairs_theta_mocks_kernels.c.src index 7c3169e0..8dbc916b 100644 --- a/mocks/DDtheta_mocks/countpairs_theta_mocks_kernels.c.src +++ b/mocks/DDtheta_mocks/countpairs_theta_mocks_kernels.c.src @@ -246,7 +246,7 @@ static inline int countpairs_theta_mocks_fallback_DOUBLE(const int64_t N0, DOUBL #if defined(__SSE4_2__) #include "sse_calls.h" -static inline int countpairs_theta_mocks_sse_instrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, +static inline int countpairs_theta_mocks_sse_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0, const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, @@ -586,7 +586,7 @@ static inline int countpairs_theta_mocks_sse_instrinsics_DOUBLE(const int64_t N0 #if defined(__AVX__) #include "avx_calls.h" -static inline int countpairs_theta_mocks_avx_instrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, +static inline int countpairs_theta_mocks_avx_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0, const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, @@ -917,7 +917,7 @@ static inline int countpairs_theta_mocks_avx_instrinsics_DOUBLE(const int64_t N0 #if defined(__AVX512F__) #include "avx512_calls.h" -static inline int countpairs_theta_mocks_avx512_instrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, +static inline int countpairs_theta_mocks_avx512_intrinsics_DOUBLE(const int64_t N0, DOUBLE *x0, DOUBLE *y0, DOUBLE *z0, const weight_struct_DOUBLE *weights0, const int64_t N1, DOUBLE *x1, DOUBLE *y1, DOUBLE *z1, const weight_struct_DOUBLE *weights1, diff --git a/mocks/vpf_mocks/countspheres_mocks_impl.c.src b/mocks/vpf_mocks/countspheres_mocks_impl.c.src index 84480a5b..96671c6d 100644 --- a/mocks/vpf_mocks/countspheres_mocks_impl.c.src +++ b/mocks/vpf_mocks/countspheres_mocks_impl.c.src @@ -52,64 +52,68 @@ vpf_mocks_func_ptr_DOUBLE vpf_mocks_driver_DOUBLE(const struct config_options *o return function; } - //Seriously this is the declaration for the function pointers...here be dragons. + /* Array of function pointers */ vpf_mocks_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - vpf_mocks_avx512_intrinsics_DOUBLE, + vpf_mocks_fallback_DOUBLE, +#ifdef __SSE4_2__ + vpf_mocks_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ vpf_mocks_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - vpf_mocks_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + vpf_mocks_avx512_intrinsics_DOUBLE, #endif - vpf_mocks_fallback_DOUBLE }; + const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) + sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) + avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) + avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch=0;//Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX): function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } - if(function_dispatch >= num_functions) { fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); @@ -118,6 +122,22 @@ vpf_mocks_func_ptr_DOUBLE vpf_mocks_driver_DOUBLE(const struct config_options *o function = allfunctions[function_dispatch]; old_isa = options->instruction_set; + if(options->verbose){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ + fprintf(stderr,"Using fallback kernel\n"); + } else if(function_dispatch == sse_index){ + fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); + } else { + fprintf(stderr,"Unknown kernel!\n"); + return NULL; + } + } + return function; } diff --git a/theory/DD/countpairs_impl.c.src b/theory/DD/countpairs_impl.c.src index 14ce55de..77a89266 100644 --- a/theory/DD/countpairs_impl.c.src +++ b/theory/DD/countpairs_impl.c.src @@ -61,39 +61,39 @@ countpairs_func_ptr_DOUBLE countpairs_driver_DOUBLE(const struct config_options }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = 0; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = curr_offset; + int sse_index = curr_index; #ifdef __SSE4_2__ - curr_offset++; - if(highest_isa >= 6) - sse_offset = curr_offset; + curr_index++; + if(highest_isa >= SSE42) + sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = curr_offset; + int avx_index = curr_index; #ifdef __AVX__ - curr_offset++; - if(highest_isa >= 7) - avx_offset = curr_offset; + curr_index++; + if(highest_isa >= AVX) + avx_index = curr_index; #endif /* Check for AVX512F support */ - int avx512_offset = curr_offset; + int avx512_index = curr_index; #ifdef __AVX512F__ - curr_offset++; - if(highest_isa >= 9) - avx512_offset = curr_offset; + curr_index++; + if(highest_isa >= AVX512F) + avx512_index = curr_index; #endif - if( curr_offset != num_functions-1) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* num_functions-1 = %d-1)\n", - curr_offset, num_functions); + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } @@ -101,11 +101,11 @@ countpairs_func_ptr_DOUBLE countpairs_driver_DOUBLE(const struct config_options /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } if(function_dispatch >= num_functions) { @@ -118,13 +118,13 @@ countpairs_func_ptr_DOUBLE countpairs_driver_DOUBLE(const struct config_options if(options->verbose){ // Must be ordered low to high, since higher ISA may be aliased to lower ones - if(function_dispatch == fallback_offset){ + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == sse_offset){ + } else if(function_dispatch == sse_index){ fprintf(stderr,"Using SSE kernel\n"); - } else if(function_dispatch == avx_offset){ + } else if(function_dispatch == avx_index){ fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == avx512_offset){ + } else if(function_dispatch == avx512_index){ fprintf(stderr,"Using AVX512 kernel\n"); } else { fprintf(stderr,"Unknown kernel!\n"); diff --git a/theory/DDrppi/countpairs_rp_pi_impl.c.src b/theory/DDrppi/countpairs_rp_pi_impl.c.src index 87a9abfa..cc5934d5 100644 --- a/theory/DDrppi/countpairs_rp_pi_impl.c.src +++ b/theory/DDrppi/countpairs_rp_pi_impl.c.src @@ -45,67 +45,70 @@ countpairs_rp_pi_func_ptr_DOUBLE countpairs_rp_pi_driver_DOUBLE(const struct con return function; } - /* Array of function pointers */ countpairs_rp_pi_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - countpairs_rp_pi_avx512_intrinsics_DOUBLE, + countpairs_rp_pi_fallback_DOUBLE, +#ifdef __SSE4_2__ + countpairs_rp_pi_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ - countpairs_rp_pi_avx_intrinsics_DOUBLE, + countpairs_rp_pi_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - countpairs_rp_pi_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + countpairs_rp_pi_avx512_intrinsics_DOUBLE, #endif - countpairs_rp_pi_fallback_DOUBLE }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) + sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) + avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) + avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch=0;//Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } if(function_dispatch >= num_functions) { - fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", + fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); return NULL; } @@ -113,15 +116,15 @@ countpairs_rp_pi_func_ptr_DOUBLE countpairs_rp_pi_driver_DOUBLE(const struct con old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ + } else if(function_dispatch == sse_index){ fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { fprintf(stderr,"Unknown kernel!\n"); return NULL; diff --git a/theory/DDsmu/countpairs_s_mu_impl.c.src b/theory/DDsmu/countpairs_s_mu_impl.c.src index b88cbf89..534ec459 100644 --- a/theory/DDsmu/countpairs_s_mu_impl.c.src +++ b/theory/DDsmu/countpairs_s_mu_impl.c.src @@ -48,65 +48,68 @@ countpairs_s_mu_func_ptr_DOUBLE countpairs_s_mu_driver_DOUBLE(const struct confi /* Array of function pointers */ countpairs_s_mu_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - countpairs_s_mu_avx512_intrinsics_DOUBLE, + countpairs_s_mu_fallback_DOUBLE, +#ifdef __SSE4_2__ + countpairs_s_mu_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ - countpairs_s_mu_avx_intrinsics_DOUBLE, + countpairs_s_mu_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - countpairs_s_mu_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + countpairs_s_mu_avx512_intrinsics_DOUBLE, #endif - countpairs_s_mu_fallback_DOUBLE }; const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) + sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) + avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) + avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch=0;//Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } - if(function_dispatch >= num_functions) { - fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", + fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); return NULL; } @@ -114,17 +117,17 @@ countpairs_s_mu_func_ptr_DOUBLE countpairs_s_mu_driver_DOUBLE(const struct confi old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr, "Using AVX512F kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr, "Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ - fprintf(stderr, "Using SSE kernel\n"); + } else if(function_dispatch == sse_index){ + fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { - fprintf(stderr, "Unknown kernel!\n"); + fprintf(stderr,"Unknown kernel!\n"); return NULL; } } diff --git a/theory/vpf/countspheres_impl.c.src b/theory/vpf/countspheres_impl.c.src index 97616d2a..c890a747 100644 --- a/theory/vpf/countspheres_impl.c.src +++ b/theory/vpf/countspheres_impl.c.src @@ -44,63 +44,68 @@ vpf_func_ptr_DOUBLE vpf_driver_DOUBLE(const struct config_options *options) return function; } - //Seriously this is the declaration for the function pointers...here be dragons. +/* Array of function pointers */ vpf_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - vpf_avx512_intrinsics_DOUBLE, + vpf_fallback_DOUBLE, +#ifdef __SSE4_2__ + vpf_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ vpf_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - vpf_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + vpf_avx512_intrinsics_DOUBLE, #endif - vpf_fallback_DOUBLE }; + const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + int curr_index = 0; + + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) + sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) + avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) + avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch=0;//Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX):function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } - if(function_dispatch >= num_functions) { fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); @@ -109,6 +114,22 @@ vpf_func_ptr_DOUBLE vpf_driver_DOUBLE(const struct config_options *options) function = allfunctions[function_dispatch]; old_isa = options->instruction_set; + if(options->verbose){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ + fprintf(stderr,"Using fallback kernel\n"); + } else if(function_dispatch == sse_index){ + fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); + } else { + fprintf(stderr,"Unknown kernel!\n"); + return NULL; + } + } + return function; } diff --git a/theory/wp/countpairs_wp_impl.c.src b/theory/wp/countpairs_wp_impl.c.src index 49f310b5..54ebeb4c 100644 --- a/theory/wp/countpairs_wp_impl.c.src +++ b/theory/wp/countpairs_wp_impl.c.src @@ -47,72 +47,78 @@ wp_func_ptr_DOUBLE wp_driver_DOUBLE(const struct config_options *options) return function; } - //Seriously this is the declaration for the function pointers + /* Array of function pointers */ wp_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - wp_avx512_intrinsics_DOUBLE, -#endif -#ifdef __AVX2__ - wp_avx2_intrinsics_DOUBLE, + wp_fallback_DOUBLE, +#ifdef __SSE4_2__ + wp_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ - wp_avx_intrinsics_DOUBLE, + wp_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - wp_sse_intrinsics_DOUBLE, +#ifdef __AVX2__ + wp_avx2_intrinsics_DOUBLE, +#endif +#ifdef __AVX512F__ + wp_avx512_intrinsics_DOUBLE, #endif - wp_fallback_DOUBLE }; + const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX2__) || defined(__AVX__) || defined(__SSE4_2__) const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; + int curr_index = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; -#endif - - /* Check for AVX2 support */ - int avx2_offset = fallback_offset; -#ifdef __AVX2__ - avx2_offset = highest_isa >= 8 ? curr_offset:fallback_offset; - curr_offset++; + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) + sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) + avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Now check if AVX2 is supported by the CPU */ + int avx2_index = curr_index; +#ifdef __AVX2__ + curr_index++; + if(highest_isa >= AVX2) + avx2_index = curr_index; +#endif + + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) + avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } int function_dispatch=0;//Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { - switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; - case(AVX2):function_dispatch=avx2_offset;break; - case(AVX):function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; - } + switch(options->instruction_set) { + case(AVX512F):function_dispatch=avx512_index;break; + case(AVX2):function_dispatch=avx2_index;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; + } } if(function_dispatch >= num_functions) { fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", @@ -123,20 +129,21 @@ wp_func_ptr_DOUBLE wp_driver_DOUBLE(const struct config_options *options) old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ - fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx2_offset){ - fprintf(stderr,"Using AVX2 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ - fprintf(stderr,"Using SSE4 kernel\n"); - } else { - printf("Unknown kernel!\n"); - } + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ + fprintf(stderr,"Using fallback kernel\n"); + } else if(function_dispatch == sse_index){ + fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx2_index){ + fprintf(stderr,"Using AVX2 kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); + } else { + fprintf(stderr,"Unknown kernel!\n"); + return NULL; + } } return function; diff --git a/theory/xi/countpairs_xi_impl.c.src b/theory/xi/countpairs_xi_impl.c.src index d1f01115..3fb00d09 100644 --- a/theory/xi/countpairs_xi_impl.c.src +++ b/theory/xi/countpairs_xi_impl.c.src @@ -45,63 +45,68 @@ xi_func_ptr_DOUBLE xi_driver_DOUBLE(const struct config_options *options) return function; } - //Seriously this is the declaration for the function pointers...here be dragons. + /* Array of function pointers */ xi_func_ptr_DOUBLE allfunctions[] = { -#ifdef __AVX512F__ - xi_avx512_intrinsics_DOUBLE, + xi_fallback_DOUBLE, +#ifdef __SSE4_2__ + xi_sse_intrinsics_DOUBLE, #endif #ifdef __AVX__ - xi_avx_intrinsics_DOUBLE, + xi_avx_intrinsics_DOUBLE, #endif -#ifdef __SSE4_2__ - xi_sse_intrinsics_DOUBLE, +#ifdef __AVX512F__ + xi_avx512_intrinsics_DOUBLE, #endif - xi_fallback_DOUBLE }; + const int num_functions = sizeof(allfunctions)/sizeof(void *); - const int fallback_offset = num_functions - 1; + const int fallback_index = 0; #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_2__) const int highest_isa = get_max_usable_isa(); #endif - int curr_offset = 0; - /* Check for AVX512F support */ - int avx512_offset = fallback_offset; -#ifdef __AVX512F__ - avx512_offset = highest_isa >= 9 ? curr_offset:fallback_offset; - curr_offset++; + int curr_index = 0; + + /* Is the SSE function supported at runtime and enabled at compile-time?*/ + int sse_index = curr_index; +#ifdef __SSE4_2__ + curr_index++; + if(highest_isa >= SSE42) + sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ - int avx_offset = fallback_offset; + int avx_index = curr_index; #ifdef __AVX__ - avx_offset = highest_isa >= 7 ? curr_offset:fallback_offset; - curr_offset++; + curr_index++; + if(highest_isa >= AVX) + avx_index = curr_index; #endif - /* Is the SSE function supported at runtime and enabled at compile-time?*/ - int sse_offset = fallback_offset; -#ifdef __SSE4_2__ - sse_offset = highest_isa >= 6 ? curr_offset:fallback_offset; - curr_offset++; + /* Check for AVX512F support */ + int avx512_index = curr_index; +#ifdef __AVX512F__ + curr_index++; + if(highest_isa >= AVX512F) + avx512_index = curr_index; #endif - if( curr_offset != fallback_offset) { - fprintf(stderr,"ERROR: Bug in code (current offset = %d *should equal* fallback function offset = %d)\n", - curr_offset, fallback_offset); + + if( curr_index != num_functions-1) { + fprintf(stderr,"ERROR: Bug in code (current index = %d *should equal* num_functions-1 = %d-1)\n", + curr_index, num_functions); return NULL; } - int function_dispatch=0; + int function_dispatch=0;//Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { - case(AVX512F):function_dispatch=avx512_offset;break; + case(AVX512F):function_dispatch=avx512_index;break; case(AVX2): - case(AVX): function_dispatch=avx_offset;break; - case(SSE42):function_dispatch=sse_offset;break; - default:function_dispatch=fallback_offset;break; + case(AVX):function_dispatch=avx_index;break; + case(SSE42):function_dispatch=sse_index;break; + default:function_dispatch=fallback_index;break; } } - if(function_dispatch >= num_functions) { fprintf(stderr,"In %s> ERROR: Could not resolve the correct function.\n Function index = %d must lie between [0, %d)\n", __FUNCTION__, function_dispatch, num_functions); @@ -111,17 +116,17 @@ xi_func_ptr_DOUBLE xi_driver_DOUBLE(const struct config_options *options) old_isa = options->instruction_set; if(options->verbose){ - // This must be first (AVX/SSE may be aliased to fallback) - if(function_dispatch == fallback_offset){ + // Must be ordered low to high, since higher ISA may be aliased to lower ones + if(function_dispatch == fallback_index){ fprintf(stderr,"Using fallback kernel\n"); - } else if(function_dispatch == avx512_offset){ - fprintf(stderr,"Using AVX512 kernel\n"); - } else if(function_dispatch == avx_offset){ - fprintf(stderr,"Using AVX kernel\n"); - } else if(function_dispatch == sse_offset){ + } else if(function_dispatch == sse_index){ fprintf(stderr,"Using SSE kernel\n"); + } else if(function_dispatch == avx_index){ + fprintf(stderr,"Using AVX kernel\n"); + } else if(function_dispatch == avx512_index){ + fprintf(stderr,"Using AVX512 kernel\n"); } else { - fprintf(stderr, "Unknown kernel!\n"); + fprintf(stderr,"Unknown kernel!\n"); return NULL; } } diff --git a/utils/cpu_features.c b/utils/cpu_features.c index 0c06487d..b1937809 100644 --- a/utils/cpu_features.c +++ b/utils/cpu_features.c @@ -9,6 +9,7 @@ */ #include "cpu_features.h" +#include "defs.h" #include // Use CPUID to detect what instruction sets the CPU supports @@ -21,7 +22,7 @@ int runtime_instrset_detect(void) if (iset >= 0) { return iset; // called before } - iset = 0; // default value + iset = FALLBACK; // default value int abcd[4] = {0,0,0,0}; // cpuid results cpuid(abcd, 0); // call cpuid function 0 if (abcd[0] == 0) return iset; // no further cpuid function supported @@ -31,36 +32,36 @@ int runtime_instrset_detect(void) if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE - iset = 1; // 1: SSE supported + iset = SSE; // 1: SSE supported if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2 - iset = 2; // 2: SSE2 supported + iset = SSE2; // 2: SSE2 supported if ((abcd[2] & (1 << 0)) == 0) return iset; // no SSE3 - iset = 3; // 3: SSE3 supported + iset = SSE3; // 3: SSE3 supported if ((abcd[2] & (1 << 9)) == 0) return iset; // no SSSE3 - iset = 4; // 4: SSSE3 supported + iset = SSSE3; // 4: SSSE3 supported if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1 - iset = 5; // 5: SSE4.1 supported + iset = SSE4; // 5: SSE4.1 supported if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2 - iset = 6; // 6: SSE4.2 supported + iset = SSE42; // 6: SSE4.2 supported if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE if ((xgetbv(0) & 6) != 6) return iset; // AVX not enabled in O.S. if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX - iset = 7; // 7: AVX supported + iset = AVX; // 7: AVX supported cpuid(abcd, 7); // call cpuid leaf 7 for feature flags if ((abcd[1] & (1 << 5)) == 0) return iset; // no AVX2 - iset = 8; // 8: AVX2 supported + iset = AVX2; // 8: AVX2 supported cpuid(abcd, 0xD); // call cpuid leaf 0xD for feature flags if ((abcd[0] & 0x60) != 0x60) return iset; // no AVX512 - iset = 9; // 9: AVX512F supported + iset = AVX512F; // 9: AVX512F supported return iset; } @@ -74,74 +75,74 @@ int get_max_usable_isa(void) iset = runtime_instrset_detect(); switch(iset){ - case 9: + case AVX512F: #ifdef __AVX512F__ - iset = 9; + iset = AVX512F; break; #elif defined(GAS_BUG_DISABLE_AVX512) fprintf(stderr, "[Warning] AVX512 is disabled due to a GNU Assembler bug. Upgrade to binutils >= 2.32 to fix this.\n"); #else fprintf(stderr, "[Warning] The CPU supports AVX512 but the compiler does not. Can you try another compiler?\n"); #endif - case 8: + case AVX2: #ifdef __AVX2__ - iset = 8; + iset = AVX2; break; #else fprintf(stderr, "[Warning] The CPU supports AVX2 but the compiler does not. Can you try another compiler?\n"); #endif - case 7: + case AVX: #ifdef __AVX__ - iset = 7; + iset = AVX; break; #else fprintf(stderr, "[Warning] The CPU supports AVX but the compiler does not. Can you try another compiler?\n"); #endif - case 6: + case SSE42: #ifdef __SSE4_2__ - iset = 6; + iset = SSE42; break; #else fprintf(stderr, "[Warning] The CPU supports SSE4.2 but the compiler does not. Can you try another compiler?\n"); #endif - case 5: + case SSE4: #ifdef __SSE4_1__ - iset = 5; + iset = SSE4; break; #else fprintf(stderr, "[Warning] The CPU supports SSE4.1 but the compiler does not. Can you try another compiler?\n"); #endif - case 4: + case SSSE3: #ifdef __SSSE3__ - iset = 4; + iset = SSSE3; break; #else fprintf(stderr, "[Warning] The CPU supports SSSE3 but the compiler does not. Can you try another compiler?\n"); #endif - case 3: + case SSE3: #ifdef __SSE3__ - iset = 3; + iset = SSE3; break; #else fprintf(stderr, "[Warning] The CPU supports SSE3 but the compiler does not. Can you try another compiler?\n"); #endif - case 2: + case SSE2: #ifdef __SSE2__ - iset = 2; + iset = SSE2; break; #else fprintf(stderr, "[Warning] The CPU supports SSE2 but the compiler does not. Can you try another compiler?\n"); #endif - case 1: + case SSE: #ifdef __SSE__ - iset = 1; + iset = SSE; break; #else fprintf(stderr, "[Warning] The CPU supports SSE but the compiler does not. Can you try another compiler?\n"); #endif - case 0: + case FALLBACK: default: - iset = 0; + iset = FALLBACK; break; } diff --git a/utils/defs.h b/utils/defs.h index 5674c601..fbb0027e 100644 --- a/utils/defs.h +++ b/utils/defs.h @@ -8,6 +8,7 @@ #pragma once +#include #include #include #include From f58a40baf612d1156e72527de4c26596b7e9fe4f Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Tue, 3 Dec 2019 17:49:39 -0500 Subject: [PATCH 6/9] Improve warning message --- utils/cpu_features.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/cpu_features.c b/utils/cpu_features.c index b1937809..85ec3d06 100644 --- a/utils/cpu_features.c +++ b/utils/cpu_features.c @@ -80,9 +80,9 @@ int get_max_usable_isa(void) iset = AVX512F; break; #elif defined(GAS_BUG_DISABLE_AVX512) - fprintf(stderr, "[Warning] AVX512 is disabled due to a GNU Assembler bug. Upgrade to binutils >= 2.32 to fix this.\n"); + fprintf(stderr, "[Warning] AVX512F is disabled due to a GNU Assembler bug. Upgrade to binutils >= 2.32 to fix this.\n"); #else - fprintf(stderr, "[Warning] The CPU supports AVX512 but the compiler does not. Can you try another compiler?\n"); + fprintf(stderr, "[Warning] The CPU supports AVX512F but the compiler does not. Can you try another compiler?\n"); #endif case AVX2: #ifdef __AVX2__ From 142a75aaf4a3bbb3b209731b731de541dbb30f86 Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Tue, 3 Dec 2019 18:01:42 -0500 Subject: [PATCH 7/9] Put one-line if statements on a single line --- mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src | 9 +++------ mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src | 9 +++------ .../DDtheta_mocks/countpairs_theta_mocks_impl.c.src | 9 +++------ mocks/vpf_mocks/countspheres_mocks_impl.c.src | 9 +++------ theory/DD/countpairs_impl.c.src | 9 +++------ theory/DDrppi/countpairs_rp_pi_impl.c.src | 9 +++------ theory/DDsmu/countpairs_s_mu_impl.c.src | 9 +++------ theory/vpf/countspheres_impl.c.src | 9 +++------ theory/wp/countpairs_wp_impl.c.src | 12 ++++-------- theory/xi/countpairs_xi_impl.c.src | 9 +++------ 10 files changed, 31 insertions(+), 62 deletions(-) diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src index 965dfe2a..3fb48ef6 100644 --- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src +++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src @@ -137,24 +137,21 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_rp_pi_mocks_driver_DOUBLE(const stru int sse_index = curr_index; #ifdef __SSE4_2__ curr_index++; - if(highest_isa >= SSE42) - sse_index = curr_index; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ int avx_index = curr_index; #ifdef __AVX__ curr_index++; - if(highest_isa >= AVX) - avx_index = curr_index; + if(highest_isa >= AVX) avx_index = curr_index; #endif /* Check for AVX512F support */ int avx512_index = curr_index; #ifdef __AVX512F__ curr_index++; - if(highest_isa >= AVX512F) - avx512_index = curr_index; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif if( curr_index != num_functions-1) { diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src index ddf1720f..0770ffc3 100644 --- a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src +++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src @@ -138,24 +138,21 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_s_mu_mocks_driver_DOUBLE(const struc int sse_index = curr_index; #ifdef __SSE4_2__ curr_index++; - if(highest_isa >= SSE42) - sse_index = curr_index; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ int avx_index = curr_index; #ifdef __AVX__ curr_index++; - if(highest_isa >= AVX) - avx_index = curr_index; + if(highest_isa >= AVX) avx_index = curr_index; #endif /* Check for AVX512F support */ int avx512_index = curr_index; #ifdef __AVX512F__ curr_index++; - if(highest_isa >= AVX512F) - avx512_index = curr_index; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif if( curr_index != num_functions-1) { diff --git a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src index 0b30cf7f..2730e9b0 100644 --- a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src +++ b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src @@ -117,24 +117,21 @@ countpairs_theta_mocks_func_ptr_DOUBLE countpairs_theta_mocks_driver_DOUBLE(cons int sse_index = curr_index; #ifdef __SSE4_2__ curr_index++; - if(highest_isa >= SSE42) - sse_index = curr_index; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ int avx_index = curr_index; #ifdef __AVX__ curr_index++; - if(highest_isa >= AVX) - avx_index = curr_index; + if(highest_isa >= AVX) avx_index = curr_index; #endif /* Check for AVX512F support */ int avx512_index = curr_index; #ifdef __AVX512F__ curr_index++; - if(highest_isa >= AVX512F) - avx512_index = curr_index; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif if( curr_index != num_functions-1) { diff --git a/mocks/vpf_mocks/countspheres_mocks_impl.c.src b/mocks/vpf_mocks/countspheres_mocks_impl.c.src index 96671c6d..8aa0f28a 100644 --- a/mocks/vpf_mocks/countspheres_mocks_impl.c.src +++ b/mocks/vpf_mocks/countspheres_mocks_impl.c.src @@ -77,24 +77,21 @@ vpf_mocks_func_ptr_DOUBLE vpf_mocks_driver_DOUBLE(const struct config_options *o int sse_index = curr_index; #ifdef __SSE4_2__ curr_index++; - if(highest_isa >= SSE42) - sse_index = curr_index; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ int avx_index = curr_index; #ifdef __AVX__ curr_index++; - if(highest_isa >= AVX) - avx_index = curr_index; + if(highest_isa >= AVX) avx_index = curr_index; #endif /* Check for AVX512F support */ int avx512_index = curr_index; #ifdef __AVX512F__ curr_index++; - if(highest_isa >= AVX512F) - avx512_index = curr_index; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif if( curr_index != num_functions-1) { diff --git a/theory/DD/countpairs_impl.c.src b/theory/DD/countpairs_impl.c.src index 77a89266..7906d02a 100644 --- a/theory/DD/countpairs_impl.c.src +++ b/theory/DD/countpairs_impl.c.src @@ -71,24 +71,21 @@ countpairs_func_ptr_DOUBLE countpairs_driver_DOUBLE(const struct config_options int sse_index = curr_index; #ifdef __SSE4_2__ curr_index++; - if(highest_isa >= SSE42) - sse_index = curr_index; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ int avx_index = curr_index; #ifdef __AVX__ curr_index++; - if(highest_isa >= AVX) - avx_index = curr_index; + if(highest_isa >= AVX) avx_index = curr_index; #endif /* Check for AVX512F support */ int avx512_index = curr_index; #ifdef __AVX512F__ curr_index++; - if(highest_isa >= AVX512F) - avx512_index = curr_index; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif if( curr_index != num_functions-1) { diff --git a/theory/DDrppi/countpairs_rp_pi_impl.c.src b/theory/DDrppi/countpairs_rp_pi_impl.c.src index cc5934d5..106fa6a3 100644 --- a/theory/DDrppi/countpairs_rp_pi_impl.c.src +++ b/theory/DDrppi/countpairs_rp_pi_impl.c.src @@ -70,24 +70,21 @@ countpairs_rp_pi_func_ptr_DOUBLE countpairs_rp_pi_driver_DOUBLE(const struct con int sse_index = curr_index; #ifdef __SSE4_2__ curr_index++; - if(highest_isa >= SSE42) - sse_index = curr_index; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ int avx_index = curr_index; #ifdef __AVX__ curr_index++; - if(highest_isa >= AVX) - avx_index = curr_index; + if(highest_isa >= AVX) avx_index = curr_index; #endif /* Check for AVX512F support */ int avx512_index = curr_index; #ifdef __AVX512F__ curr_index++; - if(highest_isa >= AVX512F) - avx512_index = curr_index; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif if( curr_index != num_functions-1) { diff --git a/theory/DDsmu/countpairs_s_mu_impl.c.src b/theory/DDsmu/countpairs_s_mu_impl.c.src index 534ec459..9842fc25 100644 --- a/theory/DDsmu/countpairs_s_mu_impl.c.src +++ b/theory/DDsmu/countpairs_s_mu_impl.c.src @@ -71,24 +71,21 @@ countpairs_s_mu_func_ptr_DOUBLE countpairs_s_mu_driver_DOUBLE(const struct confi int sse_index = curr_index; #ifdef __SSE4_2__ curr_index++; - if(highest_isa >= SSE42) - sse_index = curr_index; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ int avx_index = curr_index; #ifdef __AVX__ curr_index++; - if(highest_isa >= AVX) - avx_index = curr_index; + if(highest_isa >= AVX) avx_index = curr_index; #endif /* Check for AVX512F support */ int avx512_index = curr_index; #ifdef __AVX512F__ curr_index++; - if(highest_isa >= AVX512F) - avx512_index = curr_index; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif if( curr_index != num_functions-1) { diff --git a/theory/vpf/countspheres_impl.c.src b/theory/vpf/countspheres_impl.c.src index c890a747..12b3165a 100644 --- a/theory/vpf/countspheres_impl.c.src +++ b/theory/vpf/countspheres_impl.c.src @@ -69,24 +69,21 @@ vpf_func_ptr_DOUBLE vpf_driver_DOUBLE(const struct config_options *options) int sse_index = curr_index; #ifdef __SSE4_2__ curr_index++; - if(highest_isa >= SSE42) - sse_index = curr_index; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ int avx_index = curr_index; #ifdef __AVX__ curr_index++; - if(highest_isa >= AVX) - avx_index = curr_index; + if(highest_isa >= AVX) avx_index = curr_index; #endif /* Check for AVX512F support */ int avx512_index = curr_index; #ifdef __AVX512F__ curr_index++; - if(highest_isa >= AVX512F) - avx512_index = curr_index; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif if( curr_index != num_functions-1) { diff --git a/theory/wp/countpairs_wp_impl.c.src b/theory/wp/countpairs_wp_impl.c.src index 54ebeb4c..28553ea8 100644 --- a/theory/wp/countpairs_wp_impl.c.src +++ b/theory/wp/countpairs_wp_impl.c.src @@ -75,32 +75,28 @@ wp_func_ptr_DOUBLE wp_driver_DOUBLE(const struct config_options *options) int sse_index = curr_index; #ifdef __SSE4_2__ curr_index++; - if(highest_isa >= SSE42) - sse_index = curr_index; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ int avx_index = curr_index; #ifdef __AVX__ curr_index++; - if(highest_isa >= AVX) - avx_index = curr_index; + if(highest_isa >= AVX) avx_index = curr_index; #endif /* Now check if AVX2 is supported by the CPU */ int avx2_index = curr_index; #ifdef __AVX2__ curr_index++; - if(highest_isa >= AVX2) - avx2_index = curr_index; + if(highest_isa >= AVX2) avx2_index = curr_index; #endif /* Check for AVX512F support */ int avx512_index = curr_index; #ifdef __AVX512F__ curr_index++; - if(highest_isa >= AVX512F) - avx512_index = curr_index; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif if( curr_index != num_functions-1) { diff --git a/theory/xi/countpairs_xi_impl.c.src b/theory/xi/countpairs_xi_impl.c.src index 3fb00d09..2925377f 100644 --- a/theory/xi/countpairs_xi_impl.c.src +++ b/theory/xi/countpairs_xi_impl.c.src @@ -70,24 +70,21 @@ xi_func_ptr_DOUBLE xi_driver_DOUBLE(const struct config_options *options) int sse_index = curr_index; #ifdef __SSE4_2__ curr_index++; - if(highest_isa >= SSE42) - sse_index = curr_index; + if(highest_isa >= SSE42) sse_index = curr_index; #endif /* Now check if AVX is supported by the CPU */ int avx_index = curr_index; #ifdef __AVX__ curr_index++; - if(highest_isa >= AVX) - avx_index = curr_index; + if(highest_isa >= AVX) avx_index = curr_index; #endif /* Check for AVX512F support */ int avx512_index = curr_index; #ifdef __AVX512F__ curr_index++; - if(highest_isa >= AVX512F) - avx512_index = curr_index; + if(highest_isa >= AVX512F) avx512_index = curr_index; #endif if( curr_index != num_functions-1) { From aa57a8a82f77033baab2ced775205d23ca728aeb Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Wed, 4 Dec 2019 10:28:30 -0500 Subject: [PATCH 8/9] Fix default ISA for command line and C --- mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src | 2 +- mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src | 2 +- mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src | 2 +- mocks/vpf_mocks/countspheres_mocks_impl.c.src | 2 +- theory/DD/countpairs_impl.c.src | 2 +- theory/DDrppi/countpairs_rp_pi_impl.c.src | 2 +- theory/DDsmu/countpairs_s_mu_impl.c.src | 2 +- theory/vpf/countspheres_impl.c.src | 2 +- theory/wp/countpairs_wp_impl.c.src | 2 +- theory/xi/countpairs_xi_impl.c.src | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src index 3fb48ef6..84fe2dda 100644 --- a/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src +++ b/mocks/DDrppi_mocks/countpairs_rp_pi_mocks_impl.c.src @@ -160,7 +160,7 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_rp_pi_mocks_driver_DOUBLE(const stru return NULL; } - int function_dispatch=0;//Set default to fastest available + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { diff --git a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src index 0770ffc3..b52a67c4 100644 --- a/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src +++ b/mocks/DDsmu_mocks/countpairs_s_mu_mocks_impl.c.src @@ -161,7 +161,7 @@ countpairs_mocks_func_ptr_DOUBLE countpairs_s_mu_mocks_driver_DOUBLE(const struc return NULL; } - int function_dispatch=0;//Set default to fastest available + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { diff --git a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src index 2730e9b0..1fd759b2 100644 --- a/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src +++ b/mocks/DDtheta_mocks/countpairs_theta_mocks_impl.c.src @@ -140,7 +140,7 @@ countpairs_theta_mocks_func_ptr_DOUBLE countpairs_theta_mocks_driver_DOUBLE(cons return NULL; } - int function_dispatch=0;//Set default to fastest available + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { diff --git a/mocks/vpf_mocks/countspheres_mocks_impl.c.src b/mocks/vpf_mocks/countspheres_mocks_impl.c.src index 8aa0f28a..6b55081b 100644 --- a/mocks/vpf_mocks/countspheres_mocks_impl.c.src +++ b/mocks/vpf_mocks/countspheres_mocks_impl.c.src @@ -100,7 +100,7 @@ vpf_mocks_func_ptr_DOUBLE vpf_mocks_driver_DOUBLE(const struct config_options *o return NULL; } - int function_dispatch=0;//Set default to fastest available + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { diff --git a/theory/DD/countpairs_impl.c.src b/theory/DD/countpairs_impl.c.src index 7906d02a..0d37dda3 100644 --- a/theory/DD/countpairs_impl.c.src +++ b/theory/DD/countpairs_impl.c.src @@ -94,7 +94,7 @@ countpairs_func_ptr_DOUBLE countpairs_driver_DOUBLE(const struct config_options return NULL; } - int function_dispatch=0;//Set default to fastest available + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { diff --git a/theory/DDrppi/countpairs_rp_pi_impl.c.src b/theory/DDrppi/countpairs_rp_pi_impl.c.src index 106fa6a3..32564125 100644 --- a/theory/DDrppi/countpairs_rp_pi_impl.c.src +++ b/theory/DDrppi/countpairs_rp_pi_impl.c.src @@ -93,7 +93,7 @@ countpairs_rp_pi_func_ptr_DOUBLE countpairs_rp_pi_driver_DOUBLE(const struct con return NULL; } - int function_dispatch=0;//Set default to fastest available + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { diff --git a/theory/DDsmu/countpairs_s_mu_impl.c.src b/theory/DDsmu/countpairs_s_mu_impl.c.src index 9842fc25..09ad5b60 100644 --- a/theory/DDsmu/countpairs_s_mu_impl.c.src +++ b/theory/DDsmu/countpairs_s_mu_impl.c.src @@ -94,7 +94,7 @@ countpairs_s_mu_func_ptr_DOUBLE countpairs_s_mu_driver_DOUBLE(const struct confi return NULL; } - int function_dispatch=0;//Set default to fastest available + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { diff --git a/theory/vpf/countspheres_impl.c.src b/theory/vpf/countspheres_impl.c.src index 12b3165a..fd144a57 100644 --- a/theory/vpf/countspheres_impl.c.src +++ b/theory/vpf/countspheres_impl.c.src @@ -92,7 +92,7 @@ vpf_func_ptr_DOUBLE vpf_driver_DOUBLE(const struct config_options *options) return NULL; } - int function_dispatch=0;//Set default to fastest available + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { diff --git a/theory/wp/countpairs_wp_impl.c.src b/theory/wp/countpairs_wp_impl.c.src index 28553ea8..4c348614 100644 --- a/theory/wp/countpairs_wp_impl.c.src +++ b/theory/wp/countpairs_wp_impl.c.src @@ -105,7 +105,7 @@ wp_func_ptr_DOUBLE wp_driver_DOUBLE(const struct config_options *options) return NULL; } - int function_dispatch=0;//Set default to fastest available + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { diff --git a/theory/xi/countpairs_xi_impl.c.src b/theory/xi/countpairs_xi_impl.c.src index 2925377f..ea1b006a 100644 --- a/theory/xi/countpairs_xi_impl.c.src +++ b/theory/xi/countpairs_xi_impl.c.src @@ -93,7 +93,7 @@ xi_func_ptr_DOUBLE xi_driver_DOUBLE(const struct config_options *options) return NULL; } - int function_dispatch=0;//Set default to fastest available + int function_dispatch = num_functions-1; //Set default to fastest available /* Check that cpu supports feature */ if(options->instruction_set >= 0) { switch(options->instruction_set) { From d010d5d590fa03cc8debd3bff866485cb2830180 Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Sun, 8 Dec 2019 20:19:35 -0500 Subject: [PATCH 9/9] Move ISA enum to cpu_features.h --- utils/cpu_features.c | 4 ++-- utils/cpu_features.h | 16 ++++++++++++++++ utils/defs.h | 17 +---------------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/utils/cpu_features.c b/utils/cpu_features.c index 85ec3d06..7e5b2813 100644 --- a/utils/cpu_features.c +++ b/utils/cpu_features.c @@ -8,10 +8,10 @@ Adapted from Agner Fog's vectorclass: http://agner.org/ */ -#include "cpu_features.h" -#include "defs.h" #include +#include "cpu_features.h" + // Use CPUID to detect what instruction sets the CPU supports // The compiler may not support all these features though! // Use get_max_usable_isa() to find the max ISA supported diff --git a/utils/cpu_features.h b/utils/cpu_features.h index de8b00fa..b85d0f04 100644 --- a/utils/cpu_features.h +++ b/utils/cpu_features.h @@ -17,6 +17,22 @@ extern "C" { #endif +typedef enum { + DEFAULT=-42,/* present simply to make the enum a signed int*/ + FALLBACK=0, /* No special options */ + SSE=1, /* 64 bit vectors */ + SSE2=2, /* 128 bit vectors */ + SSE3=3, /* 128 bit vectors */ + SSSE3=4, /* 128 bit vectors */ + SSE4=5,/* 128bit vectors */ + SSE42=6, /* 128bit vectors with blend operations */ + AVX=7, /* 256bit vector width */ + AVX2=8, /* AVX2 (integer operations)*/ + AVX512F=9,/* AVX 512 Foundation */ + NUM_ISA /*NUM_ISA will be the next integer after + the last declared enum. AVX512F:=9 (so, NUM_ISA==10)*/ +} isa; //name for instruction sets -> corresponds to the return values for functions in cpu_features.c + static inline void cpuid (int output[4], int functionnumber) { #if defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax diff --git a/utils/defs.h b/utils/defs.h index fbb0027e..554fc644 100644 --- a/utils/defs.h +++ b/utils/defs.h @@ -15,6 +15,7 @@ #include #include "macros.h" +#include "cpu_features.h" #ifdef __cplusplus extern "C" { @@ -22,22 +23,6 @@ extern "C" { #define API_VERSION STR("2.3.1") -typedef enum { - DEFAULT=-42,/* present simply to make the enum a signed int*/ - FALLBACK=0, /* No special options */ - SSE=1, /* 64 bit vectors */ - SSE2=2, /* 128 bit vectors */ - SSE3=3, /* 128 bit vectors */ - SSSE3=4, /* 128 bit vectors */ - SSE4=5,/* 128bit vectors */ - SSE42=6, /* 128bit vectors with blend operations */ - AVX=7, /* 256bit vector width */ - AVX2=8, /* AVX2 (integer operations)*/ - AVX512F=9,/* AVX 512 Foundation */ - NUM_ISA /*NUM_ISA will be the next integer after - the last declared enum. AVX512F:=9 (so, NUM_ISA==10)*/ -} isa;//name for instruction sets -> corresponds to the return from instrset_detect in cpu_features.c - /* Macros as mask for the binning_flags */ /* These consititute the 32 bytes for