From a3f675308f173815ffb6002b3c5ddd8b2350481d Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 23 Jan 2024 11:18:54 -0800 Subject: [PATCH 1/5] Build issues on macOS: deduce the pointer to arg with templates --- src/xss-common-argsort.h | 46 ++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/src/xss-common-argsort.h b/src/xss-common-argsort.h index 05605831..bf486a2e 100644 --- a/src/xss-common-argsort.h +++ b/src/xss-common-argsort.h @@ -291,14 +291,15 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr, template -X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr, - arrsize_t *arg, - arrsize_t left, - arrsize_t right, - type_t pivot, - type_t *smallest, - type_t *biggest) + typename type_t = typename vtype::type_t, + typename argtype_t = typename argtype::type_t> +X86_SIMD_SORT_INLINE arrsize_t argpartition_unrolled(type_t *arr, + argtype_t *arg, + arrsize_t left, + arrsize_t right, + type_t pivot, + type_t *smallest, + type_t *biggest) { if (right - left <= 8 * num_unroll * vtype::numlanes) { return partition_avx512( @@ -422,9 +423,12 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr, return l_store; } -template +template X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, - arrsize_t *arg, + argtype_t *arg, const arrsize_t left, const arrsize_t right) { @@ -468,9 +472,12 @@ X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, } } -template +template X86_SIMD_SORT_INLINE void argsort_64bit_(type_t *arr, - arrsize_t *arg, + argtype_t *arg, arrsize_t left, arrsize_t right, arrsize_t max_iters) @@ -490,10 +497,10 @@ X86_SIMD_SORT_INLINE void argsort_64bit_(type_t *arr, arr, arg + left, (int32_t)(right + 1 - left)); return; } - type_t pivot = get_pivot_64bit(arr, arg, left, right); + type_t pivot = get_pivot_64bit(arr, arg, left, right); type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - arrsize_t pivot_index = partition_avx512_unrolled( + arrsize_t pivot_index = argpartition_unrolled( arr, arg, left, right + 1, pivot, &smallest, &biggest); if (pivot != smallest) argsort_64bit_( @@ -503,9 +510,12 @@ X86_SIMD_SORT_INLINE void argsort_64bit_(type_t *arr, arr, arg, pivot_index, right, max_iters - 1); } -template +template X86_SIMD_SORT_INLINE void argselect_64bit_(type_t *arr, - arrsize_t *arg, + argtype_t *arg, arrsize_t pos, arrsize_t left, arrsize_t right, @@ -526,10 +536,10 @@ X86_SIMD_SORT_INLINE void argselect_64bit_(type_t *arr, arr, arg + left, (int32_t)(right + 1 - left)); return; } - type_t pivot = get_pivot_64bit(arr, arg, left, right); + type_t pivot = get_pivot_64bit(arr, arg, left, right); type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - arrsize_t pivot_index = partition_avx512_unrolled( + arrsize_t pivot_index = argpartition_unrolled( arr, arg, left, right + 1, pivot, &smallest, &biggest); if ((pivot != smallest) && (pos < pivot_index)) argselect_64bit_( From ee072b4a98694656a9deb189f5da05281ab6c105 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 23 Jan 2024 14:19:42 -0800 Subject: [PATCH 2/5] Use uint64_t for arrsize_t on macOS 64-bit --- src/xss-common-argsort.h | 33 --------------------------------- src/xss-common-includes.h | 8 ++++++++ 2 files changed, 8 insertions(+), 33 deletions(-) diff --git a/src/xss-common-argsort.h b/src/xss-common-argsort.h index bf486a2e..075666f6 100644 --- a/src/xss-common-argsort.h +++ b/src/xss-common-argsort.h @@ -559,19 +559,10 @@ avx512_argsort(T *arr, arrsize_t *arg, arrsize_t arrsize, bool hasnan = false) ymm_vector, zmm_vector>::type; -/* Workaround for NumPy failed build on macOS x86_64: implicit instantiation of - * undefined template 'zmm_vector'*/ -#ifdef __APPLE__ - using argtype = - typename std::conditional, - zmm_vector>::type; -#else using argtype = typename std::conditional, zmm_vector>::type; -#endif if (arrsize > 1) { if constexpr (std::is_floating_point_v) { @@ -605,18 +596,10 @@ avx2_argsort(T *arr, arrsize_t *arg, arrsize_t arrsize, bool hasnan = false) avx2_half_vector, avx2_vector>::type; -#ifdef __APPLE__ - using argtype = - typename std::conditional, - avx2_vector>::type; -#else using argtype = typename std::conditional, avx2_vector>::type; -#endif - if (arrsize > 1) { if constexpr (std::is_floating_point_v) { if ((hasnan) && (array_has_nan(arr, arrsize))) { @@ -653,19 +636,10 @@ X86_SIMD_SORT_INLINE void avx512_argselect(T *arr, ymm_vector, zmm_vector>::type; -/* Workaround for NumPy failed build on macOS x86_64: implicit instantiation of - * undefined template 'zmm_vector'*/ -#ifdef __APPLE__ - using argtype = - typename std::conditional, - zmm_vector>::type; -#else using argtype = typename std::conditional, zmm_vector>::type; -#endif if (arrsize > 1) { if constexpr (std::is_floating_point_v) { @@ -702,17 +676,10 @@ X86_SIMD_SORT_INLINE void avx2_argselect(T *arr, avx2_half_vector, avx2_vector>::type; -#ifdef __APPLE__ - using argtype = - typename std::conditional, - avx2_vector>::type; -#else using argtype = typename std::conditional, avx2_vector>::type; -#endif if (arrsize > 1) { if constexpr (std::is_floating_point_v) { diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h index 98a3fe15..aa4070a6 100644 --- a/src/xss-common-includes.h +++ b/src/xss-common-includes.h @@ -90,7 +90,15 @@ constexpr bool always_false = false; #define NETWORK_32BIT_6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4 #define NETWORK_32BIT_7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 +/* + * workaround on 64-bit macOS which defines size_t as unsigned long and defines + * uint64_t as unsigned long long, both of which are 8 bytes + */ +#if defined(__APPLE__) && defined(__x86_64__) +typedef uint64_t arrsize_t; +#else typedef size_t arrsize_t; +#endif template struct zmm_vector; From 1558efaac371bb0f50d48353e78ec71cacb770d7 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 23 Jan 2024 15:07:48 -0800 Subject: [PATCH 3/5] Define zmm_vector on macOS --- src/avx512-64bit-common.h | 13 +++++++++++++ src/xss-common-includes.h | 8 -------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 65ee85db..0c4e3d58 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -912,6 +912,19 @@ struct zmm_vector { left_addr, right_addr, k, reg); } }; + +/* + * workaround on 64-bit macOS which defines size_t as unsigned long and defines + * uint64_t as unsigned long long, both of which are 8 bytes + */ +#if defined(__APPLE__) && defined(__x86_64__) +static_assert(sizeof(size_t) == sizeof(uint64_t), + "Size of size_t and uint64_t are not the same"); +template <> +struct zmm_vector : public zmm_vector { +}; +#endif + template <> struct zmm_vector { using type_t = double; diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h index aa4070a6..98a3fe15 100644 --- a/src/xss-common-includes.h +++ b/src/xss-common-includes.h @@ -90,15 +90,7 @@ constexpr bool always_false = false; #define NETWORK_32BIT_6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4 #define NETWORK_32BIT_7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 -/* - * workaround on 64-bit macOS which defines size_t as unsigned long and defines - * uint64_t as unsigned long long, both of which are 8 bytes - */ -#if defined(__APPLE__) && defined(__x86_64__) -typedef uint64_t arrsize_t; -#else typedef size_t arrsize_t; -#endif template struct zmm_vector; From 26afea8d8f1f3e472c15d3d5cc1d9de39259b70b Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 23 Jan 2024 15:13:35 -0800 Subject: [PATCH 4/5] Revert "Build issues on macOS: deduce the pointer to arg with templates" This reverts commit a3f675308f173815ffb6002b3c5ddd8b2350481d. --- src/xss-common-argsort.h | 46 ++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/src/xss-common-argsort.h b/src/xss-common-argsort.h index 075666f6..a0a6efb1 100644 --- a/src/xss-common-argsort.h +++ b/src/xss-common-argsort.h @@ -291,15 +291,14 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr, template -X86_SIMD_SORT_INLINE arrsize_t argpartition_unrolled(type_t *arr, - argtype_t *arg, - arrsize_t left, - arrsize_t right, - type_t pivot, - type_t *smallest, - type_t *biggest) + typename type_t = typename vtype::type_t> +X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr, + arrsize_t *arg, + arrsize_t left, + arrsize_t right, + type_t pivot, + type_t *smallest, + type_t *biggest) { if (right - left <= 8 * num_unroll * vtype::numlanes) { return partition_avx512( @@ -423,12 +422,9 @@ X86_SIMD_SORT_INLINE arrsize_t argpartition_unrolled(type_t *arr, return l_store; } -template +template X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, - argtype_t *arg, + arrsize_t *arg, const arrsize_t left, const arrsize_t right) { @@ -472,12 +468,9 @@ X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, } } -template +template X86_SIMD_SORT_INLINE void argsort_64bit_(type_t *arr, - argtype_t *arg, + arrsize_t *arg, arrsize_t left, arrsize_t right, arrsize_t max_iters) @@ -497,10 +490,10 @@ X86_SIMD_SORT_INLINE void argsort_64bit_(type_t *arr, arr, arg + left, (int32_t)(right + 1 - left)); return; } - type_t pivot = get_pivot_64bit(arr, arg, left, right); + type_t pivot = get_pivot_64bit(arr, arg, left, right); type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - arrsize_t pivot_index = argpartition_unrolled( + arrsize_t pivot_index = partition_avx512_unrolled( arr, arg, left, right + 1, pivot, &smallest, &biggest); if (pivot != smallest) argsort_64bit_( @@ -510,12 +503,9 @@ X86_SIMD_SORT_INLINE void argsort_64bit_(type_t *arr, arr, arg, pivot_index, right, max_iters - 1); } -template +template X86_SIMD_SORT_INLINE void argselect_64bit_(type_t *arr, - argtype_t *arg, + arrsize_t *arg, arrsize_t pos, arrsize_t left, arrsize_t right, @@ -536,10 +526,10 @@ X86_SIMD_SORT_INLINE void argselect_64bit_(type_t *arr, arr, arg + left, (int32_t)(right + 1 - left)); return; } - type_t pivot = get_pivot_64bit(arr, arg, left, right); + type_t pivot = get_pivot_64bit(arr, arg, left, right); type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - arrsize_t pivot_index = argpartition_unrolled( + arrsize_t pivot_index = partition_avx512_unrolled( arr, arg, left, right + 1, pivot, &smallest, &biggest); if ((pivot != smallest) && (pos < pivot_index)) argselect_64bit_( From 78ed2fda5efaecc28542462bd33273996ced62fb Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 23 Jan 2024 15:20:27 -0800 Subject: [PATCH 5/5] Explicitly use arrsize_t* for argsort_n methods --- src/xss-network-keyvaluesort.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xss-network-keyvaluesort.hpp b/src/xss-network-keyvaluesort.hpp index 8c366a8a..1cbbc159 100644 --- a/src/xss-network-keyvaluesort.hpp +++ b/src/xss-network-keyvaluesort.hpp @@ -442,7 +442,7 @@ bitonic_fullmerge_n_vec(typename keyType::reg_t *keys, template X86_SIMD_SORT_INLINE void argsort_n_vec(typename keyType::type_t *keys, - typename indexType::type_t *indices, + arrsize_t *indices, int N) { using kreg_t = typename keyType::reg_t; @@ -587,7 +587,7 @@ X86_SIMD_SORT_INLINE void kvsort_n_vec(typename keyType::type_t *keys, template X86_SIMD_SORT_INLINE void argsort_n(typename keyType::type_t *keys, - typename indexType::type_t *indices, + arrsize_t *indices, int N) { static_assert(keyType::numlanes == indexType::numlanes,