From 62c0721d9a5aa944901b0eb0510a7f041742c5a4 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 16 Oct 2023 13:14:12 -0700 Subject: [PATCH 1/3] Avoid using _mm512_set_epi16: missing in gcc-8 --- src/avx512-16bit-common.h | 148 ++++++-------------------------------- src/avx512-common-qsort.h | 1 + 2 files changed, 21 insertions(+), 128 deletions(-) diff --git a/src/avx512-16bit-common.h b/src/avx512-16bit-common.h index 532da825..288f85d0 100644 --- a/src/avx512-16bit-common.h +++ b/src/avx512-16bit-common.h @@ -99,38 +99,11 @@ struct avx512_16bit_swizzle_ops { __m512i v = vtype::cast_to(reg); if constexpr (scale == 2) { - __m512i mask = _mm512_set_epi16(30, - 31, - 28, - 29, - 26, - 27, - 24, - 25, - 22, - 23, - 20, - 21, - 18, - 19, - 16, - 17, - 14, - 15, - 12, - 13, - 10, - 11, - 8, - 9, - 6, - 7, - 4, - 5, - 2, - 3, - 0, - 1); + std::vector arr + = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, + 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, + 23, 22, 25, 24, 27, 26, 29, 28, 31, 30}; + __m512i mask = _mm512_loadu_si512(arr.data()); v = _mm512_permutexvar_epi16(mask, v); } else if constexpr (scale == 4) { @@ -160,108 +133,27 @@ struct avx512_16bit_swizzle_ops { if constexpr (scale == 2) { return swap_n(reg); } else if constexpr (scale == 4) { - __m512i mask = _mm512_set_epi16(28, - 29, - 30, - 31, - 24, - 25, - 26, - 27, - 20, - 21, - 22, - 23, - 16, - 17, - 18, - 19, - 12, - 13, - 14, - 15, - 8, - 9, - 10, - 11, - 4, - 5, - 6, - 7, - 0, - 1, - 2, - 3); + std::vector arr + = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, + 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, + 21, 20, 27, 26, 25, 24, 31, 30, 29, 28}; + __m512i mask = _mm512_loadu_si512(arr.data()); v = _mm512_permutexvar_epi16(mask, v); } else if constexpr (scale == 8) { - __m512i mask = _mm512_set_epi16(24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7); + std::vector arr + = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, + 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, + 17, 16, 31, 30, 29, 28, 27, 26, 25, 24}; + __m512i mask = _mm512_loadu_si512(arr.data()); v = _mm512_permutexvar_epi16(mask, v); } else if constexpr (scale == 16) { - __m512i mask = _mm512_set_epi16(16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15); + std::vector arr + = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, + 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16}; + __m512i mask = _mm512_loadu_si512(arr.data()); v = _mm512_permutexvar_epi16(mask, v); } else if constexpr (scale == 32) { diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index b969a069..2a39dd25 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -41,6 +41,7 @@ #include #include #include +#include #define X86_SIMD_SORT_INFINITY std::numeric_limits::infinity() #define X86_SIMD_SORT_INFINITYF std::numeric_limits::infinity() From 48594f17b78398cbeaeec51de42b8ade22475286 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 16 Oct 2023 13:17:32 -0700 Subject: [PATCH 2/3] Convert to char* to comply with _mm_prefetch(char*, ..) --- src/avx512-common-qsort.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index 2a39dd25..c6829ef4 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -451,8 +451,8 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr, X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes); - _mm_prefetch(arr + right + ii * vtype::numlanes - - num_unroll * vtype::numlanes, + _mm_prefetch((char *)(arr + right + ii * vtype::numlanes + - num_unroll * vtype::numlanes), _MM_HINT_T0); } } @@ -460,8 +460,8 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr, X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes); - _mm_prefetch(arr + left + ii * vtype::numlanes - + num_unroll * vtype::numlanes, + _mm_prefetch((char *)(arr + left + ii * vtype::numlanes + + num_unroll * vtype::numlanes), _MM_HINT_T0); } left += num_unroll * vtype::numlanes; From 65e04b6bc825d9759d49e4997b2962b972d258e0 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 16 Oct 2023 21:45:46 -0700 Subject: [PATCH 3/3] Use popcnt_u32 --- src/avx512-common-qsort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index c6829ef4..1fdf3627 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -250,7 +250,7 @@ X86_SIMD_SORT_INLINE arrsize_t partition_vec(type_t *l_store, reg_t &biggest_vec) { typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec); - arrsize_t amount_ge_pivot = _mm_popcnt_u64(ge_mask); + int amount_ge_pivot = _mm_popcnt_u32((int)ge_mask); vtype::mask_compressstoreu(l_store, vtype::knot_opmask(ge_mask), curr_vec); vtype::mask_compressstoreu(