From 14beea5bbf18131a992ed0e9377959607a48ba1c Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 28 Nov 2023 13:56:20 -0800 Subject: [PATCH 1/6] Support for 32-bit dtype in key-value sort --- examples/avx512-kv.cpp | 6 ++++- lib/x86simdsort-skx.cpp | 44 +++++++++++++++++++++++-------- lib/x86simdsort.cpp | 23 +++++++++------- src/avx512-64bit-common.h | 12 +++++++++ src/avx512-64bit-keyvaluesort.hpp | 39 ++++++++++++++++----------- 5 files changed, 87 insertions(+), 37 deletions(-) diff --git a/examples/avx512-kv.cpp b/examples/avx512-kv.cpp index 26fc910d..c789b7c8 100644 --- a/examples/avx512-kv.cpp +++ b/examples/avx512-kv.cpp @@ -5,6 +5,7 @@ int main() { int64_t arr1[size]; uint64_t arr2[size]; double arr3[size]; + float arr4[size]; avx512_qsort_kv(arr1, arr1, size); avx512_qsort_kv(arr1, arr2, size); avx512_qsort_kv(arr1, arr3, size); @@ -13,6 +14,9 @@ int main() { avx512_qsort_kv(arr2, arr3, size); avx512_qsort_kv(arr3, arr1, size); avx512_qsort_kv(arr3, arr2, size); - avx512_qsort_kv(arr3, arr3, size); + avx512_qsort_kv(arr1, arr4, size); + avx512_qsort_kv(arr2, arr4, size); + avx512_qsort_kv(arr3, arr4, size); + return 0; return 0; } diff --git a/lib/x86simdsort-skx.cpp b/lib/x86simdsort-skx.cpp index 1dabfb71..02faa90f 100644 --- a/lib/x86simdsort-skx.cpp +++ b/lib/x86simdsort-skx.cpp @@ -33,9 +33,34 @@ return avx512_argselect(arr, k, arrsize, hasnan); \ } -#define DEFINE_KEYVALUE_METHODS(type1, type2) \ +#define DEFINE_KEYVALUE_METHODS(type) \ template <> \ - void keyvalue_qsort(type1 *key, type2* val, size_t arrsize, bool hasnan) \ + void keyvalue_qsort(type *key, uint64_t* val, size_t arrsize, bool hasnan) \ + { \ + avx512_qsort_kv(key, val, arrsize, hasnan); \ + } \ + template <> \ + void keyvalue_qsort(type *key, int64_t* val, size_t arrsize, bool hasnan) \ + { \ + avx512_qsort_kv(key, val, arrsize, hasnan); \ + } \ + template <> \ + void keyvalue_qsort(type *key, double* val, size_t arrsize, bool hasnan) \ + { \ + avx512_qsort_kv(key, val, arrsize, hasnan); \ + } \ + template <> \ + void keyvalue_qsort(type *key, uint32_t* val, size_t arrsize, bool hasnan) \ + { \ + avx512_qsort_kv(key, val, arrsize, hasnan); \ + } \ + template <> \ + void keyvalue_qsort(type *key, int32_t* val, size_t arrsize, bool hasnan) \ + { \ + avx512_qsort_kv(key, val, arrsize, hasnan); \ + } \ + template <> \ + void keyvalue_qsort(type *key, float* val, size_t arrsize, bool hasnan) \ { \ avx512_qsort_kv(key, val, arrsize, hasnan); \ } \ @@ -49,14 +74,11 @@ namespace avx512 { DEFINE_ALL_METHODS(uint64_t) DEFINE_ALL_METHODS(int64_t) DEFINE_ALL_METHODS(double) - DEFINE_KEYVALUE_METHODS(double, uint64_t) - DEFINE_KEYVALUE_METHODS(double, int64_t) - DEFINE_KEYVALUE_METHODS(double, double) - DEFINE_KEYVALUE_METHODS(uint64_t, uint64_t) - DEFINE_KEYVALUE_METHODS(uint64_t, int64_t) - DEFINE_KEYVALUE_METHODS(uint64_t, double) - DEFINE_KEYVALUE_METHODS(int64_t, uint64_t) - DEFINE_KEYVALUE_METHODS(int64_t, int64_t) - DEFINE_KEYVALUE_METHODS(int64_t, double) + DEFINE_KEYVALUE_METHODS(uint64_t) + DEFINE_KEYVALUE_METHODS(int64_t) + DEFINE_KEYVALUE_METHODS(double) + DEFINE_KEYVALUE_METHODS(uint32_t) + DEFINE_KEYVALUE_METHODS(int32_t) + DEFINE_KEYVALUE_METHODS(float) } // namespace avx512 } // namespace xss diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 86caeb0e..8ebbc6be 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -196,14 +196,19 @@ DISPATCH_ALL(argselect, (ISA_LIST("avx512_skx")), (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(uint64_t, int64_t, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(uint64_t, uint64_t, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(uint64_t, double, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(int64_t, int64_t, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(int64_t, uint64_t, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(int64_t, double, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(double, int64_t, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(double, double, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(double, uint64_t, (ISA_LIST("avx512_skx"))) +#define DISPATCH_KEYVALUE_SORT_FORTYPE(type) \ + DISPATCH_KEYVALUE_SORT(type, uint64_t, (ISA_LIST("avx512_skx")))\ + DISPATCH_KEYVALUE_SORT(type, int64_t, (ISA_LIST("avx512_skx")))\ + DISPATCH_KEYVALUE_SORT(type, double, (ISA_LIST("avx512_skx")))\ + DISPATCH_KEYVALUE_SORT(type, uint32_t, (ISA_LIST("avx512_skx")))\ + DISPATCH_KEYVALUE_SORT(type, int32_t, (ISA_LIST("avx512_skx")))\ + DISPATCH_KEYVALUE_SORT(type, float, (ISA_LIST("avx512_skx")))\ + +DISPATCH_KEYVALUE_SORT_FORTYPE(uint64_t) +DISPATCH_KEYVALUE_SORT_FORTYPE(int64_t) +DISPATCH_KEYVALUE_SORT_FORTYPE(double) +DISPATCH_KEYVALUE_SORT_FORTYPE(uint32_t) +DISPATCH_KEYVALUE_SORT_FORTYPE(int32_t) +DISPATCH_KEYVALUE_SORT_FORTYPE(float) } // namespace x86simdsort diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index e7f9f44c..909f3b2b 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -186,6 +186,10 @@ struct ymm_vector { // return _mm256_shuffle_ps(zmm, zmm, mask); //} } + static reg_t sort_vec(reg_t x) + { + return sort_zmm_64bit>(x); + } static void storeu(void *mem, reg_t x) { _mm256_storeu_ps((float *)mem, x); @@ -342,6 +346,10 @@ struct ymm_vector { * 32-bit and 64-bit */ return _mm256_shuffle_epi32(zmm, 0b10110001); } + static reg_t sort_vec(reg_t x) + { + return sort_zmm_64bit>(x); + } static void storeu(void *mem, reg_t x) { _mm256_storeu_si256((__m256i *)mem, x); @@ -498,6 +506,10 @@ struct ymm_vector { * 32-bit and 64-bit */ return _mm256_shuffle_epi32(zmm, 0b10110001); } + static reg_t sort_vec(reg_t x) + { + return sort_zmm_64bit>(x); + } static void storeu(void *mem, reg_t x) { _mm256_storeu_si256((__m256i *)mem, x); diff --git a/src/avx512-64bit-keyvaluesort.hpp b/src/avx512-64bit-keyvaluesort.hpp index 8281d2db..55f79bb1 100644 --- a/src/avx512-64bit-keyvaluesort.hpp +++ b/src/avx512-64bit-keyvaluesort.hpp @@ -558,7 +558,7 @@ template (keys, indexes, i, size); if (i == 0) { break; } } @@ -617,26 +617,33 @@ template X86_SIMD_SORT_INLINE void avx512_qsort_kv(T1 *keys, T2 *indexes, arrsize_t arrsize, bool hasnan = false) { - UNUSED(hasnan); + using keytype = typename std::conditional, + zmm_vector>::type; + using valtype = typename std::conditional, + zmm_vector>::type; if (arrsize > 1) { if constexpr (std::is_floating_point_v) { - arrsize_t nan_count - = replace_nan_with_inf>(keys, arrsize); - qsort_64bit_, zmm_vector>( - keys, - indexes, - 0, - arrsize - 1, - 2 * (arrsize_t)log2(arrsize)); + arrsize_t nan_count = 0; + if (UNLIKELY(hasnan)) { + nan_count = replace_nan_with_inf>(keys, + arrsize); + } + qsort_64bit_(keys, + indexes, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize)); replace_inf_with_nan(keys, arrsize, nan_count); } else { - qsort_64bit_, zmm_vector>( - keys, - indexes, - 0, - arrsize - 1, - 2 * (arrsize_t)log2(arrsize)); + UNUSED(hasnan); + qsort_64bit_(keys, + indexes, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize)); } } } From 9b36cf171be84a27210349f00e201a839160196d Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 28 Nov 2023 14:16:09 -0800 Subject: [PATCH 2/6] Add benchmarks for 32-bit key-value sort --- benchmarks/bench-keyvalue.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/bench-keyvalue.hpp b/benchmarks/bench-keyvalue.hpp index 101a8fae..1eaab9e9 100644 --- a/benchmarks/bench-keyvalue.hpp +++ b/benchmarks/bench-keyvalue.hpp @@ -46,3 +46,6 @@ static void simdkvsort(benchmark::State &state, Args &&...args) BENCH_BOTH_KVSORT(uint64_t) BENCH_BOTH_KVSORT(int64_t) BENCH_BOTH_KVSORT(double) +BENCH_BOTH_KVSORT(uint32_t) +BENCH_BOTH_KVSORT(int32_t) +BENCH_BOTH_KVSORT(float) From acd82283b45eb8cff2449359c113296dbb1d5ecd Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 28 Nov 2023 14:16:22 -0800 Subject: [PATCH 3/6] Add tests for 32-bit key-value sort --- tests/test-keyvalue.cpp | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/tests/test-keyvalue.cpp b/tests/test-keyvalue.cpp index 3a73c08e..c82b033a 100644 --- a/tests/test-keyvalue.cpp +++ b/tests/test-keyvalue.cpp @@ -40,28 +40,32 @@ TYPED_TEST_P(simdkvsort, test_kvsort) std::vector key_bckp = key; std::vector val_bckp = val; x86simdsort::keyvalue_qsort(key.data(), val.data(), size, hasnan); - xss::scalar::keyvalue_qsort(key_bckp.data(), val_bckp.data(), size, hasnan); + xss::scalar::keyvalue_qsort( + key_bckp.data(), val_bckp.data(), size, hasnan); ASSERT_EQ(key, key_bckp); - const bool hasDuplicates = std::adjacent_find(key.begin(), key.end()) != key.end(); - if (!hasDuplicates) { - ASSERT_EQ(val, val_bckp); - } - key.clear(); val.clear(); - key_bckp.clear(); val_bckp.clear(); + const bool hasDuplicates + = std::adjacent_find(key.begin(), key.end()) != key.end(); + if (!hasDuplicates) { ASSERT_EQ(val, val_bckp); } + key.clear(); + val.clear(); + key_bckp.clear(); + val_bckp.clear(); } } } REGISTER_TYPED_TEST_SUITE_P(simdkvsort, test_kvsort); -using QKVSortTestTypes = testing::Types, - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple>; +#define CREATE_TUPLES(type) \ + std::tuple, std::tuple, \ + std::tuple, std::tuple, \ + std::tuple, std::tuple + +using QKVSortTestTypes = testing::Types; INSTANTIATE_TYPED_TEST_SUITE_P(xss, simdkvsort, QKVSortTestTypes); From 54f70552f006d9d56f779c54f2d64a37a10d298b Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 28 Nov 2023 14:39:23 -0800 Subject: [PATCH 4/6] update README --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 0b33eda8..4a57a035 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,31 @@ AVX2 specific implementations, please see [README](https://github.com/intel/x86-simd-sort/blob/main/src/README.md) file under `src/` directory. The following routines are currently supported: + +#### Sort routines on arrays ```cpp x86simdsort::qsort(T* arr, size_t size, bool hasnan); x86simdsort::qselect(T* arr, size_t k, size_t size, bool hasnan); x86simdsort::partial_qsort(T* arr, size_t k, size_t size, bool hasnan); +``` +Supported datatypes: `T $\in$ [_Float16, uint16_t, int16_t, float, uint32_t, +int32_t, double, uint64_t, int64_t]` + +#### Key-value sort routines on pairs of arrays +```cpp +x86simdsort::keyvalue_qsort(T1* key, T2* val, size_t size, bool hasnan); +``` +Supported datatypes: `T1, T2 $\in$ [float, uint32_t, int32_t, double, +uint64_t, int64_t]` Note that keyvalue sort is not yet supported for 16-bit +data types. + +#### Arg sort routines on arrays +```cpp std::vector arg = x86simdsort::argsort(T* arr, size_t size, bool hasnan); std::vector arg = x86simdsort::argselect(T* arr, size_t k, size_t size, bool hasnan); ``` +Supported datatypes: `T $\in$ [_Float16, uint16_t, int16_t, float, uint32_t, +int32_t, double, uint64_t, int64_t]` ### Build/Install From 12579273af50d1145c002d8bd929f04cc42016c6 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 28 Nov 2023 14:43:01 -0800 Subject: [PATCH 5/6] update README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4a57a035..0d66cddc 100644 --- a/README.md +++ b/README.md @@ -15,14 +15,14 @@ x86simdsort::qsort(T* arr, size_t size, bool hasnan); x86simdsort::qselect(T* arr, size_t k, size_t size, bool hasnan); x86simdsort::partial_qsort(T* arr, size_t k, size_t size, bool hasnan); ``` -Supported datatypes: `T $\in$ [_Float16, uint16_t, int16_t, float, uint32_t, +Supported datatypes: `T` $\in$ `[_Float16, uint16_t, int16_t, float, uint32_t, int32_t, double, uint64_t, int64_t]` #### Key-value sort routines on pairs of arrays ```cpp x86simdsort::keyvalue_qsort(T1* key, T2* val, size_t size, bool hasnan); ``` -Supported datatypes: `T1, T2 $\in$ [float, uint32_t, int32_t, double, +Supported datatypes: `T1`, `T2` $\in$ `[float, uint32_t, int32_t, double, uint64_t, int64_t]` Note that keyvalue sort is not yet supported for 16-bit data types. @@ -31,7 +31,7 @@ data types. std::vector arg = x86simdsort::argsort(T* arr, size_t size, bool hasnan); std::vector arg = x86simdsort::argselect(T* arr, size_t k, size_t size, bool hasnan); ``` -Supported datatypes: `T $\in$ [_Float16, uint16_t, int16_t, float, uint32_t, +Supported datatypes: `T` $\in$ `[_Float16, uint16_t, int16_t, float, uint32_t, int32_t, double, uint64_t, int64_t]` ### Build/Install From cb4616519f77f3581b6af1c8b64852fcdf5e14d9 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 28 Nov 2023 14:44:08 -0800 Subject: [PATCH 6/6] update README --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 0d66cddc..8a6015c0 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ AVX2 specific implementations, please see `src/` directory. The following routines are currently supported: -#### Sort routines on arrays +### Sort routines on arrays ```cpp x86simdsort::qsort(T* arr, size_t size, bool hasnan); x86simdsort::qselect(T* arr, size_t k, size_t size, bool hasnan); @@ -18,7 +18,7 @@ x86simdsort::partial_qsort(T* arr, size_t k, size_t size, bool hasnan); Supported datatypes: `T` $\in$ `[_Float16, uint16_t, int16_t, float, uint32_t, int32_t, double, uint64_t, int64_t]` -#### Key-value sort routines on pairs of arrays +### Key-value sort routines on pairs of arrays ```cpp x86simdsort::keyvalue_qsort(T1* key, T2* val, size_t size, bool hasnan); ``` @@ -26,7 +26,7 @@ Supported datatypes: `T1`, `T2` $\in$ `[float, uint32_t, int32_t, double, uint64_t, int64_t]` Note that keyvalue sort is not yet supported for 16-bit data types. -#### Arg sort routines on arrays +### Arg sort routines on arrays ```cpp std::vector arg = x86simdsort::argsort(T* arr, size_t size, bool hasnan); std::vector arg = x86simdsort::argselect(T* arr, size_t k, size_t size, bool hasnan); @@ -34,7 +34,7 @@ std::vector arg = x86simdsort::argselect(T* arr, size_t k, size_t size, Supported datatypes: `T` $\in$ `[_Float16, uint16_t, int16_t, float, uint32_t, int32_t, double, uint64_t, int64_t]` -### Build/Install +## Build/Install [meson](https://github.com/mesonbuild/meson) is the used build system. Command to build and install the library: @@ -53,7 +53,7 @@ benchmark](https://github.com/google/benchmark) frameworks respectively. You can configure meson to build them both by using `-Dbuild_tests=true` and `-Dbuild_benchmarks=true`. -### Example usage +## Example usage ```cpp #include "x86simdsort.h" @@ -66,7 +66,7 @@ int main() { ``` -### Details +## Details - `x86simdsort::qsort` is equivalent to `qsort` in [C](https://www.tutorialspoint.com/c_standard_library/c_function_qsort.htm) @@ -95,7 +95,7 @@ argselect) will not use the SIMD based algorithms if they detect NAN's in the array. You can read details of all the implementations [here](https://github.com/intel/x86-simd-sort/src/README.md). -### Downstream projects using x86-simd-sort +## Downstream projects using x86-simd-sort - NumPy uses this as a [submodule](https://github.com/numpy/numpy/pull/22315) to accelerate `np.sort, np.argsort, np.partition and np.argpartition`. - A slightly modifed version this library has been integrated into [openJDK](https://github.com/openjdk/jdk/pull/14227).