From c82b1b2dffb66c323391f2b2815c7e8a36dd0c62 Mon Sep 17 00:00:00 2001 From: Mitchell O'Sullivan Date: Sat, 8 Apr 2023 02:45:43 +1000 Subject: [PATCH 1/7] Fix build bugs --- Makefile | 2 +- meson.build | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index df55afbf..6169ec1b 100644 --- a/Makefile +++ b/Makefile @@ -38,4 +38,4 @@ meson: cd builddir && ninja clean: - $(RM) -rf $(TESTDIR)/*.o $(UTILS)/*.o testexe benchexe builddir + $(RM) -rf $(TESTDIR)/*.o $(BENCHDIR)/*.o $(UTILS)/*.o testexe benchexe builddir diff --git a/meson.build b/meson.build index 79b23927..9193776d 100644 --- a/meson.build +++ b/meson.build @@ -11,8 +11,8 @@ gbench_dep = dependency('benchmark', required : true) fp16code = '''#include int main() { - __mm512h temp = _mm512_set1_ph(1.0f); - __mm512h var2 = _mm512_min_ph(temp, temp); + __m512h temp = _mm512_set1_ph(1.0f); + __m512h var2 = _mm512_min_ph(temp, temp); return 0; } ''' From 3cd66426600ac5b57f0c5c4b8f5582047c7dc149 Mon Sep 17 00:00:00 2001 From: Mitchell O'Sullivan Date: Fri, 7 Apr 2023 23:02:11 +1000 Subject: [PATCH 2/7] Implement partial sorting algorithms Each datatype now supports two partial sorting algorithms: 1) Sort such that a particular index is valid (QuickSelect), and 2) Sort such that the first k indices is valid (PartialQuickSort), where 'valid' means that the elements are in the same position as if the entire array had been sorted. Additionally transferred a few lingering comments from a refactor earlier in the project. --- src/avx512-16bit-common.h | 32 ++++++++++++++++ src/avx512-16bit-qsort.hpp | 29 +++++++++++++++ src/avx512-32bit-qsort.hpp | 61 ++++++++++++++++++++++++++++++ src/avx512-64bit-common.h | 10 ++++- src/avx512-64bit-qsort.hpp | 68 ++++++++++++++++++++++++++++++---- src/avx512-common-qsort.h | 16 +++++++- src/avx512fp16-16bit-qsort.hpp | 11 ++++++ 7 files changed, 217 insertions(+), 10 deletions(-) diff --git a/src/avx512-16bit-common.h b/src/avx512-16bit-common.h index 7ab22123..b5ee6e28 100644 --- a/src/avx512-16bit-common.h +++ b/src/avx512-16bit-common.h @@ -259,6 +259,38 @@ X86_SIMD_SORT_INLINE type_t get_pivot_16bit(type_t *arr, return ((type_t *)&sort)[16]; } +template +static void +qselect_16bit_(type_t *arr, int64_t k, + int64_t left, int64_t right, + int64_t max_iters) +{ + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1, comparison_func); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_16bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_16bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512( + arr, left, right + 1, pivot, &smallest, &biggest); + if ((pivot != smallest) && (k <= pivot_index)) + qselect_16bit_(arr, k, left, pivot_index - 1, max_iters - 1); + else if ((pivot != biggest) && (k > pivot_index)) + qselect_16bit_(arr, k, pivot_index, right, max_iters - 1); +} + template static void qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index fcbaf879..606f8706 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -405,6 +405,34 @@ replace_inf_with_nan(uint16_t *arr, int64_t arrsize, int64_t nan_count) } } +template <> +void avx512_qselect(int16_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + qselect_16bit_, int16_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(uint16_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + qselect_16bit_, uint16_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qselect_16bit_, uint16_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} + template <> void avx512_qsort(int16_t *arr, int64_t arrsize) { @@ -432,4 +460,5 @@ void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize) replace_inf_with_nan(arr, arrsize, nan_count); } } + #endif // AVX512_QSORT_16BIT diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp index 1cbba00b..5c4bc7af 100644 --- a/src/avx512-32bit-qsort.hpp +++ b/src/avx512-32bit-qsort.hpp @@ -626,6 +626,38 @@ X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr, return ((type_t *)&sort)[8]; } +template +static void +qselect_32bit_(type_t *arr, int64_t k, + int64_t left, int64_t right, + int64_t max_iters) +{ + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_32bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_32bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512( + arr, left, right + 1, pivot, &smallest, &biggest); + if ((pivot != smallest) && (k <= pivot_index)) + qselect_32bit_(arr, k, left, pivot_index - 1, max_iters - 1); + else if ((pivot != biggest) && (k > pivot_index)) + qselect_32bit_(arr, k, pivot_index, right, max_iters - 1); +} + template static void qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) @@ -681,6 +713,35 @@ replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count) } } +template <> +void avx512_qselect(int32_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + qselect_32bit_, int32_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(uint32_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + qselect_32bit_, uint32_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(float *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qselect_32bit_, float>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} + template <> void avx512_qsort(int32_t *arr, int64_t arrsize) { diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 32a4731e..7fc8acf3 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -4,10 +4,16 @@ * Authors: Raghuveer Devulapalli * ****************************************************************/ -#ifndef AVX512_64BIT_COMMOM -#define AVX512_64BIT_COMMOM +#ifndef AVX512_64BIT_COMMON +#define AVX512_64BIT_COMMON #include "avx512-common-qsort.h" +/* + * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic + * sorting network (see + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) + */ +// ZMM 7, 6, 5, 4, 3, 2, 1, 0 #define NETWORK_64BIT_1 4, 5, 6, 7, 0, 1, 2, 3 #define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7 #define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2 diff --git a/src/avx512-64bit-qsort.hpp b/src/avx512-64bit-qsort.hpp index 62000549..e451634c 100644 --- a/src/avx512-64bit-qsort.hpp +++ b/src/avx512-64bit-qsort.hpp @@ -9,13 +9,6 @@ #include "avx512-64bit-common.h" -/* - * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic - * sorting network (see - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) - */ -// ZMM 7, 6, 5, 4, 3, 2, 1, 0 - // Assumes zmm is bitonic and performs a recursive half cleaner template X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) @@ -408,6 +401,67 @@ qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) qsort_64bit_(arr, pivot_index, right, max_iters - 1); } +template +static void +qselect_64bit_(type_t *arr, int64_t k, + int64_t left, int64_t right, + int64_t max_iters) +{ + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_64bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_64bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512( + arr, left, right + 1, pivot, &smallest, &biggest); + if ((pivot != smallest) && (k <= pivot_index)) + qselect_64bit_(arr, k, left, pivot_index - 1, max_iters - 1); + else if ((pivot != biggest) && (k > pivot_index)) + qselect_64bit_(arr, k, pivot_index, right, max_iters - 1); +} + +template <> +void avx512_qselect(int64_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + qselect_64bit_, int64_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(uint64_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + qselect_64bit_, uint64_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(double *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qselect_64bit_, double>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} + template <> void avx512_qsort(int64_t *arr, int64_t arrsize) { diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index a80f2721..bd57f44f 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -88,9 +88,23 @@ struct zmm_vector; template void avx512_qsort(T *arr, int64_t arrsize); - void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize); +template +void avx512_qselect(T *arr, int64_t k, int64_t arrsize); +void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize); + +template +void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize) { + avx512_qselect(arr, k, arrsize); + avx512_qsort(arr, k); +} +inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k, int64_t arrsize) +{ + avx512_qselect_fp16(arr, k, arrsize); + avx512_qsort_fp16(arr, k); +} + template bool comparison_func(const T &a, const T &b) { diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp index 363d2b55..8a9a49ed 100644 --- a/src/avx512fp16-16bit-qsort.hpp +++ b/src/avx512fp16-16bit-qsort.hpp @@ -144,6 +144,17 @@ replace_inf_with_nan(_Float16 *arr, int64_t arrsize, int64_t nan_count) memset(arr + arrsize - nan_count, 0xFF, nan_count * 2); } +template <> +void avx512_qselect(_Float16 *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qselect_16bit_, _Float16>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} + template <> void avx512_qsort(_Float16 *arr, int64_t arrsize) { From 0cc133756079d38f29f4806cb2f7a020fde2ab52 Mon Sep 17 00:00:00 2001 From: Mitchell O'Sullivan Date: Fri, 7 Apr 2023 23:19:35 +1000 Subject: [PATCH 3/7] Refactor test suite and add tests for partial sort --- tests/meson.build | 2 +- tests/test-qsort-common.h | 11 ++++ tests/test_partial_qsort.hpp | 49 +++++++++++++++ tests/test_qselect.hpp | 42 +++++++++++++ tests/{test_qsort.cpp => test_qsort.hpp} | 18 +----- tests/test_qsortfp16.cpp | 76 ++++++++++++++++++++++++ tests/test_sort.cpp | 15 +++++ 7 files changed, 195 insertions(+), 18 deletions(-) create mode 100644 tests/test-qsort-common.h create mode 100644 tests/test_partial_qsort.hpp create mode 100644 tests/test_qselect.hpp rename tests/{test_qsort.cpp => test_qsort.hpp} (70%) create mode 100644 tests/test_sort.cpp diff --git a/tests/meson.build b/tests/meson.build index 0a82d96f..a69222ac 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -11,7 +11,7 @@ endif if cpp.has_argument('-march=icelake-client') libtests += static_library('tests_qsort', - files('test_qsort.cpp', ), + files('test_sort.cpp', ), dependencies: gtest_dep, include_directories : [src, utils], cpp_args : ['-O3', '-march=icelake-client'], diff --git a/tests/test-qsort-common.h b/tests/test-qsort-common.h new file mode 100644 index 00000000..a41b2c63 --- /dev/null +++ b/tests/test-qsort-common.h @@ -0,0 +1,11 @@ +#ifndef AVX512_TEST_COMMON +#define AVX512_TEST_COMMON + +#include "avx512-16bit-qsort.hpp" +#include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-qsort.hpp" +#include "cpuinfo.h" +#include "rand_array.h" +#include + +#endif diff --git a/tests/test_partial_qsort.hpp b/tests/test_partial_qsort.hpp new file mode 100644 index 00000000..5c08064e --- /dev/null +++ b/tests/test_partial_qsort.hpp @@ -0,0 +1,49 @@ +#include "test-qsort-common.h" + +template +class avx512_partial_sort : public ::testing::Test { +}; +TYPED_TEST_SUITE_P(avx512_partial_sort); + +TYPED_TEST_P(avx512_partial_sort, test_ranges) +{ + int64_t arrsize = 1024; + int64_t nranges = 500; + + if (cpu_has_avx512bw()) { + if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) { + GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; + } + std::vector arr; + std::vector sortedarr; + std::vector psortedarr; + /* Random array */ + arr = get_uniform_rand_array(arrsize); + sortedarr = arr; + /* Sort with std::sort for comparison */ + std::sort(sortedarr.begin(), sortedarr.end()); + + for (size_t ii = 0; ii < nranges; ++ii) { + psortedarr = arr; + + /* Pick a random number of elements to sort at the beginning of the array */ + int k = get_uniform_rand_array(1, arrsize, 1).front(); + + /* Sort the range and verify all the required elements match the presorted set */ + avx512_partial_qsort(psortedarr.data(), k, psortedarr.size()); + for (size_t jj = 0; jj < k; jj++) { + ASSERT_EQ(sortedarr[jj], psortedarr[jj]); + } + + psortedarr.clear(); + } + + arr.clear(); + sortedarr.clear(); + } + else { + GTEST_SKIP() << "Skipping this test, it requires avx512bw"; + } +} + +REGISTER_TYPED_TEST_SUITE_P(avx512_partial_sort, test_ranges); diff --git a/tests/test_qselect.hpp b/tests/test_qselect.hpp new file mode 100644 index 00000000..a697c137 --- /dev/null +++ b/tests/test_qselect.hpp @@ -0,0 +1,42 @@ +#include "test-qsort-common.h" + +template +class avx512_select : public ::testing::Test { +}; +TYPED_TEST_SUITE_P(avx512_select); + +TYPED_TEST_P(avx512_select, test_arrsizes) +{ + if (cpu_has_avx512bw()) { + if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) { + GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; + } + std::vector arrsizes; + for (int64_t ii = 0; ii < 1024; ++ii) { + arrsizes.push_back(ii); + } + std::vector arr; + std::vector sortedarr; + std::vector psortedarr; + for (size_t ii = 0; ii < arrsizes.size(); ++ii) { + /* Random array */ + arr = get_uniform_rand_array(arrsizes[ii]); + sortedarr = arr; + /* Sort with std::sort for comparison */ + std::sort(sortedarr.begin(), sortedarr.end()); + for (size_t k = 0; k < arr.size(); ++k) { + psortedarr = arr; + avx512_qselect(psortedarr.data(), k+1, psortedarr.size()); + ASSERT_EQ(sortedarr[k], psortedarr[k]); + psortedarr.clear(); + } + arr.clear(); + sortedarr.clear(); + } + } + else { + GTEST_SKIP() << "Skipping this test, it requires avx512bw"; + } +} + +REGISTER_TYPED_TEST_SUITE_P(avx512_select, test_arrsizes); diff --git a/tests/test_qsort.cpp b/tests/test_qsort.hpp similarity index 70% rename from tests/test_qsort.cpp rename to tests/test_qsort.hpp index 6d82a35b..65a8eaf6 100644 --- a/tests/test_qsort.cpp +++ b/tests/test_qsort.hpp @@ -3,13 +3,7 @@ * * SPDX-License-Identifier: BSD-3-Clause * *******************************************/ -#include "avx512-16bit-qsort.hpp" -#include "avx512-32bit-qsort.hpp" -#include "avx512-64bit-qsort.hpp" -#include "cpuinfo.h" -#include "rand_array.h" -#include -#include +#include "test-qsort-common.h" template class avx512_sort : public ::testing::Test { @@ -46,13 +40,3 @@ TYPED_TEST_P(avx512_sort, test_arrsizes) } REGISTER_TYPED_TEST_SUITE_P(avx512_sort, test_arrsizes); - -using Types = testing::Types; -INSTANTIATE_TYPED_TEST_SUITE_P(TestPrefix, avx512_sort, Types); diff --git a/tests/test_qsortfp16.cpp b/tests/test_qsortfp16.cpp index ab5c10fe..094d4cdc 100644 --- a/tests/test_qsortfp16.cpp +++ b/tests/test_qsortfp16.cpp @@ -72,3 +72,79 @@ TEST(avx512_qsort_float16, test_special_floats) GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; } } + +TEST(avx512_qselect_float16, test_arrsizes) +{ + if (cpu_has_avx512fp16()) { + std::vector arrsizes; + for (int64_t ii = 0; ii < 1024; ++ii) { + arrsizes.push_back(ii); + } + std::vector<_Float16> arr; + std::vector<_Float16> sortedarr; + std::vector<_Float16> psortedarr; + + for (size_t ii = 0; ii < arrsizes.size(); ++ii) { + /* Random array */ + for (size_t jj = 0; jj < arrsizes[ii]; ++jj) { + _Float16 temp = (float)rand() / (float)(RAND_MAX); + arr.push_back(temp); + sortedarr.push_back(temp); + } + /* Sort with std::sort for comparison */ + std::sort(sortedarr.begin(), sortedarr.end()); + for (size_t k = 0; k < arr.size(); ++k) { + psortedarr = arr; + avx512_qselect<_Float16>(psortedarr.data(), k+1, psortedarr.size()); + ASSERT_EQ(sortedarr[k], psortedarr[k]); + psortedarr.clear(); + } + arr.clear(); + sortedarr.clear(); + } + } + else { + GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; + } +} + +TEST(avx512_partial_qsort_float16, test_ranges) +{ + if (cpu_has_avx512fp16()) { + int64_t arrsize = 1024; + int64_t nranges = 500; + + std::vector<_Float16> arr; + std::vector<_Float16> sortedarr; + std::vector<_Float16> psortedarr; + + /* Random array */ + for (size_t ii = 0; ii < arrsize; ++ii) { + _Float16 temp = (float)rand() / (float)(RAND_MAX); + arr.push_back(temp); + sortedarr.push_back(temp); + } + /* Sort with std::sort for comparison */ + std::sort(sortedarr.begin(), sortedarr.end()); + + for (size_t ii = 0; ii < nranges; ++ii) { + psortedarr = arr; + + int k = get_uniform_rand_array(1, arrsize, 1).front(); + + /* Sort the range and verify all the required elements match the presorted set */ + avx512_partial_qsort<_Float16>(psortedarr.data(), k, psortedarr.size()); + for (size_t jj = 0; jj < k; jj++) { + ASSERT_EQ(sortedarr[jj], psortedarr[jj]); + } + + psortedarr.clear(); + } + + arr.clear(); + sortedarr.clear(); + } + else { + GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; + } +} diff --git a/tests/test_sort.cpp b/tests/test_sort.cpp new file mode 100644 index 00000000..85a6bd8d --- /dev/null +++ b/tests/test_sort.cpp @@ -0,0 +1,15 @@ +#include "test_qsort.hpp" +#include "test_qselect.hpp" +#include "test_partial_qsort.hpp" + +using QuickSortTestTypes = testing::Types; +INSTANTIATE_TYPED_TEST_SUITE_P(TestPrefix, avx512_sort, QuickSortTestTypes); +INSTANTIATE_TYPED_TEST_SUITE_P(TestPrefix, avx512_select, QuickSortTestTypes); +INSTANTIATE_TYPED_TEST_SUITE_P(TestPrefix, avx512_partial_sort, QuickSortTestTypes); From 120456183e35c21d14b6109a3e793a0fd601f636 Mon Sep 17 00:00:00 2001 From: Mitchell O'Sullivan Date: Sat, 8 Apr 2023 01:55:03 +1000 Subject: [PATCH 4/7] Add benchmarks for partial sorting functions --- benchmarks/bench-qsort-common.h | 11 +++ benchmarks/bench_partial_qsort.hpp | 76 ++++++++++++++++ benchmarks/bench_qselect.hpp | 76 ++++++++++++++++ benchmarks/bench_qsort.cpp | 76 +--------------- benchmarks/bench_qsort.hpp | 68 ++++++++++++++ benchmarks/bench_qsortfp16.cpp | 138 ++++++++++++++++++++++++++++- 6 files changed, 371 insertions(+), 74 deletions(-) create mode 100644 benchmarks/bench-qsort-common.h create mode 100644 benchmarks/bench_partial_qsort.hpp create mode 100644 benchmarks/bench_qselect.hpp create mode 100644 benchmarks/bench_qsort.hpp diff --git a/benchmarks/bench-qsort-common.h b/benchmarks/bench-qsort-common.h new file mode 100644 index 00000000..fe0decf7 --- /dev/null +++ b/benchmarks/bench-qsort-common.h @@ -0,0 +1,11 @@ +#ifndef AVX512_BENCH_COMMON +#define AVX512_BENCH_COMMON + +#include +#include "rand_array.h" +#include "cpuinfo.h" +#include "avx512-16bit-qsort.hpp" +#include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-qsort.hpp" + +#endif diff --git a/benchmarks/bench_partial_qsort.hpp b/benchmarks/bench_partial_qsort.hpp new file mode 100644 index 00000000..6f31040d --- /dev/null +++ b/benchmarks/bench_partial_qsort.hpp @@ -0,0 +1,76 @@ +#include "bench-qsort-common.h" + +template +static void avx512_partial_qsort(benchmark::State& state) { + if (!cpu_has_avx512bw()) { + state.SkipWithMessage("Requires AVX512 BW ISA"); + } + if ((sizeof(T) == 2) && (!cpu_has_avx512_vbmi2())) { + state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); + } + // Perform setup here + size_t ARRSIZE = state.range(0); + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + arr = get_uniform_rand_array(ARRSIZE); + arr_bkp = arr; + + /* Choose random index to sort up until */ + int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); + + /* call avx512_partial_qsort */ + for (auto _ : state) { + avx512_partial_qsort(arr.data(), k, ARRSIZE); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } +} + +template +static void stdpartialsort(benchmark::State& state) { + // Perform setup here + size_t ARRSIZE = state.range(0); + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + arr = get_uniform_rand_array(ARRSIZE); + arr_bkp = arr; + + /* Choose random index to sort up until */ + int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); + + /* call std::partial_sort */ + for (auto _ : state) { + std::partial_sort(arr.begin(), arr.begin() + k, arr.end()); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } +} + +// Register the function as a benchmark +BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdpartialsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdpartialsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdpartialsort)->Arg(10000)->Arg(1000000); + +BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdpartialsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdpartialsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdpartialsort)->Arg(10000)->Arg(10000000); + +//BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdpartialsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdpartialsort)->Arg(10000)->Arg(10000000); diff --git a/benchmarks/bench_qselect.hpp b/benchmarks/bench_qselect.hpp new file mode 100644 index 00000000..3966632a --- /dev/null +++ b/benchmarks/bench_qselect.hpp @@ -0,0 +1,76 @@ +#include "bench-qsort-common.h" + +template +static void avx512_qselect(benchmark::State& state) { + if (!cpu_has_avx512bw()) { + state.SkipWithMessage("Requires AVX512 BW ISA"); + } + if ((sizeof(T) == 2) && (!cpu_has_avx512_vbmi2())) { + state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); + } + // Perform setup here + size_t ARRSIZE = state.range(0); + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + arr = get_uniform_rand_array(ARRSIZE); + arr_bkp = arr; + + /* Choose random index to make sorted */ + int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); + + /* call avx512 quickselect */ + for (auto _ : state) { + avx512_qselect(arr.data(), k, ARRSIZE); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } +} + +template +static void stdnthelement(benchmark::State& state) { + // Perform setup here + size_t ARRSIZE = state.range(0); + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + arr = get_uniform_rand_array(ARRSIZE); + arr_bkp = arr; + + /* Choose random index to make sorted */ + int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); + + /* call std::nth_element */ + for (auto _ : state) { + std::nth_element(arr.begin(), arr.begin() + k, arr.end()); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } +} + +// Register the function as a benchmark +BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); +BENCHMARK(stdnthelement)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); +BENCHMARK(stdnthelement)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); +BENCHMARK(stdnthelement)->Arg(10000)->Arg(1000000); + +BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); +BENCHMARK(stdnthelement)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); +BENCHMARK(stdnthelement)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); +BENCHMARK(stdnthelement)->Arg(10000)->Arg(10000000); + +//BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); +BENCHMARK(stdnthelement)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); +BENCHMARK(stdnthelement)->Arg(10000)->Arg(10000000); diff --git a/benchmarks/bench_qsort.cpp b/benchmarks/bench_qsort.cpp index 3f622fc3..d14bf218 100644 --- a/benchmarks/bench_qsort.cpp +++ b/benchmarks/bench_qsort.cpp @@ -1,73 +1,3 @@ -#include -#include "rand_array.h" -#include "cpuinfo.h" -#include "avx512-16bit-qsort.hpp" -#include "avx512-32bit-qsort.hpp" -#include "avx512-64bit-qsort.hpp" - -template -static void avx512_qsort(benchmark::State& state) { - if (!cpu_has_avx512bw()) { - state.SkipWithMessage("Requires AVX512 BW ISA"); - } - if ((sizeof(T) == 2) && (!cpu_has_avx512_vbmi2())) { - state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); - } - // Perform setup here - size_t ARRSIZE = state.range(0); - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements is reverse order */ - arr = get_uniform_rand_array(ARRSIZE); - arr_bkp = arr; - - /* call avx512 quicksort */ - for (auto _ : state) { - avx512_qsort(arr.data(), ARRSIZE); - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } -} - -template -static void stdsort(benchmark::State& state) { - // Perform setup here - size_t ARRSIZE = state.range(0); - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements is reverse order */ - arr = get_uniform_rand_array(ARRSIZE); - arr_bkp = arr; - - /* call avx512 quicksort */ - for (auto _ : state) { - std::sort(arr.begin(), arr.end()); - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } -} - -// Register the function as a benchmark -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); - -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(10000000); - -//BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(10000000); +#include "bench_qsort.hpp" +#include "bench_qselect.hpp" +#include "bench_partial_qsort.hpp" diff --git a/benchmarks/bench_qsort.hpp b/benchmarks/bench_qsort.hpp new file mode 100644 index 00000000..bd652993 --- /dev/null +++ b/benchmarks/bench_qsort.hpp @@ -0,0 +1,68 @@ +#include "bench-qsort-common.h" + +template +static void avx512_qsort(benchmark::State& state) { + if (!cpu_has_avx512bw()) { + state.SkipWithMessage("Requires AVX512 BW ISA"); + } + if ((sizeof(T) == 2) && (!cpu_has_avx512_vbmi2())) { + state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); + } + // Perform setup here + size_t ARRSIZE = state.range(0); + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + arr = get_uniform_rand_array(ARRSIZE); + arr_bkp = arr; + + /* call avx512 quicksort */ + for (auto _ : state) { + avx512_qsort(arr.data(), ARRSIZE); + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } +} + +template +static void stdsort(benchmark::State& state) { + // Perform setup here + size_t ARRSIZE = state.range(0); + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + arr = get_uniform_rand_array(ARRSIZE); + arr_bkp = arr; + + /* call std::sort */ + for (auto _ : state) { + std::sort(arr.begin(), arr.end()); + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } +} + +// Register the function as a benchmark +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); + +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(10000000); + +//BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(10000000); diff --git a/benchmarks/bench_qsortfp16.cpp b/benchmarks/bench_qsortfp16.cpp index 0b454f7e..f39eeca9 100644 --- a/benchmarks/bench_qsortfp16.cpp +++ b/benchmarks/bench_qsortfp16.cpp @@ -45,7 +45,7 @@ static void stdsort(benchmark::State& state) { } arr_bkp = arr; - /* call avx512 quicksort */ + /* call std::sort */ for (auto _ : state) { std::sort(arr.begin(), arr.end()); state.PauseTiming(); @@ -61,3 +61,139 @@ static void stdsort(benchmark::State& state) { // Register the function as a benchmark BENCHMARK(avx512_qsort<_Float16>)->Arg(10000)->Arg(1000000); BENCHMARK(stdsort<_Float16>)->Arg(10000)->Arg(1000000); + +template +static void avx512_qselect(benchmark::State& state) { + if (cpu_has_avx512fp16()) { + // Perform setup here + size_t ARRSIZE = state.range(0); + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + for (size_t jj = 0; jj < ARRSIZE; ++jj) { + _Float16 temp = (float) rand() / (float)(RAND_MAX); + arr.push_back(temp); + } + arr_bkp = arr; + + /* Choose random index to make sorted */ + int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); + + /* call avx512 quickselect */ + for (auto _ : state) { + avx512_qselect(arr.data(), k, ARRSIZE); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } + } + else { + state.SkipWithMessage("Requires AVX512-FP16 ISA"); + } +} + +template +static void stdnthelement(benchmark::State& state) { + if (cpu_has_avx512fp16()) { + // Perform setup here + size_t ARRSIZE = state.range(0); + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + for (size_t jj = 0; jj < ARRSIZE; ++jj) { + _Float16 temp = (float) rand() / (float)(RAND_MAX); + arr.push_back(temp); + } + arr_bkp = arr; + + /* Choose random index to sort until */ + int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); + + /* call std::nth_element */ + for (auto _ : state) { + std::nth_element(arr.begin(), arr.begin() + k, arr.end()); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } + } + else { + state.SkipWithMessage("Requires AVX512-FP16 ISA"); + } +} + +// Register the function as a benchmark +BENCHMARK(avx512_qselect<_Float16>)->Arg(10000)->Arg(1000000); +BENCHMARK(stdnthelement<_Float16>)->Arg(10000)->Arg(1000000); + +template +static void avx512_partial_qsort(benchmark::State& state) { + if (cpu_has_avx512fp16()) { + // Perform setup here + size_t ARRSIZE = state.range(0); + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + for (size_t jj = 0; jj < ARRSIZE; ++jj) { + _Float16 temp = (float) rand() / (float)(RAND_MAX); + arr.push_back(temp); + } + arr_bkp = arr; + + /* Choose random index to sort up until */ + int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); + + /* call avx512_partial_qsort */ + for (auto _ : state) { + avx512_partial_qsort(arr.data(), k, ARRSIZE); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } + } + else { + state.SkipWithMessage("Requires AVX512-FP16 ISA"); + } +} + +template +static void stdpartialsort(benchmark::State& state) { + if (cpu_has_avx512fp16()) { + // Perform setup here + size_t ARRSIZE = state.range(0); + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + for (size_t jj = 0; jj < ARRSIZE; ++jj) { + _Float16 temp = (float) rand() / (float)(RAND_MAX); + arr.push_back(temp); + } + arr_bkp = arr; + + /* Choose random index to sort up until */ + int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); + + /* call std::partial_sort */ + for (auto _ : state) { + std::partial_sort(arr.begin(), arr.begin() + k, arr.end()); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } + } + else { + state.SkipWithMessage("Requires AVX512-FP16 ISA"); + } +} + +// Register the function as a benchmark +BENCHMARK(avx512_partial_qsort<_Float16>)->Arg(10000)->Arg(1000000); +BENCHMARK(stdpartialsort<_Float16>)->Arg(10000)->Arg(1000000); From 9029f61ae10b150615c4357ed56b2b16b31161cd Mon Sep 17 00:00:00 2001 From: Mitchell O'Sullivan Date: Mon, 10 Apr 2023 14:12:12 +1000 Subject: [PATCH 5/7] Align qselect parameter meaning with nth_element The QuickSelect internal method is now phrased such that the position to be sorted is given as an offset (in the same way that left points to the first element and right points to the last element). Similarly, the avx512_qselect method also now uses this interpretation. --- benchmarks/bench_qselect.hpp | 4 ++-- benchmarks/bench_qsortfp16.cpp | 4 ++-- src/avx512-16bit-common.h | 24 ++++++++++++------------ src/avx512-32bit-qsort.hpp | 24 ++++++++++++------------ src/avx512-64bit-qsort.hpp | 12 ++++++------ src/avx512-common-qsort.h | 10 +++++----- tests/test_qselect.hpp | 11 ++++++++++- tests/test_qsortfp16.cpp | 11 ++++++++++- 8 files changed, 59 insertions(+), 41 deletions(-) diff --git a/benchmarks/bench_qselect.hpp b/benchmarks/bench_qselect.hpp index 3966632a..487079aa 100644 --- a/benchmarks/bench_qselect.hpp +++ b/benchmarks/bench_qselect.hpp @@ -18,7 +18,7 @@ static void avx512_qselect(benchmark::State& state) { arr_bkp = arr; /* Choose random index to make sorted */ - int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); + int k = get_uniform_rand_array(1, ARRSIZE - 1, 0).front(); /* call avx512 quickselect */ for (auto _ : state) { @@ -42,7 +42,7 @@ static void stdnthelement(benchmark::State& state) { arr_bkp = arr; /* Choose random index to make sorted */ - int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); + int k = get_uniform_rand_array(1, ARRSIZE - 1, 0).front(); /* call std::nth_element */ for (auto _ : state) { diff --git a/benchmarks/bench_qsortfp16.cpp b/benchmarks/bench_qsortfp16.cpp index f39eeca9..4f700cb4 100644 --- a/benchmarks/bench_qsortfp16.cpp +++ b/benchmarks/bench_qsortfp16.cpp @@ -78,7 +78,7 @@ static void avx512_qselect(benchmark::State& state) { arr_bkp = arr; /* Choose random index to make sorted */ - int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); + int k = get_uniform_rand_array(1, ARRSIZE - 1, 0).front(); /* call avx512 quickselect */ for (auto _ : state) { @@ -110,7 +110,7 @@ static void stdnthelement(benchmark::State& state) { arr_bkp = arr; /* Choose random index to sort until */ - int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); + int k = get_uniform_rand_array(1, ARRSIZE - 1, 0).front(); /* call std::nth_element */ for (auto _ : state) { diff --git a/src/avx512-16bit-common.h b/src/avx512-16bit-common.h index b5ee6e28..6e0743d6 100644 --- a/src/avx512-16bit-common.h +++ b/src/avx512-16bit-common.h @@ -261,9 +261,7 @@ X86_SIMD_SORT_INLINE type_t get_pivot_16bit(type_t *arr, template static void -qselect_16bit_(type_t *arr, int64_t k, - int64_t left, int64_t right, - int64_t max_iters) +qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -285,15 +283,17 @@ qselect_16bit_(type_t *arr, int64_t k, type_t biggest = vtype::type_min(); int64_t pivot_index = partition_avx512( arr, left, right + 1, pivot, &smallest, &biggest); - if ((pivot != smallest) && (k <= pivot_index)) - qselect_16bit_(arr, k, left, pivot_index - 1, max_iters - 1); - else if ((pivot != biggest) && (k > pivot_index)) - qselect_16bit_(arr, k, pivot_index, right, max_iters - 1); + if (pivot != smallest) + qsort_16bit_(arr, left, pivot_index - 1, max_iters - 1); + if (pivot != biggest) + qsort_16bit_(arr, pivot_index, right, max_iters - 1); } template static void -qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) +qselect_16bit_(type_t *arr, int64_t pos, + int64_t left, int64_t right, + int64_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -315,10 +315,10 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) type_t biggest = vtype::type_min(); int64_t pivot_index = partition_avx512( arr, left, right + 1, pivot, &smallest, &biggest); - if (pivot != smallest) - qsort_16bit_(arr, left, pivot_index - 1, max_iters - 1); - if (pivot != biggest) - qsort_16bit_(arr, pivot_index, right, max_iters - 1); + if ((pivot != smallest) && (pos < pivot_index)) + qselect_16bit_(arr, pos, left, pivot_index - 1, max_iters - 1); + else if ((pivot != biggest) && (pos >= pivot_index)) + qselect_16bit_(arr, pos, pivot_index, right, max_iters - 1); } #endif // AVX512_16BIT_COMMON diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp index 5c4bc7af..e9e97aa1 100644 --- a/src/avx512-32bit-qsort.hpp +++ b/src/avx512-32bit-qsort.hpp @@ -628,9 +628,7 @@ X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr, template static void -qselect_32bit_(type_t *arr, int64_t k, - int64_t left, int64_t right, - int64_t max_iters) +qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -652,15 +650,17 @@ qselect_32bit_(type_t *arr, int64_t k, type_t biggest = vtype::type_min(); int64_t pivot_index = partition_avx512( arr, left, right + 1, pivot, &smallest, &biggest); - if ((pivot != smallest) && (k <= pivot_index)) - qselect_32bit_(arr, k, left, pivot_index - 1, max_iters - 1); - else if ((pivot != biggest) && (k > pivot_index)) - qselect_32bit_(arr, k, pivot_index, right, max_iters - 1); + if (pivot != smallest) + qsort_32bit_(arr, left, pivot_index - 1, max_iters - 1); + if (pivot != biggest) + qsort_32bit_(arr, pivot_index, right, max_iters - 1); } template static void -qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) +qselect_32bit_(type_t *arr, int64_t pos, + int64_t left, int64_t right, + int64_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -682,10 +682,10 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) type_t biggest = vtype::type_min(); int64_t pivot_index = partition_avx512( arr, left, right + 1, pivot, &smallest, &biggest); - if (pivot != smallest) - qsort_32bit_(arr, left, pivot_index - 1, max_iters - 1); - if (pivot != biggest) - qsort_32bit_(arr, pivot_index, right, max_iters - 1); + if ((pivot != smallest) && (pos < pivot_index)) + qselect_32bit_(arr, pos, left, pivot_index - 1, max_iters - 1); + else if ((pivot != biggest) && (pos >= pivot_index)) + qselect_32bit_(arr, pos, pivot_index, right, max_iters - 1); } X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) diff --git a/src/avx512-64bit-qsort.hpp b/src/avx512-64bit-qsort.hpp index e451634c..dfb5376f 100644 --- a/src/avx512-64bit-qsort.hpp +++ b/src/avx512-64bit-qsort.hpp @@ -403,7 +403,7 @@ qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) template static void -qselect_64bit_(type_t *arr, int64_t k, +qselect_64bit_(type_t *arr, int64_t pos, int64_t left, int64_t right, int64_t max_iters) { @@ -427,10 +427,10 @@ qselect_64bit_(type_t *arr, int64_t k, type_t biggest = vtype::type_min(); int64_t pivot_index = partition_avx512( arr, left, right + 1, pivot, &smallest, &biggest); - if ((pivot != smallest) && (k <= pivot_index)) - qselect_64bit_(arr, k, left, pivot_index - 1, max_iters - 1); - else if ((pivot != biggest) && (k > pivot_index)) - qselect_64bit_(arr, k, pivot_index, right, max_iters - 1); + if ((pivot != smallest) && (pos < pivot_index)) + qselect_64bit_(arr, pos, left, pivot_index - 1, max_iters - 1); + else if ((pivot != biggest) && (pos >= pivot_index)) + qselect_64bit_(arr, pos, pivot_index, right, max_iters - 1); } template <> @@ -438,7 +438,7 @@ void avx512_qselect(int64_t *arr, int64_t k, int64_t arrsize) { if (arrsize > 1) { qselect_64bit_, int64_t>( - arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); } } diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index bd57f44f..6ef2c584 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -95,14 +95,14 @@ void avx512_qselect(T *arr, int64_t k, int64_t arrsize); void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize); template -void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize) { - avx512_qselect(arr, k, arrsize); - avx512_qsort(arr, k); +inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize) { + avx512_qselect(arr, k - 1, arrsize); + avx512_qsort(arr, k - 1); } inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k, int64_t arrsize) { - avx512_qselect_fp16(arr, k, arrsize); - avx512_qsort_fp16(arr, k); + avx512_qselect_fp16(arr, k - 1, arrsize); + avx512_qsort_fp16(arr, k - 1); } template diff --git a/tests/test_qselect.hpp b/tests/test_qselect.hpp index a697c137..cad017bb 100644 --- a/tests/test_qselect.hpp +++ b/tests/test_qselect.hpp @@ -26,8 +26,17 @@ TYPED_TEST_P(avx512_select, test_arrsizes) std::sort(sortedarr.begin(), sortedarr.end()); for (size_t k = 0; k < arr.size(); ++k) { psortedarr = arr; - avx512_qselect(psortedarr.data(), k+1, psortedarr.size()); + avx512_qselect(psortedarr.data(), k, psortedarr.size()); + /* index k is correct */ ASSERT_EQ(sortedarr[k], psortedarr[k]); + /* Check left partition */ + for (size_t jj = 0; jj < k; jj++) { + ASSERT_LE(psortedarr[jj], psortedarr[k]); + } + /* Check right partition */ + for (size_t jj = k+1; jj < arr.size(); jj++) { + ASSERT_GE(psortedarr[jj], psortedarr[k]); + } psortedarr.clear(); } arr.clear(); diff --git a/tests/test_qsortfp16.cpp b/tests/test_qsortfp16.cpp index 094d4cdc..f86d77df 100644 --- a/tests/test_qsortfp16.cpp +++ b/tests/test_qsortfp16.cpp @@ -95,8 +95,17 @@ TEST(avx512_qselect_float16, test_arrsizes) std::sort(sortedarr.begin(), sortedarr.end()); for (size_t k = 0; k < arr.size(); ++k) { psortedarr = arr; - avx512_qselect<_Float16>(psortedarr.data(), k+1, psortedarr.size()); + avx512_qselect<_Float16>(psortedarr.data(), k, psortedarr.size()); + /* index k is correct */ ASSERT_EQ(sortedarr[k], psortedarr[k]); + /* Check left partition */ + for (size_t jj = 0; jj < k; jj++) { + ASSERT_LE(psortedarr[jj], psortedarr[k]); + } + /* Check right partition */ + for (size_t jj = k+1; jj < arr.size(); jj++) { + ASSERT_GE(psortedarr[jj], psortedarr[k]); + } psortedarr.clear(); } arr.clear(); From f4bca13484e6bbf888fd439fd5c0f54b1f2fe1af Mon Sep 17 00:00:00 2001 From: Mitchell O'Sullivan Date: Sun, 9 Apr 2023 19:40:25 +1000 Subject: [PATCH 6/7] Correct language on pivot and partition functions The comment and variable names appear misleading as the function actually returns the position of the element immediately following the last which is less than the pivot. --- src/avx512-common-qsort.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index 6ef2c584..5b6591f0 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -131,8 +131,8 @@ static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask) return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max } /* - * Parition one ZMM register based on the pivot and returns the index of the - * last element that is less than equal to the pivot. + * Parition one ZMM register based on the pivot and returns the + * number of elements that are greater than or equal to the pivot. */ template static inline int32_t partition_vec(type_t *arr, @@ -143,20 +143,20 @@ static inline int32_t partition_vec(type_t *arr, zmm_t *smallest_vec, zmm_t *biggest_vec) { - /* which elements are larger than the pivot */ - typename vtype::opmask_t gt_mask = vtype::ge(curr_vec, pivot_vec); - int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask); + /* which elements are larger than or equal to the pivot */ + typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec); + int32_t amount_ge_pivot = _mm_popcnt_u32((int32_t)ge_mask); vtype::mask_compressstoreu( - arr + left, vtype::knot_opmask(gt_mask), curr_vec); + arr + left, vtype::knot_opmask(ge_mask), curr_vec); vtype::mask_compressstoreu( - arr + right - amount_gt_pivot, gt_mask, curr_vec); + arr + right - amount_ge_pivot, ge_mask, curr_vec); *smallest_vec = vtype::min(curr_vec, *smallest_vec); *biggest_vec = vtype::max(curr_vec, *biggest_vec); - return amount_gt_pivot; + return amount_ge_pivot; } /* * Parition an array based on the pivot and returns the index of the - * last element that is less than equal to the pivot. + * first element that is greater than or equal to the pivot. */ template static inline int64_t partition_avx512(type_t *arr, @@ -188,7 +188,7 @@ static inline int64_t partition_avx512(type_t *arr, if (right - left == vtype::numlanes) { zmm_t vec = vtype::loadu(arr + left); - int32_t amount_gt_pivot = partition_vec(arr, + int32_t amount_ge_pivot = partition_vec(arr, left, left + vtype::numlanes, vec, @@ -197,7 +197,7 @@ static inline int64_t partition_avx512(type_t *arr, &max_vec); *smallest = vtype::reducemin(min_vec); *biggest = vtype::reducemax(max_vec); - return left + (vtype::numlanes - amount_gt_pivot); + return left + (vtype::numlanes - amount_ge_pivot); } // first and last vtype::numlanes values are partitioned at the end @@ -225,7 +225,7 @@ static inline int64_t partition_avx512(type_t *arr, left += vtype::numlanes; } // partition the current vector and save it on both sides of the array - int32_t amount_gt_pivot + int32_t amount_ge_pivot = partition_vec(arr, l_store, r_store + vtype::numlanes, @@ -234,27 +234,27 @@ static inline int64_t partition_avx512(type_t *arr, &min_vec, &max_vec); ; - r_store -= amount_gt_pivot; - l_store += (vtype::numlanes - amount_gt_pivot); + r_store -= amount_ge_pivot; + l_store += (vtype::numlanes - amount_ge_pivot); } /* partition and save vec_left and vec_right */ - int32_t amount_gt_pivot = partition_vec(arr, + int32_t amount_ge_pivot = partition_vec(arr, l_store, r_store + vtype::numlanes, vec_left, pivot_vec, &min_vec, &max_vec); - l_store += (vtype::numlanes - amount_gt_pivot); - amount_gt_pivot = partition_vec(arr, + l_store += (vtype::numlanes - amount_ge_pivot); + amount_ge_pivot = partition_vec(arr, l_store, l_store + vtype::numlanes, vec_right, pivot_vec, &min_vec, &max_vec); - l_store += (vtype::numlanes - amount_gt_pivot); + l_store += (vtype::numlanes - amount_ge_pivot); *smallest = vtype::reducemin(min_vec); *biggest = vtype::reducemax(max_vec); return l_store; From f8d1c2274c7d27cdf1a62d5a224cb5ff82e92077 Mon Sep 17 00:00:00 2001 From: Mitchell O'Sullivan Date: Sun, 16 Apr 2023 20:01:41 +1000 Subject: [PATCH 7/7] Modify partial sort benchmarking to vary k --- benchmarks/bench_partial_qsort.hpp | 50 ++++++++++++++---------------- benchmarks/bench_qselect.hpp | 50 ++++++++++++++---------------- benchmarks/bench_qsort.hpp | 4 +-- benchmarks/bench_qsortfp16.cpp | 40 ++++++++++-------------- 4 files changed, 64 insertions(+), 80 deletions(-) diff --git a/benchmarks/bench_partial_qsort.hpp b/benchmarks/bench_partial_qsort.hpp index 6f31040d..d54ceb31 100644 --- a/benchmarks/bench_partial_qsort.hpp +++ b/benchmarks/bench_partial_qsort.hpp @@ -9,7 +9,8 @@ static void avx512_partial_qsort(benchmark::State& state) { state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); } // Perform setup here - size_t ARRSIZE = state.range(0); + int64_t K = state.range(0); + size_t ARRSIZE = 10000; std::vector arr; std::vector arr_bkp; @@ -17,12 +18,9 @@ static void avx512_partial_qsort(benchmark::State& state) { arr = get_uniform_rand_array(ARRSIZE); arr_bkp = arr; - /* Choose random index to sort up until */ - int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); - /* call avx512_partial_qsort */ for (auto _ : state) { - avx512_partial_qsort(arr.data(), k, ARRSIZE); + avx512_partial_qsort(arr.data(), K, ARRSIZE); state.PauseTiming(); arr = arr_bkp; @@ -33,7 +31,8 @@ static void avx512_partial_qsort(benchmark::State& state) { template static void stdpartialsort(benchmark::State& state) { // Perform setup here - size_t ARRSIZE = state.range(0); + int64_t K = state.range(0); + size_t ARRSIZE = 10000; std::vector arr; std::vector arr_bkp; @@ -41,12 +40,9 @@ static void stdpartialsort(benchmark::State& state) { arr = get_uniform_rand_array(ARRSIZE); arr_bkp = arr; - /* Choose random index to sort up until */ - int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); - /* call std::partial_sort */ for (auto _ : state) { - std::partial_sort(arr.begin(), arr.begin() + k, arr.end()); + std::partial_sort(arr.begin(), arr.begin() + K, arr.end()); state.PauseTiming(); arr = arr_bkp; @@ -55,22 +51,22 @@ static void stdpartialsort(benchmark::State& state) { } // Register the function as a benchmark -BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdpartialsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdpartialsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdpartialsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdpartialsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdpartialsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdpartialsort)->Arg(10000)->Arg(10000000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -//BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdpartialsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_partial_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdpartialsort)->Arg(10000)->Arg(10000000); +//BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); diff --git a/benchmarks/bench_qselect.hpp b/benchmarks/bench_qselect.hpp index 487079aa..fea5bea4 100644 --- a/benchmarks/bench_qselect.hpp +++ b/benchmarks/bench_qselect.hpp @@ -9,7 +9,8 @@ static void avx512_qselect(benchmark::State& state) { state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); } // Perform setup here - size_t ARRSIZE = state.range(0); + int64_t K = state.range(0); + size_t ARRSIZE = 10000; std::vector arr; std::vector arr_bkp; @@ -17,12 +18,9 @@ static void avx512_qselect(benchmark::State& state) { arr = get_uniform_rand_array(ARRSIZE); arr_bkp = arr; - /* Choose random index to make sorted */ - int k = get_uniform_rand_array(1, ARRSIZE - 1, 0).front(); - /* call avx512 quickselect */ for (auto _ : state) { - avx512_qselect(arr.data(), k, ARRSIZE); + avx512_qselect(arr.data(), K, ARRSIZE); state.PauseTiming(); arr = arr_bkp; @@ -33,7 +31,8 @@ static void avx512_qselect(benchmark::State& state) { template static void stdnthelement(benchmark::State& state) { // Perform setup here - size_t ARRSIZE = state.range(0); + int64_t K = state.range(0); + size_t ARRSIZE = 10000; std::vector arr; std::vector arr_bkp; @@ -41,12 +40,9 @@ static void stdnthelement(benchmark::State& state) { arr = get_uniform_rand_array(ARRSIZE); arr_bkp = arr; - /* Choose random index to make sorted */ - int k = get_uniform_rand_array(1, ARRSIZE - 1, 0).front(); - /* call std::nth_element */ for (auto _ : state) { - std::nth_element(arr.begin(), arr.begin() + k, arr.end()); + std::nth_element(arr.begin(), arr.begin() + K, arr.end()); state.PauseTiming(); arr = arr_bkp; @@ -55,22 +51,22 @@ static void stdnthelement(benchmark::State& state) { } // Register the function as a benchmark -BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); -BENCHMARK(stdnthelement)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); -BENCHMARK(stdnthelement)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); -BENCHMARK(stdnthelement)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); -BENCHMARK(stdnthelement)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); -BENCHMARK(stdnthelement)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); -BENCHMARK(stdnthelement)->Arg(10000)->Arg(10000000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -//BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); -BENCHMARK(stdnthelement)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qselect)->Arg(10000)->Arg(1000000); -BENCHMARK(stdnthelement)->Arg(10000)->Arg(10000000); +//BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); diff --git a/benchmarks/bench_qsort.hpp b/benchmarks/bench_qsort.hpp index bd652993..6659fdae 100644 --- a/benchmarks/bench_qsort.hpp +++ b/benchmarks/bench_qsort.hpp @@ -59,10 +59,10 @@ BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(10000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); //BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(10000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); diff --git a/benchmarks/bench_qsortfp16.cpp b/benchmarks/bench_qsortfp16.cpp index 4f700cb4..eddd876d 100644 --- a/benchmarks/bench_qsortfp16.cpp +++ b/benchmarks/bench_qsortfp16.cpp @@ -66,7 +66,8 @@ template static void avx512_qselect(benchmark::State& state) { if (cpu_has_avx512fp16()) { // Perform setup here - size_t ARRSIZE = state.range(0); + int64_t K = state.range(0); + size_t ARRSIZE = 10000; std::vector arr; std::vector arr_bkp; @@ -77,12 +78,9 @@ static void avx512_qselect(benchmark::State& state) { } arr_bkp = arr; - /* Choose random index to make sorted */ - int k = get_uniform_rand_array(1, ARRSIZE - 1, 0).front(); - /* call avx512 quickselect */ for (auto _ : state) { - avx512_qselect(arr.data(), k, ARRSIZE); + avx512_qselect(arr.data(), K, ARRSIZE); state.PauseTiming(); arr = arr_bkp; @@ -98,7 +96,8 @@ template static void stdnthelement(benchmark::State& state) { if (cpu_has_avx512fp16()) { // Perform setup here - size_t ARRSIZE = state.range(0); + int64_t K = state.range(0); + size_t ARRSIZE = 10000; std::vector arr; std::vector arr_bkp; @@ -109,12 +108,9 @@ static void stdnthelement(benchmark::State& state) { } arr_bkp = arr; - /* Choose random index to sort until */ - int k = get_uniform_rand_array(1, ARRSIZE - 1, 0).front(); - /* call std::nth_element */ for (auto _ : state) { - std::nth_element(arr.begin(), arr.begin() + k, arr.end()); + std::nth_element(arr.begin(), arr.begin() + K, arr.end()); state.PauseTiming(); arr = arr_bkp; @@ -127,14 +123,15 @@ static void stdnthelement(benchmark::State& state) { } // Register the function as a benchmark -BENCHMARK(avx512_qselect<_Float16>)->Arg(10000)->Arg(1000000); -BENCHMARK(stdnthelement<_Float16>)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qselect<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); template static void avx512_partial_qsort(benchmark::State& state) { if (cpu_has_avx512fp16()) { // Perform setup here - size_t ARRSIZE = state.range(0); + int64_t K = state.range(0); + size_t ARRSIZE = 10000; std::vector arr; std::vector arr_bkp; @@ -145,12 +142,9 @@ static void avx512_partial_qsort(benchmark::State& state) { } arr_bkp = arr; - /* Choose random index to sort up until */ - int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); - /* call avx512_partial_qsort */ for (auto _ : state) { - avx512_partial_qsort(arr.data(), k, ARRSIZE); + avx512_partial_qsort(arr.data(), K, ARRSIZE); state.PauseTiming(); arr = arr_bkp; @@ -166,7 +160,8 @@ template static void stdpartialsort(benchmark::State& state) { if (cpu_has_avx512fp16()) { // Perform setup here - size_t ARRSIZE = state.range(0); + int64_t K = state.range(0); + size_t ARRSIZE = 10000; std::vector arr; std::vector arr_bkp; @@ -177,12 +172,9 @@ static void stdpartialsort(benchmark::State& state) { } arr_bkp = arr; - /* Choose random index to sort up until */ - int k = get_uniform_rand_array(1, ARRSIZE, 1).front(); - /* call std::partial_sort */ for (auto _ : state) { - std::partial_sort(arr.begin(), arr.begin() + k, arr.end()); + std::partial_sort(arr.begin(), arr.begin() + K, arr.end()); state.PauseTiming(); arr = arr_bkp; @@ -195,5 +187,5 @@ static void stdpartialsort(benchmark::State& state) { } // Register the function as a benchmark -BENCHMARK(avx512_partial_qsort<_Float16>)->Arg(10000)->Arg(1000000); -BENCHMARK(stdpartialsort<_Float16>)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_partial_qsort<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);