Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions src/avx512-16bit-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,11 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
}

template <typename vtype, typename type_t>
static void
qselect_16bit_(type_t *arr, int64_t pos,
int64_t left, int64_t right,
int64_t max_iters)
static void qselect_16bit_(type_t *arr,
int64_t pos,
int64_t left,
int64_t right,
int64_t max_iters)
{
/*
* Resort to std::sort if quicksort isnt making any progress
Expand Down
13 changes: 7 additions & 6 deletions src/avx512-32bit-qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,7 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
type_t smallest = vtype::type_max();
type_t biggest = vtype::type_min();
int64_t pivot_index = partition_avx512<vtype>(
int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
arr, left, right + 1, pivot, &smallest, &biggest);
if (pivot != smallest)
qsort_32bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
Expand All @@ -657,10 +657,11 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
}

template <typename vtype, typename type_t>
static void
qselect_32bit_(type_t *arr, int64_t pos,
int64_t left, int64_t right,
int64_t max_iters)
static void qselect_32bit_(type_t *arr,
int64_t pos,
int64_t left,
int64_t right,
int64_t max_iters)
{
/*
* Resort to std::sort if quicksort isnt making any progress
Expand All @@ -680,7 +681,7 @@ qselect_32bit_(type_t *arr, int64_t pos,
type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
type_t smallest = vtype::type_max();
type_t biggest = vtype::type_min();
int64_t pivot_index = partition_avx512<vtype>(
int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
arr, left, right + 1, pivot, &smallest, &biggest);
if ((pivot != smallest) && (pos < pivot_index))
qselect_32bit_<vtype>(arr, pos, left, pivot_index - 1, max_iters - 1);
Expand Down
366 changes: 358 additions & 8 deletions src/avx512-64bit-qsort.hpp

Large diffs are not rendered by default.

122 changes: 121 additions & 1 deletion src/avx512-common-qsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ void avx512_qselect(T *arr, int64_t k, int64_t arrsize);
void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize);

template <typename T>
inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize) {
inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize)
{
avx512_qselect<T>(arr, k - 1, arrsize);
avx512_qsort<T>(arr, k - 1);
}
Expand Down Expand Up @@ -259,4 +260,123 @@ static inline int64_t partition_avx512(type_t *arr,
*biggest = vtype::reducemax(max_vec);
return l_store;
}

template <typename vtype,
int num_unroll,
typename type_t = typename vtype::type_t>
static inline int64_t partition_avx512_unrolled(type_t *arr,
int64_t left,
int64_t right,
type_t pivot,
type_t *smallest,
type_t *biggest)
{
if (right - left <= 2 * num_unroll * vtype::numlanes) {
return partition_avx512<vtype>(
arr, left, right, pivot, smallest, biggest);
}
/* make array length divisible by 8*vtype::numlanes , shortening the array */
for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0;
--i) {
*smallest = std::min(*smallest, arr[left], comparison_func<vtype>);
*biggest = std::max(*biggest, arr[left], comparison_func<vtype>);
if (!comparison_func<vtype>(arr[left], pivot)) {
std::swap(arr[left], arr[--right]);
}
else {
++left;
}
}

if (left == right)
return left; /* less than vtype::numlanes elements in the array */

using zmm_t = typename vtype::zmm_t;
zmm_t pivot_vec = vtype::set1(pivot);
zmm_t min_vec = vtype::set1(*smallest);
zmm_t max_vec = vtype::set1(*biggest);

// We will now have atleast 16 registers worth of data to process:
// left and right vtype::numlanes values are partitioned at the end
zmm_t vec_left[num_unroll], vec_right[num_unroll];
#pragma GCC unroll 8
for (int ii = 0; ii < num_unroll; ++ii) {
vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii);
vec_right[ii] = vtype::loadu(
arr + (right - vtype::numlanes * (num_unroll - ii)));
}
// store points of the vectors
int64_t r_store = right - vtype::numlanes;
int64_t l_store = left;
// indices for loading the elements
left += num_unroll * vtype::numlanes;
right -= num_unroll * vtype::numlanes;
while (right - left != 0) {
zmm_t curr_vec[num_unroll];
/*
* if fewer elements are stored on the right side of the array,
* then next elements are loaded from the right side,
* otherwise from the left side
*/
if ((r_store + vtype::numlanes) - right < left - l_store) {
right -= num_unroll * vtype::numlanes;
#pragma GCC unroll 8
for (int ii = 0; ii < num_unroll; ++ii) {
curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
}
}
else {
#pragma GCC unroll 8
for (int ii = 0; ii < num_unroll; ++ii) {
curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
}
left += num_unroll * vtype::numlanes;
}
// partition the current vector and save it on both sides of the array
#pragma GCC unroll 8
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_ge_pivot
= partition_vec<vtype>(arr,
l_store,
r_store + vtype::numlanes,
curr_vec[ii],
pivot_vec,
&min_vec,
&max_vec);
l_store += (vtype::numlanes - amount_ge_pivot);
r_store -= amount_ge_pivot;
}
}

/* partition and save vec_left[8] and vec_right[8] */
#pragma GCC unroll 8
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_ge_pivot
= partition_vec<vtype>(arr,
l_store,
r_store + vtype::numlanes,
vec_left[ii],
pivot_vec,
&min_vec,
&max_vec);
l_store += (vtype::numlanes - amount_ge_pivot);
r_store -= amount_ge_pivot;
}
#pragma GCC unroll 8
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_ge_pivot
= partition_vec<vtype>(arr,
l_store,
r_store + vtype::numlanes,
vec_right[ii],
pivot_vec,
&min_vec,
&max_vec);
l_store += (vtype::numlanes - amount_ge_pivot);
r_store -= amount_ge_pivot;
}
*smallest = vtype::reducemin(min_vec);
*biggest = vtype::reducemax(max_vec);
return l_store;
}
#endif // AVX512_QSORT_COMMON
2 changes: 1 addition & 1 deletion tests/test_keyvalue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
* *******************************************/

#include "avx512-64bit-keyvaluesort.hpp"
#include "rand_array.h"
#include "cpuinfo.h"
#include "rand_array.h"
#include <gtest/gtest.h>
#include <vector>

Expand Down
3 changes: 2 additions & 1 deletion tests/test_partial_qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ TYPED_TEST_P(avx512_partial_sort, test_ranges)
int k = get_uniform_rand_array<int64_t>(1, arrsize, 1).front();

/* Sort the range and verify all the required elements match the presorted set */
avx512_partial_qsort<TypeParam>(psortedarr.data(), k, psortedarr.size());
avx512_partial_qsort<TypeParam>(
psortedarr.data(), k, psortedarr.size());
for (size_t jj = 0; jj < k; jj++) {
ASSERT_EQ(sortedarr[jj], psortedarr[jj]);
}
Expand Down
53 changes: 49 additions & 4 deletions tests/test_qselect.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ class avx512_select : public ::testing::Test {
};
TYPED_TEST_SUITE_P(avx512_select);

TYPED_TEST_P(avx512_select, test_arrsizes)
TYPED_TEST_P(avx512_select, test_random)
{
if (cpu_has_avx512bw()) {
if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
Expand All @@ -26,15 +26,16 @@ TYPED_TEST_P(avx512_select, test_arrsizes)
std::sort(sortedarr.begin(), sortedarr.end());
for (size_t k = 0; k < arr.size(); ++k) {
psortedarr = arr;
avx512_qselect<TypeParam>(psortedarr.data(), k, psortedarr.size());
avx512_qselect<TypeParam>(
psortedarr.data(), k, psortedarr.size());
/* index k is correct */
ASSERT_EQ(sortedarr[k], psortedarr[k]);
/* Check left partition */
for (size_t jj = 0; jj < k; jj++) {
ASSERT_LE(psortedarr[jj], psortedarr[k]);
}
/* Check right partition */
for (size_t jj = k+1; jj < arr.size(); jj++) {
for (size_t jj = k + 1; jj < arr.size(); jj++) {
ASSERT_GE(psortedarr[jj], psortedarr[k]);
}
psortedarr.clear();
Expand All @@ -48,4 +49,48 @@ TYPED_TEST_P(avx512_select, test_arrsizes)
}
}

REGISTER_TYPED_TEST_SUITE_P(avx512_select, test_arrsizes);
TYPED_TEST_P(avx512_select, test_small_range)
{
if (cpu_has_avx512bw()) {
if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
}
std::vector<int64_t> arrsizes;
for (int64_t ii = 0; ii < 1024; ++ii) {
arrsizes.push_back(ii);
}
std::vector<TypeParam> arr;
std::vector<TypeParam> sortedarr;
std::vector<TypeParam> psortedarr;
for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
/* Random array */
arr = get_uniform_rand_array<TypeParam>(arrsizes[ii], 20, 1);
sortedarr = arr;
/* Sort with std::sort for comparison */
std::sort(sortedarr.begin(), sortedarr.end());
for (size_t k = 0; k < arr.size(); ++k) {
psortedarr = arr;
avx512_qselect<TypeParam>(
psortedarr.data(), k, psortedarr.size());
/* index k is correct */
ASSERT_EQ(sortedarr[k], psortedarr[k]);
/* Check left partition */
for (size_t jj = 0; jj < k; jj++) {
ASSERT_LE(psortedarr[jj], psortedarr[k]);
}
/* Check right partition */
for (size_t jj = k + 1; jj < arr.size(); jj++) {
ASSERT_GE(psortedarr[jj], psortedarr[k]);
}
psortedarr.clear();
}
arr.clear();
sortedarr.clear();
}
}
else {
GTEST_SKIP() << "Skipping this test, it requires avx512bw";
}
}

REGISTER_TYPED_TEST_SUITE_P(avx512_select, test_random, test_small_range);
99 changes: 96 additions & 3 deletions tests/test_qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class avx512_sort : public ::testing::Test {
};
TYPED_TEST_SUITE_P(avx512_sort);

TYPED_TEST_P(avx512_sort, test_arrsizes)
TYPED_TEST_P(avx512_sort, test_random)
{
if (cpu_has_avx512bw()) {
if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
Expand All @@ -29,7 +29,7 @@ TYPED_TEST_P(avx512_sort, test_arrsizes)
/* Sort with std::sort for comparison */
std::sort(sortedarr.begin(), sortedarr.end());
avx512_qsort<TypeParam>(arr.data(), arr.size());
ASSERT_EQ(sortedarr, arr);
ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii];
arr.clear();
sortedarr.clear();
}
Expand All @@ -39,4 +39,97 @@ TYPED_TEST_P(avx512_sort, test_arrsizes)
}
}

REGISTER_TYPED_TEST_SUITE_P(avx512_sort, test_arrsizes);
TYPED_TEST_P(avx512_sort, test_reverse)
{
if (cpu_has_avx512bw()) {
if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
}
std::vector<int64_t> arrsizes;
for (int64_t ii = 0; ii < 1024; ++ii) {
arrsizes.push_back((TypeParam)(ii + 1));
}
std::vector<TypeParam> arr;
std::vector<TypeParam> sortedarr;
for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
/* reverse array */
for (int jj = 0; jj < arrsizes[ii]; ++jj) {
arr.push_back((TypeParam)(arrsizes[ii] - jj));
}
sortedarr = arr;
/* Sort with std::sort for comparison */
std::sort(sortedarr.begin(), sortedarr.end());
avx512_qsort<TypeParam>(arr.data(), arr.size());
ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii];
arr.clear();
sortedarr.clear();
}
}
else {
GTEST_SKIP() << "Skipping this test, it requires avx512bw";
}
}

TYPED_TEST_P(avx512_sort, test_constant)
{
if (cpu_has_avx512bw()) {
if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
}
std::vector<int64_t> arrsizes;
for (int64_t ii = 0; ii < 1024; ++ii) {
arrsizes.push_back((TypeParam)(ii + 1));
}
std::vector<TypeParam> arr;
std::vector<TypeParam> sortedarr;
for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
/* constant array */
for (int jj = 0; jj < arrsizes[ii]; ++jj) {
arr.push_back(ii);
}
sortedarr = arr;
/* Sort with std::sort for comparison */
std::sort(sortedarr.begin(), sortedarr.end());
avx512_qsort<TypeParam>(arr.data(), arr.size());
ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii];
arr.clear();
sortedarr.clear();
}
}
else {
GTEST_SKIP() << "Skipping this test, it requires avx512bw";
}
}

TYPED_TEST_P(avx512_sort, test_small_range)
{
if (cpu_has_avx512bw()) {
if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
}
std::vector<int64_t> arrsizes;
for (int64_t ii = 0; ii < 1024; ++ii) {
arrsizes.push_back((TypeParam)(ii + 1));
}
std::vector<TypeParam> arr;
std::vector<TypeParam> sortedarr;
for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
arr = get_uniform_rand_array<TypeParam>(arrsizes[ii], 20, 1);
sortedarr = arr;
/* Sort with std::sort for comparison */
std::sort(sortedarr.begin(), sortedarr.end());
avx512_qsort<TypeParam>(arr.data(), arr.size());
ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii];
arr.clear();
sortedarr.clear();
}
}
else {
GTEST_SKIP() << "Skipping this test, it requires avx512bw";
}
}
REGISTER_TYPED_TEST_SUITE_P(avx512_sort,
test_random,
test_reverse,
test_constant,
test_small_range);
Loading