Skip to content
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CXX = g++-12
CXX ?= g++-12
SRCDIR = ./src
TESTDIR = ./tests
BENCHDIR = ./benchmarks
Expand Down
7 changes: 4 additions & 3 deletions benchmarks/bench-qsort-common.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
#ifndef AVX512_BENCH_COMMON
#define AVX512_BENCH_COMMON

#include <benchmark/benchmark.h>
#include "rand_array.h"
#include "cpuinfo.h"
#include "avx512-16bit-qsort.hpp"
#include "avx512-32bit-qsort.hpp"
#include "avx512-64bit-argsort.hpp"
#include "avx512-64bit-qsort.hpp"
#include "cpuinfo.h"
#include "rand_array.h"
#include <benchmark/benchmark.h>

#define MY_BENCHMARK_CAPTURE(func, T, test_case_name, ...) \
BENCHMARK_PRIVATE_DECLARE(func) \
Expand Down
96 changes: 96 additions & 0 deletions benchmarks/bench_argsort.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#include "bench-qsort-common.h"

template <typename T>
std::vector<int64_t> stdargsort(const std::vector<T> &array)
{
std::vector<int64_t> indices(array.size());
std::iota(indices.begin(), indices.end(), 0);
std::sort(indices.begin(),
indices.end(),
[&array](int64_t left, int64_t right) -> bool {
// sort indices according to corresponding array element
return array[left] < array[right];
});

return indices;
}

template <typename T, class... Args>
static void stdargsort(benchmark::State &state, Args &&...args)
{
auto args_tuple = std::make_tuple(std::move(args)...);
// Perform setup here
size_t ARRSIZE = std::get<0>(args_tuple);
std::vector<T> arr;
std::vector<int64_t> inx;

std::string arrtype = std::get<1>(args_tuple);
if (arrtype == "random") { arr = get_uniform_rand_array<T>(ARRSIZE); }
else if (arrtype == "sorted") {
arr = get_uniform_rand_array<T>(ARRSIZE);
std::sort(arr.begin(), arr.end());
}
else if (arrtype == "constant") {
T temp = get_uniform_rand_array<T>(1)[0];
for (size_t ii = 0; ii < ARRSIZE; ++ii) {
arr.push_back(temp);
}
}
else if (arrtype == "reverse") {
arr = get_uniform_rand_array<T>(ARRSIZE);
std::sort(arr.begin(), arr.end());
std::reverse(arr.begin(), arr.end());
}

/* call avx512 quicksort */
for (auto _ : state) {
inx = stdargsort(arr);
}
}

template <typename T, class... Args>
static void avx512argsort(benchmark::State &state, Args &&...args)
{
auto args_tuple = std::make_tuple(std::move(args)...);
if (!cpu_has_avx512bw()) {
state.SkipWithMessage("Requires AVX512 BW ISA");
}
// Perform setup here
size_t ARRSIZE = std::get<0>(args_tuple);
std::vector<T> arr;
std::vector<int64_t> inx;

std::string arrtype = std::get<1>(args_tuple);
if (arrtype == "random") { arr = get_uniform_rand_array<T>(ARRSIZE); }
else if (arrtype == "sorted") {
arr = get_uniform_rand_array<T>(ARRSIZE);
std::sort(arr.begin(), arr.end());
}
else if (arrtype == "constant") {
T temp = get_uniform_rand_array<T>(1)[0];
for (size_t ii = 0; ii < ARRSIZE; ++ii) {
arr.push_back(temp);
}
}
else if (arrtype == "reverse") {
arr = get_uniform_rand_array<T>(ARRSIZE);
std::sort(arr.begin(), arr.end());
std::reverse(arr.begin(), arr.end());
}

/* call avx512 quicksort */
for (auto _ : state) {
inx = avx512_argsort<T>(arr.data(), ARRSIZE);
}
}

#define BENCH_BOTH(type)\
BENCH(avx512argsort, type)\
BENCH(stdargsort, type)\

BENCH_BOTH(int64_t)
BENCH_BOTH(uint64_t)
BENCH_BOTH(double)
BENCH_BOTH(int32_t)
BENCH_BOTH(uint32_t)
BENCH_BOTH(float)
48 changes: 39 additions & 9 deletions benchmarks/bench_partial_qsort.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#include "bench-qsort-common.h"

template <typename T>
static void avx512_partial_qsort(benchmark::State& state) {
static void avx512_partial_qsort(benchmark::State &state)
{
if (!cpu_has_avx512bw()) {
state.SkipWithMessage("Requires AVX512 BW ISA");
}
Expand Down Expand Up @@ -29,7 +30,8 @@ static void avx512_partial_qsort(benchmark::State& state) {
}

template <typename T>
static void stdpartialsort(benchmark::State& state) {
static void stdpartialsort(benchmark::State &state)
{
// Perform setup here
int64_t K = state.range(0);
size_t ARRSIZE = 10000;
Expand All @@ -53,20 +55,48 @@ static void stdpartialsort(benchmark::State& state) {
// Register the function as a benchmark
BENCHMARK(avx512_partial_qsort<float>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(stdpartialsort<float>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<uint32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<uint32_t>)
->Arg(10)
->Arg(100)
->Arg(1000)
->Arg(5000);
BENCHMARK(stdpartialsort<uint32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<int32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<int32_t>)
->Arg(10)
->Arg(100)
->Arg(1000)
->Arg(5000);
BENCHMARK(stdpartialsort<int32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);

BENCHMARK(avx512_partial_qsort<double>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<double>)
->Arg(10)
->Arg(100)
->Arg(1000)
->Arg(5000);
BENCHMARK(stdpartialsort<double>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<uint64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<uint64_t>)
->Arg(10)
->Arg(100)
->Arg(1000)
->Arg(5000);
BENCHMARK(stdpartialsort<uint64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<int64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<int64_t>)
->Arg(10)
->Arg(100)
->Arg(1000)
->Arg(5000);
BENCHMARK(stdpartialsort<int64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);

//BENCHMARK(avx512_partial_qsort<float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<uint16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<uint16_t>)
->Arg(10)
->Arg(100)
->Arg(1000)
->Arg(5000);
BENCHMARK(stdpartialsort<uint16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<int16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<int16_t>)
->Arg(10)
->Arg(100)
->Arg(1000)
->Arg(5000);
BENCHMARK(stdpartialsort<int16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
6 changes: 4 additions & 2 deletions benchmarks/bench_qselect.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#include "bench-qsort-common.h"

template <typename T>
static void avx512_qselect(benchmark::State& state) {
static void avx512_qselect(benchmark::State &state)
{
if (!cpu_has_avx512bw()) {
state.SkipWithMessage("Requires AVX512 BW ISA");
}
Expand Down Expand Up @@ -29,7 +30,8 @@ static void avx512_qselect(benchmark::State& state) {
}

template <typename T>
static void stdnthelement(benchmark::State& state) {
static void stdnthelement(benchmark::State &state)
{
// Perform setup here
int64_t K = state.range(0);
size_t ARRSIZE = 10000;
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/bench_qsort.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include "bench_qsort.hpp"
#include "bench_qselect.hpp"
#include "bench_argsort.hpp"
#include "bench_partial_qsort.hpp"
#include "bench_qselect.hpp"
18 changes: 9 additions & 9 deletions benchmarks/bench_qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,15 @@ static void avx512qsort(benchmark::State &state, Args &&...args)
}
}

#define BENCH_ALL(type)\
#define BENCH_BOTH_QSORT(type)\
BENCH(avx512qsort, type)\
BENCH(stdsort, type)

BENCH_ALL(uint64_t)
BENCH_ALL(int64_t)
BENCH_ALL(uint32_t)
BENCH_ALL(int32_t)
BENCH_ALL(uint16_t)
BENCH_ALL(int16_t)
BENCH_ALL(float)
BENCH_ALL(double)
BENCH_BOTH_QSORT(uint64_t)
BENCH_BOTH_QSORT(int64_t)
BENCH_BOTH_QSORT(uint32_t)
BENCH_BOTH_QSORT(int32_t)
BENCH_BOTH_QSORT(uint16_t)
BENCH_BOTH_QSORT(int16_t)
BENCH_BOTH_QSORT(float)
BENCH_BOTH_QSORT(double)
42 changes: 26 additions & 16 deletions benchmarks/bench_qsortfp16.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#include <benchmark/benchmark.h>
#include "rand_array.h"
#include "cpuinfo.h"
#include "avx512fp16-16bit-qsort.hpp"
#include "cpuinfo.h"
#include "rand_array.h"
#include <benchmark/benchmark.h>

template <typename T>
static void avx512_qsort(benchmark::State& state) {
static void avx512_qsort(benchmark::State &state)
{
if (cpu_has_avx512fp16()) {
// Perform setup here
size_t ARRSIZE = state.range(0);
Expand All @@ -13,7 +14,7 @@ static void avx512_qsort(benchmark::State& state) {

/* Initialize elements */
for (size_t jj = 0; jj < ARRSIZE; ++jj) {
_Float16 temp = (float) rand() / (float)(RAND_MAX);
_Float16 temp = (float)rand() / (float)(RAND_MAX);
arr.push_back(temp);
}
arr_bkp = arr;
Expand All @@ -32,15 +33,16 @@ static void avx512_qsort(benchmark::State& state) {
}

template <typename T>
static void stdsort(benchmark::State& state) {
static void stdsort(benchmark::State &state)
{
if (cpu_has_avx512fp16()) {
// Perform setup here
size_t ARRSIZE = state.range(0);
std::vector<T> arr;
std::vector<T> arr_bkp;

for (size_t jj = 0; jj < ARRSIZE; ++jj) {
_Float16 temp = (float) rand() / (float)(RAND_MAX);
_Float16 temp = (float)rand() / (float)(RAND_MAX);
arr.push_back(temp);
}
arr_bkp = arr;
Expand All @@ -63,7 +65,8 @@ BENCHMARK(avx512_qsort<_Float16>)->Arg(10000)->Arg(1000000);
BENCHMARK(stdsort<_Float16>)->Arg(10000)->Arg(1000000);

template <typename T>
static void avx512_qselect(benchmark::State& state) {
static void avx512_qselect(benchmark::State &state)
{
if (cpu_has_avx512fp16()) {
// Perform setup here
int64_t K = state.range(0);
Expand All @@ -73,7 +76,7 @@ static void avx512_qselect(benchmark::State& state) {

/* Initialize elements */
for (size_t jj = 0; jj < ARRSIZE; ++jj) {
_Float16 temp = (float) rand() / (float)(RAND_MAX);
_Float16 temp = (float)rand() / (float)(RAND_MAX);
arr.push_back(temp);
}
arr_bkp = arr;
Expand All @@ -93,7 +96,8 @@ static void avx512_qselect(benchmark::State& state) {
}

template <typename T>
static void stdnthelement(benchmark::State& state) {
static void stdnthelement(benchmark::State &state)
{
if (cpu_has_avx512fp16()) {
// Perform setup here
int64_t K = state.range(0);
Expand All @@ -103,7 +107,7 @@ static void stdnthelement(benchmark::State& state) {

/* Initialize elements */
for (size_t jj = 0; jj < ARRSIZE; ++jj) {
_Float16 temp = (float) rand() / (float)(RAND_MAX);
_Float16 temp = (float)rand() / (float)(RAND_MAX);
arr.push_back(temp);
}
arr_bkp = arr;
Expand All @@ -127,7 +131,8 @@ BENCHMARK(avx512_qselect<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(stdnthelement<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);

template <typename T>
static void avx512_partial_qsort(benchmark::State& state) {
static void avx512_partial_qsort(benchmark::State &state)
{
if (cpu_has_avx512fp16()) {
// Perform setup here
int64_t K = state.range(0);
Expand All @@ -137,7 +142,7 @@ static void avx512_partial_qsort(benchmark::State& state) {

/* Initialize elements */
for (size_t jj = 0; jj < ARRSIZE; ++jj) {
_Float16 temp = (float) rand() / (float)(RAND_MAX);
_Float16 temp = (float)rand() / (float)(RAND_MAX);
arr.push_back(temp);
}
arr_bkp = arr;
Expand All @@ -157,7 +162,8 @@ static void avx512_partial_qsort(benchmark::State& state) {
}

template <typename T>
static void stdpartialsort(benchmark::State& state) {
static void stdpartialsort(benchmark::State &state)
{
if (cpu_has_avx512fp16()) {
// Perform setup here
int64_t K = state.range(0);
Expand All @@ -167,7 +173,7 @@ static void stdpartialsort(benchmark::State& state) {

/* Initialize elements */
for (size_t jj = 0; jj < ARRSIZE; ++jj) {
_Float16 temp = (float) rand() / (float)(RAND_MAX);
_Float16 temp = (float)rand() / (float)(RAND_MAX);
arr.push_back(temp);
}
arr_bkp = arr;
Expand All @@ -187,5 +193,9 @@ static void stdpartialsort(benchmark::State& state) {
}

// Register the function as a benchmark
BENCHMARK(avx512_partial_qsort<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
BENCHMARK(avx512_partial_qsort<_Float16>)
->Arg(10)
->Arg(100)
->Arg(1000)
->Arg(5000);
BENCHMARK(stdpartialsort<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
Loading