From 2c73c07259dff2f041b4e6fce5242e15c647d227 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 1 Nov 2023 11:53:13 -0700 Subject: [PATCH 1/5] Add meson flag to build benchmarks --- scripts/bench-compare.sh | 2 +- scripts/branch-compare.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/bench-compare.sh b/scripts/bench-compare.sh index 57347cce..85bc2756 100755 --- a/scripts/bench-compare.sh +++ b/scripts/bench-compare.sh @@ -11,7 +11,7 @@ if [ ! -d .bench/google-benchmark ]; then fi compare=$(realpath .bench/google-benchmark/tools/compare.py) -meson setup --warnlevel 0 --buildtype release builddir-${branch} +meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir-${branch} cd builddir-${branch} ninja $compare filters ./benchexe $1 $2 diff --git a/scripts/branch-compare.sh b/scripts/branch-compare.sh index 91d3c681..ca833ed9 100755 --- a/scripts/branch-compare.sh +++ b/scripts/branch-compare.sh @@ -27,7 +27,7 @@ build_branch() { fi fi cd $dir_name - meson setup --warnlevel 0 --buildtype release builddir + meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir cd builddir ninja cd ../../ From 4cf96127f37e683eb2639eeeba1f01683313f52b Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 1 Nov 2023 11:54:47 -0700 Subject: [PATCH 2/5] Improve avx2 min/max for 64-bit --- src/avx2-64bit-qsort.hpp | 15 ++++++++++++--- src/avx2-emu-funcs.hpp | 4 ++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp index 6135302a..afca85ac 100644 --- a/src/avx2-64bit-qsort.hpp +++ b/src/avx2-64bit-qsort.hpp @@ -89,12 +89,15 @@ struct avx2_vector { { return _mm256_xor_si256(x, y); } + static opmask_t gt(reg_t x, reg_t y) + { + return _mm256_cmpgt_epi64(x, y); + } static opmask_t ge(reg_t x, reg_t y) { opmask_t equal = eq(x, y); opmask_t greater = _mm256_cmpgt_epi64(x, y); - return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(equal), - _mm256_castsi256_pd(greater))); + return _mm256_or_si256(equal, greater); } static opmask_t eq(reg_t x, reg_t y) { @@ -258,10 +261,16 @@ struct avx2_vector { return _mm256_i64gather_epi64( (long long int const *)base, index, scale); } + static opmask_t gt(reg_t x, reg_t y) + { + const __m256i offset = _mm256_set1_epi64x(0x8000000000000000); + x = _mm256_add_epi64(x, offset); + y = _mm256_add_epi64(y, offset); + return _mm256_cmpgt_epi64(x, y); + } static opmask_t ge(reg_t x, reg_t y) { opmask_t equal = eq(x, y); - const __m256i offset = _mm256_set1_epi64x(0x8000000000000000); x = _mm256_add_epi64(x, offset); y = _mm256_add_epi64(y, offset); diff --git a/src/avx2-emu-funcs.hpp b/src/avx2-emu-funcs.hpp index 0dd50c09..9564d13b 100644 --- a/src/avx2-emu-funcs.hpp +++ b/src/avx2-emu-funcs.hpp @@ -273,7 +273,7 @@ typename avx2_vector::reg_t avx2_emu_max(typename avx2_vector::reg_t x, typename avx2_vector::reg_t y) { using vtype = avx2_vector; - typename vtype::opmask_t nlt = vtype::ge(x, y); + typename vtype::opmask_t nlt = vtype::gt(x, y); return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(y), _mm256_castsi256_pd(x), _mm256_castsi256_pd(nlt))); @@ -284,7 +284,7 @@ typename avx2_vector::reg_t avx2_emu_min(typename avx2_vector::reg_t x, typename avx2_vector::reg_t y) { using vtype = avx2_vector; - typename vtype::opmask_t nlt = vtype::ge(x, y); + typename vtype::opmask_t nlt = vtype::gt(x, y); return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(x), _mm256_castsi256_pd(y), _mm256_castsi256_pd(nlt))); From 31dea7c421d9acf47d148d8f4cd97b06c5897d19 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 1 Nov 2023 12:48:23 -0700 Subject: [PATCH 3/5] use xor instead of add --- src/avx2-64bit-qsort.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp index afca85ac..fa80d437 100644 --- a/src/avx2-64bit-qsort.hpp +++ b/src/avx2-64bit-qsort.hpp @@ -264,16 +264,16 @@ struct avx2_vector { static opmask_t gt(reg_t x, reg_t y) { const __m256i offset = _mm256_set1_epi64x(0x8000000000000000); - x = _mm256_add_epi64(x, offset); - y = _mm256_add_epi64(y, offset); + x = _mm256_xor_si256(x, offset); + y = _mm256_xor_si256(y, offset); return _mm256_cmpgt_epi64(x, y); } static opmask_t ge(reg_t x, reg_t y) { opmask_t equal = eq(x, y); const __m256i offset = _mm256_set1_epi64x(0x8000000000000000); - x = _mm256_add_epi64(x, offset); - y = _mm256_add_epi64(y, offset); + x = _mm256_xor_si256(x, offset); + y = _mm256_xor_si256(y, offset); opmask_t greater = _mm256_cmpgt_epi64(x, y); return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(equal), From a0f93fb642fde54b07ecf6a2ef42fa800fc305a0 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 1 Nov 2023 13:06:13 -0700 Subject: [PATCH 4/5] use _mm256_or_si256 --- src/avx2-64bit-qsort.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp index fa80d437..2b9af0b9 100644 --- a/src/avx2-64bit-qsort.hpp +++ b/src/avx2-64bit-qsort.hpp @@ -274,10 +274,8 @@ struct avx2_vector { const __m256i offset = _mm256_set1_epi64x(0x8000000000000000); x = _mm256_xor_si256(x, offset); y = _mm256_xor_si256(y, offset); - opmask_t greater = _mm256_cmpgt_epi64(x, y); - return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(equal), - _mm256_castsi256_pd(greater))); + return _mm256_or_si256(equal, greater); } static opmask_t eq(reg_t x, reg_t y) { From cffafff1b98f63194c4c331144bf91d8cfabe094 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 1 Nov 2023 13:06:31 -0700 Subject: [PATCH 5/5] change unroll partition factor --- src/avx2-64bit-qsort.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp index 2b9af0b9..ef91f373 100644 --- a/src/avx2-64bit-qsort.hpp +++ b/src/avx2-64bit-qsort.hpp @@ -60,7 +60,7 @@ struct avx2_vector { #else static constexpr int network_sort_threshold = 64; #endif - static constexpr int partition_unroll_factor = 4; + static constexpr int partition_unroll_factor = 8; using swizzle_ops = avx2_64bit_swizzle_ops; @@ -224,7 +224,7 @@ struct avx2_vector { #else static constexpr int network_sort_threshold = 64; #endif - static constexpr int partition_unroll_factor = 4; + static constexpr int partition_unroll_factor = 8; using swizzle_ops = avx2_64bit_swizzle_ops; @@ -387,7 +387,7 @@ struct avx2_vector { #else static constexpr int network_sort_threshold = 64; #endif - static constexpr int partition_unroll_factor = 4; + static constexpr int partition_unroll_factor = 8; using swizzle_ops = avx2_64bit_swizzle_ops;