diff --git a/scripts/bench-compare.sh b/scripts/bench-compare.sh index 57347cce..85bc2756 100755 --- a/scripts/bench-compare.sh +++ b/scripts/bench-compare.sh @@ -11,7 +11,7 @@ if [ ! -d .bench/google-benchmark ]; then fi compare=$(realpath .bench/google-benchmark/tools/compare.py) -meson setup --warnlevel 0 --buildtype release builddir-${branch} +meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir-${branch} cd builddir-${branch} ninja $compare filters ./benchexe $1 $2 diff --git a/scripts/branch-compare.sh b/scripts/branch-compare.sh index 91d3c681..ca833ed9 100755 --- a/scripts/branch-compare.sh +++ b/scripts/branch-compare.sh @@ -27,7 +27,7 @@ build_branch() { fi fi cd $dir_name - meson setup --warnlevel 0 --buildtype release builddir + meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir cd builddir ninja cd ../../ diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp index 6135302a..ef91f373 100644 --- a/src/avx2-64bit-qsort.hpp +++ b/src/avx2-64bit-qsort.hpp @@ -60,7 +60,7 @@ struct avx2_vector { #else static constexpr int network_sort_threshold = 64; #endif - static constexpr int partition_unroll_factor = 4; + static constexpr int partition_unroll_factor = 8; using swizzle_ops = avx2_64bit_swizzle_ops; @@ -89,12 +89,15 @@ struct avx2_vector { { return _mm256_xor_si256(x, y); } + static opmask_t gt(reg_t x, reg_t y) + { + return _mm256_cmpgt_epi64(x, y); + } static opmask_t ge(reg_t x, reg_t y) { opmask_t equal = eq(x, y); opmask_t greater = _mm256_cmpgt_epi64(x, y); - return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(equal), - _mm256_castsi256_pd(greater))); + return _mm256_or_si256(equal, greater); } static opmask_t eq(reg_t x, reg_t y) { @@ -221,7 +224,7 @@ struct avx2_vector { #else static constexpr int network_sort_threshold = 64; #endif - static constexpr int partition_unroll_factor = 4; + static constexpr int partition_unroll_factor = 8; using swizzle_ops = avx2_64bit_swizzle_ops; @@ -258,17 +261,21 @@ struct avx2_vector { return _mm256_i64gather_epi64( (long long int const *)base, index, scale); } + static opmask_t gt(reg_t x, reg_t y) + { + const __m256i offset = _mm256_set1_epi64x(0x8000000000000000); + x = _mm256_xor_si256(x, offset); + y = _mm256_xor_si256(y, offset); + return _mm256_cmpgt_epi64(x, y); + } static opmask_t ge(reg_t x, reg_t y) { opmask_t equal = eq(x, y); - const __m256i offset = _mm256_set1_epi64x(0x8000000000000000); - x = _mm256_add_epi64(x, offset); - y = _mm256_add_epi64(y, offset); - + x = _mm256_xor_si256(x, offset); + y = _mm256_xor_si256(y, offset); opmask_t greater = _mm256_cmpgt_epi64(x, y); - return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(equal), - _mm256_castsi256_pd(greater))); + return _mm256_or_si256(equal, greater); } static opmask_t eq(reg_t x, reg_t y) { @@ -380,7 +387,7 @@ struct avx2_vector { #else static constexpr int network_sort_threshold = 64; #endif - static constexpr int partition_unroll_factor = 4; + static constexpr int partition_unroll_factor = 8; using swizzle_ops = avx2_64bit_swizzle_ops; diff --git a/src/avx2-emu-funcs.hpp b/src/avx2-emu-funcs.hpp index 0dd50c09..9564d13b 100644 --- a/src/avx2-emu-funcs.hpp +++ b/src/avx2-emu-funcs.hpp @@ -273,7 +273,7 @@ typename avx2_vector::reg_t avx2_emu_max(typename avx2_vector::reg_t x, typename avx2_vector::reg_t y) { using vtype = avx2_vector; - typename vtype::opmask_t nlt = vtype::ge(x, y); + typename vtype::opmask_t nlt = vtype::gt(x, y); return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(y), _mm256_castsi256_pd(x), _mm256_castsi256_pd(nlt))); @@ -284,7 +284,7 @@ typename avx2_vector::reg_t avx2_emu_min(typename avx2_vector::reg_t x, typename avx2_vector::reg_t y) { using vtype = avx2_vector; - typename vtype::opmask_t nlt = vtype::ge(x, y); + typename vtype::opmask_t nlt = vtype::gt(x, y); return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(x), _mm256_castsi256_pd(y), _mm256_castsi256_pd(nlt)));