Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/bench-compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ if [ ! -d .bench/google-benchmark ]; then
fi
compare=$(realpath .bench/google-benchmark/tools/compare.py)

meson setup --warnlevel 0 --buildtype release builddir-${branch}
meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir-${branch}
cd builddir-${branch}
ninja
$compare filters ./benchexe $1 $2
2 changes: 1 addition & 1 deletion scripts/branch-compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ build_branch() {
fi
fi
cd $dir_name
meson setup --warnlevel 0 --buildtype release builddir
meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir
cd builddir
ninja
cd ../../
Expand Down
29 changes: 18 additions & 11 deletions src/avx2-64bit-qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ struct avx2_vector<int64_t> {
#else
static constexpr int network_sort_threshold = 64;
#endif
static constexpr int partition_unroll_factor = 4;
static constexpr int partition_unroll_factor = 8;

using swizzle_ops = avx2_64bit_swizzle_ops;

Expand Down Expand Up @@ -89,12 +89,15 @@ struct avx2_vector<int64_t> {
{
return _mm256_xor_si256(x, y);
}
static opmask_t gt(reg_t x, reg_t y)
{
return _mm256_cmpgt_epi64(x, y);
}
static opmask_t ge(reg_t x, reg_t y)
{
opmask_t equal = eq(x, y);
opmask_t greater = _mm256_cmpgt_epi64(x, y);
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(equal),
_mm256_castsi256_pd(greater)));
return _mm256_or_si256(equal, greater);
}
static opmask_t eq(reg_t x, reg_t y)
{
Expand Down Expand Up @@ -221,7 +224,7 @@ struct avx2_vector<uint64_t> {
#else
static constexpr int network_sort_threshold = 64;
#endif
static constexpr int partition_unroll_factor = 4;
static constexpr int partition_unroll_factor = 8;

using swizzle_ops = avx2_64bit_swizzle_ops;

Expand Down Expand Up @@ -258,17 +261,21 @@ struct avx2_vector<uint64_t> {
return _mm256_i64gather_epi64(
(long long int const *)base, index, scale);
}
static opmask_t gt(reg_t x, reg_t y)
{
const __m256i offset = _mm256_set1_epi64x(0x8000000000000000);
x = _mm256_xor_si256(x, offset);
y = _mm256_xor_si256(y, offset);
return _mm256_cmpgt_epi64(x, y);
}
static opmask_t ge(reg_t x, reg_t y)
{
opmask_t equal = eq(x, y);

const __m256i offset = _mm256_set1_epi64x(0x8000000000000000);
x = _mm256_add_epi64(x, offset);
y = _mm256_add_epi64(y, offset);

x = _mm256_xor_si256(x, offset);
y = _mm256_xor_si256(y, offset);
opmask_t greater = _mm256_cmpgt_epi64(x, y);
return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(equal),
_mm256_castsi256_pd(greater)));
return _mm256_or_si256(equal, greater);
}
static opmask_t eq(reg_t x, reg_t y)
{
Expand Down Expand Up @@ -380,7 +387,7 @@ struct avx2_vector<double> {
#else
static constexpr int network_sort_threshold = 64;
#endif
static constexpr int partition_unroll_factor = 4;
static constexpr int partition_unroll_factor = 8;

using swizzle_ops = avx2_64bit_swizzle_ops;

Expand Down
4 changes: 2 additions & 2 deletions src/avx2-emu-funcs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ typename avx2_vector<T>::reg_t avx2_emu_max(typename avx2_vector<T>::reg_t x,
typename avx2_vector<T>::reg_t y)
{
using vtype = avx2_vector<T>;
typename vtype::opmask_t nlt = vtype::ge(x, y);
typename vtype::opmask_t nlt = vtype::gt(x, y);
return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(y),
_mm256_castsi256_pd(x),
_mm256_castsi256_pd(nlt)));
Expand All @@ -284,7 +284,7 @@ typename avx2_vector<T>::reg_t avx2_emu_min(typename avx2_vector<T>::reg_t x,
typename avx2_vector<T>::reg_t y)
{
using vtype = avx2_vector<T>;
typename vtype::opmask_t nlt = vtype::ge(x, y);
typename vtype::opmask_t nlt = vtype::gt(x, y);
return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(x),
_mm256_castsi256_pd(y),
_mm256_castsi256_pd(nlt)));
Expand Down