Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[libc] Fix buggy AVX2 / AVX512 memcmp #77081

Merged
merged 7 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 78 additions & 22 deletions libc/src/string/memory_utils/op_x86.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
8, 9, 10, 11, 12, 13, 14, 15));
}
LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
return static_cast<uint16_t>(_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
return static_cast<uint16_t>(
_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
}
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m128i>(p1, offset);
Expand Down Expand Up @@ -180,15 +181,41 @@ template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
return _mm256_max_epu8(a, b);
}
LIBC_INLINE __m256i bytewise_reverse(__m256i value) {
return _mm256_shuffle_epi8(value,
_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
8, 9, 10, 11, 12, 13, 14, 15, //
16, 17, 18, 19, 20, 21, 22, 23, //
24, 25, 26, 27, 28, 29, 30, 31));
}
LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
return _mm256_movemask_epi8(bytewise_reverse(_mm256_cmpeq_epi8(max, value)));
// Bytewise comparison of 'max' and 'value'.
const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value);
// Because x86 is little endian, bytes in the vector must be reversed before
// using movemask.
#if defined(__AVX512VBMI__) && defined(__AVX512VL__)
// When AVX512BMI is available we can completely reverse the vector through
// VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
const __m256i big_endian_byte_mask =
_mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
8, 9, 10, 11, 12, 13, 14, 15, //
16, 17, 18, 19, 20, 21, 22, 23, //
24, 25, 26, 27, 28, 29, 30, 31),
little_endian_byte_mask);
// And turn the byte vector mask into an 'uint32_t' for direct scalar
// comparison.
return _mm256_movemask_epi8(big_endian_byte_mask);
#else
// We can't byte-reverse '__m256i' in a single instruction with AVX2.
// '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane
// leading to:
// ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
// 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
// So we first shuffle each 16-byte lane leading to half-reversed vector mask.
const __m256i half_reversed = _mm256_shuffle_epi8(
little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
8, 9, 10, 11, 12, 13, 14, 15, //
0, 1, 2, 3, 4, 5, 6, 7, //
8, 9, 10, 11, 12, 13, 14, 15));
// Then we turn the vector into an uint32_t.
const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed);
// And swap the lower and upper parts. This is optimized into a single `rorx`
// instruction.
return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16);
#endif
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
Expand All @@ -198,7 +225,7 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
const auto le = big_endian_cmp_mask(vmax, b);
const auto ge = big_endian_cmp_mask(vmax, a);
static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
return cmp_uint32_t(ge, le);
return cmp_neq_uint64_t(ge, le);
}
#endif // __AVX2__

Expand All @@ -210,19 +237,48 @@ template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
return _mm512_max_epu8(a, b);
}
LIBC_INLINE __m512i bytewise_reverse(__m512i value) {
return _mm512_shuffle_epi8(value,
_mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
8, 9, 10, 11, 12, 13, 14, 15, //
16, 17, 18, 19, 20, 21, 22, 23, //
24, 25, 26, 27, 28, 29, 30, 31, //
32, 33, 34, 35, 36, 37, 38, 39, //
40, 41, 42, 43, 44, 45, 46, 47, //
48, 49, 50, 51, 52, 53, 54, 55, //
56, 57, 58, 59, 60, 61, 62, 63));
}
LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
return _mm512_cmpeq_epi8_mask(bytewise_reverse(max), bytewise_reverse(value));
// The AVX512BMI version is disabled due to bad codegen.
// https://github.com/llvm/llvm-project/issues/77459
// https://github.com/llvm/llvm-project/pull/77081
// TODO: Re-enable when clang version meets the fixed version.
#if false && defined(__AVX512VBMI__)
// When AVX512BMI is available we can completely reverse the vector through
// VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
8, 9, 10, 11, 12, 13, 14, 15, //
16, 17, 18, 19, 20, 21, 22, 23, //
24, 25, 26, 27, 28, 29, 30, 31, //
32, 33, 34, 35, 36, 37, 38, 39, //
40, 41, 42, 43, 44, 45, 46, 47, //
48, 49, 50, 51, 52, 53, 54, 55, //
56, 57, 58, 59, 60, 61, 62, 63);
// Then we compute the mask for equal bytes.
return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It appears that this code is not well optimized by the compiler, using GPRs instead of vector registers.
#77459

The vector code is 21 cycles whether the GPR is 25 according to llvm-mca.
I'll benchmark both of them and report.

_mm512_permutexvar_epi8(indices, value));
#else
// We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__.
// '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane.
// So we only reverse groups of 8 bytes, these groups are necessarily within a
// 16-byte lane.
// zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes |
// zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> |
const __m512i indices = _mm512_set_epi8(56, 57, 58, 59, 60, 61, 62, 63, //
48, 49, 50, 51, 52, 53, 54, 55, //
40, 41, 42, 43, 44, 45, 46, 47, //
32, 33, 34, 35, 36, 37, 38, 39, //
24, 25, 26, 27, 28, 29, 30, 31, //
16, 17, 18, 19, 20, 21, 22, 23, //
8, 9, 10, 11, 12, 13, 14, 15, //
0, 1, 2, 3, 4, 5, 6, 7);
// Then we compute the mask for equal bytes. In this mask the bits of each
// byte are already reversed but the byte themselves should be reversed, this
// is done by using a bswap instruction.
return __builtin_bswap64(
_mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), //
_mm512_shuffle_epi8(value, indices)));

#endif
}
template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
Expand Down
7 changes: 7 additions & 0 deletions libc/test/src/string/memcmp_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ TEST(LlvmLibcMemcmpTest, LhsAfterRhsLexically) {
EXPECT_GT(LIBC_NAMESPACE::memcmp(lhs, rhs, 2), 0);
}

TEST(LlvmLibcMemcmpTest, Issue77080) {
// https://github.com/llvm/llvm-project/issues/77080
constexpr char lhs[35] = "1.069cd68bbe76eb2143a3284d27ebe220";
constexpr char rhs[35] = "1.0500185b5d966a544e2d0fa40701b0f3";
ASSERT_GE(LIBC_NAMESPACE::memcmp(lhs, rhs, 34), 1);
}

// Adapt CheckMemcmp signature to memcmp.
static inline int Adaptor(cpp::span<char> p1, cpp::span<char> p2, size_t size) {
return LIBC_NAMESPACE::memcmp(p1.begin(), p2.begin(), size);
Expand Down