502 changes: 304 additions & 198 deletions libc/src/string/memory_utils/op_generic.h

Large diffs are not rendered by default.

330 changes: 127 additions & 203 deletions libc/src/string/memory_utils/op_x86.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@
namespace __llvm_libc::x86 {

// A set of constants to check compile time features.
static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
static LIBC_INLINE constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
static LIBC_INLINE constexpr bool kSse41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__);
static LIBC_INLINE constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
static LIBC_INLINE constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
static LIBC_INLINE constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
static LIBC_INLINE constexpr bool kAvx512BW =
LLVM_LIBC_IS_DEFINED(__AVX512BW__);

///////////////////////////////////////////////////////////////////////////////
// Memcpy repmovsb implementation
Expand All @@ -54,220 +56,142 @@ struct Memcpy {
}
};

///////////////////////////////////////////////////////////////////////////////
// Bcmp

// Base implementation for the Bcmp specializations.
// - BlockSize is either 16, 32 or 64 depending on the available compile time
// features, it is used to switch between "single native operation" or a
// "sequence of native operations".
// - BlockBcmp is the function that implements the bcmp logic.
template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl {
static constexpr size_t SIZE = Size;
LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
if constexpr (Size == BlockSize) {
return BlockBcmp(p1, p2);
} else if constexpr (Size % BlockSize == 0) {
for (size_t offset = 0; offset < Size; offset += BlockSize)
if (auto value = BlockBcmp(p1 + offset, p2 + offset))
return value;
} else {
deferred_static_assert("SIZE not implemented");
}
return BcmpReturnType::ZERO();
}

LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
return block(p1 + count - Size, p2 + count - Size);
}

LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
return block(p1, p2) | tail(p1, p2, count);
}
} // namespace __llvm_libc::x86

LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
size_t count) {
static_assert(Size > 1, "a loop of size 1 does not need tail");
size_t offset = 0;
do {
if (auto value = block(p1 + offset, p2 + offset))
return value;
offset += Size;
} while (offset < count - Size);
return tail(p1, p2, count);
}
};
namespace __llvm_libc::generic {

namespace sse2 {
LIBC_INLINE BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
#if defined(__SSE2__)
using T = char __attribute__((__vector_size__(16)));
// A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
const int mask =
_mm_movemask_epi8(cpp::bit_cast<__m128i>(load<T>(p1) != load<T>(p2)));
return static_cast<uint32_t>(mask);
#else
(void)p1;
(void)p2;
return BcmpReturnType::ZERO();
#endif // defined(__SSE2__)
///////////////////////////////////////////////////////////////////////////////
// Specializations for __m128i
#if defined(__SSE4_1__)
template <> struct is_vector<__m128i> : cpp::true_type {};
template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
return _mm_max_epu8(a, b);
}
template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
} // namespace sse2

namespace avx2 {
LIBC_INLINE BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
#if defined(__AVX2__)
using T = char __attribute__((__vector_size__(32)));
// A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
const int mask =
_mm256_movemask_epi8(cpp::bit_cast<__m256i>(load<T>(p1) != load<T>(p2)));
// _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
// mask.
return static_cast<uint32_t>(mask);
#else
(void)p1;
(void)p2;
return BcmpReturnType::ZERO();
#endif // defined(__AVX2__)
LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
8, 9, 10, 11, 12, 13, 14, 15));
}
template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
} // namespace avx2

namespace avx512bw {
LIBC_INLINE BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
#if defined(__AVX512BW__)
using T = char __attribute__((__vector_size__(64)));
// A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
const uint64_t mask = _mm512_cmpneq_epi8_mask(
cpp::bit_cast<__m512i>(load<T>(p1)), cpp::bit_cast<__m512i>(load<T>(p2)));
const bool mask_is_set = mask != 0;
return static_cast<uint32_t>(mask_is_set);
#else
(void)p1;
(void)p2;
return BcmpReturnType::ZERO();
#endif // defined(__AVX512BW__)
LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
return _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value)));
}
template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
} // namespace avx512bw

// Assuming that the mask is non zero, the index of the first mismatching byte
// is the number of trailing zeros in the mask. Trailing zeros and not leading
// zeros because the x86 architecture is little endian.
LIBC_INLINE MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2,
uint64_t mask) {
const size_t diff_index = __builtin_ctzll(mask);
const int16_t ca = cpp::to_integer<uint8_t>(p1[diff_index]);
const int16_t cb = cpp::to_integer<uint8_t>(p2[diff_index]);
return ca - cb;
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m128i>(p1, offset);
const auto b = load<__m128i>(p2, offset);
const auto xored = _mm_xor_si128(a, b);
return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0
}
template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m128i>(p1, offset);
const auto b = load<__m128i>(p2, offset);
const auto xored = _mm_xor_si128(a, b);
return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m128i>(p1, offset);
const auto b = load<__m128i>(p2, offset);
const auto vmax = bytewise_max(a, b);
const auto le = big_endian_cmp_mask(vmax, b);
const auto ge = big_endian_cmp_mask(vmax, a);
static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>);
return static_cast<int32_t>(ge) - static_cast<int32_t>(le);
}
#endif // __SSE4_1__

///////////////////////////////////////////////////////////////////////////////
// Memcmp

// Base implementation for the Memcmp specializations.
// - BlockSize is either 16, 32 or 64 depending on the available compile time
// features, it is used to switch between "single native operation" or a
// "sequence of native operations".
// - BlockMemcmp is the function that implements the memcmp logic.
// - BlockBcmp is the function that implements the bcmp logic.
template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp>
struct MemcmpImpl {
static constexpr size_t SIZE = Size;
LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) {
if constexpr (Size == BlockSize) {
return BlockMemcmp(p1, p2);
} else if constexpr (Size % BlockSize == 0) {
for (size_t offset = 0; offset < Size; offset += BlockSize)
if (auto value = BlockBcmp(p1 + offset, p2 + offset))
return BlockMemcmp(p1 + offset, p2 + offset);
} else {
deferred_static_assert("SIZE not implemented");
}
return MemcmpReturnType::ZERO();
}

LIBC_INLINE static MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
return block(p1 + count - Size, p2 + count - Size);
}

LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2,
size_t count) {
if (auto value = block(p1, p2))
return value;
return tail(p1, p2, count);
}

LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
size_t count) {
static_assert(Size > 1, "a loop of size 1 does not need tail");
size_t offset = 0;
do {
if (auto value = block(p1 + offset, p2 + offset))
return value;
offset += Size;
} while (offset < count - Size);
return tail(p1, p2, count);
}
};

namespace sse2 {
LIBC_INLINE MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
#if defined(__SSE2__)
using T = char __attribute__((__vector_size__(16)));
// A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
if (int mask =
_mm_movemask_epi8(cpp::bit_cast<__m128i>(load<T>(p1) != load<T>(p2))))
return char_diff_no_zero(p1, p2, mask);
return MemcmpReturnType::ZERO();
#else
(void)p1;
(void)p2;
return MemcmpReturnType::ZERO();
#endif // defined(__SSE2__)
// Specializations for __m256i
#if defined(__AVX__)
template <> struct is_vector<__m256i> : cpp::true_type {};
template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m256i>(p1, offset);
const auto b = load<__m256i>(p2, offset);
const auto xored = _mm256_castps_si256(
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0
}
template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
} // namespace sse2
template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m256i>(p1, offset);
const auto b = load<__m256i>(p2, offset);
const auto xored = _mm256_castps_si256(
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0
}
#endif // __AVX__

namespace avx2 {
LIBC_INLINE MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
#if defined(__AVX2__)
using T = char __attribute__((__vector_size__(32)));
// A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
if (int mask = _mm256_movemask_epi8(
cpp::bit_cast<__m256i>(load<T>(p1) != load<T>(p2))))
return char_diff_no_zero(p1, p2, mask);
return MemcmpReturnType::ZERO();
#else
(void)p1;
(void)p2;
return MemcmpReturnType::ZERO();
#endif // defined(__AVX2__)
LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
return _mm256_max_epu8(a, b);
}
LIBC_INLINE __m256i bytewise_reverse(__m256i value) {
return _mm256_shuffle_epi8(value,
_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
8, 9, 10, 11, 12, 13, 14, 15, //
16, 17, 18, 19, 20, 21, 22, 23, //
24, 25, 26, 27, 28, 29, 30, 31));
}
template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
} // namespace avx2
LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
return _mm256_movemask_epi8(bytewise_reverse(_mm256_cmpeq_epi8(max, value)));
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m256i>(p1, offset);
const auto b = load<__m256i>(p2, offset);
const auto vmax = bytewise_max(a, b);
const auto le = big_endian_cmp_mask(vmax, b);
const auto ge = big_endian_cmp_mask(vmax, a);
static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
return cmp_uint32_t(ge, le);
}
#endif // __AVX2__

namespace avx512bw {
LIBC_INLINE MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
///////////////////////////////////////////////////////////////////////////////
// Specializations for __m512i
#if defined(__AVX512BW__)
using T = char __attribute__((__vector_size__(64)));
// A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
if (uint64_t mask =
_mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(load<T>(p1)),
cpp::bit_cast<__m512i>(load<T>(p2))))
return char_diff_no_zero(p1, p2, mask);
return MemcmpReturnType::ZERO();
#else
(void)p1;
(void)p2;
return MemcmpReturnType::ZERO();
#endif // defined(__AVX512BW__)
template <> struct is_vector<__m512i> : cpp::true_type {};
template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
return _mm512_max_epu8(a, b);
}
LIBC_INLINE __m512i bytewise_reverse(__m512i value) {
return _mm512_shuffle_epi8(value,
_mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
8, 9, 10, 11, 12, 13, 14, 15, //
16, 17, 18, 19, 20, 21, 22, 23, //
24, 25, 26, 27, 28, 29, 30, 31, //
32, 33, 34, 35, 36, 37, 38, 39, //
40, 41, 42, 43, 44, 45, 46, 47, //
48, 49, 50, 51, 52, 53, 54, 55, //
56, 57, 58, 59, 60, 61, 62, 63));
}
LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
return _mm512_cmpeq_epi8_mask(bytewise_reverse(max), bytewise_reverse(value));
}
template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
} // namespace avx512bw
template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
const auto b = load<__m512i>(p2, offset);
return _mm512_cmpneq_epi8_mask(a, b) == 0;
}
template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
const auto b = load<__m512i>(p2, offset);
const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
return (xored >> 32) | (xored & 0xFFFFFFFF);
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
const auto b = load<__m512i>(p2, offset);
const auto vmax = bytewise_max(a, b);
const auto le = big_endian_cmp_mask(vmax, b);
const auto ge = big_endian_cmp_mask(vmax, a);
static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>);
return cmp_neq_uint64_t(ge, le);
}
#endif // __AVX512BW__

} // namespace __llvm_libc::x86
} // namespace __llvm_libc::generic

#endif // LIBC_TARGET_ARCH_IS_X86_64

Expand Down
45 changes: 45 additions & 0 deletions libc/src/string/memory_utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@

#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/cstddef.h"
#include "src/__support/CPP/limits.h" // cpp::numeric_limits
#include "src/__support/CPP/type_traits.h"
#include "src/__support/endian.h"
#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/__support/macros/config.h" // LIBC_HAS_BUILTIN
#include "src/__support/macros/properties/architectures.h"

#include <stddef.h> // size_t
#include <stdint.h> // intptr_t / uintptr_t
Expand Down Expand Up @@ -149,6 +151,39 @@ template <typename T> struct StrictIntegralType {
using MemcmpReturnType = StrictIntegralType<int32_t>;
using BcmpReturnType = StrictIntegralType<uint32_t>;

// This implements the semantic of 'memcmp' returning a negative value when 'a'
// is less than 'b', '0' when 'a' equals 'b' and a positive number otherwise.
LIBC_INLINE MemcmpReturnType cmp_uint32_t(uint32_t a, uint32_t b) {
// We perform the difference as an uint64_t.
const int64_t diff = static_cast<int64_t>(a) - static_cast<int64_t>(b);
// And reduce the uint64_t into an uint32_t.
// TODO: provide a detailed explanation.
return static_cast<int32_t>((diff >> 1) | (diff & 0xFFFF));
}

// Returns a negative value if 'a' is less than 'b' and a positive value
// otherwise. This implements the semantic of 'memcmp' when we know that 'a' and
// 'b' differ.
LIBC_INLINE MemcmpReturnType cmp_neq_uint64_t(uint64_t a, uint64_t b) {
#if defined(LIBC_TARGET_ARCH_IS_X86_64)
// On x86, we choose the returned values so that they are just one unit appart
// as this allows for better code generation.
static constexpr int32_t POSITIVE = cpp::numeric_limits<int32_t>::max();
static constexpr int32_t NEGATIVE = cpp::numeric_limits<int32_t>::min();
static_assert(cpp::bit_cast<uint32_t>(NEGATIVE) -
cpp::bit_cast<uint32_t>(POSITIVE) ==
1);
#else
// On RISC-V we simply use '1' and '-1' as it leads to branchless code.
// On ARMv8, both strategies lead to the same performance.
static constexpr int32_t POSITIVE = 1;
static constexpr int32_t NEGATIVE = -1;
#endif
static_assert(POSITIVE > 0);
static_assert(NEGATIVE < 0);
return a < b ? NEGATIVE : POSITIVE;
}

// Loads bytes from memory (possibly unaligned) and materializes them as
// type.
template <typename T> LIBC_INLINE T load(CPtr ptr) {
Expand Down Expand Up @@ -280,6 +315,16 @@ void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2");
}

template <size_t SIZE> struct AlignHelper {
AlignHelper(CPtr ptr) : offset_(distance_to_next_aligned<SIZE>(ptr)) {}

LIBC_INLINE bool not_aligned() const { return offset_ != SIZE; }
LIBC_INLINE uintptr_t offset() const { return offset_; }

private:
uintptr_t offset_;
};

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
91 changes: 44 additions & 47 deletions libc/src/string/memory_utils/x86_64/memcmp_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,79 +18,76 @@ namespace __llvm_libc {

[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
if (LIBC_UNLIKELY(count >= 384)) {
if (auto value = generic::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
return generic::Memcmp<uint64_t>::loop_and_tail_align_above(384, p1, p2,
count);
}

#if defined(__SSE4_1__)
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
if (LIBC_UNLIKELY(count >= 384)) {
if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
inline_memcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
return generic::Memcmp<__m128i>::loop_and_tail_align_above(384, p1, p2,
count);
}
#endif // __SSE4_1__

#if defined(__AVX2__)
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
return generic::Memcmp<__m128i>::head_tail(p1, p2, count);
if (count <= 64)
return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
if (LIBC_UNLIKELY(count >= 384)) {
if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
return value;
align_to_next_boundary<32, Arg::P1>(p1, p2, count);
}
return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
return generic::Memcmp<__m256i>::head_tail(p1, p2, count);
return generic::Memcmp<__m256i>::loop_and_tail_align_above(384, p1, p2,
count);
}
#endif // __AVX2__

#if defined(__AVX512BW__)
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
return generic::Memcmp<__m128i>::head_tail(p1, p2, count);
if (count <= 64)
return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
return generic::Memcmp<__m256i>::head_tail(p1, p2, count);
if (count <= 128)
return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
if (LIBC_UNLIKELY(count >= 384)) {
if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
}
return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
return generic::Memcmp<__m512i>::head_tail(p1, p2, count);
return generic::Memcmp<__m512i>::loop_and_tail_align_above(384, p1, p2,
count);
}
#endif // __AVX512BW__

LIBC_INLINE MemcmpReturnType inline_memcmp_x86(CPtr p1, CPtr p2, size_t count) {

if (count == 0)
return MemcmpReturnType::ZERO();
if (count == 1)
return generic::Memcmp<1>::block(p1, p2);
return generic::Memcmp<uint8_t>::block(p1, p2);
if (count == 2)
return generic::Memcmp<2>::block(p1, p2);
return generic::Memcmp<uint16_t>::block(p1, p2);
if (count == 3)
return generic::Memcmp<3>::block(p1, p2);
if (count <= 8)
return generic::Memcmp<4>::head_tail(p1, p2, count);
return generic::MemcmpSequence<uint16_t, uint8_t>::block(p1, p2);
if (count == 4)
return generic::Memcmp<uint32_t>::block(p1, p2);
if (count == 5)
return generic::MemcmpSequence<uint32_t, uint8_t>::block(p1, p2);
if (count == 6)
return generic::MemcmpSequence<uint32_t, uint16_t>::block(p1, p2);
if (count == 7)
return generic::Memcmp<uint32_t>::head_tail(p1, p2, 7);
if (count == 8)
return generic::Memcmp<uint64_t>::block(p1, p2);
if (count <= 16)
return generic::Memcmp<8>::head_tail(p1, p2, count);
if constexpr (x86::kAvx512BW)
return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
else if constexpr (x86::kAvx2)
return inline_memcmp_x86_avx2_gt16(p1, p2, count);
else if constexpr (x86::kSse2)
return inline_memcmp_x86_sse2_gt16(p1, p2, count);
else
return inline_memcmp_generic_gt16(p1, p2, count);
return generic::Memcmp<uint64_t>::head_tail(p1, p2, count);
#if defined(__AVX512BW__)
return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
#elif defined(__AVX2__)
return inline_memcmp_x86_avx2_gt16(p1, p2, count);
#elif defined(__SSE4_1__)
return inline_memcmp_x86_sse41_gt16(p1, p2, count);
#else
return inline_memcmp_generic_gt16(p1, p2, count);
#endif
}

} // namespace __llvm_libc

#endif // LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCMP_IMPLEMENTATIONS_H
87 changes: 43 additions & 44 deletions libc/test/src/string/memory_utils/op_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,35 +194,34 @@ TYPED_TEST(LlvmLibcOpTest, Memset, MemsetImplementations) {
}

using BcmpImplementations = testing::TypeList<
#ifdef __SSE2__
x86::sse2::Bcmp<16>, //
x86::sse2::Bcmp<32>, //
x86::sse2::Bcmp<64>, //
x86::sse2::Bcmp<128>, //
#endif
#ifdef LIBC_TARGET_ARCH_IS_X86_64
#ifdef __SSE4_1__
generic::Bcmp<__m128i>,
#endif // __SSE4_1__
#ifdef __AVX2__
x86::avx2::Bcmp<32>, //
x86::avx2::Bcmp<64>, //
x86::avx2::Bcmp<128>, //
#endif
generic::Bcmp<__m256i>,
#endif // __AVX2__
#ifdef __AVX512BW__
x86::avx512bw::Bcmp<64>, //
x86::avx512bw::Bcmp<128>, //
#endif
generic::Bcmp<__m512i>,
#endif // __AVX512BW__

#endif // LIBC_TARGET_ARCH_IS_X86_64
#ifdef LIBC_TARGET_ARCH_IS_AARCH64
aarch64::Bcmp<16>, //
aarch64::Bcmp<32>, //
#endif
aarch64::Bcmp<32>,
#endif // LIBC_TARGET_ARCH_IS_AARCH64
#ifdef LLVM_LIBC_HAS_UINT64
generic::Bcmp<8>, //
generic::Bcmp<uint64_t>, //
#endif
generic::Bcmp<1>, //
generic::Bcmp<2>, //
generic::Bcmp<4>, //
generic::Bcmp<16>, //
generic::Bcmp<32>, //
generic::Bcmp<64> //
>;
generic::Bcmp<uint8_t>, //
generic::Bcmp<uint16_t>, //
generic::Bcmp<uint32_t>, //
generic::BcmpSequence<uint8_t, uint8_t>, //
generic::BcmpSequence<uint8_t, uint8_t, uint8_t>, //
generic::BcmpSequence<uint16_t, uint8_t>, //
generic::BcmpSequence<uint32_t, uint8_t>, //
generic::BcmpSequence<uint32_t, uint16_t>, //
generic::BcmpSequence<uint32_t, uint16_t, uint8_t>>;

// Adapt CheckBcmp signature to op implementation signatures.
template <auto FnImpl>
Expand All @@ -247,7 +246,8 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
ASSERT_TRUE((CheckBcmp<BlockImpl>(span1, span2, kSize)));
}
}
{ // Test head tail operations from kSize to 2 * kSize.
if constexpr (has_head_tail<Impl>::value) {
// Test head tail operations from kSize to 2 * kSize.
static constexpr auto HeadTailImpl = CmpAdaptor<Impl::head_tail>;
Buffer Buffer1(2 * kSize);
Buffer Buffer2(2 * kSize);
Expand All @@ -258,7 +258,8 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
ASSERT_TRUE((CheckBcmp<HeadTailImpl>(span1, span2, size)));
}
}
{ // Test loop operations from kSize to 3 * kSize.
if constexpr (has_loop_and_tail<Impl>::value) {
// Test loop operations from kSize to 3 * kSize.
if constexpr (kSize > 1) {
static constexpr auto LoopImpl = CmpAdaptor<Impl::loop_and_tail>;
Buffer Buffer1(3 * kSize);
Expand All @@ -274,31 +275,27 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
}

using MemcmpImplementations = testing::TypeList<
#ifdef LIBC_TARGET_ARCH_IS_X86_64
#ifdef __SSE2__
x86::sse2::Memcmp<16>, //
x86::sse2::Memcmp<32>, //
x86::sse2::Memcmp<64>, //
x86::sse2::Memcmp<128>, //
generic::Memcmp<__m128i>, //
#endif
#ifdef __AVX2__
x86::avx2::Memcmp<32>, //
x86::avx2::Memcmp<64>, //
x86::avx2::Memcmp<128>, //
generic::Memcmp<__m256i>, //
#endif
#ifdef __AVX512BW__
x86::avx512bw::Memcmp<64>, //
x86::avx512bw::Memcmp<128>, //
generic::Memcmp<__m512i>, //
#endif
#endif // LIBC_TARGET_ARCH_IS_X86_64
#ifdef LLVM_LIBC_HAS_UINT64
generic::Memcmp<8>, //
generic::Memcmp<uint64_t>, //
#endif
generic::Memcmp<1>, //
generic::Memcmp<2>, //
generic::Memcmp<3>, //
generic::Memcmp<4>, //
generic::Memcmp<16>, //
generic::Memcmp<32>, //
generic::Memcmp<64> //
generic::Memcmp<uint8_t>, //
generic::Memcmp<uint16_t>, //
generic::Memcmp<uint32_t>, //
generic::MemcmpSequence<uint8_t, uint8_t>, //
generic::MemcmpSequence<uint8_t, uint8_t, uint8_t>, //
generic::MemcmpSequence<uint16_t, uint8_t>, //
generic::MemcmpSequence<uint32_t, uint16_t, uint8_t> //
>;

TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
Expand All @@ -314,7 +311,8 @@ TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
ASSERT_TRUE((CheckMemcmp<BlockImpl>(span1, span2, kSize)));
}
}
{ // Test head tail operations from kSize to 2 * kSize.
if constexpr (has_head_tail<Impl>::value) {
// Test head tail operations from kSize to 2 * kSize.
static constexpr auto HeadTailImpl = CmpAdaptor<Impl::head_tail>;
Buffer Buffer1(2 * kSize);
Buffer Buffer2(2 * kSize);
Expand All @@ -325,7 +323,8 @@ TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
ASSERT_TRUE((CheckMemcmp<HeadTailImpl>(span1, span2, size)));
}
}
{ // Test loop operations from kSize to 3 * kSize.
if constexpr (has_loop_and_tail<Impl>::value) {
// Test loop operations from kSize to 3 * kSize.
if constexpr (kSize > 1) {
static constexpr auto LoopImpl = CmpAdaptor<Impl::loop_and_tail>;
Buffer Buffer1(3 * kSize);
Expand Down
1 change: 1 addition & 0 deletions utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -1857,6 +1857,7 @@ libc_support_library(
":__support_cpp_array",
":__support_cpp_bit",
":__support_cpp_cstddef",
":__support_cpp_limits",
":__support_cpp_type_traits",
":__support_macros_attributes",
":__support_macros_config",
Expand Down