Skip to content

Commit

Permalink
[libc] Improve memcmp latency and codegen
Browse files Browse the repository at this point in the history
This is based on ideas from @nafi to:
 - use a branchless version of 'cmp' for 'uint32_t',
 - completely resolve the lexicographic comparison through vector
   operations when wide types are available. We also get rid of byte
   reloads and serializing '__builtin_ctzll'.

I did not include the suggestion to replace comparisons of 'uint16_t'
with two 'uint8_t' as it did not seem to help the codegen. This can
be revisited in sub-sequent patches.

The code been rewritten to reduce nested function calls, making the
job of the inliner easier and preventing harmful code duplication.

Reviewed By: nafi3000

Differential Revision: https://reviews.llvm.org/D148717
  • Loading branch information
gchatelet committed Jun 5, 2023
1 parent 60f06bc commit 9ec6ebd
Show file tree
Hide file tree
Showing 10 changed files with 637 additions and 580 deletions.
8 changes: 7 additions & 1 deletion libc/src/string/CMakeLists.txt
Expand Up @@ -450,6 +450,12 @@ function(add_implementation name impl_name)
endforeach()
endif()

if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
# Prevent warning when passing x86 SIMD types as template arguments.
# e.g. "warning: ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
list(APPEND ADD_IMPL_COMPILE_OPTIONS "-Wno-ignored-attributes")
endif()

add_entrypoint_object(${impl_name}
NAME ${name}
SRCS ${ADD_IMPL_SRCS}
Expand Down Expand Up @@ -564,7 +570,7 @@ endfunction()
if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memcpy(memcpy_x86_64_opt_sse2 COMPILE_OPTIONS -march=k8 REQUIRE SSE2)
add_memcpy(memcpy_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
add_memcpy(memcpy_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
add_memcpy(memcpy_x86_64_opt_avx COMPILE_OPTIONS -march=haswell REQUIRE AVX)
add_memcpy(memcpy_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memcpy(memcpy)
Expand Down
1 change: 1 addition & 0 deletions libc/src/string/memory_utils/CMakeLists.txt
Expand Up @@ -24,6 +24,7 @@ add_header_library(
libc.src.__support.CPP.type_traits
libc.src.__support.macros.config
libc.src.__support.macros.optimization
libc.src.__support.macros.properties.architectures
)

add_header_library(
Expand Down
113 changes: 51 additions & 62 deletions libc/src/string/memory_utils/bcmp_implementations.h
Expand Up @@ -22,21 +22,17 @@
namespace __llvm_libc {

[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) {
LIBC_LOOP_NOUNROLL
for (; offset < count; ++offset)
if (p1[offset] != p2[offset])
return BcmpReturnType::NONZERO();
return BcmpReturnType::ZERO();
inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t count, size_t offset = 0) {
return generic::Bcmp<uint8_t>::loop_and_tail_offset(p1, p2, count, offset);
}

[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
constexpr size_t kAlign = sizeof(uint64_t);
if (count <= 2 * kAlign)
return inline_bcmp_byte_per_byte(p1, p2, 0, count);
return inline_bcmp_byte_per_byte(p1, p2, count);
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
if (auto value = inline_bcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
return value;
size_t offset = bytes_to_p1_align;
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
Expand All @@ -55,16 +51,16 @@ inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
if (a != b)
return BcmpReturnType::NONZERO();
}
return inline_bcmp_byte_per_byte(p1, p2, offset, count);
return inline_bcmp_byte_per_byte(p1, p2, count, offset);
}

[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
constexpr size_t kAlign = sizeof(uint32_t);
if (count <= 2 * kAlign)
return inline_bcmp_byte_per_byte(p1, p2, 0, count);
return inline_bcmp_byte_per_byte(p1, p2, count);
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
if (auto value = inline_bcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
return value;
size_t offset = bytes_to_p1_align;
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
Expand All @@ -80,89 +76,82 @@ inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
if (a != b)
return BcmpReturnType::NONZERO();
}
return inline_bcmp_byte_per_byte(p1, p2, offset, count);
return inline_bcmp_byte_per_byte(p1, p2, count, offset);
}

#if defined(LIBC_TARGET_ARCH_IS_X86) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
if (count < 256)
return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
if (auto value = generic::Bcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
return generic::Bcmp<uint64_t>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // defined(LIBC_TARGET_ARCH_IS_X86) ||
// defined(LIBC_TARGET_ARCH_IS_AARCH64)

#if defined(LIBC_TARGET_ARCH_IS_X86)
#if defined(__SSE4_1__)
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
if (count < 256)
return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // __SSE4_1__

#if defined(__AVX__)
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
if (count <= 64)
return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
if (LIBC_UNLIKELY(count >= 256)) {
if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
}
return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // __AVX__

#if defined(__AVX512BW__)
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
if (count <= 64)
return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
if (count <= 128)
return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
if (LIBC_UNLIKELY(count >= 256)) {
if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
}
return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // __AVX512BW__

[[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2,
size_t count) {
if (count == 0)
return BcmpReturnType::ZERO();
if (count == 1)
return generic::Bcmp<1>::block(p1, p2);
return generic::Bcmp<uint8_t>::block(p1, p2);
if (count == 2)
return generic::Bcmp<2>::block(p1, p2);
if (count <= 4)
return generic::Bcmp<2>::head_tail(p1, p2, count);
if (count <= 8)
return generic::Bcmp<4>::head_tail(p1, p2, count);
return generic::Bcmp<uint16_t>::block(p1, p2);
if (count == 3)
return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
if (count == 4)
return generic::Bcmp<uint32_t>::block(p1, p2);
if (count == 5)
return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
if (count == 6)
return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
if (count == 7)
return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
if (count == 8)
return generic::Bcmp<uint64_t>::block(p1, p2);
if (count <= 16)
return generic::Bcmp<8>::head_tail(p1, p2, count);
if constexpr (x86::kAvx512BW)
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
else if constexpr (x86::kAvx2)
return inline_bcmp_x86_avx2_gt16(p1, p2, count);
else if constexpr (x86::kSse2)
return inline_bcmp_x86_sse2_gt16(p1, p2, count);
else
return inline_bcmp_generic_gt16(p1, p2, count);
return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
#if defined(__AVX512BW__)
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
#elif defined(__AVX__)
return inline_bcmp_x86_avx_gt16(p1, p2, count);
#elif defined(__SSE4_1__)
return inline_bcmp_x86_sse41_gt16(p1, p2, count);
#else
return inline_bcmp_generic_gt16(p1, p2, count);
#endif
}
#endif // defined(LIBC_TARGET_ARCH_IS_X86)

Expand Down Expand Up @@ -225,7 +214,7 @@ LIBC_INLINE BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
return inline_bcmp_aligned_access_32bit(p1, p2, count);
#else
return inline_bcmp_byte_per_byte(p1, p2, 0, count);
return inline_bcmp_byte_per_byte(p1, p2, count);
#endif
}

Expand Down
39 changes: 14 additions & 25 deletions libc/src/string/memory_utils/memcmp_implementations.h
Expand Up @@ -26,21 +26,17 @@
namespace __llvm_libc {

[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) {
LIBC_LOOP_NOUNROLL
for (; offset < count; ++offset)
if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset))
return value;
return MemcmpReturnType::ZERO();
inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t count, size_t offset = 0) {
return generic::Memcmp<uint8_t>::loop_and_tail_offset(p1, p2, count, offset);
}

[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
constexpr size_t kAlign = sizeof(uint64_t);
if (count <= 2 * kAlign)
return inline_memcmp_byte_per_byte(p1, p2, 0, count);
return inline_memcmp_byte_per_byte(p1, p2, count);
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
if (auto value = inline_memcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
return value;
size_t offset = bytes_to_p1_align;
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
Expand All @@ -56,21 +52,20 @@ inline_memcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
b = load64_aligned<uint8_t, uint16_t, uint16_t, uint16_t, uint8_t>(
p2, offset);
uint64_t a = load64_aligned<uint64_t>(p1, offset);
if (a != b) {
// TODO use cmp_neq_uint64_t from D148717 once it's submitted.
return Endian::to_big_endian(a) < Endian::to_big_endian(b) ? -1 : 1;
}
if (a != b)
return cmp_neq_uint64_t(Endian::to_big_endian(a),
Endian::to_big_endian(b));
}
return inline_memcmp_byte_per_byte(p1, p2, offset, count);
return inline_memcmp_byte_per_byte(p1, p2, count, offset);
}

[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
constexpr size_t kAlign = sizeof(uint32_t);
if (count <= 2 * kAlign)
return inline_memcmp_byte_per_byte(p1, p2, 0, count);
return inline_memcmp_byte_per_byte(p1, p2, count);
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
if (auto value = inline_memcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
return value;
size_t offset = bytes_to_p1_align;
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
Expand All @@ -83,16 +78,10 @@ inline_memcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
else
b = load32_aligned<uint8_t, uint16_t, uint8_t>(p2, offset);
uint32_t a = load32_aligned<uint32_t>(p1, offset);
if (a != b) {
// TODO use cmp_uint32_t from D148717 once it's submitted.
// We perform the difference as an uint64_t.
const int64_t diff = static_cast<int64_t>(Endian::to_big_endian(a)) -
static_cast<int64_t>(Endian::to_big_endian(b));
// And reduce the uint64_t into an uint32_t.
return static_cast<int32_t>((diff >> 1) | (diff & 0xFFFF));
}
if (a != b)
return cmp_uint32_t(Endian::to_big_endian(a), Endian::to_big_endian(b));
}
return inline_memcmp_byte_per_byte(p1, p2, offset, count);
return inline_memcmp_byte_per_byte(p1, p2, count, offset);
}

LIBC_INLINE MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
Expand All @@ -105,7 +94,7 @@ LIBC_INLINE MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
return inline_memcmp_aligned_access_32bit(p1, p2, count);
#else
return inline_memcmp_byte_per_byte(p1, p2, 0, count);
return inline_memcmp_byte_per_byte(p1, p2, count);
#endif
}

Expand Down

0 comments on commit 9ec6ebd

Please sign in to comment.