Skip to content

Commit

Permalink
Revert D148717 "[libc] Improve memcmp latency and codegen"
Browse files Browse the repository at this point in the history
This broke aarch64 debug buildbot https://lab.llvm.org/buildbot/#/builders/223/builds/21703
This reverts commit bd4f978.
  • Loading branch information
gchatelet committed Jun 12, 2023
1 parent 203078b commit 1ec995c
Show file tree
Hide file tree
Showing 16 changed files with 643 additions and 902 deletions.
9 changes: 2 additions & 7 deletions libc/src/__support/macros/properties/architectures.h
Expand Up @@ -45,10 +45,6 @@
#define LIBC_TARGET_ARCH_IS_AARCH64
#endif

#if (defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_ARM))
#define LIBC_TARGET_ARCH_IS_ANY_ARM
#endif

#if defined(__riscv) && (__riscv_xlen == 64)
#define LIBC_TARGET_ARCH_IS_RISCV64
#endif
Expand All @@ -57,9 +53,8 @@
#define LIBC_TARGET_ARCH_IS_RISCV32
#endif

#if (defined(LIBC_TARGET_ARCH_IS_RISCV64) || \
defined(LIBC_TARGET_ARCH_IS_RISCV32))
#define LIBC_TARGET_ARCH_IS_ANY_RISCV
#if (defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_ARM))
#define LIBC_TARGET_ARCH_IS_ANY_ARM
#endif

#endif // LLVM_LIBC_SUPPORT_MACROS_PROPERTIES_ARCHITECTURES_H
8 changes: 1 addition & 7 deletions libc/src/string/CMakeLists.txt
Expand Up @@ -450,12 +450,6 @@ function(add_implementation name impl_name)
endforeach()
endif()

if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
# Prevent warning when passing x86 SIMD types as template arguments.
# e.g. "warning: ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
list(APPEND ADD_IMPL_COMPILE_OPTIONS "-Wno-ignored-attributes")
endif()

add_entrypoint_object(${impl_name}
NAME ${name}
SRCS ${ADD_IMPL_SRCS}
Expand Down Expand Up @@ -570,7 +564,7 @@ endfunction()
if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memcpy(memcpy_x86_64_opt_sse2 COMPILE_OPTIONS -march=k8 REQUIRE SSE2)
add_memcpy(memcpy_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
add_memcpy(memcpy_x86_64_opt_avx COMPILE_OPTIONS -march=sandybridge REQUIRE AVX)
add_memcpy(memcpy_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
add_memcpy(memcpy_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memcpy(memcpy)
Expand Down
1 change: 0 additions & 1 deletion libc/src/string/memory_utils/CMakeLists.txt
Expand Up @@ -24,7 +24,6 @@ add_header_library(
libc.src.__support.CPP.type_traits
libc.src.__support.macros.config
libc.src.__support.macros.optimization
libc.src.__support.macros.properties.architectures
)

add_header_library(
Expand Down
33 changes: 16 additions & 17 deletions libc/src/string/memory_utils/aarch64/memcmp_implementations.h
Expand Up @@ -19,48 +19,47 @@ namespace __llvm_libc {
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
if (LIBC_UNLIKELY(count >= 384)) {
if (auto value = generic::Memcmp<uint8x16_t>::block(p1, p2))
if (auto value = generic::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return generic::Memcmp<uint8x16_t>::loop_and_tail(p1, p2, count);
return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
}

[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) {
if (LIBC_UNLIKELY(count >= 128)) { // [128, ∞]
if (auto value = generic::Memcmp<uint8x16_t>::block(p1, p2))
if (auto value = generic::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
return generic::Memcmp<uint8x16x2_t>::loop_and_tail(p1, p2, count);
return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
}
if (generic::Bcmp<uint8x16_t>::block(p1, p2)) // [16, 16]
return generic::Memcmp<uint8x16_t>::block(p1, p2);
if (generic::Bcmp<16>::block(p1, p2)) // [16, 16]
return generic::Memcmp<16>::block(p1, p2);
if (count < 32) // [17, 31]
return generic::Memcmp<uint8x16_t>::tail(p1, p2, count);
if (generic::Bcmp<uint8x16_t>::block(p1 + 16, p2 + 16)) // [32, 32]
return generic::Memcmp<uint8x16_t>::block(p1 + 16, p2 + 16);
return generic::Memcmp<16>::tail(p1, p2, count);
if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
if (count < 64) // [33, 63]
return generic::Memcmp<uint8x16x2_t>::tail(p1, p2, count);
return generic::Memcmp<32>::tail(p1, p2, count);
// [64, 127]
return generic::Memcmp<uint8x16_t>::loop_and_tail(p1 + 32, p2 + 32,
count - 32);
return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
}

LIBC_INLINE MemcmpReturnType inline_memcmp_aarch64(CPtr p1, CPtr p2,
size_t count) {
if (count == 0)
return MemcmpReturnType::ZERO();
if (count == 1)
return generic::Memcmp<uint8_t>::block(p1, p2);
return generic::Memcmp<1>::block(p1, p2);
if (count == 2)
return generic::Memcmp<uint16_t>::block(p1, p2);
return generic::Memcmp<2>::block(p1, p2);
if (count == 3)
return generic::MemcmpSequence<uint16_t, uint8_t>::block(p1, p2);
return generic::Memcmp<3>::block(p1, p2);
if (count <= 8)
return generic::Memcmp<uint32_t>::head_tail(p1, p2, count);
return generic::Memcmp<4>::head_tail(p1, p2, count);
if (count <= 16)
return generic::Memcmp<uint64_t>::head_tail(p1, p2, count);
return generic::Memcmp<8>::head_tail(p1, p2, count);
if constexpr (aarch64::kNeon)
return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
else
Expand Down
128 changes: 69 additions & 59 deletions libc/src/string/memory_utils/bcmp_implementations.h
Expand Up @@ -15,25 +15,28 @@
#include "src/string/memory_utils/op_aarch64.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_riscv.h"
#include "src/string/memory_utils/op_x86.h"

#include <stddef.h> // size_t

namespace __llvm_libc {

[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t count, size_t offset = 0) {
return generic::Bcmp<uint8_t>::loop_and_tail_offset(p1, p2, count, offset);
inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) {
LIBC_LOOP_NOUNROLL
for (; offset < count; ++offset)
if (p1[offset] != p2[offset])
return BcmpReturnType::NONZERO();
return BcmpReturnType::ZERO();
}

[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
constexpr size_t kAlign = sizeof(uint64_t);
if (count <= 2 * kAlign)
return inline_bcmp_byte_per_byte(p1, p2, count);
return inline_bcmp_byte_per_byte(p1, p2, 0, count);
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
if (auto value = inline_bcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
return value;
size_t offset = bytes_to_p1_align;
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
Expand All @@ -52,16 +55,16 @@ inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
if (a != b)
return BcmpReturnType::NONZERO();
}
return inline_bcmp_byte_per_byte(p1, p2, count, offset);
return inline_bcmp_byte_per_byte(p1, p2, offset, count);
}

[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
constexpr size_t kAlign = sizeof(uint32_t);
if (count <= 2 * kAlign)
return inline_bcmp_byte_per_byte(p1, p2, count);
return inline_bcmp_byte_per_byte(p1, p2, 0, count);
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
if (auto value = inline_bcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
return value;
size_t offset = bytes_to_p1_align;
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
Expand All @@ -77,82 +80,89 @@ inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
if (a != b)
return BcmpReturnType::NONZERO();
}
return inline_bcmp_byte_per_byte(p1, p2, count, offset);
return inline_bcmp_byte_per_byte(p1, p2, offset, count);
}

#if defined(LIBC_TARGET_ARCH_IS_X86) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
return generic::Bcmp<uint64_t>::loop_and_tail_align_above(256, p1, p2, count);
if (count < 256)
return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
if (auto value = generic::Bcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
}
#endif // defined(LIBC_TARGET_ARCH_IS_X86) ||
// defined(LIBC_TARGET_ARCH_IS_AARCH64)

#if defined(LIBC_TARGET_ARCH_IS_X86)
#if defined(__SSE4_1__)
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
if (count < 256)
return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
}
#endif // __SSE4_1__

#if defined(__AVX__)
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
if (LIBC_UNLIKELY(count >= 256)) {
if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
}
return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
}
#endif // __AVX__

#if defined(__AVX512BW__)
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
if (LIBC_UNLIKELY(count >= 256)) {
if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
}
return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
}
#endif // __AVX512BW__

[[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2,
size_t count) {
if (count == 0)
return BcmpReturnType::ZERO();
if (count == 1)
return generic::Bcmp<uint8_t>::block(p1, p2);
return generic::Bcmp<1>::block(p1, p2);
if (count == 2)
return generic::Bcmp<uint16_t>::block(p1, p2);
if (count == 3)
return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
if (count == 4)
return generic::Bcmp<uint32_t>::block(p1, p2);
if (count == 5)
return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
if (count == 6)
return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
if (count == 7)
return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
if (count == 8)
return generic::Bcmp<uint64_t>::block(p1, p2);
return generic::Bcmp<2>::block(p1, p2);
if (count <= 4)
return generic::Bcmp<2>::head_tail(p1, p2, count);
if (count <= 8)
return generic::Bcmp<4>::head_tail(p1, p2, count);
if (count <= 16)
return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
#if defined(__AVX512BW__)
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
#elif defined(__AVX__)
return inline_bcmp_x86_avx_gt16(p1, p2, count);
#elif defined(__SSE4_1__)
return inline_bcmp_x86_sse41_gt16(p1, p2, count);
#else
return inline_bcmp_generic_gt16(p1, p2, count);
#endif
return generic::Bcmp<8>::head_tail(p1, p2, count);
if constexpr (x86::kAvx512BW)
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
else if constexpr (x86::kAvx2)
return inline_bcmp_x86_avx2_gt16(p1, p2, count);
else if constexpr (x86::kSse2)
return inline_bcmp_x86_sse2_gt16(p1, p2, count);
else
return inline_bcmp_generic_gt16(p1, p2, count);
}
#endif // defined(LIBC_TARGET_ARCH_IS_X86)

Expand All @@ -168,27 +178,27 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
case 0:
return BcmpReturnType::ZERO();
case 1:
return generic::Bcmp<uint8_t>::block(p1, p2);
return generic::Bcmp<1>::block(p1, p2);
case 2:
return generic::Bcmp<uint16_t>::block(p1, p2);
return generic::Bcmp<2>::block(p1, p2);
case 3:
return generic::Bcmp<uint16_t>::head_tail(p1, p2, count);
return generic::Bcmp<2>::head_tail(p1, p2, count);
case 4:
return generic::Bcmp<uint32_t>::block(p1, p2);
return generic::Bcmp<4>::block(p1, p2);
case 5:
case 6:
case 7:
return generic::Bcmp<uint32_t>::head_tail(p1, p2, count);
return generic::Bcmp<4>::head_tail(p1, p2, count);
case 8:
return generic::Bcmp<uint64_t>::block(p1, p2);
return generic::Bcmp<8>::block(p1, p2);
case 9:
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
return generic::Bcmp<8>::head_tail(p1, p2, count);
}
}

Expand All @@ -215,7 +225,7 @@ LIBC_INLINE BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
return inline_bcmp_aligned_access_32bit(p1, p2, count);
#else
return inline_bcmp_byte_per_byte(p1, p2, count);
return inline_bcmp_byte_per_byte(p1, p2, 0, count);
#endif
}

Expand Down

0 comments on commit 1ec995c

Please sign in to comment.