diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp index 7d473afc0b42e..19e38a3c8bdbe 100644 --- a/libc/src/string/memmove.cpp +++ b/libc/src/string/memmove.cpp @@ -15,10 +15,16 @@ namespace LIBC_NAMESPACE { LLVM_LIBC_FUNCTION(void *, memmove, (void *dst, const void *src, size_t count)) { + // Memmove may handle some small sizes as efficiently as inline_memcpy. + // For these sizes we may not do is_disjoint check. + // This both avoids additional code for the most frequent smaller sizes + // and removes code bloat (we don't need the memcpy logic for small sizes). + if (inline_memmove_small_size(dst, src, count)) + return dst; if (is_disjoint(dst, src, count)) inline_memcpy(dst, src, count); else - inline_memmove(dst, src, count); + inline_memmove_follow_up(dst, src, count); return dst; } diff --git a/libc/src/string/memory_utils/inline_memmove.h b/libc/src/string/memory_utils/inline_memmove.h index f72ea24ab538d..30c2c3ddbf1bb 100644 --- a/libc/src/string/memory_utils/inline_memmove.h +++ b/libc/src/string/memory_utils/inline_memmove.h @@ -13,28 +13,58 @@ #if defined(LIBC_TARGET_ARCH_IS_X86) #include "src/string/memory_utils/x86_64/inline_memmove.h" -#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_x86 +#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SMALL_SIZE \ + inline_memmove_small_size_x86 +#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_FOLLOW_UP \ + inline_memmove_follow_up_x86 #elif defined(LIBC_TARGET_ARCH_IS_AARCH64) #include "src/string/memory_utils/aarch64/inline_memmove.h" -#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_aarch64 +#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SMALL_SIZE \ + inline_memmove_no_small_size +#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_FOLLOW_UP inline_memmove_aarch64 #elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV) #include "src/string/memory_utils/riscv/inline_memmove.h" -#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_riscv +#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SMALL_SIZE \ + inline_memmove_no_small_size +#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_FOLLOW_UP inline_memmove_riscv #elif defined(LIBC_TARGET_ARCH_IS_ARM) #include "src/string/memory_utils/generic/byte_per_byte.h" -#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_byte_per_byte +#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SMALL_SIZE \ + inline_memmove_no_small_size +#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_FOLLOW_UP \ + inline_memmove_byte_per_byte #elif defined(LIBC_TARGET_ARCH_IS_GPU) #include "src/string/memory_utils/generic/builtin.h" -#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_builtin +#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SMALL_SIZE \ + inline_memmove_no_small_size +#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_FOLLOW_UP inline_memmove_builtin #else #error "Unsupported architecture" #endif namespace LIBC_NAMESPACE { +LIBC_INLINE constexpr bool inline_memmove_no_small_size(void *, const void *, + size_t) { + return false; +} + +LIBC_INLINE bool inline_memmove_small_size(void *dst, const void *src, + size_t count) { + return LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SMALL_SIZE( + reinterpret_cast(dst), reinterpret_cast(src), count); +} + +LIBC_INLINE void inline_memmove_follow_up(void *dst, const void *src, + size_t count) { + LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_FOLLOW_UP( + reinterpret_cast(dst), reinterpret_cast(src), count); +} + LIBC_INLINE void inline_memmove(void *dst, const void *src, size_t count) { - LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE(reinterpret_cast(dst), - reinterpret_cast(src), count); + if (inline_memmove_small_size(dst, src, count)) + return; + inline_memmove_follow_up(dst, src, count); } } // namespace LIBC_NAMESPACE diff --git a/libc/src/string/memory_utils/x86_64/inline_memmove.h b/libc/src/string/memory_utils/x86_64/inline_memmove.h index 95ad07f752195..879b36eaa6734 100644 --- a/libc/src/string/memory_utils/x86_64/inline_memmove.h +++ b/libc/src/string/memory_utils/x86_64/inline_memmove.h @@ -18,40 +18,94 @@ namespace LIBC_NAMESPACE { -LIBC_INLINE void inline_memmove_x86(Ptr dst, CPtr src, size_t count) { +LIBC_INLINE bool inline_memmove_small_size_x86(Ptr dst, CPtr src, + size_t count) { #if defined(__AVX512F__) + constexpr size_t vector_size = 64; using uint128_t = generic_v128; using uint256_t = generic_v256; using uint512_t = generic_v512; #elif defined(__AVX__) + constexpr size_t vector_size = 32; using uint128_t = generic_v128; using uint256_t = generic_v256; using uint512_t = cpp::array; #elif defined(__SSE2__) + constexpr size_t vector_size = 16; using uint128_t = generic_v128; using uint256_t = cpp::array; using uint512_t = cpp::array; #else + constexpr size_t vector_size = 8; using uint128_t = cpp::array; using uint256_t = cpp::array; using uint512_t = cpp::array; #endif + (void)vector_size; if (count == 0) - return; - if (count == 1) - return generic::Memmove::block(dst, src); - if (count <= 4) - return generic::Memmove::head_tail(dst, src, count); - if (count <= 8) - return generic::Memmove::head_tail(dst, src, count); - if (count <= 16) - return generic::Memmove::head_tail(dst, src, count); - if (count <= 32) - return generic::Memmove::head_tail(dst, src, count); - if (count <= 64) - return generic::Memmove::head_tail(dst, src, count); - if (count <= 128) - return generic::Memmove::head_tail(dst, src, count); + return true; + if (count == 1) { + generic::Memmove::block(dst, src); + return true; + } + if (count == 2) { + generic::Memmove::block(dst, src); + return true; + } + if (count == 3) { + generic::Memmove>::block(dst, src); + return true; + } + if (count == 4) { + generic::Memmove::block(dst, src); + return true; + } + if (count < 8) { + generic::Memmove::head_tail(dst, src, count); + return true; + } + // If count is equal to a power of 2, we can handle it as head-tail + // of both smaller size and larger size (head-tail are either + // non-overlapping for smaller size, or completely collapsed + // for larger size). It seems to be more profitable to do the copy + // with the larger size, if it's natively supported (e.g. doing + // 2 collapsed 32-byte moves for count=64 if AVX2 is supported). + // But it's not profitable to use larger size if it's not natively + // supported: we will both use more instructions and handle fewer + // sizes in earlier branches. + if (vector_size >= 16 ? count < 16 : count <= 16) { + generic::Memmove::head_tail(dst, src, count); + return true; + } + if (vector_size >= 32 ? count < 32 : count <= 32) { + generic::Memmove::head_tail(dst, src, count); + return true; + } + if (vector_size >= 64 ? count < 64 : count <= 64) { + generic::Memmove::head_tail(dst, src, count); + return true; + } + if (count <= 128) { + generic::Memmove::head_tail(dst, src, count); + return true; + } + return false; +} + +LIBC_INLINE void inline_memmove_follow_up_x86(Ptr dst, CPtr src, size_t count) { +#if defined(__AVX512F__) + using uint256_t = generic_v256; + using uint512_t = generic_v512; +#elif defined(__AVX__) + using uint256_t = generic_v256; + using uint512_t = cpp::array; +#elif defined(__SSE2__) + using uint256_t = cpp::array; + using uint512_t = cpp::array; +#else + using uint256_t = cpp::array; + using uint512_t = cpp::array; +#endif if (dst < src) { generic::Memmove::align_forward(dst, src, count); return generic::Memmove::loop_and_tail_forward(dst, src, count);