Expand Up
@@ -11,145 +11,130 @@
#include " src/__support/architectures.h"
#include " src/__support/common.h"
#include " src/string/memory_utils/elements.h"
#include " src/string/memory_utils/op_aarch64.h"
#include " src/string/memory_utils/op_builtin.h"
#include " src/string/memory_utils/op_generic.h"
#include " src/string/memory_utils/op_x86.h"
#include " src/string/memory_utils/utils.h"
#include < stddef.h> // size_t
// Design rationale
// ================
//
// Using a profiler to observe size distributions for calls into libc
// functions, it was found most operations act on a small number of bytes.
// This makes it important to favor small sizes.
//
// The tests for `count` are in ascending order so the cost of branching is
// proportional to the cost of copying.
//
// The function is written in C++ for several reasons:
// - The compiler can __see__ the code, this is useful when performing Profile
// Guided Optimization as the optimized code can take advantage of branching
// probabilities.
// - It also allows for easier customization and favors testing multiple
// implementation parameters.
// - As compilers and processors get better, the generated code is improved
// with little change on the code side.
namespace __llvm_libc {
static inline void inline_memcpy (char *__restrict dst,
const char *__restrict src, size_t count) {
using namespace __llvm_libc ::builtin;
#if defined(LLVM_LIBC_ARCH_X86)
// ///////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_X86
// ///////////////////////////////////////////////////////////////////////////
// Whether to use only rep;movsb.
constexpr bool USE_ONLY_REP_MOVSB =
LLVM_LIBC_IS_DEFINED (LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
// kRepMovsBSize == -1 : Only CopyAligned is used.
// kRepMovsBSize == 0 : Only RepMovsb is used.
// else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
constexpr size_t REP_MOVS_B_SIZE =
#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE)
LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
#else
-1 ;
#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
// Whether target supports AVX instructions.
constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED (__AVX__);
#if defined(__AVX__)
using LoopBlockSize = _64;
#else
using LoopBlockSize = _32;
#endif
if (USE_ONLY_REP_MOVSB)
return copy<x86::Accelerator>(dst, src, count);
[[maybe_unused]] static inline void
inline_memcpy_embedded_tiny (Ptr __restrict dst, CPtr __restrict src,
size_t count) {
#pragma nounroll
for (size_t offset = 0 ; offset < count; ++offset)
builtin::Memcpy<1 >::block (dst + offset, src + offset);
}
#if defined(LLVM_LIBC_ARCH_X86)
[[maybe_unused]] static inline void
inline_memcpy_x86 (Ptr __restrict dst, CPtr __restrict src, size_t count) {
if (count == 0 )
return ;
if (count == 1 )
return copy<_1> (dst, src);
return builtin::Memcpy< 1 >:: block (dst, src);
if (count == 2 )
return copy<_2> (dst, src);
return builtin::Memcpy< 2 >:: block (dst, src);
if (count == 3 )
return copy<_3> (dst, src);
return builtin::Memcpy< 3 >:: block (dst, src);
if (count == 4 )
return copy<_4> (dst, src);
return builtin::Memcpy< 4 >:: block (dst, src);
if (count < 8 )
return copy<HeadTail<_4>> (dst, src, count);
return builtin::Memcpy< 4 >:: head_tail (dst, src, count);
if (count < 16 )
return copy<HeadTail<_8>> (dst, src, count);
return builtin::Memcpy< 8 >:: head_tail (dst, src, count);
if (count < 32 )
return copy<HeadTail<_16>> (dst, src, count);
return builtin::Memcpy< 16 >:: head_tail (dst, src, count);
if (count < 64 )
return copy<HeadTail<_32>> (dst, src, count);
return builtin::Memcpy< 32 >:: head_tail (dst, src, count);
if (count < 128 )
return copy<HeadTail<_64>>(dst, src, count);
if (HAS_AVX && count < 256 )
return copy<HeadTail<_128>>(dst, src, count);
if (count <= REP_MOVS_B_SIZE)
return copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
count);
return copy<x86::Accelerator>(dst, src, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
// ///////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_AARCH64
// ///////////////////////////////////////////////////////////////////////////
return builtin::Memcpy<64 >::head_tail (dst, src, count);
if (x86::kAvx && count < 256 )
return builtin::Memcpy<128 >::head_tail (dst, src, count);
builtin::Memcpy<32 >::block (dst, src);
align_to_next_boundary<32 , Arg::Dst>(dst, src, count);
static constexpr size_t kBlockSize = x86::kAvx ? 64 : 32 ;
return builtin::Memcpy<kBlockSize >::loop_and_tail (dst, src, count);
}
[[maybe_unused]] static inline void
inline_memcpy_x86_maybe_interpose_repmovsb (Ptr __restrict dst,
CPtr __restrict src, size_t count) {
// Whether to use rep;movsb exclusively, not at all, or only above a certain
// threshold.
// TODO: Use only a single preprocessor definition to simplify the code.
#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1
#endif
static constexpr bool kUseOnlyRepMovsb =
LLVM_LIBC_IS_DEFINED (LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
static constexpr size_t kRepMovsbThreshold =
LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
if constexpr (kUseOnlyRepMovsb )
return x86::Memcpy::repmovsb (dst, src, count);
else if constexpr (kRepMovsbThreshold >= 0 ) {
if (unlikely (count >= kRepMovsbThreshold ))
return x86::Memcpy::repmovsb (dst, src, count);
else
return inline_memcpy_x86 (dst, src, count);
} else {
return inline_memcpy_x86 (dst, src, count);
}
}
#endif // defined(LLVM_LIBC_ARCH_X86)
#if defined(LLVM_LIBC_ARCH_AARCH64)
[[maybe_unused]] static inline void
inline_memcpy_aarch64 (Ptr __restrict dst, CPtr __restrict src, size_t count) {
if (count == 0 )
return ;
if (count == 1 )
return copy<_1> (dst, src);
return builtin::Memcpy< 1 >:: block (dst, src);
if (count == 2 )
return copy<_2> (dst, src);
return builtin::Memcpy< 2 >:: block (dst, src);
if (count == 3 )
return copy<_3> (dst, src);
return builtin::Memcpy< 3 >:: block (dst, src);
if (count == 4 )
return copy<_4> (dst, src);
return builtin::Memcpy< 4 >:: block (dst, src);
if (count < 8 )
return copy<HeadTail<_4>> (dst, src, count);
return builtin::Memcpy< 4 >:: head_tail (dst, src, count);
if (count < 16 )
return copy<HeadTail<_8>> (dst, src, count);
return builtin::Memcpy< 8 >:: head_tail (dst, src, count);
if (count < 32 )
return copy<HeadTail<_16>> (dst, src, count);
return builtin::Memcpy< 16 >:: head_tail (dst, src, count);
if (count < 64 )
return copy<HeadTail<_32>> (dst, src, count);
return builtin::Memcpy< 32 >:: head_tail (dst, src, count);
if (count < 128 )
return copy<HeadTail<_64>>(dst, src, count);
return copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
return builtin::Memcpy<64 >::head_tail (dst, src, count);
builtin::Memcpy<16 >::block (dst, src);
align_to_next_boundary<16 , Arg::Src>(dst, src, count);
return builtin::Memcpy<64 >::loop_and_tail (dst, src, count);
}
#endif // defined(LLVM_LIBC_ARCH_AARCH64)
static inline void inline_memcpy (Ptr __restrict dst, CPtr __restrict src,
size_t count) {
using namespace __llvm_libc ::builtin;
#if defined(LLVM_LIBC_ARCH_X86)
return inline_memcpy_x86_maybe_interpose_repmovsb (dst, src, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
return inline_memcpy_aarch64 (dst, src, count);
#elif defined(LLVM_LIBC_ARCH_ARM)
return inline_memcpy_embedded_tiny (dst, src, count);
#else
// ///////////////////////////////////////////////////////////////////////////
// Default
// ///////////////////////////////////////////////////////////////////////////
if (count == 0 )
return ;
if (count == 1 )
return copy<_1>(dst, src);
if (count == 2 )
return copy<_2>(dst, src);
if (count == 3 )
return copy<_3>(dst, src);
if (count == 4 )
return copy<_4>(dst, src);
if (count < 8 )
return copy<HeadTail<_4>>(dst, src, count);
if (count < 16 )
return copy<HeadTail<_8>>(dst, src, count);
if (count < 32 )
return copy<HeadTail<_16>>(dst, src, count);
if (count < 64 )
return copy<HeadTail<_32>>(dst, src, count);
if (count < 128 )
return copy<HeadTail<_64>>(dst, src, count);
return copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
#error "Unsupported platform"
#endif
}
static inline void inline_memcpy (void *__restrict dst,
const void *__restrict src, size_t count) {
inline_memcpy (reinterpret_cast <Ptr >(dst), reinterpret_cast <CPtr>(src), count);
}
} // namespace __llvm_libc
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H