182 changes: 115 additions & 67 deletions libc/src/string/memory_utils/memcmp_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,93 +11,141 @@

#include "src/__support/architectures.h"
#include "src/__support/common.h"
#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/op_aarch64.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h"

#include <stddef.h> // size_t

namespace __llvm_libc {
[[maybe_unused]] static inline MemcmpReturnType
inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
#pragma nounroll
for (size_t offset = 0; offset < count; ++offset)
if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset))
return value;
return MemcmpReturnType::ZERO();
}

#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
[[maybe_unused]] static inline MemcmpReturnType
inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
if (unlikely(count >= 384)) {
if (auto value = generic::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
}
#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)

static inline int inline_memcmp(const char *lhs, const char *rhs,
size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_X86
/////////////////////////////////////////////////////////////////////////////
using namespace __llvm_libc::x86;
if (count == 0)
return 0;
if (count == 1)
return three_way_compare<_1>(lhs, rhs);
if (count == 2)
return three_way_compare<_2>(lhs, rhs);
if (count == 3)
return three_way_compare<_3>(lhs, rhs);
if (count <= 8)
return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
if (count <= 16)
return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
[[maybe_unused]] static inline MemcmpReturnType
inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
if (unlikely(count >= 384)) {
if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
}

[[maybe_unused]] static inline MemcmpReturnType
inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_AARCH64
/////////////////////////////////////////////////////////////////////////////
using namespace ::__llvm_libc::aarch64;
if (count == 0) // [0, 0]
return 0;
if (count == 1) // [1, 1]
return three_way_compare<_1>(lhs, rhs);
if (count == 2) // [2, 2]
return three_way_compare<_2>(lhs, rhs);
if (count == 3) // [3, 3]
return three_way_compare<_3>(lhs, rhs);
if (count < 8) // [4, 7]
return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
if (count < 16) // [8, 15]
return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
if (unlikely(count >= 128)) // [128, ∞]
return three_way_compare<Align<_16>::Then<Loop<_32>>>(lhs, rhs, count);
if (!equals<_16>(lhs, rhs)) // [16, 16]
return three_way_compare<_16>(lhs, rhs);
return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
if (unlikely(count >= 384)) {
if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
return value;
align_to_next_boundary<32, Arg::P1>(p1, p2, count);
}
return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
}

[[maybe_unused]] static inline MemcmpReturnType
inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
if (unlikely(count >= 384)) {
if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
return value;
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
}
return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
}
#endif // defined(LLVM_LIBC_ARCH_X86)

#if defined(LLVM_LIBC_ARCH_AARCH64)
[[maybe_unused]] static inline MemcmpReturnType
inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) {
if (unlikely(count >= 128)) { // [128, ∞]
if (auto value = generic::Memcmp<16>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
}
if (generic::Bcmp<16>::block(p1, p2)) // [16, 16]
return generic::Memcmp<16>::block(p1, p2);
if (count < 32) // [17, 31]
return three_way_compare<Tail<_16>>(lhs, rhs, count);
if (!equals<Skip<16>::Then<_16>>(lhs, rhs)) // [32, 32]
return three_way_compare<Skip<16>::Then<_16>>(lhs, rhs);
return generic::Memcmp<16>::tail(p1, p2, count);
if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
if (count < 64) // [33, 63]
return three_way_compare<Tail<_32>>(lhs, rhs, count);
return generic::Memcmp<32>::tail(p1, p2, count);
// [64, 127]
return three_way_compare<Skip<32>::Then<Loop<_16>>>(lhs, rhs, count);
#else
/////////////////////////////////////////////////////////////////////////////
// Default
/////////////////////////////////////////////////////////////////////////////
using namespace ::__llvm_libc::scalar;
return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
}
#endif // defined(LLVM_LIBC_ARCH_AARCH64)

static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
if (count == 0)
return 0;
return MemcmpReturnType::ZERO();
if (count == 1)
return three_way_compare<_1>(lhs, rhs);
return generic::Memcmp<1>::block(p1, p2);
if (count == 2)
return three_way_compare<_2>(lhs, rhs);
return generic::Memcmp<2>::block(p1, p2);
if (count == 3)
return three_way_compare<_3>(lhs, rhs);
return generic::Memcmp<3>::block(p1, p2);
if (count <= 8)
return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
return generic::Memcmp<4>::head_tail(p1, p2, count);
if (count <= 16)
return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
if (count <= 32)
return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
if (count <= 64)
return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
if (count <= 128)
return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
return generic::Memcmp<8>::head_tail(p1, p2, count);
#if defined(LLVM_LIBC_ARCH_X86)
if constexpr (x86::kAvx512BW)
return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
else if constexpr (x86::kAvx2)
return inline_memcmp_x86_avx2_gt16(p1, p2, count);
else if constexpr (x86::kSse2)
return inline_memcmp_x86_sse2_gt16(p1, p2, count);
else
return inline_memcmp_generic_gt16(p1, p2, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
if constexpr (aarch64::kNeon)
return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
else
return inline_memcmp_generic_gt16(p1, p2, count);
#endif
#elif defined(LLVM_LIBC_ARCH_ARM)
return inline_memcmp_embedded_tiny(p1, p2, count);
#else
#error "Unsupported platform"
#endif
}

static inline int inline_memcmp(const void *p1, const void *p2, size_t count) {
return static_cast<int>(inline_memcmp(reinterpret_cast<CPtr>(p1),
reinterpret_cast<CPtr>(p2), count));
}

} // namespace __llvm_libc
Expand Down
197 changes: 91 additions & 106 deletions libc/src/string/memory_utils/memcpy_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,145 +11,130 @@

#include "src/__support/architectures.h"
#include "src/__support/common.h"
#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/op_aarch64.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h"

#include <stddef.h> // size_t

// Design rationale
// ================
//
// Using a profiler to observe size distributions for calls into libc
// functions, it was found most operations act on a small number of bytes.
// This makes it important to favor small sizes.
//
// The tests for `count` are in ascending order so the cost of branching is
// proportional to the cost of copying.
//
// The function is written in C++ for several reasons:
// - The compiler can __see__ the code, this is useful when performing Profile
// Guided Optimization as the optimized code can take advantage of branching
// probabilities.
// - It also allows for easier customization and favors testing multiple
// implementation parameters.
// - As compilers and processors get better, the generated code is improved
// with little change on the code side.

namespace __llvm_libc {

static inline void inline_memcpy(char *__restrict dst,
const char *__restrict src, size_t count) {
using namespace __llvm_libc::builtin;
#if defined(LLVM_LIBC_ARCH_X86)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_X86
/////////////////////////////////////////////////////////////////////////////

// Whether to use only rep;movsb.
constexpr bool USE_ONLY_REP_MOVSB =
LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);

// kRepMovsBSize == -1 : Only CopyAligned is used.
// kRepMovsBSize == 0 : Only RepMovsb is used.
// else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
constexpr size_t REP_MOVS_B_SIZE =
#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE)
LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
#else
-1;
#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE

// Whether target supports AVX instructions.
constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);

#if defined(__AVX__)
using LoopBlockSize = _64;
#else
using LoopBlockSize = _32;
#endif

if (USE_ONLY_REP_MOVSB)
return copy<x86::Accelerator>(dst, src, count);
[[maybe_unused]] static inline void
inline_memcpy_embedded_tiny(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
#pragma nounroll
for (size_t offset = 0; offset < count; ++offset)
builtin::Memcpy<1>::block(dst + offset, src + offset);
}

#if defined(LLVM_LIBC_ARCH_X86)
[[maybe_unused]] static inline void
inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
if (count == 0)
return;
if (count == 1)
return copy<_1>(dst, src);
return builtin::Memcpy<1>::block(dst, src);
if (count == 2)
return copy<_2>(dst, src);
return builtin::Memcpy<2>::block(dst, src);
if (count == 3)
return copy<_3>(dst, src);
return builtin::Memcpy<3>::block(dst, src);
if (count == 4)
return copy<_4>(dst, src);
return builtin::Memcpy<4>::block(dst, src);
if (count < 8)
return copy<HeadTail<_4>>(dst, src, count);
return builtin::Memcpy<4>::head_tail(dst, src, count);
if (count < 16)
return copy<HeadTail<_8>>(dst, src, count);
return builtin::Memcpy<8>::head_tail(dst, src, count);
if (count < 32)
return copy<HeadTail<_16>>(dst, src, count);
return builtin::Memcpy<16>::head_tail(dst, src, count);
if (count < 64)
return copy<HeadTail<_32>>(dst, src, count);
return builtin::Memcpy<32>::head_tail(dst, src, count);
if (count < 128)
return copy<HeadTail<_64>>(dst, src, count);
if (HAS_AVX && count < 256)
return copy<HeadTail<_128>>(dst, src, count);
if (count <= REP_MOVS_B_SIZE)
return copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
count);
return copy<x86::Accelerator>(dst, src, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_AARCH64
/////////////////////////////////////////////////////////////////////////////
return builtin::Memcpy<64>::head_tail(dst, src, count);
if (x86::kAvx && count < 256)
return builtin::Memcpy<128>::head_tail(dst, src, count);
builtin::Memcpy<32>::block(dst, src);
align_to_next_boundary<32, Arg::Dst>(dst, src, count);
static constexpr size_t kBlockSize = x86::kAvx ? 64 : 32;
return builtin::Memcpy<kBlockSize>::loop_and_tail(dst, src, count);
}

[[maybe_unused]] static inline void
inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
// Whether to use rep;movsb exclusively, not at all, or only above a certain
// threshold.
// TODO: Use only a single preprocessor definition to simplify the code.
#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1
#endif

static constexpr bool kUseOnlyRepMovsb =
LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
static constexpr size_t kRepMovsbThreshold =
LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
if constexpr (kUseOnlyRepMovsb)
return x86::Memcpy::repmovsb(dst, src, count);
else if constexpr (kRepMovsbThreshold >= 0) {
if (unlikely(count >= kRepMovsbThreshold))
return x86::Memcpy::repmovsb(dst, src, count);
else
return inline_memcpy_x86(dst, src, count);
} else {
return inline_memcpy_x86(dst, src, count);
}
}
#endif // defined(LLVM_LIBC_ARCH_X86)

#if defined(LLVM_LIBC_ARCH_AARCH64)
[[maybe_unused]] static inline void
inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) {
if (count == 0)
return;
if (count == 1)
return copy<_1>(dst, src);
return builtin::Memcpy<1>::block(dst, src);
if (count == 2)
return copy<_2>(dst, src);
return builtin::Memcpy<2>::block(dst, src);
if (count == 3)
return copy<_3>(dst, src);
return builtin::Memcpy<3>::block(dst, src);
if (count == 4)
return copy<_4>(dst, src);
return builtin::Memcpy<4>::block(dst, src);
if (count < 8)
return copy<HeadTail<_4>>(dst, src, count);
return builtin::Memcpy<4>::head_tail(dst, src, count);
if (count < 16)
return copy<HeadTail<_8>>(dst, src, count);
return builtin::Memcpy<8>::head_tail(dst, src, count);
if (count < 32)
return copy<HeadTail<_16>>(dst, src, count);
return builtin::Memcpy<16>::head_tail(dst, src, count);
if (count < 64)
return copy<HeadTail<_32>>(dst, src, count);
return builtin::Memcpy<32>::head_tail(dst, src, count);
if (count < 128)
return copy<HeadTail<_64>>(dst, src, count);
return copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
return builtin::Memcpy<64>::head_tail(dst, src, count);
builtin::Memcpy<16>::block(dst, src);
align_to_next_boundary<16, Arg::Src>(dst, src, count);
return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
}
#endif // defined(LLVM_LIBC_ARCH_AARCH64)

static inline void inline_memcpy(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
using namespace __llvm_libc::builtin;
#if defined(LLVM_LIBC_ARCH_X86)
return inline_memcpy_x86_maybe_interpose_repmovsb(dst, src, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
return inline_memcpy_aarch64(dst, src, count);
#elif defined(LLVM_LIBC_ARCH_ARM)
return inline_memcpy_embedded_tiny(dst, src, count);
#else
/////////////////////////////////////////////////////////////////////////////
// Default
/////////////////////////////////////////////////////////////////////////////
if (count == 0)
return;
if (count == 1)
return copy<_1>(dst, src);
if (count == 2)
return copy<_2>(dst, src);
if (count == 3)
return copy<_3>(dst, src);
if (count == 4)
return copy<_4>(dst, src);
if (count < 8)
return copy<HeadTail<_4>>(dst, src, count);
if (count < 16)
return copy<HeadTail<_8>>(dst, src, count);
if (count < 32)
return copy<HeadTail<_16>>(dst, src, count);
if (count < 64)
return copy<HeadTail<_32>>(dst, src, count);
if (count < 128)
return copy<HeadTail<_64>>(dst, src, count);
return copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
#error "Unsupported platform"
#endif
}

static inline void inline_memcpy(void *__restrict dst,
const void *__restrict src, size_t count) {
inline_memcpy(reinterpret_cast<Ptr>(dst), reinterpret_cast<CPtr>(src), count);
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H
160 changes: 71 additions & 89 deletions libc/src/string/memory_utils/memset_implementations.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,129 +10,111 @@
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H

#include "src/__support/architectures.h"
#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/op_aarch64.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h"

#include <stddef.h> // size_t

namespace __llvm_libc {

// A general purpose implementation assuming cheap unaligned writes for sizes:
// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
// or 64 Bytes at a time, the compiler will expand them as needed.
//
// This implementation is subject to change as we benchmark more processors. We
// may also want to customize it for processors with specialized instructions
// that performs better (e.g. `rep stosb`).
//
// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
// We want to balance two things here:
// - The number of redundant writes (when using `SetBlockOverlap`),
// - The number of conditionals for sizes <=128 (~90% of memset calls are for
// such sizes).
//
// For the range 64-128:
// - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
// is wasteful near 65 but efficient toward 128.
// - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
// 96 or 128 Bytes.
// - Another approach could be to use an hybrid approach copy<64>+Overlap<32>
// for 65-96 and copy<96>+Overlap<32> for 97-128
//
// Benchmarks showed that redundant writes were cheap (for Intel X86) but
// conditional were expensive, even on processor that do not support writing 64B
// at a time (pre-AVX512F). We also want to favor short functions that allow
// more hot code to fit in the iL1 cache.
//
// Above 128 we have to use conditionals since we don't know the upper bound in
// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
// superior for sizes that mattered.
inline static void inline_memset(char *dst, unsigned char value, size_t count) {
[[maybe_unused]] inline static void
inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) {
#pragma nounroll
for (size_t offset = 0; offset < count; ++offset)
generic::Memset<1, 1>::block(dst + offset, value);
}

#if defined(LLVM_LIBC_ARCH_X86)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_X86
/////////////////////////////////////////////////////////////////////////////
using namespace __llvm_libc::x86;
template <size_t MaxSize>
[[maybe_unused]] inline static void inline_memset_x86(Ptr dst, uint8_t value,
size_t count) {
if (count == 0)
return;
if (count == 1)
return splat_set<_1>(dst, value);
return generic::Memset<1, MaxSize>::block(dst, value);
if (count == 2)
return splat_set<_2>(dst, value);
return generic::Memset<2, MaxSize>::block(dst, value);
if (count == 3)
return splat_set<_3>(dst, value);
return generic::Memset<3, MaxSize>::block(dst, value);
if (count <= 8)
return splat_set<HeadTail<_4>>(dst, value, count);
return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
if (count <= 16)
return splat_set<HeadTail<_8>>(dst, value, count);
return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
if (count <= 32)
return splat_set<HeadTail<_16>>(dst, value, count);
return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
if (count <= 64)
return splat_set<HeadTail<_32>>(dst, value, count);
return generic::Memset<32, MaxSize>::head_tail(dst, value, count);
if (count <= 128)
return splat_set<HeadTail<_64>>(dst, value, count);
return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
/////////////////////////////////////////////////////////////////////////////
// LLVM_LIBC_ARCH_AARCH64
/////////////////////////////////////////////////////////////////////////////
using namespace __llvm_libc::aarch64_memset;
return generic::Memset<64, MaxSize>::head_tail(dst, value, count);
// Aligned loop
generic::Memset<32, MaxSize>::block(dst, value);
align_to_next_boundary<32>(dst, count);
return generic::Memset<32, MaxSize>::loop_and_tail(dst, value, count);
}
#endif // defined(LLVM_LIBC_ARCH_X86)

#if defined(LLVM_LIBC_ARCH_AARCH64)
template <size_t MaxSize>
[[maybe_unused]] inline static void
inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) {
if (count == 0)
return;
if (count <= 3) {
splat_set<_1>(dst, value);
generic::Memset<1, MaxSize>::block(dst, value);
if (count > 1)
splat_set<Tail<_2>>(dst, value, count);
generic::Memset<2, MaxSize>::tail(dst, value, count);
return;
}
if (count <= 8)
return splat_set<HeadTail<_4>>(dst, value, count);
return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
if (count <= 16)
return splat_set<HeadTail<_8>>(dst, value, count);
return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
if (count <= 32)
return splat_set<HeadTail<_16>>(dst, value, count);
return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
if (count <= (32 + 64)) {
splat_set<_32>(dst, value);
generic::Memset<32, MaxSize>::block(dst, value);
if (count <= 64)
return splat_set<Tail<_32>>(dst, value, count);
splat_set<Skip<32>::Then<_32>>(dst, value);
splat_set<Tail<_32>>(dst, value, count);
return generic::Memset<32, MaxSize>::tail(dst, value, count);
generic::Memset<32, MaxSize>::block(dst + 32, value);
generic::Memset<32, MaxSize>::tail(dst, value, count);
return;
}
if (count >= 448 && value == 0 && hasZva())
return splat_set<Align<_64, Arg::P1>::Then<Loop<Zva64, _64>>>(dst, 0,
count);
else
return splat_set<Align<_16, Arg::P1>::Then<Loop<_64>>>(dst, value, count);
#else
/////////////////////////////////////////////////////////////////////////////
// Default
/////////////////////////////////////////////////////////////////////////////
using namespace ::__llvm_libc::scalar;
if (count >= 448 && value == 0 && aarch64::neon::hasZva()) {
generic::Memset<64, MaxSize>::block(dst, 0);
align_to_next_boundary<64>(dst, count);
return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count);
} else {
generic::Memset<16, MaxSize>::block(dst, value);
align_to_next_boundary<16>(dst, count);
return generic::Memset<64, MaxSize>::loop_and_tail(dst, value, count);
}
}
#endif // defined(LLVM_LIBC_ARCH_AARCH64)

if (count == 0)
return;
if (count == 1)
return splat_set<_1>(dst, value);
if (count == 2)
return splat_set<_2>(dst, value);
if (count == 3)
return splat_set<_3>(dst, value);
if (count <= 8)
return splat_set<HeadTail<_4>>(dst, value, count);
if (count <= 16)
return splat_set<HeadTail<_8>>(dst, value, count);
if (count <= 32)
return splat_set<HeadTail<_16>>(dst, value, count);
if (count <= 64)
return splat_set<HeadTail<_32>>(dst, value, count);
if (count <= 128)
return splat_set<HeadTail<_64>>(dst, value, count);
return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
inline static void inline_memset(Ptr dst, uint8_t value, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
static constexpr size_t kMaxSize = x86::kAvx512F ? 64
: x86::kAvx ? 32
: x86::kSse2 ? 16
: 8;
return inline_memset_x86<kMaxSize>(dst, value, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
return inline_memset_aarch64<kMaxSize>(dst, value, count);
#elif defined(LLVM_LIBC_ARCH_ARM)
return inline_memset_embedded_tiny(dst, value, count);
#else
#error "Unsupported platform"
#endif
}

inline static void inline_memset(void *dst, uint8_t value, size_t count) {
inline_memset(reinterpret_cast<Ptr>(dst), value, count);
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
2 changes: 1 addition & 1 deletion libc/src/string/memory_utils/op_x86.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
///////////////////////////////////////////////////////////////////////////////
// Memcpy repmovsb implementation
struct Memcpy {
static void repmovsb(char *dst, const char *src, size_t count) {
static void repmovsb(void *dst, const void *src, size_t count) {
asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
}
};
Expand Down
7 changes: 3 additions & 4 deletions libc/src/string/mempcpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@
namespace __llvm_libc {

LLVM_LIBC_FUNCTION(void *, mempcpy,
(void *__restrict dest, const void *__restrict src,
(void *__restrict dst, const void *__restrict src,
size_t count)) {
char *result = reinterpret_cast<char *>(dest);
inline_memcpy(result, reinterpret_cast<const char *>(src), count);
return result + count;
inline_memcpy(dst, src, count);
return reinterpret_cast<char *>(dst) + count;
}

} // namespace __llvm_libc
3 changes: 1 addition & 2 deletions libc/src/string/memset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@
namespace __llvm_libc {

LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) {
inline_memset(reinterpret_cast<char *>(dst),
static_cast<unsigned char>(value), count);
inline_memset(dst, static_cast<uint8_t>(value), count);
return dst;
}

Expand Down