80 changes: 40 additions & 40 deletions libc/src/string/memory_utils/x86_64/inline_memcpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,19 @@ namespace LIBC_NAMESPACE {

namespace x86 {

LIBC_INLINE_VAR constexpr size_t kOneCacheline = 64;
LIBC_INLINE_VAR constexpr size_t kTwoCachelines = 2 * kOneCacheline;
LIBC_INLINE_VAR constexpr size_t kThreeCachelines = 3 * kOneCacheline;
LIBC_INLINE_VAR constexpr size_t K_ONE_CACHELINE = 64;
LIBC_INLINE_VAR constexpr size_t K_TWO_CACHELINES = 2 * K_ONE_CACHELINE;
LIBC_INLINE_VAR constexpr size_t K_THREE_CACHELINES = 3 * K_ONE_CACHELINE;

LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
LIBC_INLINE_VAR constexpr bool K_USE_SOFTWARE_PREFETCHING =
LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING);

// Whether to use rep;movsb exclusively (0), not at all (SIZE_MAX), or only
// above a certain threshold. Defaults to "do not use rep;movsb".
#ifndef LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
#define LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE SIZE_MAX
#endif
LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
LIBC_INLINE_VAR constexpr size_t K_REP_MOVSB_THRESHOLD =
LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;

} // namespace x86
Expand Down Expand Up @@ -73,10 +73,10 @@ inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
prefetch_to_local_cache(src + kOneCacheline);
prefetch_to_local_cache(src + K_ONE_CACHELINE);
if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
prefetch_to_local_cache(src + kTwoCachelines);
prefetch_to_local_cache(src + K_TWO_CACHELINES);
// Aligning 'dst' on a 32B boundary.
builtin::Memcpy<32>::block(dst, src);
align_to_next_boundary<32, Arg::Dst>(dst, src, count);
Expand All @@ -89,22 +89,22 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
// - count >= 128.
if (count < 352) {
// Two cache lines at a time.
while (offset + kTwoCachelines + 32 <= count) {
prefetch_to_local_cache(src + offset + kOneCacheline);
prefetch_to_local_cache(src + offset + kTwoCachelines);
builtin::Memcpy<kTwoCachelines>::block_offset(dst, src, offset);
offset += kTwoCachelines;
while (offset + K_TWO_CACHELINES + 32 <= count) {
prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
builtin::Memcpy<K_TWO_CACHELINES>::block_offset(dst, src, offset);
offset += K_TWO_CACHELINES;
}
} else {
// Three cache lines at a time.
while (offset + kThreeCachelines + 32 <= count) {
prefetch_to_local_cache(src + offset + kOneCacheline);
prefetch_to_local_cache(src + offset + kTwoCachelines);
prefetch_to_local_cache(src + offset + kThreeCachelines);
while (offset + K_THREE_CACHELINES + 32 <= count) {
prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
prefetch_to_local_cache(src + offset + K_THREE_CACHELINES);
// It is likely that this copy will be turned into a 'rep;movsb' on
// non-AVX machines.
builtin::Memcpy<kThreeCachelines>::block_offset(dst, src, offset);
offset += kThreeCachelines;
builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset);
offset += K_THREE_CACHELINES;
}
}
return builtin::Memcpy<32>::loop_and_tail_offset(dst, src, count, offset);
Expand All @@ -114,11 +114,11 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
prefetch_to_local_cache(src + kOneCacheline);
prefetch_to_local_cache(src + K_ONE_CACHELINE);
if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
prefetch_to_local_cache(src + kTwoCachelines);
prefetch_to_local_cache(src + kThreeCachelines);
prefetch_to_local_cache(src + K_TWO_CACHELINES);
prefetch_to_local_cache(src + K_THREE_CACHELINES);
if (count < 256)
return builtin::Memcpy<128>::head_tail(dst, src, count);
// Aligning 'dst' on a 32B boundary.
Expand All @@ -131,27 +131,27 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
// - we prefetched cachelines at 'src + 64', 'src + 128', and 'src + 196'
// - 'dst' is 32B aligned,
// - count >= 128.
while (offset + kThreeCachelines + 64 <= count) {
while (offset + K_THREE_CACHELINES + 64 <= count) {
// Three cache lines at a time.
prefetch_to_local_cache(src + offset + kOneCacheline);
prefetch_to_local_cache(src + offset + kTwoCachelines);
prefetch_to_local_cache(src + offset + kThreeCachelines);
builtin::Memcpy<kThreeCachelines>::block_offset(dst, src, offset);
offset += kThreeCachelines;
prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
prefetch_to_local_cache(src + offset + K_THREE_CACHELINES);
builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset);
offset += K_THREE_CACHELINES;
}
return builtin::Memcpy<64>::loop_and_tail_offset(dst, src, count, offset);
}

[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
#if defined(__AVX512F__)
constexpr size_t vector_size = 64;
constexpr size_t VECTOR_SIZE = 64;
#elif defined(__AVX__)
constexpr size_t vector_size = 32;
constexpr size_t VECTOR_SIZE = 32;
#elif defined(__SSE2__)
constexpr size_t vector_size = 16;
constexpr size_t VECTOR_SIZE = 16;
#else
constexpr size_t vector_size = 8;
constexpr size_t VECTOR_SIZE = 8;
#endif
if (count == 0)
return;
Expand All @@ -174,20 +174,20 @@ inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
// But it's not profitable to use larger size if it's not natively
// supported: we will both use more instructions and handle fewer
// sizes in earlier branches.
if (vector_size >= 16 ? count < 16 : count <= 16)
if (VECTOR_SIZE >= 16 ? count < 16 : count <= 16)
return builtin::Memcpy<8>::head_tail(dst, src, count);
if (vector_size >= 32 ? count < 32 : count <= 32)
if (VECTOR_SIZE >= 32 ? count < 32 : count <= 32)
return builtin::Memcpy<16>::head_tail(dst, src, count);
if (vector_size >= 64 ? count < 64 : count <= 64)
if (VECTOR_SIZE >= 64 ? count < 64 : count <= 64)
return builtin::Memcpy<32>::head_tail(dst, src, count);
if constexpr (x86::kAvx) {
if constexpr (x86::kUseSoftwarePrefetching) {
if constexpr (x86::K_AVX) {
if constexpr (x86::K_USE_SOFTWARE_PREFETCHING) {
return inline_memcpy_x86_avx_ge64_sw_prefetching(dst, src, count);
} else {
return inline_memcpy_x86_avx_ge64(dst, src, count);
}
} else {
if constexpr (x86::kUseSoftwarePrefetching) {
if constexpr (x86::K_USE_SOFTWARE_PREFETCHING) {
return inline_memcpy_x86_sse2_ge64_sw_prefetching(dst, src, count);
} else {
return inline_memcpy_x86_sse2_ge64(dst, src, count);
Expand All @@ -198,12 +198,12 @@ inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
if constexpr (x86::kRepMovsbThreshold == 0) {
if constexpr (x86::K_REP_MOVSB_THRESHOLD == 0) {
return x86::Memcpy::repmovsb(dst, src, count);
} else if constexpr (x86::kRepMovsbThreshold == SIZE_MAX) {
} else if constexpr (x86::K_REP_MOVSB_THRESHOLD == SIZE_MAX) {
return inline_memcpy_x86(dst, src, count);
} else {
if (LIBC_UNLIKELY(count >= x86::kRepMovsbThreshold))
if (LIBC_UNLIKELY(count >= x86::K_REP_MOVSB_THRESHOLD))
return x86::Memcpy::repmovsb(dst, src, count);
else
return inline_memcpy_x86(dst, src, count);
Expand Down
22 changes: 12 additions & 10 deletions libc/src/string/memory_utils/x86_64/inline_memset.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@
namespace LIBC_NAMESPACE {
namespace x86 {
// Size of one cache line for software prefetching
LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64;
LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2;
LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5;
LIBC_INLINE_VAR constexpr size_t K_ONE_CACHELINE_SIZE = 64;
LIBC_INLINE_VAR constexpr size_t K_TWO_CACHELINES_SIZE =
K_ONE_CACHELINE_SIZE * 2;
LIBC_INLINE_VAR constexpr size_t K_FIVE_CACHELINES_SIZE =
K_ONE_CACHELINE_SIZE * 5;

LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
LIBC_INLINE_VAR constexpr bool K_USE_SOFTWARE_PREFETCHING_MEMSET =
LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);

} // namespace x86
Expand All @@ -47,15 +49,15 @@ using uint512_t = cpp::array<uint64_t, 8>;

[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
constexpr size_t PREFETCH_DISTANCE = x86::kFiveCachelinesSize;
constexpr size_t PREFETCH_DEGREE = x86::kTwoCachelinesSize;
constexpr size_t PREFETCH_DISTANCE = x86::K_FIVE_CACHELINES_SIZE;
constexpr size_t PREFETCH_DEGREE = x86::K_TWO_CACHELINES_SIZE;
constexpr size_t SIZE = sizeof(uint256_t);
// Prefetch one cache line
prefetch_for_write(dst + x86::kOneCachelineSize);
prefetch_for_write(dst + x86::K_ONE_CACHELINE_SIZE);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
// Prefetch the second cache line
prefetch_for_write(dst + x86::kTwoCachelinesSize);
prefetch_for_write(dst + x86::K_TWO_CACHELINES_SIZE);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
align_to_next_boundary<32>(dst, count);
Expand All @@ -67,7 +69,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
while (offset + PREFETCH_DEGREE + SIZE <= count) {
prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
prefetch_for_write(dst + offset + PREFETCH_DISTANCE +
x86::kOneCachelineSize);
x86::K_ONE_CACHELINE_SIZE);
for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
generic::Memset<uint256_t>::block(dst + offset, value);
}
Expand All @@ -93,7 +95,7 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
return generic::Memset<uint128_t>::head_tail(dst, value, count);
if (count <= 64)
return generic::Memset<uint256_t>::head_tail(dst, value, count);
if constexpr (x86::kUseSoftwarePrefetchingMemset)
if constexpr (x86::K_USE_SOFTWARE_PREFETCHING_MEMSET)
return inline_memset_x86_gt64_sw_prefetching(dst, value, count);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
Expand Down