-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc][x86] Use prefetch for write for memcpy #90450
Conversation
Currently when `LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING` is set we prefetch memory for read on the source buffer. This patch adds prefetch for write on the destination buffer.
@llvm/pr-subscribers-libc Author: Guillaume Chatelet (gchatelet) ChangesCurrently when Full diff: https://github.com/llvm/llvm-project/pull/90450.diff 1 Files Affected:
diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
index ae61b1235bd08c..7a4e70bdf2d150 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
@@ -69,14 +69,21 @@ inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
}
+[[maybe_unused]] LIBC_INLINE void inline_memcpy_prefetch(Ptr __restrict dst,
+ CPtr __restrict src,
+ size_t distance) {
+ prefetch_to_local_cache(src + distance);
+ prefetch_for_write(dst + distance);
+}
+
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
- prefetch_to_local_cache(src + K_ONE_CACHELINE);
+ inline_memcpy_prefetch(dst, src, K_ONE_CACHELINE);
if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
- prefetch_to_local_cache(src + K_TWO_CACHELINES);
+ inline_memcpy_prefetch(dst, src, K_TWO_CACHELINES);
// Aligning 'dst' on a 32B boundary.
builtin::Memcpy<32>::block(dst, src);
align_to_next_boundary<32, Arg::Dst>(dst, src, count);
@@ -90,17 +97,17 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
if (count < 352) {
// Two cache lines at a time.
while (offset + K_TWO_CACHELINES + 32 <= count) {
- prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
- prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
+ inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
+ inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
builtin::Memcpy<K_TWO_CACHELINES>::block_offset(dst, src, offset);
offset += K_TWO_CACHELINES;
}
} else {
// Three cache lines at a time.
while (offset + K_THREE_CACHELINES + 32 <= count) {
- prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
- prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
- prefetch_to_local_cache(src + offset + K_THREE_CACHELINES);
+ inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
+ inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
+ inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
// It is likely that this copy will be turned into a 'rep;movsb' on
// non-AVX machines.
builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset);
@@ -114,11 +121,11 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
- prefetch_to_local_cache(src + K_ONE_CACHELINE);
+ inline_memcpy_prefetch(dst, src, K_ONE_CACHELINE);
if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
- prefetch_to_local_cache(src + K_TWO_CACHELINES);
- prefetch_to_local_cache(src + K_THREE_CACHELINES);
+ inline_memcpy_prefetch(dst, src, K_TWO_CACHELINES);
+ inline_memcpy_prefetch(dst, src, K_THREE_CACHELINES);
if (count < 256)
return builtin::Memcpy<128>::head_tail(dst, src, count);
// Aligning 'dst' on a 32B boundary.
@@ -133,9 +140,9 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
// - count >= 128.
while (offset + K_THREE_CACHELINES + 64 <= count) {
// Three cache lines at a time.
- prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
- prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
- prefetch_to_local_cache(src + offset + K_THREE_CACHELINES);
+ inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
+ inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
+ inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset);
offset += K_THREE_CACHELINES;
}
|
CPtr __restrict src, | ||
size_t distance) { | ||
prefetch_to_local_cache(src + distance); | ||
prefetch_for_write(dst + distance); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should there be an option to turn that off ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The prefetching behavior is controlled by LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING
, I don't want to split this further down into "prefetch for write" and "prefetch for read", we already have a lot of combinations to test.
I think the commit message should at least include the open-source benchmark numbers, and possible hints from our internal experience with this. |
Could you paste the performance study numbers |
Reviving this PR. The current microbenchmarks available (https://github.com/google/fleetbench and Here are the throughput results for fleetbench on a machine with hardware prefetchers turned off (all runs sorted by increasing throughput). It shows a neutral / mild improvement. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…0dbf23e48 Local branch amd-gfx fd70dbf Merged main:fcb3a0485857c749d04ea234a8c3d629c62ab211 into amd-gfx:425372303add Remote branch main 73ef397 [libc][x86] Use prefetch for write for memcpy (llvm#90450)
Currently when
LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING
is set we prefetch memory for read on the source buffer. This patch adds prefetch for write on the destination buffer.