Skip to content

Commit

Permalink
[libc] Optimize the RPC memory copy for the AMDGPU target (#70467)
Browse files Browse the repository at this point in the history
Summary:
We previously made the change to make the GPU target use builtin
implementations of memory copy functions. However, this had the negative
effect of massively increasing register usages when using the printing
interface. For example, a `printf` call went from using 25 VGPRs to 54
simply because of using the builtin. However, we probably want to still
export the builitin, but for the RPC interface we heavily prefer small
resource usage over the performance gains of fully unrolling this loop.
For NVPTX however, the builtin implementation causes the resource usage
to go down (36 registers total for a regular `fputs` call) so we will
maintain that implementation.

I think specializing this is the right call as we will always prefer the
implementation with the smallest resource footprint for this interface,
as performance is already going to be heavily bottlenecked by the use of
fine-grained memory.
  • Loading branch information
jhuber6 committed Oct 27, 2023
1 parent 7415799 commit 8e447a1
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 5 deletions.
9 changes: 4 additions & 5 deletions libc/src/__support/RPC/rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/optional.h"
#include "src/__support/GPU/utils.h"
#include "src/string/memory_utils/inline_memcpy.h"

#include <stdint.h>

Expand Down Expand Up @@ -458,7 +457,7 @@ LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
? sizeof(Buffer::data) - sizeof(uint64_t)
: lane_value(size, id);
inline_memcpy(&buffer->data[1], lane_value(src, id), len);
rpc_memcpy(&buffer->data[1], lane_value(src, id), len);
});
uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
uint64_t mask = process.packet[index].header.mask;
Expand All @@ -468,7 +467,7 @@ LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
? sizeof(Buffer::data)
: lane_value(size, id) - idx;
if (idx < lane_value(size, id))
inline_memcpy(buffer->data, advance(lane_value(src, id), idx), len);
rpc_memcpy(buffer->data, advance(lane_value(src, id), idx), len);
});
idx += sizeof(Buffer::data);
}
Expand All @@ -491,7 +490,7 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
? sizeof(Buffer::data) - sizeof(uint64_t)
: lane_value(size, id);
inline_memcpy(lane_value(dst, id), &buffer->data[1], len);
rpc_memcpy(lane_value(dst, id), &buffer->data[1], len);
});
uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
uint64_t mask = process.packet[index].header.mask;
Expand All @@ -501,7 +500,7 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
? sizeof(Buffer::data)
: lane_value(size, id) - idx;
if (idx < lane_value(size, id))
inline_memcpy(advance(lane_value(dst, id), idx), buffer->data, len);
rpc_memcpy(advance(lane_value(dst, id), idx), buffer->data, len);
});
idx += sizeof(Buffer::data);
}
Expand Down
14 changes: 14 additions & 0 deletions libc/src/__support/RPC/rpc_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#include "src/__support/GPU/utils.h"
#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/__support/macros/properties/architectures.h"
#include "src/string/memory_utils/generic/byte_per_byte.h"
#include "src/string/memory_utils/inline_memcpy.h"

namespace LIBC_NAMESPACE {
namespace rpc {
Expand Down Expand Up @@ -64,6 +66,18 @@ template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(ptr) + bytes);
}

/// Wrapper around the optimal memory copy implementation for the target.
LIBC_INLINE void rpc_memcpy(void *dst, const void *src, size_t count) {
// The built-in memcpy prefers to fully unroll loops. We want to minimize
// resource usage so we use a single nounroll loop implementation.
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
inline_memcpy_byte_per_byte(reinterpret_cast<Ptr>(dst),
reinterpret_cast<CPtr>(src), count);
#else
inline_memcpy(dst, src, count);
#endif
}

} // namespace rpc
} // namespace LIBC_NAMESPACE

Expand Down

0 comments on commit 8e447a1

Please sign in to comment.