diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h index 88c62dcdc340f..08c1dfd10d6d7 100644 --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -24,7 +24,6 @@ #include "src/__support/CPP/functional.h" #include "src/__support/CPP/optional.h" #include "src/__support/GPU/utils.h" -#include "src/string/memory_utils/inline_memcpy.h" #include @@ -458,7 +457,7 @@ LIBC_INLINE void Port::send_n(const void *const *src, uint64_t *size) { lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t) ? sizeof(Buffer::data) - sizeof(uint64_t) : lane_value(size, id); - inline_memcpy(&buffer->data[1], lane_value(src, id), len); + rpc_memcpy(&buffer->data[1], lane_value(src, id), len); }); uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t); uint64_t mask = process.packet[index].header.mask; @@ -468,7 +467,7 @@ LIBC_INLINE void Port::send_n(const void *const *src, uint64_t *size) { ? sizeof(Buffer::data) : lane_value(size, id) - idx; if (idx < lane_value(size, id)) - inline_memcpy(buffer->data, advance(lane_value(src, id), idx), len); + rpc_memcpy(buffer->data, advance(lane_value(src, id), idx), len); }); idx += sizeof(Buffer::data); } @@ -491,7 +490,7 @@ LIBC_INLINE void Port::recv_n(void **dst, uint64_t *size, A &&alloc) { lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t) ? sizeof(Buffer::data) - sizeof(uint64_t) : lane_value(size, id); - inline_memcpy(lane_value(dst, id), &buffer->data[1], len); + rpc_memcpy(lane_value(dst, id), &buffer->data[1], len); }); uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t); uint64_t mask = process.packet[index].header.mask; @@ -501,7 +500,7 @@ LIBC_INLINE void Port::recv_n(void **dst, uint64_t *size, A &&alloc) { ? sizeof(Buffer::data) : lane_value(size, id) - idx; if (idx < lane_value(size, id)) - inline_memcpy(advance(lane_value(dst, id), idx), buffer->data, len); + rpc_memcpy(advance(lane_value(dst, id), idx), buffer->data, len); }); idx += sizeof(Buffer::data); } diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h index 46ca841c49199..04620b0487f4a 100644 --- a/libc/src/__support/RPC/rpc_util.h +++ b/libc/src/__support/RPC/rpc_util.h @@ -13,6 +13,8 @@ #include "src/__support/GPU/utils.h" #include "src/__support/macros/attributes.h" // LIBC_INLINE #include "src/__support/macros/properties/architectures.h" +#include "src/string/memory_utils/generic/byte_per_byte.h" +#include "src/string/memory_utils/inline_memcpy.h" namespace LIBC_NAMESPACE { namespace rpc { @@ -64,6 +66,18 @@ template LIBC_INLINE T *advance(T *ptr, U bytes) { return reinterpret_cast(reinterpret_cast(ptr) + bytes); } +/// Wrapper around the optimal memory copy implementation for the target. +LIBC_INLINE void rpc_memcpy(void *dst, const void *src, size_t count) { + // The built-in memcpy prefers to fully unroll loops. We want to minimize + // resource usage so we use a single nounroll loop implementation. +#if defined(LIBC_TARGET_ARCH_IS_AMDGPU) + inline_memcpy_byte_per_byte(reinterpret_cast(dst), + reinterpret_cast(src), count); +#else + inline_memcpy(dst, src, count); +#endif +} + } // namespace rpc } // namespace LIBC_NAMESPACE