diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h index 58bbe29cb3a7d..9432b7b39f783 100644 --- a/libc/src/__support/GPU/amdgpu/utils.h +++ b/libc/src/__support/GPU/amdgpu/utils.h @@ -17,9 +17,6 @@ namespace LIBC_NAMESPACE { namespace gpu { -/// The number of threads that execute in lock-step in a lane. -constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE; - /// Type aliases to the address spaces used by the AMDGPU backend. template using Private = [[clang::opencl_private]] T; template using Constant = [[clang::opencl_constant]] T; @@ -108,8 +105,11 @@ LIBC_INLINE uint64_t get_thread_id() { get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); } -/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware. -LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; } +/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware +/// and compilation options. +LIBC_INLINE uint32_t get_lane_size() { + return __builtin_amdgcn_wavefrontsize(); +} /// Returns the id of the thread inside of an AMD wavefront executing together. [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h index 00b59837ccc67..58db88dce1ca8 100644 --- a/libc/src/__support/GPU/generic/utils.h +++ b/libc/src/__support/GPU/generic/utils.h @@ -16,8 +16,6 @@ namespace LIBC_NAMESPACE { namespace gpu { -constexpr const uint64_t LANE_SIZE = 1; - template using Private = T; template using Constant = T; template using Shared = T; @@ -55,7 +53,7 @@ LIBC_INLINE uint32_t get_thread_id_z() { return 0; } LIBC_INLINE uint64_t get_thread_id() { return 0; } -LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; } +LIBC_INLINE uint32_t get_lane_size() { return 1; } LIBC_INLINE uint32_t get_lane_id() { return 0; } diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h index e7e297adf7ecc..6c4bb5a7720a5 100644 --- a/libc/src/__support/GPU/nvptx/utils.h +++ b/libc/src/__support/GPU/nvptx/utils.h @@ -16,9 +16,6 @@ namespace LIBC_NAMESPACE { namespace gpu { -/// The number of threads that execute in lock-step in a warp. -constexpr const uint64_t LANE_SIZE = 32; - /// Type aliases to the address spaces used by the NVPTX backend. template using Private = [[clang::opencl_private]] T; template using Constant = [[clang::opencl_constant]] T; @@ -95,8 +92,8 @@ LIBC_INLINE uint64_t get_thread_id() { get_num_threads_x() * get_num_threads_y() * get_thread_id_z(); } -/// Returns the size of a CUDA warp. -LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; } +/// Returns the size of a CUDA warp, always 32 on NVIDIA hardware. +LIBC_INLINE uint32_t get_lane_size() { return 32; } /// Returns the id of the thread inside of a CUDA warp executing together. [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h index 7924d4cec2ac8..5ed39ae0d7f7a 100644 --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -43,19 +43,6 @@ struct Header { uint16_t opcode; }; -/// The data payload for the associated packet. We provide enough space for each -/// thread in the cooperating lane to have a buffer. -template struct Payload { - Buffer slot[lane_size]; -}; - -/// A packet used to share data between the client and server across an entire -/// lane. We use a lane as the minimum granularity for execution. -template struct alignas(64) Packet { - Header header; - Payload payload; -}; - /// The maximum number of parallel ports that the RPC interface can support. constexpr uint64_t MAX_PORT_COUNT = 4096; @@ -71,7 +58,7 @@ constexpr uint64_t MAX_PORT_COUNT = 4096; /// - The client will always start with a 'send' operation. /// - The server will always start with a 'recv' operation. /// - Every 'send' or 'recv' call is mirrored by the other process. -template struct Process { +template struct Process { LIBC_INLINE Process() = default; LIBC_INLINE Process(const Process &) = delete; LIBC_INLINE Process &operator=(const Process &) = delete; @@ -82,7 +69,8 @@ template struct Process { uint32_t port_count = 0; cpp::Atomic *inbox = nullptr; cpp::Atomic *outbox = nullptr; - Packet *packet = nullptr; + Header *header = nullptr; + Buffer *packet = nullptr; static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8; cpp::Atomic lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0}; @@ -92,7 +80,9 @@ template struct Process { advance(buffer, inbox_offset(port_count)))), outbox(reinterpret_cast *>( advance(buffer, outbox_offset(port_count)))), - packet(reinterpret_cast( + header(reinterpret_cast
( + advance(buffer, header_offset(port_count)))), + packet(reinterpret_cast( advance(buffer, buffer_offset(port_count)))) {} /// Allocate a memory buffer sufficient to store the following equivalent @@ -101,10 +91,12 @@ template struct Process { /// struct Equivalent { /// Atomic primary[port_count]; /// Atomic secondary[port_count]; - /// Packet buffer[port_count]; + /// Header header[port_count]; + /// Buffer packet[port_count][lane_size]; /// }; - LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count) { - return buffer_offset(port_count) + buffer_bytes(port_count); + LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count, + uint32_t lane_size) { + return buffer_offset(port_count) + buffer_bytes(port_count, lane_size); } /// Retrieve the inbox state from memory shared between processes. @@ -144,6 +136,13 @@ template struct Process { atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); } + /// The packet is a linearly allocated array of buffers used to communicate + /// with the other process. This function returns the appropriate slot in this + /// array such that the process can operate on an entire warp or wavefront. + LIBC_INLINE Buffer *get_packet(uint32_t index, uint32_t lane_size) { + return &packet[index * lane_size]; + } + /// Determines if this process needs to wait for ownership of the buffer. We /// invert the condition on one of the processes to indicate that if one /// process owns the buffer then the other does not. @@ -219,8 +218,9 @@ template struct Process { } /// Number of bytes to allocate for the buffer containing the packets. - LIBC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count) { - return port_count * sizeof(Packet); + LIBC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count, + uint32_t lane_size) { + return port_count * lane_size * sizeof(Buffer); } /// Offset of the inbox in memory. This is the same as the outbox if inverted. @@ -233,9 +233,15 @@ template struct Process { return Invert ? 0 : mailbox_bytes(port_count); } + /// Offset of the buffer containing the packets after the inbox and outbox. + LIBC_INLINE static constexpr uint64_t header_offset(uint32_t port_count) { + return align_up(2 * mailbox_bytes(port_count), alignof(Header)); + } + /// Offset of the buffer containing the packets after the inbox and outbox. LIBC_INLINE static constexpr uint64_t buffer_offset(uint32_t port_count) { - return align_up(2 * mailbox_bytes(port_count), alignof(Packet)); + return align_up(header_offset(port_count) + port_count * sizeof(Header), + alignof(Buffer)); } /// Conditionally set the n-th bit in the atomic bitfield. @@ -262,39 +268,39 @@ template struct Process { }; /// Invokes a function accross every active buffer across the total lane size. -template static LIBC_INLINE void invoke_rpc(cpp::function fn, - Packet &packet) { + uint32_t lane_size, uint64_t lane_mask, + Buffer *slot) { if constexpr (is_process_gpu()) { - fn(&packet.payload.slot[gpu::get_lane_id()]); + fn(&slot[gpu::get_lane_id()]); } else { for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size()) - if (packet.header.mask & 1ul << i) - fn(&packet.payload.slot[i]); + if (lane_mask & (1ul << i)) + fn(&slot[i]); } } /// Alternate version that also provides the index of the current lane. -template static LIBC_INLINE void invoke_rpc(cpp::function fn, - Packet &packet) { + uint32_t lane_size, uint64_t lane_mask, + Buffer *slot) { if constexpr (is_process_gpu()) { - fn(&packet.payload.slot[gpu::get_lane_id()], gpu::get_lane_id()); + fn(&slot[gpu::get_lane_id()], gpu::get_lane_id()); } else { for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size()) - if (packet.header.mask & 1ul << i) - fn(&packet.payload.slot[i], i); + if (lane_mask & (1ul << i)) + fn(&slot[i], i); } } /// The port provides the interface to communicate between the multiple /// processes. A port is conceptually an index into the memory provided by the /// underlying process that is guarded by a lock bit. -template struct Port { - LIBC_INLINE Port(Process &process, uint64_t lane_mask, uint32_t index, - uint32_t out) - : process(process), lane_mask(lane_mask), index(index), out(out), - receive(false), owns_buffer(true) {} +template struct Port { + LIBC_INLINE Port(Process &process, uint64_t lane_mask, uint32_t lane_size, + uint32_t index, uint32_t out) + : process(process), lane_mask(lane_mask), lane_size(lane_size), + index(index), out(out), receive(false), owns_buffer(true) {} LIBC_INLINE ~Port() = default; private: @@ -305,7 +311,7 @@ template struct Port { friend struct Client; template friend struct Server; - friend class cpp::optional>; + friend class cpp::optional>; public: template LIBC_INLINE void recv(U use); @@ -319,7 +325,7 @@ template struct Port { LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc); LIBC_INLINE uint16_t get_opcode() const { - return process.packet[index].header.opcode; + return process.header[index].opcode; } LIBC_INLINE uint16_t get_index() const { return index; } @@ -333,8 +339,9 @@ template struct Port { } private: - Process &process; + Process &process; uint64_t lane_mask; + uint32_t lane_size; uint32_t index; uint32_t out; bool receive; @@ -351,15 +358,14 @@ struct Client { LIBC_INLINE Client(uint32_t port_count, void *buffer) : process(port_count, buffer) {} - using Port = rpc::Port>; + using Port = rpc::Port; template LIBC_INLINE Port open(); private: - Process> process; + Process process; }; static_assert(cpp::is_trivially_copyable::value && - sizeof(Process>) == - sizeof(Process>), + sizeof(Process) == sizeof(Process), "The client is not trivially copyable from the server"); /// The RPC server used to respond to the client. @@ -372,38 +378,35 @@ template struct Server { LIBC_INLINE Server(uint32_t port_count, void *buffer) : process(port_count, buffer) {} - using Port = rpc::Port>; + using Port = rpc::Port; LIBC_INLINE cpp::optional try_open(uint32_t start = 0); LIBC_INLINE Port open(); LIBC_INLINE static uint64_t allocation_size(uint32_t port_count) { - return Process>::allocation_size(port_count); + return Process::allocation_size(port_count, lane_size); } private: - Process> process; + Process process; }; /// Applies \p fill to the shared buffer and initiates a send operation. -template -template -LIBC_INLINE void Port::send(F fill) { +template template LIBC_INLINE void Port::send(F fill) { uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index); // We need to wait until we own the buffer before sending. process.wait_for_ownership(lane_mask, index, out, in); // Apply the \p fill function to initialize the buffer and release the memory. - invoke_rpc(fill, process.packet[index]); + invoke_rpc(fill, lane_size, process.header[index].mask, + process.get_packet(index, lane_size)); out = process.invert_outbox(index, out); owns_buffer = false; receive = false; } /// Applies \p use to the shared buffer and acknowledges the send. -template -template -LIBC_INLINE void Port::recv(U use) { +template template LIBC_INLINE void Port::recv(U use) { // We only exchange ownership of the buffer during a receive if we are waiting // for a previous receive to finish. if (receive) { @@ -417,15 +420,16 @@ LIBC_INLINE void Port::recv(U use) { process.wait_for_ownership(lane_mask, index, out, in); // Apply the \p use function to read the memory out of the buffer. - invoke_rpc(use, process.packet[index]); + invoke_rpc(use, lane_size, process.header[index].mask, + process.get_packet(index, lane_size)); receive = true; owns_buffer = true; } /// Combines a send and receive into a single function. -template +template template -LIBC_INLINE void Port::send_and_recv(F fill, U use) { +LIBC_INLINE void Port::send_and_recv(F fill, U use) { send(fill); recv(use); } @@ -433,17 +437,17 @@ LIBC_INLINE void Port::send_and_recv(F fill, U use) { /// Combines a receive and send operation into a single function. The \p work /// function modifies the buffer in-place and the send is only used to initiate /// the copy back. -template +template template -LIBC_INLINE void Port::recv_and_send(W work) { +LIBC_INLINE void Port::recv_and_send(W work) { recv(work); send([](Buffer *) { /* no-op */ }); } /// Helper routine to simplify the interface when sending from the GPU using /// thread private pointers to the underlying value. -template -LIBC_INLINE void Port::send_n(const void *src, uint64_t size) { +template +LIBC_INLINE void Port::send_n(const void *src, uint64_t size) { const void **src_ptr = &src; uint64_t *size_ptr = &size; send_n(src_ptr, size_ptr); @@ -451,8 +455,8 @@ LIBC_INLINE void Port::send_n(const void *src, uint64_t size) { /// Sends an arbitrarily sized data buffer \p src across the shared channel in /// multiples of the packet length. -template -LIBC_INLINE void Port::send_n(const void *const *src, uint64_t *size) { +template +LIBC_INLINE void Port::send_n(const void *const *src, uint64_t *size) { uint64_t num_sends = 0; send([&](Buffer *buffer, uint32_t id) { reinterpret_cast(buffer->data)[0] = lane_value(size, id); @@ -465,7 +469,7 @@ LIBC_INLINE void Port::send_n(const void *const *src, uint64_t *size) { rpc_memcpy(&buffer->data[1], lane_value(src, id), len); }); uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t); - uint64_t mask = process.packet[index].header.mask; + uint64_t mask = process.header[index].mask; while (gpu::ballot(mask, idx < num_sends)) { send([=](Buffer *buffer, uint32_t id) { uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data) @@ -481,9 +485,9 @@ LIBC_INLINE void Port::send_n(const void *const *src, uint64_t *size) { /// Receives an arbitrarily sized data buffer across the shared channel in /// multiples of the packet length. The \p alloc function is called with the /// size of the data so that we can initialize the size of the \p dst buffer. -template +template template -LIBC_INLINE void Port::recv_n(void **dst, uint64_t *size, A &&alloc) { +LIBC_INLINE void Port::recv_n(void **dst, uint64_t *size, A &&alloc) { uint64_t num_recvs = 0; recv([&](Buffer *buffer, uint32_t id) { lane_value(size, id) = reinterpret_cast(buffer->data)[0]; @@ -498,7 +502,7 @@ LIBC_INLINE void Port::recv_n(void **dst, uint64_t *size, A &&alloc) { rpc_memcpy(lane_value(dst, id), &buffer->data[1], len); }); uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t); - uint64_t mask = process.packet[index].header.mask; + uint64_t mask = process.header[index].mask; while (gpu::ballot(mask, idx < num_recvs)) { recv([=](Buffer *buffer, uint32_t id) { uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data) @@ -515,8 +519,10 @@ LIBC_INLINE void Port::recv_n(void **dst, uint64_t *size, A &&alloc) { /// only open a port if we find an index that is in a valid sending state. That /// is, there are send operations pending that haven't been serviced on this /// port. Each port instance uses an associated \p opcode to tell the server -/// what to do. -template LIBC_INLINE Client::Port Client::open() { +/// what to do. The Client interface provides the appropriate lane size to the +/// port using the platform's returned value. +template +[[clang::convergent]] LIBC_INLINE Client::Port Client::open() { // Repeatedly perform a naive linear scan for a port that can be opened to // send data. for (uint32_t index = gpu::get_cluster_id();; ++index) { @@ -540,11 +546,11 @@ template LIBC_INLINE Client::Port Client::open() { } if (gpu::is_first_lane(lane_mask)) { - process.packet[index].header.opcode = opcode; - process.packet[index].header.mask = lane_mask; + process.header[index].opcode = opcode; + process.header[index].mask = lane_mask; } gpu::sync_lane(lane_mask); - return Port(process, lane_mask, index, out); + return Port(process, lane_mask, gpu::get_lane_size(), index, out); } } @@ -577,7 +583,7 @@ template continue; } - return Port(process, lane_mask, index, out); + return Port(process, lane_mask, lane_size, index, out); } return cpp::nullopt; } diff --git a/libc/test/src/__support/RPC/rpc_smoke_test.cpp b/libc/test/src/__support/RPC/rpc_smoke_test.cpp index 54821e21f9ccf..58b318c7cfa61 100644 --- a/libc/test/src/__support/RPC/rpc_smoke_test.cpp +++ b/libc/test/src/__support/RPC/rpc_smoke_test.cpp @@ -13,12 +13,8 @@ namespace { enum { lane_size = 8, port_count = 4 }; -struct Packet { - uint64_t unused; -}; - -using ProcAType = LIBC_NAMESPACE::rpc::Process; -using ProcBType = LIBC_NAMESPACE::rpc::Process; +using ProcAType = LIBC_NAMESPACE::rpc::Process; +using ProcBType = LIBC_NAMESPACE::rpc::Process; static_assert(ProcAType::inbox_offset(port_count) == ProcBType::outbox_offset(port_count)); @@ -26,7 +22,7 @@ static_assert(ProcAType::inbox_offset(port_count) == static_assert(ProcAType::outbox_offset(port_count) == ProcBType::inbox_offset(port_count)); -enum { alloc_size = ProcAType::allocation_size(port_count) }; +enum { alloc_size = ProcAType::allocation_size(port_count, 1) }; alignas(64) char buffer[alloc_size] = {0}; } // namespace diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp index a2e5d0fd5a833..4e535a294a19e 100644 --- a/libc/utils/gpu/server/rpc_server.cpp +++ b/libc/utils/gpu/server/rpc_server.cpp @@ -396,62 +396,42 @@ const void *rpc_get_client_buffer(uint32_t device_id) { uint64_t rpc_get_client_size() { return sizeof(rpc::Client); } -using ServerPort = std::variant::Port *, rpc::Server<32>::Port *, - rpc::Server<64>::Port *>; +using ServerPort = std::variant::Port *>; ServerPort get_port(rpc_port_t ref) { - if (ref.lane_size == 1) - return reinterpret_cast::Port *>(ref.handle); - else if (ref.lane_size == 32) - return reinterpret_cast::Port *>(ref.handle); - else if (ref.lane_size == 64) - return reinterpret_cast::Port *>(ref.handle); - else - __builtin_unreachable(); + return reinterpret_cast::Port *>(ref.handle); } void rpc_send(rpc_port_t ref, rpc_port_callback_ty callback, void *data) { - auto port = get_port(ref); - std::visit( - [=](auto &port) { - port->send([=](rpc::Buffer *buffer) { - callback(reinterpret_cast(buffer), data); - }); - }, - port); + auto port = reinterpret_cast::Port *>(ref.handle); + port->send([=](rpc::Buffer *buffer) { + callback(reinterpret_cast(buffer), data); + }); } void rpc_send_n(rpc_port_t ref, const void *const *src, uint64_t *size) { - auto port = get_port(ref); - std::visit([=](auto &port) { port->send_n(src, size); }, port); + auto port = reinterpret_cast::Port *>(ref.handle); + port->send_n(src, size); } void rpc_recv(rpc_port_t ref, rpc_port_callback_ty callback, void *data) { - auto port = get_port(ref); - std::visit( - [=](auto &port) { - port->recv([=](rpc::Buffer *buffer) { - callback(reinterpret_cast(buffer), data); - }); - }, - port); + auto port = reinterpret_cast::Port *>(ref.handle); + port->recv([=](rpc::Buffer *buffer) { + callback(reinterpret_cast(buffer), data); + }); } void rpc_recv_n(rpc_port_t ref, void **dst, uint64_t *size, rpc_alloc_ty alloc, void *data) { - auto port = get_port(ref); + auto port = reinterpret_cast::Port *>(ref.handle); auto alloc_fn = [=](uint64_t size) { return alloc(size, data); }; - std::visit([=](auto &port) { port->recv_n(dst, size, alloc_fn); }, port); + port->recv_n(dst, size, alloc_fn); } void rpc_recv_and_send(rpc_port_t ref, rpc_port_callback_ty callback, void *data) { - auto port = get_port(ref); - std::visit( - [=](auto &port) { - port->recv_and_send([=](rpc::Buffer *buffer) { - callback(reinterpret_cast(buffer), data); - }); - }, - port); + auto port = reinterpret_cast::Port *>(ref.handle); + port->recv_and_send([=](rpc::Buffer *buffer) { + callback(reinterpret_cast(buffer), data); + }); }