15 changes: 9 additions & 6 deletions libc/utils/gpu/loader/amdgpu/Loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
hsa_amd_memory_fill(dev_ret, 0, sizeof(int));

// Allocate finegrained memory for the RPC server and client to share.
uint64_t port_size = __llvm_libc::rpc::default_port_count;
uint32_t wavefront_size = 0;
if (hsa_status_t err = hsa_agent_get_info(
dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
Expand All @@ -338,26 +339,28 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
void *server_outbox;
void *buffer;
if (hsa_status_t err = hsa_amd_memory_pool_allocate(
finegrained_pool, sizeof(__llvm_libc::cpp::Atomic<int>),
finegrained_pool, port_size * sizeof(__llvm_libc::cpp::Atomic<int>),
/*flags=*/0, &server_inbox))
handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_allocate(
finegrained_pool, sizeof(__llvm_libc::cpp::Atomic<int>),
finegrained_pool, port_size * sizeof(__llvm_libc::cpp::Atomic<int>),
/*flags=*/0, &server_outbox))
handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_allocate(
finegrained_pool,
align_up(sizeof(__llvm_libc::rpc::Header) +
(wavefront_size * sizeof(__llvm_libc::rpc::Buffer)),
alignof(__llvm_libc::rpc::Packet)),
port_size *
align_up(sizeof(__llvm_libc::rpc::Header) +
(wavefront_size * sizeof(__llvm_libc::rpc::Buffer)),
alignof(__llvm_libc::rpc::Packet)),
/*flags=*/0, &buffer))
handle_error(err);
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_inbox);
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox);
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer);

// Initialize the RPC server's buffer for host-device communication.
server.reset(wavefront_size, &lock, server_inbox, server_outbox, buffer);
server.reset(port_size, wavefront_size, &lock, server_inbox, server_outbox,
buffer);

// Obtain a queue with the minimum (power of two) size, used to send commands
// to the HSA runtime and launch execution on the device.
Expand Down
18 changes: 11 additions & 7 deletions libc/utils/gpu/loader/nvptx/Loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,18 +246,22 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
handle_error(err);

uint64_t port_size = __llvm_libc::rpc::default_port_count;
uint32_t warp_size = 32;
void *server_inbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
void *server_outbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
void *buffer =
allocator(align_up(sizeof(__llvm_libc::rpc::Header) +
(warp_size * sizeof(__llvm_libc::rpc::Buffer)),
alignof(__llvm_libc::rpc::Packet)));
void *server_inbox =
allocator(port_size * sizeof(__llvm_libc::cpp::Atomic<int>));
void *server_outbox =
allocator(port_size * sizeof(__llvm_libc::cpp::Atomic<int>));
void *buffer = allocator(
port_size * align_up(sizeof(__llvm_libc::rpc::Header) +
(warp_size * sizeof(__llvm_libc::rpc::Buffer)),
alignof(__llvm_libc::rpc::Packet)));
if (!server_inbox || !server_outbox || !buffer)
handle_error("Failed to allocate memory the RPC client / server.");

// Initialize the RPC server's buffer for host-device communication.
server.reset(warp_size, &lock, server_inbox, server_outbox, buffer);
server.reset(port_size, warp_size, &lock, server_inbox, server_outbox,
buffer);

LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
// Call the kernel to
Expand Down