247 changes: 127 additions & 120 deletions libc/utils/gpu/loader/amdgpu/Loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,6 @@
#include <cstring>
#include <utility>

/// The name of the kernel we will launch. All AMDHSA kernels end with '.kd'.
constexpr const char *KERNEL_START = "_start.kd";

/// The arguments to the '_start' kernel.
struct kernel_args_t {
int argc;
void *argv;
void *envp;
void *ret;
void *inbox;
void *outbox;
void *buffer;
};

/// Print the error code and exit if \p code indicates an error.
static void handle_error(hsa_status_t code) {
if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
Expand Down Expand Up @@ -145,6 +131,105 @@ hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
return iterate_agent_memory_pools(agent, cb);
}

template <typename args_t>
hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
hsa_amd_memory_pool_t kernargs_pool,
hsa_queue_t *queue, const LaunchParameters &params,
const char *kernel_name, args_t kernel_args) {
// Look up the '_start' kernel in the loaded executable.
hsa_executable_symbol_t symbol;
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
executable, kernel_name, &dev_agent, &symbol))
return err;

// Retrieve different properties of the kernel symbol used for launch.
uint64_t kernel;
uint32_t args_size;
uint32_t group_size;
uint32_t private_size;

std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};

for (auto &[info, value] : symbol_infos)
if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
return err;

// Allocate space for the kernel arguments on the host and allow the GPU agent
// to access it.
void *args;
if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
/*flags=*/0, &args))
handle_error(err);
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);

// Initialie all the arguments (explicit and implicit) to zero, then set the
// explicit arguments to the values created above.
std::memset(args, 0, args_size);
std::memcpy(args, &kernel_args, sizeof(args_t));

// Obtain a packet from the queue.
uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
;

const uint32_t mask = queue->size - 1;
hsa_kernel_dispatch_packet_t *packet =
static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
(packet_id & mask);

// Set up the packet for exeuction on the device. We currently only launch
// with one thread on the device, forcing the rest of the wavefront to be
// masked off.
std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
(params.num_blocks_z * params.num_threads_z != 1))
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet->workgroup_size_x = params.num_threads_x;
packet->workgroup_size_y = params.num_threads_y;
packet->workgroup_size_z = params.num_threads_z;
packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
packet->private_segment_size = private_size;
packet->group_segment_size = group_size;
packet->kernel_object = kernel;
packet->kernarg_address = args;

// Create a signal to indicate when this packet has been completed.
if (hsa_status_t err =
hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
handle_error(err);

// Initialize the packet header and set the doorbell signal to begin execution
// by the HSA runtime.
uint16_t setup = packet->setup;
uint16_t header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
__atomic_store_n(&packet->header, header | (setup << 16), __ATOMIC_RELEASE);
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);

// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.
while (hsa_signal_wait_scacquire(
packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
/*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
handle_server();

// Destroy the resources acquired to launch the kernel and return.
if (hsa_status_t err = hsa_amd_memory_pool_free(args))
handle_error(err);
if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
handle_error(err);

return HSA_STATUS_SUCCESS;
}

int load(int argc, char **argv, char **envp, void *image, size_t size,
const LaunchParameters &params) {
// Initialize the HSA runtime used to communicate with the device.
Expand All @@ -169,18 +254,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
handle_error(err);

// Obtain a queue with the minimum (power of two) size, used to send commands
// to the HSA runtime and launch execution on the device.
uint64_t queue_size;
if (hsa_status_t err = hsa_agent_get_info(
dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
handle_error(err);
hsa_queue_t *queue = nullptr;
if (hsa_status_t err =
hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_SINGLE,
nullptr, nullptr, UINT32_MAX, UINT32_MAX, &queue))
handle_error(err);

// Load the code object's ISA information and executable data segments.
hsa_code_object_t object;
if (hsa_status_t err = hsa_code_object_deserialize(image, size, "", &object))
Expand Down Expand Up @@ -228,36 +301,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
dev_agent, &coarsegrained_pool))
handle_error(err);

// Look up the '_start' kernel in the loaded executable.
hsa_executable_symbol_t symbol;
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
executable, KERNEL_START, &dev_agent, &symbol))
handle_error(err);

// Retrieve different properties of the kernel symbol used for launch.
uint64_t kernel;
uint32_t args_size;
uint32_t group_size;
uint32_t private_size;

std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};

for (auto &[info, value] : symbol_infos)
if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
handle_error(err);

// Allocate space for the kernel arguments on the host and allow the GPU agent
// to access it.
void *args;
if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
/*flags=*/0, &args))
handle_error(err);
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);

// Allocate fine-grained memory on the host to hold the pointer array for the
// copied argv and allow the GPU agent to access it.
auto allocator = [&](uint64_t size) -> void * {
Expand Down Expand Up @@ -313,69 +356,33 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox);
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer);

// Initialie all the arguments (explicit and implicit) to zero, then set the
// explicit arguments to the values created above.
std::memset(args, 0, args_size);
kernel_args_t *kernel_args = reinterpret_cast<kernel_args_t *>(args);
kernel_args->argc = argc;
kernel_args->argv = dev_argv;
kernel_args->envp = dev_envp;
kernel_args->ret = dev_ret;
kernel_args->inbox = server_outbox;
kernel_args->outbox = server_inbox;
kernel_args->buffer = buffer;

// Obtain a packet from the queue.
uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue_size)
;

const uint32_t mask = queue_size - 1;
hsa_kernel_dispatch_packet_t *packet =
(hsa_kernel_dispatch_packet_t *)queue->base_address + (packet_id & mask);

// Set up the packet for exeuction on the device. We currently only launch
// with one thread on the device, forcing the rest of the wavefront to be
// masked off.
std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
(params.num_blocks_z * params.num_threads_z != 1))
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet->workgroup_size_x = params.num_threads_x;
packet->workgroup_size_y = params.num_threads_y;
packet->workgroup_size_z = params.num_threads_z;
packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
packet->private_segment_size = private_size;
packet->group_segment_size = group_size;
packet->kernel_object = kernel;
packet->kernarg_address = args;
// Initialize the RPC server's buffer for host-device communication.
server.reset(wavefront_size, &lock, server_inbox, server_outbox, buffer);

// Create a signal to indicate when this packet has been completed.
// Obtain a queue with the minimum (power of two) size, used to send commands
// to the HSA runtime and launch execution on the device.
uint64_t queue_size;
if (hsa_status_t err = hsa_agent_get_info(
dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
handle_error(err);
hsa_queue_t *queue = nullptr;
if (hsa_status_t err =
hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
nullptr, UINT32_MAX, UINT32_MAX, &queue))
handle_error(err);

// Initialize the RPC server's buffer for host-device communication.
server.reset(wavefront_size, &lock, server_inbox, server_outbox, buffer);

// Initialize the packet header and set the doorbell signal to begin execution
// by the HSA runtime.
uint16_t header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
__atomic_store_n(&packet->header, header | (packet->setup << 16),
__ATOMIC_RELEASE);
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
begin_args_t init_args = {argc, dev_argv, dev_envp,
server_outbox, server_inbox, buffer};
if (hsa_status_t err =
launch_kernel(dev_agent, executable, kernargs_pool, queue,
single_threaded_params, "_begin.kd", init_args))
handle_error(err);

// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.
while (hsa_signal_wait_scacquire(
packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
/*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
handle_server();
start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
queue, params, "_start.kd", args))
handle_error(err);

// Create a memory signal and copy the return value back from the device into
// a new buffer.
Expand All @@ -402,9 +409,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
// Save the return value and perform basic clean-up.
int ret = *static_cast<int *>(host_ret);

// Free the memory allocated for the device.
if (hsa_status_t err = hsa_amd_memory_pool_free(args))
end_args_t fini_args = {ret};
if (hsa_status_t err =
launch_kernel(dev_agent, executable, kernargs_pool, queue,
single_threaded_params, "_end.kd", fini_args))
handle_error(err);

// Free the memory allocated for the device.
if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
Expand All @@ -420,10 +431,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,

if (hsa_status_t err = hsa_signal_destroy(memory_signal))
handle_error(err);

if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
handle_error(err);

if (hsa_status_t err = hsa_queue_destroy(queue))
handle_error(err);

Expand Down
85 changes: 45 additions & 40 deletions libc/utils/gpu/loader/nvptx/Loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,6 @@
using namespace llvm;
using namespace object;

/// The arguments to the '_start' kernel.
struct kernel_args_t {
int argc;
void *argv;
void *envp;
void *ret;
void *inbox;
void *outbox;
void *buffer;
};

static void handle_error(CUresult err) {
if (err == CUDA_SUCCESS)
return;
Expand Down Expand Up @@ -170,6 +159,36 @@ Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
return dev_memory;
}

template <typename args_t>
CUresult launch_kernel(CUmodule binary, CUstream stream,
const LaunchParameters &params, const char *kernel_name,
args_t kernel_args) {
// look up the '_start' kernel in the loaded module.
CUfunction function;
if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
handle_error(err);

// Set up the arguments to the '_start' kernel on the GPU.
uint64_t args_size = sizeof(args_t);
void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
CU_LAUNCH_PARAM_END};

// Call the kernel with the given arguments.
if (CUresult err = cuLaunchKernel(
function, params.num_blocks_x, params.num_blocks_y,
params.num_blocks_z, params.num_threads_x, params.num_threads_y,
params.num_threads_z, 0, stream, nullptr, args_config))
handle_error(err);

// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.
while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
handle_server();

return CUDA_SUCCESS;
}

int load(int argc, char **argv, char **envp, void *image, size_t size,
const LaunchParameters &params) {

Expand Down Expand Up @@ -197,11 +216,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
handle_error(err);

// look up the '_start' kernel in the loaded module.
CUfunction function;
if (CUresult err = cuModuleGetFunction(&function, binary, "_start"))
handle_error(err);

// Allocate pinned memory on the host to hold the pointer array for the
// copied argv and allow the GPU device to access it.
auto allocator = [&](uint64_t size) -> void * {
Expand Down Expand Up @@ -242,35 +256,21 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (!server_inbox || !server_outbox || !buffer)
handle_error("Failed to allocate memory the RPC client / server.");

// Set up the arguments to the '_start' kernel on the GPU.
uint64_t args_size = sizeof(kernel_args_t);
kernel_args_t args;
std::memset(&args, 0, args_size);
args.argc = argc;
args.argv = dev_argv;
args.envp = dev_envp;
args.ret = reinterpret_cast<void *>(dev_ret);
args.inbox = server_outbox;
args.outbox = server_inbox;
args.buffer = buffer;
void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
CU_LAUNCH_PARAM_END};

// Initialize the RPC server's buffer for host-device communication.
server.reset(warp_size, &lock, server_inbox, server_outbox, buffer);

// Call the kernel with the given arguments.
if (CUresult err = cuLaunchKernel(
function, params.num_blocks_x, params.num_blocks_y,
params.num_blocks_z, params.num_threads_x, params.num_threads_y,
params.num_threads_z, 0, stream, nullptr, args_config))
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
// Call the kernel to
begin_args_t init_args = {argc, dev_argv, dev_envp,
server_outbox, server_inbox, buffer};
if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
"_begin", init_args))
handle_error(err);

// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.
while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
handle_server();
start_args_t args = {argc, dev_argv, dev_envp,
reinterpret_cast<void *>(dev_ret)};
if (CUresult err = launch_kernel(binary, stream, params, "_start", args))
handle_error(err);

// Copy the return value back from the kernel and wait.
int host_ret = 0;
Expand All @@ -280,6 +280,11 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (CUresult err = cuStreamSynchronize(stream))
handle_error(err);

end_args_t fini_args = {host_ret};
if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
"_end", fini_args))
handle_error(err);

// Free the memory allocated for the device.
if (CUresult err = cuMemFreeHost(*memory_or_err))
handle_error(err);
Expand Down