diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index 19da0ad29cd84..dfc5ab904fe9d 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -87,7 +87,6 @@ function(_get_hermetic_test_compile_options output_var) -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}) elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) list(APPEND compile_options - "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false" -Wno-multi-gpu --cuda-path=${LIBC_CUDA_ROOT} -nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit) endif() @@ -637,6 +636,7 @@ function(add_integration_test test_name) # makes `add_custom_target` construct the correct command and execute it. set(test_cmd ${INTEGRATION_TEST_ENV} + $<$:LIBOMPTARGET_STACK_SIZE=3072> $<$:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${INTEGRATION_TEST_LOADER_ARGS} @@ -790,8 +790,7 @@ function(add_libc_hermetic test_name) if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) target_link_options(${fq_build_target_name} PRIVATE ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu - -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto - "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static + -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -nostdlib -static "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}") elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) target_link_options(${fq_build_target_name} PRIVATE @@ -859,6 +858,7 @@ function(add_libc_hermetic test_name) string(REPLACE " " ";" test_cmd "${test_cmd_parsed}") else() set(test_cmd ${HERMETIC_TEST_ENV} + $<$:LIBOMPTARGET_STACK_SIZE=3072> $<$:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS} $ ${HERMETIC_TEST_ARGS}) endif() diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp index 48f095d924931..e876629e6d0e1 100644 --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -13,6 +13,9 @@ #include "src/stdlib/atexit.h" #include "src/stdlib/exit.h" +// TODO: Merge this and the NVPTX start files once the common `device_kernel` +// attribute correctly implies `amdgpu_kernel`. + extern "C" int main(int argc, char **argv, char **envp); extern "C" void __cxa_finalize(void *dso); @@ -21,45 +24,18 @@ namespace LIBC_NAMESPACE_DECL { // FIXME: Factor this out into common logic so we don't need to stub it here. void teardown_main_tls() {} -// FIXME: Touch this symbol to force this to be linked in statically. -volatile void *dummy = &LIBC_NAMESPACE::rpc::client; - DataEnvironment app; -extern "C" uintptr_t __init_array_start[]; -extern "C" uintptr_t __init_array_end[]; -extern "C" uintptr_t __fini_array_start[]; -extern "C" uintptr_t __fini_array_end[]; - -using InitCallback = void(int, char **, char **); -using FiniCallback = void(void); - -static void call_init_array_callbacks(int argc, char **argv, char **env) { - size_t init_array_size = __init_array_end - __init_array_start; - for (size_t i = 0; i < init_array_size; ++i) - reinterpret_cast(__init_array_start[i])(argc, argv, env); -} - -static void call_fini_array_callbacks() { - size_t fini_array_size = __fini_array_end - __fini_array_start; - for (size_t i = fini_array_size; i > 0; --i) - reinterpret_cast(__fini_array_start[i - 1])(); -} - } // namespace LIBC_NAMESPACE_DECL extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel, clang::amdgpu_flat_work_group_size(1, 1), clang::amdgpu_max_num_work_groups(1)]] void -_begin(int argc, char **argv, char **env) { +_begin(int, char **, char **env) { + // The LLVM offloading runtime will automatically call any present global + // constructors and destructors so we defer that handling. __atomic_store_n(&LIBC_NAMESPACE::app.env_ptr, reinterpret_cast(env), __ATOMIC_RELAXED); - // We want the fini array callbacks to be run after other atexit - // callbacks are run. So, we register them before running the init - // array callbacks as they can potentially register their own atexit - // callbacks. - LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks); - LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env); } extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp index ce8f5bbb6d4eb..822930e22bc26 100644 --- a/libc/startup/gpu/nvptx/start.cpp +++ b/libc/startup/gpu/nvptx/start.cpp @@ -23,48 +23,14 @@ DataEnvironment app; // FIXME: Factor this out into common logic so we don't need to stub it here. void teardown_main_tls() {} -// FIXME: Touch this symbol to force this to be linked in statically. -volatile void *dummy = &LIBC_NAMESPACE::rpc::client; - -extern "C" { -// Nvidia's 'nvlink' linker does not provide these symbols. We instead need -// to manually create them and update the globals in the loader implememtation. -uintptr_t *__init_array_start [[gnu::visibility("protected")]]; -uintptr_t *__init_array_end [[gnu::visibility("protected")]]; -uintptr_t *__fini_array_start [[gnu::visibility("protected")]]; -uintptr_t *__fini_array_end [[gnu::visibility("protected")]]; -} - -// Nvidia requires that the signature of the function pointers match. This means -// we cannot support the extended constructor arguments. -using InitCallback = void(void); -using FiniCallback = void(void); - -static void call_init_array_callbacks(int, char **, char **) { - size_t init_array_size = __init_array_end - __init_array_start; - for (size_t i = 0; i < init_array_size; ++i) - reinterpret_cast(__init_array_start[i])(); -} - -static void call_fini_array_callbacks() { - size_t fini_array_size = __fini_array_end - __fini_array_start; - for (size_t i = fini_array_size; i > 0; --i) - reinterpret_cast(__fini_array_start[i - 1])(); -} - } // namespace LIBC_NAMESPACE_DECL extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void -_begin(int argc, char **argv, char **env) { +_begin(int, char **, char **env) { + // The LLVM offloading runtime will automatically call any present global + // constructors and destructors so we defer that handling. __atomic_store_n(&LIBC_NAMESPACE::app.env_ptr, reinterpret_cast(env), __ATOMIC_RELAXED); - - // We want the fini array callbacks to be run after other atexit - // callbacks are run. So, we register them before running the init - // array callbacks as they can potentially register their own atexit - // callbacks. - LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks); - LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env); } extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void diff --git a/llvm/tools/CMakeLists.txt b/llvm/tools/CMakeLists.txt index 729797aa43f0b..b9c5a79849ec8 100644 --- a/llvm/tools/CMakeLists.txt +++ b/llvm/tools/CMakeLists.txt @@ -9,10 +9,6 @@ # traversing each directory. create_llvm_tool_options() -if(NOT LLVM_COMPILER_IS_GCC_COMPATIBLE OR NOT LLVM_LIBC_GPU_BUILD) - set(LLVM_TOOL_LLVM_GPU_LOADER_BUILD OFF) -endif() - if(NOT LLVM_BUILD_LLVM_DYLIB AND NOT LLVM_BUILD_LLVM_C_DYLIB) set(LLVM_TOOL_LLVM_SHLIB_BUILD Off) endif() diff --git a/llvm/tools/llvm-gpu-loader/CMakeLists.txt b/llvm/tools/llvm-gpu-loader/CMakeLists.txt index b35a702476ada..de276635e2713 100644 --- a/llvm/tools/llvm-gpu-loader/CMakeLists.txt +++ b/llvm/tools/llvm-gpu-loader/CMakeLists.txt @@ -1,46 +1,12 @@ set(LLVM_LINK_COMPONENTS - BinaryFormat - Object Option Support - FrontendOffloading TargetParser ) add_llvm_tool(llvm-gpu-loader llvm-gpu-loader.cpp - # TODO: We intentionally split this currently due to statically linking the - # GPU runtimes. Dynamically load the dependencies, possibly using the - # LLVM offloading API when it is complete. - PARTIAL_SOURCES_INTENDED - DEPENDS intrinsics_gen ) - -# Locate the RPC server handling interface. -include(FindLibcCommonUtils) -target_link_libraries(llvm-gpu-loader PUBLIC llvm-libc-common-utilities) - -# Check for HSA support for targeting AMD GPUs. -find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm) -if(hsa-runtime64_FOUND) - target_sources(llvm-gpu-loader PRIVATE amdhsa.cpp) - target_compile_definitions(llvm-gpu-loader PRIVATE AMDHSA_SUPPORT) - target_link_libraries(llvm-gpu-loader PRIVATE hsa-runtime64::hsa-runtime64) - - # Compatibility with the old amdhsa-loader name. - add_llvm_tool_symlink(amdhsa-loader llvm-gpu-loader) -endif() - -# Check for CUDA support for targeting NVIDIA GPUs. -find_package(CUDAToolkit 11.2 QUIET) -if(CUDAToolkit_FOUND) - target_sources(llvm-gpu-loader PRIVATE nvptx.cpp) - target_compile_definitions(llvm-gpu-loader PRIVATE NVPTX_SUPPORT) - target_link_libraries(llvm-gpu-loader PRIVATE CUDA::cuda_driver) - - # Compatibility with the old nvptx-loader name. - add_llvm_tool_symlink(nvptx-loader llvm-gpu-loader) -endif() diff --git a/llvm/tools/llvm-gpu-loader/amdhsa.cpp b/llvm/tools/llvm-gpu-loader/amdhsa.cpp deleted file mode 100644 index 5715058d8cfac..0000000000000 --- a/llvm/tools/llvm-gpu-loader/amdhsa.cpp +++ /dev/null @@ -1,594 +0,0 @@ -//===-- Loader Implementation for AMDHSA devices --------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file impelements a simple loader to run images supporting the AMDHSA -// architecture. The file launches the '_start' kernel which should be provided -// by the device application start code and call ultimately call the 'main' -// function. -// -//===----------------------------------------------------------------------===// - -#include "llvm-gpu-loader.h" -#include "server.h" - -#include "hsa/hsa.h" -#include "hsa/hsa_ext_amd.h" - -#include "llvm/Frontend/Offloading/Utility.h" - -#include -#include -#include -#include -#include -#include -#include - -// The implicit arguments of COV5 AMDGPU kernels. -struct implicit_args_t { - uint32_t grid_size_x; - uint32_t grid_size_y; - uint32_t grid_size_z; - uint16_t workgroup_size_x; - uint16_t workgroup_size_y; - uint16_t workgroup_size_z; - uint8_t Unused0[46]; - uint16_t grid_dims; - uint8_t Unused1[190]; -}; - -/// Print the error code and exit if \p code indicates an error. -static void handle_error_impl(const char *file, int32_t line, - hsa_status_t code) { - if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK) - return; - - const char *desc; - if (hsa_status_string(code, &desc) != HSA_STATUS_SUCCESS) - desc = "Unknown error"; - fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, desc); - exit(EXIT_FAILURE); -} - -/// Generic interface for iterating using the HSA callbacks. -template -hsa_status_t iterate(func_ty func, callback_ty cb) { - auto l = [](elem_ty elem, void *data) -> hsa_status_t { - callback_ty *unwrapped = static_cast(data); - return (*unwrapped)(elem); - }; - return func(l, static_cast(&cb)); -} - -/// Generic interface for iterating using the HSA callbacks. -template -hsa_status_t iterate(func_ty func, func_arg_ty func_arg, callback_ty cb) { - auto l = [](elem_ty elem, void *data) -> hsa_status_t { - callback_ty *unwrapped = static_cast(data); - return (*unwrapped)(elem); - }; - return func(func_arg, l, static_cast(&cb)); -} - -/// Iterate through all availible agents. -template -hsa_status_t iterate_agents(callback_ty callback) { - return iterate(hsa_iterate_agents, callback); -} - -/// Iterate through all availible memory pools. -template -hsa_status_t iterate_agent_memory_pools(hsa_agent_t agent, callback_ty cb) { - return iterate(hsa_amd_agent_iterate_memory_pools, - agent, cb); -} - -template -hsa_status_t get_agent(hsa_agent_t *output_agent) { - // Find the first agent with a matching device type. - auto cb = [&](hsa_agent_t hsa_agent) -> hsa_status_t { - hsa_device_type_t type; - hsa_status_t status = - hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_DEVICE, &type); - if (status != HSA_STATUS_SUCCESS) - return status; - - if (type == flag) { - // Ensure that a GPU agent supports kernel dispatch packets. - if (type == HSA_DEVICE_TYPE_GPU) { - hsa_agent_feature_t features; - status = - hsa_agent_get_info(hsa_agent, HSA_AGENT_INFO_FEATURE, &features); - if (status != HSA_STATUS_SUCCESS) - return status; - if (features & HSA_AGENT_FEATURE_KERNEL_DISPATCH) - *output_agent = hsa_agent; - } else { - *output_agent = hsa_agent; - } - return HSA_STATUS_INFO_BREAK; - } - return HSA_STATUS_SUCCESS; - }; - - return iterate_agents(cb); -} - -void print_kernel_resources(const char *kernel_name) { - fprintf(stderr, "Kernel resources on AMDGPU is not supported yet.\n"); -} - -/// Retrieve a global memory pool with a \p flag from the agent. -template -hsa_status_t get_agent_memory_pool(hsa_agent_t agent, - hsa_amd_memory_pool_t *output_pool) { - auto cb = [&](hsa_amd_memory_pool_t memory_pool) { - uint32_t flags; - hsa_amd_segment_t segment; - if (auto err = hsa_amd_memory_pool_get_info( - memory_pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment)) - return err; - if (auto err = hsa_amd_memory_pool_get_info( - memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags)) - return err; - - if (segment != HSA_AMD_SEGMENT_GLOBAL) - return HSA_STATUS_SUCCESS; - - if (flags & flag) - *output_pool = memory_pool; - - return HSA_STATUS_SUCCESS; - }; - return iterate_agent_memory_pools(agent, cb); -} - -template -hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, - hsa_amd_memory_pool_t kernargs_pool, - hsa_amd_memory_pool_t coarsegrained_pool, - hsa_queue_t *queue, rpc::Server &server, - const LaunchParameters ¶ms, - const char *kernel_name, args_t kernel_args, - uint32_t wavefront_size, bool print_resource_usage) { - // Look up the kernel in the loaded executable. - hsa_executable_symbol_t symbol; - if (hsa_status_t err = hsa_executable_get_symbol_by_name( - executable, kernel_name, &dev_agent, &symbol)) - return err; - - // Retrieve different properties of the kernel symbol used for launch. - uint64_t kernel; - uint32_t args_size; - uint32_t group_size; - uint32_t private_size; - bool dynamic_stack; - - std::pair symbol_infos[] = { - {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel}, - {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size}, - {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size}, - {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack}, - {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}}; - - for (auto &[info, value] : symbol_infos) - if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value)) - return err; - - // Allocate space for the kernel arguments on the host and allow the GPU agent - // to access it. - void *args; - if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size, - /*flags=*/0, &args)) - handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args); - - // Initialize all the arguments (explicit and implicit) to zero, then set the - // explicit arguments to the values created above. - std::memset(args, 0, args_size); - std::memcpy(args, &kernel_args, std::is_empty_v ? 0 : sizeof(args_t)); - - // Initialize the necessary implicit arguments to the proper values. - int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) + - (params.num_blocks_z * params.num_threads_z != 1); - implicit_args_t *implicit_args = reinterpret_cast( - reinterpret_cast(args) + sizeof(args_t)); - implicit_args->grid_dims = dims; - implicit_args->grid_size_x = params.num_blocks_x; - implicit_args->grid_size_y = params.num_blocks_y; - implicit_args->grid_size_z = params.num_blocks_z; - implicit_args->workgroup_size_x = params.num_threads_x; - implicit_args->workgroup_size_y = params.num_threads_y; - implicit_args->workgroup_size_z = params.num_threads_z; - - // Obtain a packet from the queue. - uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); - while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size) - ; - - const uint32_t mask = queue->size - 1; - hsa_kernel_dispatch_packet_t *packet = - static_cast(queue->base_address) + - (packet_id & mask); - - // Set up the packet for exeuction on the device. We currently only launch - // with one thread on the device, forcing the rest of the wavefront to be - // masked off. - uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - packet->workgroup_size_x = params.num_threads_x; - packet->workgroup_size_y = params.num_threads_y; - packet->workgroup_size_z = params.num_threads_z; - packet->reserved0 = 0; - packet->grid_size_x = params.num_blocks_x * params.num_threads_x; - packet->grid_size_y = params.num_blocks_y * params.num_threads_y; - packet->grid_size_z = params.num_blocks_z * params.num_threads_z; - packet->private_segment_size = - dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size; - packet->group_segment_size = group_size; - packet->kernel_object = kernel; - packet->kernarg_address = args; - packet->reserved2 = 0; - // Create a signal to indicate when this packet has been completed. - if (hsa_status_t err = - hsa_signal_create(1, 0, nullptr, &packet->completion_signal)) - handle_error(err); - - if (print_resource_usage) - print_kernel_resources(kernel_name); - - // Initialize the packet header and set the doorbell signal to begin execution - // by the HSA runtime. - uint16_t header = - 1u << HSA_PACKET_HEADER_BARRIER | - (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); - uint32_t header_word = header | (setup << 16u); - __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE); - hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); - - std::atomic finished = false; - std::thread server_thread( - [](std::atomic *finished, rpc::Server *server, - uint32_t wavefront_size, hsa_agent_t dev_agent, - hsa_amd_memory_pool_t coarsegrained_pool) { - // Register RPC callbacks for the malloc and free functions on HSA. - auto malloc_handler = [&](size_t size) -> void * { - void *dev_ptr = nullptr; - if (hsa_amd_memory_pool_allocate(coarsegrained_pool, size, - /*flags=*/0, &dev_ptr)) - dev_ptr = nullptr; - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); - return dev_ptr; - }; - - auto free_handler = [](void *ptr) -> void { - if (hsa_status_t err = - hsa_amd_memory_pool_free(reinterpret_cast(ptr))) - handle_error(err); - }; - - uint32_t index = 0; - while (!*finished) { - if (wavefront_size == 32) - index = - handle_server<32>(*server, index, malloc_handler, free_handler); - else - index = - handle_server<64>(*server, index, malloc_handler, free_handler); - } - }, - &finished, &server, wavefront_size, dev_agent, coarsegrained_pool); - - // Wait until the kernel has completed execution on the device. Periodically - // check the RPC client for work to be performed on the server. - while (hsa_signal_wait_scacquire(packet->completion_signal, - HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, - HSA_WAIT_STATE_BLOCKED) != 0) - ; - - finished = true; - if (server_thread.joinable()) - server_thread.join(); - - // Destroy the resources acquired to launch the kernel and return. - if (hsa_status_t err = hsa_amd_memory_pool_free(args)) - handle_error(err); - if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal)) - handle_error(err); - - return HSA_STATUS_SUCCESS; -} - -/// Copies data from the source agent to the destination agent. The source -/// memory must first be pinned explicitly or allocated via HSA. -static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent, - const void *src, hsa_agent_t src_agent, - uint64_t size) { - // Create a memory signal to copy information between the host and device. - hsa_signal_t memory_signal; - if (hsa_status_t err = hsa_signal_create(1, 0, nullptr, &memory_signal)) - return err; - - if (hsa_status_t err = hsa_amd_memory_async_copy( - dst, dst_agent, src, src_agent, size, 0, nullptr, memory_signal)) - return err; - - while (hsa_signal_wait_scacquire(memory_signal, HSA_SIGNAL_CONDITION_EQ, 0, - UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) - ; - - if (hsa_status_t err = hsa_signal_destroy(memory_signal)) - return err; - - return HSA_STATUS_SUCCESS; -} - -int load_amdhsa(int argc, const char **argv, const char **envp, void *image, - size_t size, const LaunchParameters ¶ms, - bool print_resource_usage) { - // Initialize the HSA runtime used to communicate with the device. - if (hsa_status_t err = hsa_init()) - handle_error(err); - - // Register a callback when the device encounters a memory fault. - if (hsa_status_t err = hsa_amd_register_system_event_handler( - [](const hsa_amd_event_t *event, void *) -> hsa_status_t { - if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) - return HSA_STATUS_ERROR; - return HSA_STATUS_SUCCESS; - }, - nullptr)) - handle_error(err); - - // Obtain a single agent for the device and host to use the HSA memory model. - hsa_agent_t dev_agent; - hsa_agent_t host_agent; - if (hsa_status_t err = get_agent(&dev_agent)) - handle_error(err); - if (hsa_status_t err = get_agent(&host_agent)) - handle_error(err); - - // Load the code object's ISA information and executable data segments. - hsa_code_object_reader_t reader; - if (hsa_status_t err = - hsa_code_object_reader_create_from_memory(image, size, &reader)) - handle_error(err); - - hsa_executable_t executable; - if (hsa_status_t err = hsa_executable_create_alt( - HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, "", - &executable)) - handle_error(err); - - hsa_loaded_code_object_t object; - if (hsa_status_t err = hsa_executable_load_agent_code_object( - executable, dev_agent, reader, "", &object)) - handle_error(err); - - // No modifications to the executable are allowed after this point. - if (hsa_status_t err = hsa_executable_freeze(executable, "")) - handle_error(err); - - // Check the validity of the loaded executable. If the agents ISA features do - // not match the executable's code object it will fail here. - uint32_t result; - if (hsa_status_t err = hsa_executable_validate(executable, &result)) - handle_error(err); - if (result) - handle_error(HSA_STATUS_ERROR); - - if (hsa_status_t err = hsa_code_object_reader_destroy(reader)) - handle_error(err); - - // Obtain memory pools to exchange data between the host and the device. The - // fine-grained pool acts as pinned memory on the host for DMA transfers to - // the device, the coarse-grained pool is for allocations directly on the - // device, and the kernerl-argument pool is for executing the kernel. - hsa_amd_memory_pool_t kernargs_pool; - hsa_amd_memory_pool_t finegrained_pool; - hsa_amd_memory_pool_t coarsegrained_pool; - if (hsa_status_t err = - get_agent_memory_pool( - host_agent, &kernargs_pool)) - handle_error(err); - if (hsa_status_t err = - get_agent_memory_pool( - host_agent, &finegrained_pool)) - handle_error(err); - if (hsa_status_t err = - get_agent_memory_pool( - dev_agent, &coarsegrained_pool)) - handle_error(err); - - // The AMDGPU target can change its wavefront size. There currently isn't a - // good way to look this up through the HSA API so we use the LLVM interface. - uint16_t abi_version; - llvm::StringRef image_ref(reinterpret_cast(image), size); - llvm::StringMap info_map; - if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage( - llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) { - handle_error(llvm::toString(std::move(err)).c_str()); - } - - // Allocate fine-grained memory on the host to hold the pointer array for the - // copied argv and allow the GPU agent to access it. - auto allocator = [&](uint64_t size) -> void * { - void *dev_ptr = nullptr; - if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size, - /*flags=*/0, &dev_ptr)) - handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); - return dev_ptr; - }; - void *dev_argv = copy_argument_vector(argc, argv, allocator); - if (!dev_argv) - handle_error("Failed to allocate device argv"); - - // Allocate fine-grained memory on the host to hold the pointer array for the - // copied environment array and allow the GPU agent to access it. - void *dev_envp = copy_environment(envp, allocator); - if (!dev_envp) - handle_error("Failed to allocate device environment"); - - // Allocate space for the return pointer and initialize it to zero. - void *dev_ret; - if (hsa_status_t err = - hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(int), - /*flags=*/0, &dev_ret)) - handle_error(err); - hsa_amd_memory_fill(dev_ret, 0, /*count=*/1); - - // Allocate finegrained memory for the RPC server and client to share. - uint32_t wavefront_size = - llvm::max_element(info_map, [](auto &&x, auto &&y) { - return x.second.WavefrontSize < y.second.WavefrontSize; - })->second.WavefrontSize; - - // Set up the RPC server. - void *rpc_buffer; - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, - rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT), - /*flags=*/0, &rpc_buffer)) - handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer); - - rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer); - rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer); - - // Initialize the RPC client on the device by copying the local data to the - // device's internal pointer. - hsa_executable_symbol_t rpc_client_sym; - if (hsa_status_t err = hsa_executable_get_symbol_by_name( - executable, "__llvm_rpc_client", &dev_agent, &rpc_client_sym)) - handle_error(err); - - void *rpc_client_dev; - if (hsa_status_t err = hsa_executable_symbol_get_info( - rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, - &rpc_client_dev)) - handle_error(err); - - void *rpc_client_buffer; - if (hsa_status_t err = - hsa_amd_memory_lock(&client, sizeof(rpc::Client), - /*agents=*/nullptr, 0, &rpc_client_buffer)) - handle_error(err); - - // Copy the RPC client buffer to the address pointed to by the symbol. - if (hsa_status_t err = - hsa_memcpy(rpc_client_dev, dev_agent, rpc_client_buffer, host_agent, - sizeof(rpc::Client))) - handle_error(err); - - if (hsa_status_t err = hsa_amd_memory_unlock(&client)) - handle_error(err); - - // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU. - // If the clock_freq symbol is missing, no work to do. - hsa_executable_symbol_t freq_sym; - if (HSA_STATUS_SUCCESS == - hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq", - &dev_agent, &freq_sym)) { - void *host_clock_freq; - if (hsa_status_t err = - hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t), - /*flags=*/0, &host_clock_freq)) - handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq); - - if (HSA_STATUS_SUCCESS == - hsa_agent_get_info(dev_agent, - static_cast( - HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY), - host_clock_freq)) { - - void *freq_addr; - if (hsa_status_t err = hsa_executable_symbol_get_info( - freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, - &freq_addr)) - handle_error(err); - - if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq, - host_agent, sizeof(uint64_t))) - handle_error(err); - } - } - - // Obtain a queue with the maximum (power of two) size, used to send commands - // to the HSA runtime and launch execution on the device. - uint64_t queue_size; - if (hsa_status_t err = hsa_agent_get_info( - dev_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size)) - handle_error(err); - hsa_queue_t *queue = nullptr; - if (hsa_status_t err = - hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, - nullptr, UINT32_MAX, UINT32_MAX, &queue)) - handle_error(err); - - LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; - begin_args_t init_args = {argc, dev_argv, dev_envp}; - if (hsa_status_t err = launch_kernel( - dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, - server, single_threaded_params, "_begin.kd", init_args, - info_map["_begin"].WavefrontSize, print_resource_usage)) - handle_error(err); - - start_args_t args = {argc, dev_argv, dev_envp, dev_ret}; - if (hsa_status_t err = launch_kernel( - dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, - server, params, "_start.kd", args, info_map["_start"].WavefrontSize, - print_resource_usage)) - handle_error(err); - - void *host_ret; - if (hsa_status_t err = - hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(int), - /*flags=*/0, &host_ret)) - handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_ret); - - if (hsa_status_t err = - hsa_memcpy(host_ret, host_agent, dev_ret, dev_agent, sizeof(int))) - handle_error(err); - - // Save the return value and perform basic clean-up. - int ret = *static_cast(host_ret); - - end_args_t fini_args = {}; - if (hsa_status_t err = launch_kernel( - dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, - server, single_threaded_params, "_end.kd", fini_args, - info_map["_end"].WavefrontSize, print_resource_usage)) - handle_error(err); - - if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer)) - handle_error(err); - - // Free the memory allocated for the device. - if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret)) - handle_error(err); - - if (hsa_status_t err = hsa_queue_destroy(queue)) - handle_error(err); - - if (hsa_status_t err = hsa_executable_destroy(executable)) - handle_error(err); - - if (hsa_status_t err = hsa_shut_down()) - handle_error(err); - - return ret; -} diff --git a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp index a8204664e85eb..d66f2b8d69b62 100644 --- a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp +++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp @@ -35,121 +35,255 @@ using namespace llvm; -static cl::OptionCategory loader_category("loader options"); +static cl::OptionCategory LoaderCategory("loader options"); -static cl::opt help("h", cl::desc("Alias for -help"), cl::Hidden, - cl::cat(loader_category)); +static cl::opt Help("h", cl::desc("Alias for -help"), cl::Hidden, + cl::cat(LoaderCategory)); static cl::opt - threads_x("threads-x", cl::desc("Number of threads in the 'x' dimension"), - cl::init(1), cl::cat(loader_category)); + Threads_x("threads-x", cl::desc("Number of threads in the 'x' dimension"), + cl::init(1), cl::cat(LoaderCategory)); static cl::opt - threads_y("threads-y", cl::desc("Number of threads in the 'y' dimension"), - cl::init(1), cl::cat(loader_category)); + Threads_y("threads-y", cl::desc("Number of threads in the 'y' dimension"), + cl::init(1), cl::cat(LoaderCategory)); static cl::opt - threads_z("threads-z", cl::desc("Number of threads in the 'z' dimension"), - cl::init(1), cl::cat(loader_category)); -static cl::alias threads("threads", cl::aliasopt(threads_x), + Threads_z("threads-z", cl::desc("Number of threads in the 'z' dimension"), + cl::init(1), cl::cat(LoaderCategory)); +static cl::alias threads("threads", cl::aliasopt(Threads_x), cl::desc("Alias for --threads-x"), - cl::cat(loader_category)); + cl::cat(LoaderCategory)); static cl::opt - blocks_x("blocks-x", cl::desc("Number of blocks in the 'x' dimension"), - cl::init(1), cl::cat(loader_category)); + Blocks_x("blocks-x", cl::desc("Number of blocks in the 'x' dimension"), + cl::init(1), cl::cat(LoaderCategory)); static cl::opt - blocks_y("blocks-y", cl::desc("Number of blocks in the 'y' dimension"), - cl::init(1), cl::cat(loader_category)); + Blocks_y("blocks-y", cl::desc("Number of blocks in the 'y' dimension"), + cl::init(1), cl::cat(LoaderCategory)); static cl::opt - blocks_z("blocks-z", cl::desc("Number of blocks in the 'z' dimension"), - cl::init(1), cl::cat(loader_category)); -static cl::alias blocks("blocks", cl::aliasopt(blocks_x), + Blocks_z("blocks-z", cl::desc("Number of blocks in the 'z' dimension"), + cl::init(1), cl::cat(LoaderCategory)); +static cl::alias Blocks("blocks", cl::aliasopt(Blocks_x), cl::desc("Alias for --blocks-x"), - cl::cat(loader_category)); + cl::cat(LoaderCategory)); -static cl::opt - print_resource_usage("print-resource-usage", - cl::desc("Output resource usage of launched kernels"), - cl::init(false), cl::cat(loader_category)); - -static cl::opt file(cl::Positional, cl::Required, +static cl::opt File(cl::Positional, cl::Required, cl::desc(""), - cl::cat(loader_category)); -static cl::list args(cl::ConsumeAfter, + cl::cat(LoaderCategory)); +static cl::list Args(cl::ConsumeAfter, cl::desc("..."), - cl::cat(loader_category)); + cl::cat(LoaderCategory)); + +// The arguments to the '_begin' kernel. +struct BeginArgs { + int Argc; + void *Argv; + void *Envp; +}; + +// The arguments to the '_start' kernel. +struct StartArgs { + int Argc; + void *Argv; + void *Envp; + void *Ret; +}; -[[noreturn]] void report_error(Error E) { +// The arguments to the '_end' kernel. +struct EndArgs {}; + +[[noreturn]] static void handleError(Error E) { outs().flush(); logAllUnhandledErrors(std::move(E), WithColor::error(errs(), "loader")); exit(EXIT_FAILURE); } -std::string get_main_executable(const char *name) { - void *ptr = (void *)(intptr_t)&get_main_executable; - auto cow_path = sys::fs::getMainExecutable(name, ptr); - return sys::path::parent_path(cow_path).str(); +[[noreturn]] static void handleError(ol_result_t Err, unsigned Line) { + fprintf(stderr, "%s:%d %s\n", __FILE__, Line, Err->Details); + exit(EXIT_FAILURE); +} + +#define OFFLOAD_ERR(X) \ + if (ol_result_t Err = X) \ + handleError(Err, __LINE__); + +static void *copyArgumentVector(int Argc, const char **Argv, + ol_device_handle_t Device) { + size_t ArgSize = sizeof(char *) * (Argc + 1); + size_t StringLen = 0; + for (int i = 0; i < Argc; ++i) + StringLen += strlen(Argv[i]) + 1; + + // We allocate enough space for a null terminated array and all the strings. + void *DevArgv; + OFFLOAD_ERR( + olMemAlloc(Device, OL_ALLOC_TYPE_HOST, ArgSize + StringLen, &DevArgv)); + if (!DevArgv) + handleError( + createStringError("Failed to allocate memory for environment.")); + + // Store the strings linerally in the same memory buffer. + void *DevString = reinterpret_cast(DevArgv) + ArgSize; + for (int i = 0; i < Argc; ++i) { + size_t size = strlen(Argv[i]) + 1; + std::memcpy(DevString, Argv[i], size); + static_cast(DevArgv)[i] = DevString; + DevString = reinterpret_cast(DevString) + size; + } + + // Ensure the vector is null terminated. + reinterpret_cast(DevArgv)[Argc] = nullptr; + return DevArgv; +} + +void *copyEnvironment(const char **Envp, ol_device_handle_t Device) { + int Envc = 0; + for (const char **Env = Envp; *Env != 0; ++Env) + ++Envc; + + return copyArgumentVector(Envc, Envp, Device); +} + +ol_device_handle_t findDevice(MemoryBufferRef Binary) { + ol_device_handle_t Device; + std::tuple Data = std::make_tuple(&Device, &Binary); + OFFLOAD_ERR(olIterateDevices( + [](ol_device_handle_t Device, void *UserData) { + auto &[Output, Binary] = *reinterpret_cast(UserData); + bool IsValid = false; + OFFLOAD_ERR(olIsValidBinary(Device, Binary->getBufferStart(), + Binary->getBufferSize(), &IsValid)); + if (!IsValid) + return true; + + *Output = Device; + return false; + }, + &Data)); + return Device; +} + +ol_device_handle_t getHostDevice() { + ol_device_handle_t Device; + OFFLOAD_ERR(olIterateDevices( + [](ol_device_handle_t Device, void *UserData) { + ol_platform_handle_t Platform; + olGetDeviceInfo(Device, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), + &Platform); + ol_platform_backend_t Backend; + olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend), + &Backend); + + auto &Output = *reinterpret_cast(UserData); + if (Backend == OL_PLATFORM_BACKEND_HOST) { + Output = Device; + return false; + } + return true; + }, + &Device)); + return Device; +} + +ol_program_handle_t loadBinary(std::vector &Binary, + std::vector &Devices) { + for (ol_device_handle_t &Device : Devices) { + bool IsValid = false; + OFFLOAD_ERR( + olIsValidBinary(Device, Binary.data(), Binary.size(), &IsValid)); + if (!IsValid) + continue; + + ol_program_handle_t Program; + OFFLOAD_ERR( + olCreateProgram(Device, Binary.data(), Binary.size(), &Program)); + return Program; + } + handleError( + createStringError("No valid device found for '%s'", File.c_str())); +} + +template +void launchKernel(ol_queue_handle_t Queue, ol_device_handle_t Device, + ol_program_handle_t Program, const char *Name, + ol_kernel_launch_size_args_t LaunchArgs, Args KernelArgs) { + ol_symbol_handle_t Kernel; + OFFLOAD_ERR(olGetSymbol(Program, Name, OL_SYMBOL_KIND_KERNEL, &Kernel)); + + OFFLOAD_ERR(olLaunchKernel(Queue, Device, Kernel, &KernelArgs, + std::is_empty_v ? 0 : sizeof(Args), + &LaunchArgs)); } int main(int argc, const char **argv, const char **envp) { sys::PrintStackTraceOnErrorSignal(argv[0]); - cl::HideUnrelatedOptions(loader_category); + cl::HideUnrelatedOptions(LoaderCategory); cl::ParseCommandLineOptions( argc, argv, "A utility used to launch unit tests built for a GPU target. This is\n" "intended to provide an intrface simular to cross-compiling emulators\n"); - if (help) { + if (Help) { cl::PrintHelpMessage(); return EXIT_SUCCESS; } - ErrorOr> image_or_err = - MemoryBuffer::getFileOrSTDIN(file); - if (std::error_code ec = image_or_err.getError()) - report_error(errorCodeToError(ec)); - MemoryBufferRef image = **image_or_err; - - SmallVector new_argv = {file.c_str()}; - llvm::transform(args, std::back_inserter(new_argv), - [](const std::string &arg) { return arg.c_str(); }); - - Expected elf_or_err = - llvm::object::ELF64LEObjectFile::create(image); - if (!elf_or_err) - report_error(elf_or_err.takeError()); - - int ret = 1; - if (elf_or_err->getArch() == Triple::amdgcn) { -#ifdef AMDHSA_SUPPORT - LaunchParameters params{threads_x, threads_y, threads_z, - blocks_x, blocks_y, blocks_z}; - - ret = load_amdhsa(new_argv.size(), new_argv.data(), envp, - const_cast(image.getBufferStart()), - image.getBufferSize(), params, print_resource_usage); -#else - report_error(createStringError( - "Unsupported architecture; %s", - Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin())); -#endif - } else if (elf_or_err->getArch() == Triple::nvptx64) { -#ifdef NVPTX_SUPPORT - LaunchParameters params{threads_x, threads_y, threads_z, - blocks_x, blocks_y, blocks_z}; - - ret = load_nvptx(new_argv.size(), new_argv.data(), envp, - const_cast(image.getBufferStart()), - image.getBufferSize(), params, print_resource_usage); -#else - report_error(createStringError( - "Unsupported architecture; %s", - Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin())); -#endif - } else { - report_error(createStringError( - "Unsupported architecture; %s", - Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin())); - } + if (Error Err = loadLLVMOffload()) + handleError(std::move(Err)); + + ErrorOr> ImageOrErr = + MemoryBuffer::getFileOrSTDIN(File); + if (std::error_code EC = ImageOrErr.getError()) + handleError(errorCodeToError(EC)); + MemoryBufferRef Image = **ImageOrErr; + + SmallVector NewArgv = {File.c_str()}; + llvm::transform(Args, std::back_inserter(NewArgv), + [](const std::string &Arg) { return Arg.c_str(); }); + + OFFLOAD_ERR(olInit()); + ol_device_handle_t Device = findDevice(Image); + ol_device_handle_t Host = getHostDevice(); + + ol_program_handle_t Program; + OFFLOAD_ERR(olCreateProgram(Device, Image.getBufferStart(), + Image.getBufferSize(), &Program)); + + ol_queue_handle_t Queue; + OFFLOAD_ERR(olCreateQueue(Device, &Queue)); + + int DevArgc = static_cast(NewArgv.size()); + void *DevArgv = copyArgumentVector(NewArgv.size(), NewArgv.begin(), Device); + void *DevEnvp = copyEnvironment(envp, Device); + + void *DevRet; + OFFLOAD_ERR(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, sizeof(int), &DevRet)); + + ol_kernel_launch_size_args_t BeginLaunch{1, {1, 1, 1}, {1, 1, 1}, 0}; + BeginArgs BeginArgs = {DevArgc, DevArgv, DevEnvp}; + launchKernel(Queue, Device, Program, "_begin", BeginLaunch, BeginArgs); + OFFLOAD_ERR(olSyncQueue(Queue)); + + uint32_t Dims = (Blocks_z > 1) ? 3 : (Blocks_y > 1) ? 2 : 1; + ol_kernel_launch_size_args_t StartLaunch{Dims, + {Blocks_x, Blocks_y, Blocks_z}, + {Threads_x, Threads_y, Threads_z}, + /*SharedMemBytes=*/0}; + StartArgs StartArgs = {DevArgc, DevArgv, DevEnvp, DevRet}; + launchKernel(Queue, Device, Program, "_start", StartLaunch, StartArgs); + + ol_kernel_launch_size_args_t EndLaunch{1, {1, 1, 1}, {1, 1, 1}, 0}; + EndArgs EndArgs = {}; + launchKernel(Queue, Device, Program, "_end", EndLaunch, EndArgs); + + int Ret; + OFFLOAD_ERR(olMemcpy(Queue, &Ret, Host, DevRet, Device, sizeof(int))); + OFFLOAD_ERR(olSyncQueue(Queue)); + + OFFLOAD_ERR(olMemFree(DevArgv)); + OFFLOAD_ERR(olMemFree(DevEnvp)); + OFFLOAD_ERR(olDestroyQueue(Queue)); + OFFLOAD_ERR(olDestroyProgram(Program)); + OFFLOAD_ERR(olShutDown()); - return ret; + return Ret; } diff --git a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h index 08861c29b4fa4..3990cb3911e30 100644 --- a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h +++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h @@ -1,108 +1,177 @@ -//===-- Generic device loader interface -----------------------------------===// +//===-- Dynamically loaded offload API ------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// +// Dynamically loads the API provided by the LLVMOffload library. We need to do +// this dynamically because this tool is used before it is actually built and +// should be provided even when the user did not specify the offload runtime. +// +//===----------------------------------------------------------------------===// #ifndef LLVM_TOOLS_LLVM_GPU_LOADER_LLVM_GPU_LOADER_H #define LLVM_TOOLS_LLVM_GPU_LOADER_LLVM_GPU_LOADER_H -#include -#include -#include -#include -#include - -/// Generic launch parameters for configuration the number of blocks / threads. -struct LaunchParameters { - uint32_t num_threads_x; - uint32_t num_threads_y; - uint32_t num_threads_z; - uint32_t num_blocks_x; - uint32_t num_blocks_y; - uint32_t num_blocks_z; -}; - -/// The arguments to the '_begin' kernel. -struct begin_args_t { - int argc; - void *argv; - void *envp; -}; - -/// The arguments to the '_start' kernel. -struct start_args_t { - int argc; - void *argv; - void *envp; - void *ret; -}; - -/// The arguments to the '_end' kernel. -struct end_args_t {}; - -/// Generic interface to load the \p image and launch execution of the _start -/// kernel on the target device. Copies \p argc and \p argv to the device. -/// Returns the final value of the `main` function on the device. -#ifdef AMDHSA_SUPPORT -int load_amdhsa(int argc, const char **argv, const char **evnp, void *image, - size_t size, const LaunchParameters ¶ms, - bool print_resource_usage); -#endif -#ifdef NVPTX_SUPPORT -int load_nvptx(int argc, const char **argv, const char **evnp, void *image, - size_t size, const LaunchParameters ¶ms, - bool print_resource_usage); -#endif - -/// Return \p V aligned "upwards" according to \p Align. -template inline V align_up(V val, A align) { - return ((val + V(align) - 1) / V(align)) * V(align); -} - -/// Copy the system's argument vector to GPU memory allocated using \p alloc. -template -void *copy_argument_vector(int argc, const char **argv, Allocator alloc) { - size_t argv_size = sizeof(char *) * (argc + 1); - size_t str_size = 0; - for (int i = 0; i < argc; ++i) - str_size += strlen(argv[i]) + 1; - - // We allocate enough space for a null terminated array and all the strings. - void *dev_argv = alloc(argv_size + str_size); - if (!dev_argv) - return nullptr; - - // Store the strings linerally in the same memory buffer. - void *dev_str = reinterpret_cast(dev_argv) + argv_size; - for (int i = 0; i < argc; ++i) { - size_t size = strlen(argv[i]) + 1; - std::memcpy(dev_str, argv[i], size); - static_cast(dev_argv)[i] = dev_str; - dev_str = reinterpret_cast(dev_str) + size; - } - - // Ensure the vector is null terminated. - reinterpret_cast(dev_argv)[argc] = nullptr; - return dev_argv; -} - -/// Copy the system's environment to GPU memory allocated using \p alloc. -template -void *copy_environment(const char **envp, Allocator alloc) { - int envc = 0; - for (const char **env = envp; *env != 0; ++env) - ++envc; - - return copy_argument_vector(envc, envp, alloc); -} - -inline void handle_error_impl(const char *file, int32_t line, const char *msg) { - fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, msg); - exit(EXIT_FAILURE); +#include "llvm/Support/DynamicLibrary.h" +#include "llvm/Support/Error.h" + +typedef enum ol_alloc_type_t { + OL_ALLOC_TYPE_HOST = 0, + OL_ALLOC_TYPE_DEVICE = 1, + OL_ALLOC_TYPE_FORCE_UINT32 = 0x7fffffff +} ol_alloc_type_t; + +typedef enum ol_device_info_t { + OL_DEVICE_INFO_TYPE = 0, + OL_DEVICE_INFO_PLATFORM = 1, + OL_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff +} ol_device_info_t; + +typedef enum ol_platform_info_t { + OL_PLATFORM_INFO_NAME = 0, + OL_PLATFORM_INFO_BACKEND = 3, + OL_PLATFORM_INFO_FORCE_UINT32 = 0x7fffffff +} ol_platform_info_t; + +typedef enum ol_symbol_kind_t { + OL_SYMBOL_KIND_KERNEL = 0, + OL_SYMBOL_KIND_GLOBAL_VARIABLE = 1, + OL_SYMBOL_KIND_FORCE_UINT32 = 0x7fffffff +} ol_symbol_kind_t; + +typedef enum ol_errc_t { + OL_ERRC_SUCCESS = 0, + OL_ERRC_FORCE_UINT32 = 0x7fffffff +} ol_errc_t; + +typedef struct ol_error_struct_t { + ol_errc_t Code; + const char *Details; +} ol_error_struct_t; + +typedef struct ol_dimensions_t { + uint32_t x; + uint32_t y; + uint32_t z; +} ol_dimensions_t; + +typedef struct ol_kernel_launch_size_args_t { + size_t Dimensions; + struct ol_dimensions_t NumGroups; + struct ol_dimensions_t GroupSize; + size_t DynSharedMemory; +} ol_kernel_launch_size_args_t; + +typedef enum ol_platform_backend_t { + OL_PLATFORM_BACKEND_UNKNOWN = 0, + OL_PLATFORM_BACKEND_CUDA = 1, + OL_PLATFORM_BACKEND_AMDGPU = 2, + OL_PLATFORM_BACKEND_HOST = 3, + OL_PLATFORM_BACKEND_LAST = 4, + OL_PLATFORM_BACKEND_FORCE_UINT32 = 0x7fffffff +} ol_platform_backend_t; + +typedef struct ol_device_impl_t *ol_device_handle_t; +typedef struct ol_platform_impl_t *ol_platform_handle_t; +typedef struct ol_program_impl_t *ol_program_handle_t; +typedef struct ol_queue_impl_t *ol_queue_handle_t; +typedef struct ol_symbol_impl_t *ol_symbol_handle_t; +typedef const struct ol_error_struct_t *ol_result_t; + +typedef bool (*ol_device_iterate_cb_t)(ol_device_handle_t Device, + void *UserData); + +ol_result_t (*olInit)(); +ol_result_t (*olShutDown)(); + +ol_result_t (*olIterateDevices)(ol_device_iterate_cb_t Callback, + void *UserData); + +ol_result_t (*olIsValidBinary)(ol_device_handle_t Device, const void *ProgData, + size_t ProgDataSize, bool *Valid); + +ol_result_t (*olCreateProgram)(ol_device_handle_t Device, const void *ProgData, + size_t ProgDataSize, + ol_program_handle_t *Program); + +ol_result_t (*olDestroyProgram)(ol_program_handle_t Program); + +ol_result_t (*olGetSymbol)(ol_program_handle_t Program, const char *Name, + ol_symbol_kind_t Kind, ol_symbol_handle_t *Symbol); + +ol_result_t (*olLaunchKernel)( + ol_queue_handle_t Queue, ol_device_handle_t Device, + ol_symbol_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize, + const ol_kernel_launch_size_args_t *LaunchSizeArgs); + +ol_result_t (*olCreateQueue)(ol_device_handle_t Device, + ol_queue_handle_t *Queue); + +ol_result_t (*olDestroyQueue)(ol_queue_handle_t Queue); + +ol_result_t (*olSyncQueue)(ol_queue_handle_t Queue); + +ol_result_t (*olMemAlloc)(ol_device_handle_t Device, ol_alloc_type_t Type, + size_t Size, void **AllocationOut); + +ol_result_t (*olMemFree)(void *Address); + +ol_result_t (*olMemcpy)(ol_queue_handle_t Queue, void *DstPtr, + ol_device_handle_t DstDevice, const void *SrcPtr, + ol_device_handle_t SrcDevice, size_t Size); + +ol_result_t (*olGetDeviceInfo)(ol_device_handle_t Device, + ol_device_info_t PropName, size_t PropSize, + void *PropValue); + +ol_result_t (*olGetPlatformInfo)(ol_platform_handle_t Platform, + ol_platform_info_t PropName, size_t PropSize, + void *PropValue); + +llvm::Error loadLLVMOffload() { + constexpr const char *OffloadLibrary = "libLLVMOffload.so"; + + std::string ErrMsg; + auto DynlibHandle = std::make_unique( + llvm::sys::DynamicLibrary::getPermanentLibrary(OffloadLibrary, &ErrMsg)); + + if (!DynlibHandle->isValid()) + return llvm::createStringError(llvm::inconvertibleErrorCode(), + "Failed to dlopen %s: %s", OffloadLibrary, + ErrMsg.c_str()); + +#define DYNAMIC_INIT(SYM) \ + do { \ + void *Ptr = DynlibHandle->getAddressOfSymbol(#SYM); \ + if (!Ptr) \ + return llvm::createStringError( \ + llvm::inconvertibleErrorCode(), "Missing symbol '%s' in %s", \ + reinterpret_cast(#SYM), OffloadLibrary); \ + SYM = reinterpret_cast(Ptr); \ + } while (0) + + DYNAMIC_INIT(olInit); + DYNAMIC_INIT(olShutDown); + DYNAMIC_INIT(olIterateDevices); + DYNAMIC_INIT(olIsValidBinary); + DYNAMIC_INIT(olCreateProgram); + DYNAMIC_INIT(olDestroyProgram); + DYNAMIC_INIT(olGetSymbol); + DYNAMIC_INIT(olLaunchKernel); + DYNAMIC_INIT(olCreateQueue); + DYNAMIC_INIT(olDestroyQueue); + DYNAMIC_INIT(olSyncQueue); + DYNAMIC_INIT(olMemAlloc); + DYNAMIC_INIT(olMemFree); + DYNAMIC_INIT(olMemcpy); + DYNAMIC_INIT(olGetDeviceInfo); + DYNAMIC_INIT(olGetPlatformInfo); +#undef DYNAMIC_INIT + + return llvm::Error::success(); } -#define handle_error(X) handle_error_impl(__FILE__, __LINE__, X) #endif // LLVM_TOOLS_LLVM_GPU_LOADER_LLVM_GPU_LOADER_H diff --git a/llvm/tools/llvm-gpu-loader/nvptx.cpp b/llvm/tools/llvm-gpu-loader/nvptx.cpp deleted file mode 100644 index 82b455249ad24..0000000000000 --- a/llvm/tools/llvm-gpu-loader/nvptx.cpp +++ /dev/null @@ -1,367 +0,0 @@ -//===-- Loader Implementation for NVPTX devices --------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file impelements a simple loader to run images supporting the NVPTX -// architecture. The file launches the '_start' kernel which should be provided -// by the device application start code and call ultimately call the 'main' -// function. -// -//===----------------------------------------------------------------------===// - -#include "llvm-gpu-loader.h" -#include "server.h" - -#include "cuda.h" - -#include "llvm/Object/ELF.h" -#include "llvm/Object/ELFObjectFile.h" - -#include -#include -#include -#include -#include -#include -#include - -using namespace llvm; -using namespace object; - -static void handle_error_impl(const char *file, int32_t line, CUresult err) { - if (err == CUDA_SUCCESS) - return; - - const char *err_str = nullptr; - CUresult result = cuGetErrorString(err, &err_str); - if (result != CUDA_SUCCESS) - fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line); - else - fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str); - exit(1); -} - -// Gets the names of all the globals that contain functions to initialize or -// deinitialize. We need to do this manually because the NVPTX toolchain does -// not contain the necessary binary manipulation tools. -template -Expected get_ctor_dtor_array(const void *image, const size_t size, - Alloc allocator, CUmodule binary) { - auto mem_buffer = MemoryBuffer::getMemBuffer( - StringRef(reinterpret_cast(image), size), "image", - /*RequiresNullTerminator=*/false); - Expected elf_or_err = - ELF64LEObjectFile::create(*mem_buffer); - if (!elf_or_err) - handle_error(toString(elf_or_err.takeError()).c_str()); - - std::vector> ctors; - std::vector> dtors; - // CUDA has no way to iterate over all the symbols so we need to inspect the - // ELF directly using the LLVM libraries. - for (const auto &symbol : elf_or_err->symbols()) { - auto name_or_err = symbol.getName(); - if (!name_or_err) - handle_error(toString(name_or_err.takeError()).c_str()); - - // Search for all symbols that contain a constructor or destructor. - if (!name_or_err->starts_with("__init_array_object_") && - !name_or_err->starts_with("__fini_array_object_")) - continue; - - uint16_t priority; - if (name_or_err->rsplit('_').second.getAsInteger(10, priority)) - handle_error("Invalid priority for constructor or destructor"); - - if (name_or_err->starts_with("__init")) - ctors.emplace_back(std::make_pair(name_or_err->data(), priority)); - else - dtors.emplace_back(std::make_pair(name_or_err->data(), priority)); - } - // Lower priority constructors are run before higher ones. The reverse is true - // for destructors. - llvm::sort(ctors, llvm::less_second()); - llvm::sort(dtors, llvm::less_second()); - - // Allocate host pinned memory to make these arrays visible to the GPU. - CUdeviceptr *dev_memory = reinterpret_cast(allocator( - ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr))); - uint64_t global_size = 0; - - // Get the address of the global and then store the address of the constructor - // function to call in the constructor array. - CUdeviceptr *dev_ctors_start = dev_memory; - CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size(); - for (uint64_t i = 0; i < ctors.size(); ++i) { - CUdeviceptr dev_ptr; - if (CUresult err = - cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first)) - handle_error(err); - if (CUresult err = - cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t))) - handle_error(err); - } - - // Get the address of the global and then store the address of the destructor - // function to call in the destructor array. - CUdeviceptr *dev_dtors_start = dev_ctors_end; - CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size(); - for (uint64_t i = 0; i < dtors.size(); ++i) { - CUdeviceptr dev_ptr; - if (CUresult err = - cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first)) - handle_error(err); - if (CUresult err = - cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t))) - handle_error(err); - } - - // Obtain the address of the pointers the startup implementation uses to - // iterate the constructors and destructors. - CUdeviceptr init_start; - if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary, - "__init_array_start")) - handle_error(err); - CUdeviceptr init_end; - if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary, - "__init_array_end")) - handle_error(err); - CUdeviceptr fini_start; - if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary, - "__fini_array_start")) - handle_error(err); - CUdeviceptr fini_end; - if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary, - "__fini_array_end")) - handle_error(err); - - // Copy the pointers to the newly written array to the symbols so the startup - // implementation can iterate them. - if (CUresult err = - cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t))) - handle_error(err); - if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t))) - handle_error(err); - if (CUresult err = - cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t))) - handle_error(err); - if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t))) - handle_error(err); - - return dev_memory; -} - -void print_kernel_resources(CUmodule binary, const char *kernel_name) { - CUfunction function; - if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name)) - handle_error(err); - int num_regs; - if (CUresult err = - cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function)) - handle_error(err); - printf("Executing kernel %s:\n", kernel_name); - printf("%6s registers: %d\n", kernel_name, num_regs); -} - -template -CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server, - const LaunchParameters ¶ms, const char *kernel_name, - args_t kernel_args, bool print_resource_usage) { - // look up the '_start' kernel in the loaded module. - CUfunction function; - if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name)) - handle_error(err); - - // Set up the arguments to the '_start' kernel on the GPU. - uint64_t args_size = std::is_empty_v ? 0 : sizeof(args_t); - void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args, - CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, - CU_LAUNCH_PARAM_END}; - if (print_resource_usage) - print_kernel_resources(binary, kernel_name); - - // Initialize a non-blocking CUDA stream to allocate memory if needed. - // This needs to be done on a separate stream or else it will deadlock - // with the executing kernel. - CUstream memory_stream; - if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING)) - handle_error(err); - - std::atomic finished = false; - std::thread server_thread( - [](std::atomic *finished, rpc::Server *server, - CUstream memory_stream) { - auto malloc_handler = [&](size_t size) -> void * { - CUdeviceptr dev_ptr; - if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream)) - dev_ptr = 0UL; - - // Wait until the memory allocation is complete. - while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY) - ; - return reinterpret_cast(dev_ptr); - }; - - auto free_handler = [&](void *ptr) -> void { - if (CUresult err = cuMemFreeAsync(reinterpret_cast(ptr), - memory_stream)) - handle_error(err); - }; - - uint32_t index = 0; - while (!*finished) { - index = - handle_server<32>(*server, index, malloc_handler, free_handler); - } - }, - &finished, &server, memory_stream); - - // Call the kernel with the given arguments. - if (CUresult err = cuLaunchKernel( - function, params.num_blocks_x, params.num_blocks_y, - params.num_blocks_z, params.num_threads_x, params.num_threads_y, - params.num_threads_z, 0, stream, nullptr, args_config)) - handle_error(err); - - if (CUresult err = cuStreamSynchronize(stream)) - handle_error(err); - - finished = true; - if (server_thread.joinable()) - server_thread.join(); - - return CUDA_SUCCESS; -} - -int load_nvptx(int argc, const char **argv, const char **envp, void *image, - size_t size, const LaunchParameters ¶ms, - bool print_resource_usage) { - if (CUresult err = cuInit(0)) - handle_error(err); - // Obtain the first device found on the system. - uint32_t device_id = 0; - CUdevice device; - if (CUresult err = cuDeviceGet(&device, device_id)) - handle_error(err); - - // Initialize the CUDA context and claim it for this execution. - CUcontext context; - if (CUresult err = cuDevicePrimaryCtxRetain(&context, device)) - handle_error(err); - if (CUresult err = cuCtxSetCurrent(context)) - handle_error(err); - - // Increase the stack size per thread. - // TODO: We should allow this to be passed in so only the tests that require a - // larger stack can specify it to save on memory usage. - if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024)) - handle_error(err); - - // Initialize a non-blocking CUDA stream to execute the kernel. - CUstream stream; - if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)) - handle_error(err); - - // Load the image into a CUDA module. - CUmodule binary; - if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr)) - handle_error(err); - - // Allocate pinned memory on the host to hold the pointer array for the - // copied argv and allow the GPU device to access it. - auto allocator = [&](uint64_t size) -> void * { - void *dev_ptr; - if (CUresult err = cuMemAllocHost(&dev_ptr, size)) - handle_error(err); - return dev_ptr; - }; - - auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary); - if (!memory_or_err) - handle_error(toString(memory_or_err.takeError()).c_str()); - - void *dev_argv = copy_argument_vector(argc, argv, allocator); - if (!dev_argv) - handle_error("Failed to allocate device argv"); - - // Allocate pinned memory on the host to hold the pointer array for the - // copied environment array and allow the GPU device to access it. - void *dev_envp = copy_environment(envp, allocator); - if (!dev_envp) - handle_error("Failed to allocate device environment"); - - // Allocate space for the return pointer and initialize it to zero. - CUdeviceptr dev_ret; - if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int))) - handle_error(err); - if (CUresult err = cuMemsetD32(dev_ret, 0, 1)) - handle_error(err); - - uint32_t warp_size = 32; - void *rpc_buffer = nullptr; - if (CUresult err = cuMemAllocHost( - &rpc_buffer, - rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT))) - handle_error(err); - rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer); - rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer); - - // Initialize the RPC client on the device by copying the local data to the - // device's internal pointer. - CUdeviceptr rpc_client_dev = 0; - uint64_t client_ptr_size = sizeof(void *); - if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size, - binary, "__llvm_rpc_client")) - handle_error(err); - - if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client))) - handle_error(err); - - LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; - begin_args_t init_args = {argc, dev_argv, dev_envp}; - if (CUresult err = - launch_kernel(binary, stream, server, single_threaded_params, - "_begin", init_args, print_resource_usage)) - handle_error(err); - - start_args_t args = {argc, dev_argv, dev_envp, - reinterpret_cast(dev_ret)}; - if (CUresult err = launch_kernel(binary, stream, server, params, "_start", - args, print_resource_usage)) - handle_error(err); - - // Copy the return value back from the kernel and wait. - int host_ret = 0; - if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int))) - handle_error(err); - - if (CUresult err = cuStreamSynchronize(stream)) - handle_error(err); - - end_args_t fini_args = {}; - if (CUresult err = - launch_kernel(binary, stream, server, single_threaded_params, "_end", - fini_args, print_resource_usage)) - handle_error(err); - - // Free the memory allocated for the device. - if (CUresult err = cuMemFreeHost(*memory_or_err)) - handle_error(err); - if (CUresult err = cuMemFree(dev_ret)) - handle_error(err); - if (CUresult err = cuMemFreeHost(dev_argv)) - handle_error(err); - if (CUresult err = cuMemFreeHost(rpc_buffer)) - handle_error(err); - - // Destroy the context and the loaded binary. - if (CUresult err = cuModuleUnload(binary)) - handle_error(err); - if (CUresult err = cuDevicePrimaryCtxRelease(device)) - handle_error(err); - return host_ret; -} diff --git a/llvm/tools/llvm-gpu-loader/server.h b/llvm/tools/llvm-gpu-loader/server.h deleted file mode 100644 index da73cc007f5d5..0000000000000 --- a/llvm/tools/llvm-gpu-loader/server.h +++ /dev/null @@ -1,55 +0,0 @@ -//===-- Common RPC server handler -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TOOLS_LLVM_GPU_LOADER_SERVER_H -#define LLVM_TOOLS_LLVM_GPU_LOADER_SERVER_H - -#include -#include - -#include "shared/rpc.h" -#include "shared/rpc_opcodes.h" -#include "shared/rpc_server.h" - -template -inline uint32_t handle_server(rpc::Server &server, uint32_t index, - Alloc &&alloc, Free &&free) { - auto port = server.try_open(num_lanes, index); - if (!port) - return 0; - index = port->get_index() + 1; - - int status = rpc::RPC_SUCCESS; - switch (port->get_opcode()) { - case LIBC_MALLOC: { - port->recv_and_send([&](rpc::Buffer *buffer, uint32_t) { - buffer->data[0] = reinterpret_cast(alloc(buffer->data[0])); - }); - break; - } - case LIBC_FREE: { - port->recv([&](rpc::Buffer *buffer, uint32_t) { - free(reinterpret_cast(buffer->data[0])); - }); - break; - } - default: - status = LIBC_NAMESPACE::shared::handle_libc_opcodes(*port, num_lanes); - break; - } - - // Handle all of the `libc` specific opcodes. - if (status != rpc::RPC_SUCCESS) - handle_error("Error handling RPC server"); - - port->close(); - - return index; -} - -#endif // LLVM_TOOLS_LLVM_GPU_LOADER_SERVER_H