From f11b32f74ede722d206486a8522330731f9ff0e4 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 7 Oct 2025 15:24:55 -0500 Subject: [PATCH] [libc] Make GPU `_end` kernel only call exit callbacks Summary: We use the infrastructure to stand up a pretend hosted environment on the GPU. Part of that is calling exit codes and handling the callback. Exiting from inside a GPU region is problematic as it actually relies on a lot of GPU magic behind the scenes. This is at least *correct* now as we use `quick_exit` on the CPU when the GPU calls `exit`. However, calling `quick_exit` will interfere with instrumentation or benchmarking that excepts a nice teardown order. For normal execution we should do the friendly option and let the loader utility clean everything up manually. --- libc/startup/gpu/amdgpu/start.cpp | 10 +++++----- libc/startup/gpu/nvptx/start.cpp | 10 +++++----- llvm/tools/llvm-gpu-loader/amdhsa.cpp | 4 ++-- llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h | 4 +--- llvm/tools/llvm-gpu-loader/nvptx.cpp | 4 ++-- 5 files changed, 15 insertions(+), 17 deletions(-) diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp index 8bd0c3a938d02..bf6c96ea7c617 100644 --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -14,6 +14,7 @@ #include "src/stdlib/exit.h" extern "C" int main(int argc, char **argv, char **envp); +extern "C" void __cxa_finalize(void *dso); namespace LIBC_NAMESPACE_DECL { @@ -68,9 +69,8 @@ _start(int argc, char **argv, char **envp, int *ret) { extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel, clang::amdgpu_flat_work_group_size(1, 1), clang::amdgpu_max_num_work_groups(1)]] void -_end(int retval) { - // Only a single thread should call `exit` here, the rest should gracefully - // return from the kernel. This is so only one thread calls the destructors - // registred with 'atexit' above. - LIBC_NAMESPACE::exit(retval); +_end() { + // Only a single thread should call the destructors registred with 'atexit'. + // The loader utility will handle the actual exit and return code cleanly. + __cxa_finalize(nullptr); } diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp index bc529b36f5097..3dd85fdf36731 100644 --- a/libc/startup/gpu/nvptx/start.cpp +++ b/libc/startup/gpu/nvptx/start.cpp @@ -14,6 +14,7 @@ #include "src/stdlib/exit.h" extern "C" int main(int argc, char **argv, char **envp); +extern "C" void __cxa_finalize(void *dso); namespace LIBC_NAMESPACE_DECL { @@ -70,9 +71,8 @@ _start(int argc, char **argv, char **envp, int *ret) { __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED); } -extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void -_end(int retval) { - // To finis the execution we invoke all the callbacks registered via 'atexit' - // and then exit with the appropriate return value. - LIBC_NAMESPACE::exit(retval); +extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void _end() { + // Only a single thread should call the destructors registred with 'atexit'. + // The loader utility will handle the actual exit and return code cleanly. + __cxa_finalize(nullptr); } diff --git a/llvm/tools/llvm-gpu-loader/amdhsa.cpp b/llvm/tools/llvm-gpu-loader/amdhsa.cpp index be1b6b7993920..5715058d8cfac 100644 --- a/llvm/tools/llvm-gpu-loader/amdhsa.cpp +++ b/llvm/tools/llvm-gpu-loader/amdhsa.cpp @@ -192,7 +192,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, // Initialize all the arguments (explicit and implicit) to zero, then set the // explicit arguments to the values created above. std::memset(args, 0, args_size); - std::memcpy(args, &kernel_args, sizeof(args_t)); + std::memcpy(args, &kernel_args, std::is_empty_v ? 0 : sizeof(args_t)); // Initialize the necessary implicit arguments to the proper values. int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) + @@ -563,7 +563,7 @@ int load_amdhsa(int argc, const char **argv, const char **envp, void *image, // Save the return value and perform basic clean-up. int ret = *static_cast(host_ret); - end_args_t fini_args = {ret}; + end_args_t fini_args = {}; if (hsa_status_t err = launch_kernel( dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, server, single_threaded_params, "_end.kd", fini_args, diff --git a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h index ed34d0bace978..08861c29b4fa4 100644 --- a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h +++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h @@ -41,9 +41,7 @@ struct start_args_t { }; /// The arguments to the '_end' kernel. -struct end_args_t { - int argc; -}; +struct end_args_t {}; /// Generic interface to load the \p image and launch execution of the _start /// kernel on the target device. Copies \p argc and \p argv to the device. diff --git a/llvm/tools/llvm-gpu-loader/nvptx.cpp b/llvm/tools/llvm-gpu-loader/nvptx.cpp index 781a045e71a94..82b455249ad24 100644 --- a/llvm/tools/llvm-gpu-loader/nvptx.cpp +++ b/llvm/tools/llvm-gpu-loader/nvptx.cpp @@ -177,7 +177,7 @@ CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server, handle_error(err); // Set up the arguments to the '_start' kernel on the GPU. - uint64_t args_size = sizeof(args_t); + uint64_t args_size = std::is_empty_v ? 0 : sizeof(args_t); void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args, CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, CU_LAUNCH_PARAM_END}; @@ -342,7 +342,7 @@ int load_nvptx(int argc, const char **argv, const char **envp, void *image, if (CUresult err = cuStreamSynchronize(stream)) handle_error(err); - end_args_t fini_args = {host_ret}; + end_args_t fini_args = {}; if (CUresult err = launch_kernel(binary, stream, server, single_threaded_params, "_end", fini_args, print_resource_usage))