From f11b32f74ede722d206486a8522330731f9ff0e4 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 7 Oct 2025 15:24:55 -0500
Subject: [PATCH] [libc] Make GPU `_end` kernel only call exit callbacks

Summary:
We use the infrastructure to stand up a pretend hosted environment on
the GPU. Part of that is calling exit codes and handling the callback.
Exiting from inside a GPU region is problematic as it actually relies on
a lot of GPU magic behind the scenes. This is at least *correct* now as
we use `quick_exit` on the CPU when the GPU calls `exit`. However,
calling `quick_exit` will interfere with instrumentation or benchmarking
that excepts a nice teardown order. For normal execution we should do
the friendly option and let the loader utility clean everything up
manually.
---
 libc/startup/gpu/amdgpu/start.cpp            | 10 +++++-----
 libc/startup/gpu/nvptx/start.cpp             | 10 +++++-----
 llvm/tools/llvm-gpu-loader/amdhsa.cpp        |  4 ++--
 llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h |  4 +---
 llvm/tools/llvm-gpu-loader/nvptx.cpp         |  4 ++--
 5 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 8bd0c3a938d02..bf6c96ea7c617 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -14,6 +14,7 @@
 #include "src/stdlib/exit.h"
 
 extern "C" int main(int argc, char **argv, char **envp);
+extern "C" void __cxa_finalize(void *dso);
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -68,9 +69,8 @@ _start(int argc, char **argv, char **envp, int *ret) {
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel,
              clang::amdgpu_flat_work_group_size(1, 1),
              clang::amdgpu_max_num_work_groups(1)]] void
-_end(int retval) {
-  // Only a single thread should call `exit` here, the rest should gracefully
-  // return from the kernel. This is so only one thread calls the destructors
-  // registred with 'atexit' above.
-  LIBC_NAMESPACE::exit(retval);
+_end() {
+  // Only a single thread should call the destructors registred with 'atexit'.
+  // The loader utility will handle the actual exit and return code cleanly.
+  __cxa_finalize(nullptr);
 }
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index bc529b36f5097..3dd85fdf36731 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -14,6 +14,7 @@
 #include "src/stdlib/exit.h"
 
 extern "C" int main(int argc, char **argv, char **envp);
+extern "C" void __cxa_finalize(void *dso);
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -70,9 +71,8 @@ _start(int argc, char **argv, char **envp, int *ret) {
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }
 
-extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
-_end(int retval) {
-  // To finis the execution we invoke all the callbacks registered via 'atexit'
-  // and then exit with the appropriate return value.
-  LIBC_NAMESPACE::exit(retval);
+extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void _end() {
+  // Only a single thread should call the destructors registred with 'atexit'.
+  // The loader utility will handle the actual exit and return code cleanly.
+  __cxa_finalize(nullptr);
 }
diff --git a/llvm/tools/llvm-gpu-loader/amdhsa.cpp b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
index be1b6b7993920..5715058d8cfac 100644
--- a/llvm/tools/llvm-gpu-loader/amdhsa.cpp
+++ b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
@@ -192,7 +192,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   // Initialize all the arguments (explicit and implicit) to zero, then set the
   // explicit arguments to the values created above.
   std::memset(args, 0, args_size);
-  std::memcpy(args, &kernel_args, sizeof(args_t));
+  std::memcpy(args, &kernel_args, std::is_empty_v<args_t> ? 0 : sizeof(args_t));
 
   // Initialize the necessary implicit arguments to the proper values.
   int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
@@ -563,7 +563,7 @@ int load_amdhsa(int argc, const char **argv, const char **envp, void *image,
   // Save the return value and perform basic clean-up.
   int ret = *static_cast<int *>(host_ret);
 
-  end_args_t fini_args = {ret};
+  end_args_t fini_args = {};
   if (hsa_status_t err = launch_kernel(
           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
           server, single_threaded_params, "_end.kd", fini_args,
diff --git a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
index ed34d0bace978..08861c29b4fa4 100644
--- a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
+++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
@@ -41,9 +41,7 @@ struct start_args_t {
 };
 
 /// The arguments to the '_end' kernel.
-struct end_args_t {
-  int argc;
-};
+struct end_args_t {};
 
 /// Generic interface to load the \p image and launch execution of the _start
 /// kernel on the target device. Copies \p argc and \p argv to the device.
diff --git a/llvm/tools/llvm-gpu-loader/nvptx.cpp b/llvm/tools/llvm-gpu-loader/nvptx.cpp
index 781a045e71a94..82b455249ad24 100644
--- a/llvm/tools/llvm-gpu-loader/nvptx.cpp
+++ b/llvm/tools/llvm-gpu-loader/nvptx.cpp
@@ -177,7 +177,7 @@ CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
     handle_error(err);
 
   // Set up the arguments to the '_start' kernel on the GPU.
-  uint64_t args_size = sizeof(args_t);
+  uint64_t args_size = std::is_empty_v<args_t> ? 0 : sizeof(args_t);
   void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
                          CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
                          CU_LAUNCH_PARAM_END};
@@ -342,7 +342,7 @@ int load_nvptx(int argc, const char **argv, const char **envp, void *image,
   if (CUresult err = cuStreamSynchronize(stream))
     handle_error(err);
 
-  end_args_t fini_args = {host_ret};
+  end_args_t fini_args = {};
   if (CUresult err =
           launch_kernel(binary, stream, server, single_threaded_params, "_end",
                         fini_args, print_resource_usage))