[libc] Add support for global ctors / dtors for AMDGPU

This patch makes the necessary changes to support calling global constructors and destructors on the GPU. The patch in D149340 allows the `lld` linker to create the symbols pointing us to these globals. These should be executed by a single thread, which is more difficult on the GPU because all threads are active. I chose to use an atomic counter to sync every thread on the GPU. This is very slow if you use more than a few thousand threads, but for testing purposes it should be sufficient. Depends on D149340 D149363 Reviewed By: sivachandra Differential Revision: https://reviews.llvm.org/D149398
llvm · Apr 29, 2023 · 1b823ab · 1b823ab
1 parent a1da746
commit 1b823ab
Show file tree

Hide file tree

Showing 4 changed files with 135 additions and 8 deletions.
diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -5,6 +5,8 @@ add_startup_object(
   DEPENDS
     libc.src.__support.RPC.rpc_client
     libc.src.__support.GPU.utils
+    libc.src.stdlib.exit
+    libc.src.stdlib.atexit
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin

diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
@@ -8,36 +8,91 @@
 
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
+#include "src/stdlib/atexit.h"
+#include "src/stdlib/exit.h"
 
 extern "C" int main(int argc, char **argv, char **envp);
 
 namespace __llvm_libc {
 
 static cpp::Atomic<uint32_t> lock = 0;
 
-static cpp::Atomic<uint32_t> init = 0;
+static cpp::Atomic<uint32_t> count = 0;
 
-void init_rpc(void *in, void *out, void *buffer) {
-  // Only a single thread should update the RPC data.
+extern "C" uintptr_t __init_array_start[];
+extern "C" uintptr_t __init_array_end[];
+extern "C" uintptr_t __fini_array_start[];
+extern "C" uintptr_t __fini_array_end[];
+
+using InitCallback = void(int, char **, char **);
+using FiniCallback = void(void);
+
+static uint64_t get_grid_size() {
+  return gpu::get_num_threads() * gpu::get_num_blocks();
+}
+
+static void call_init_array_callbacks(int argc, char **argv, char **env) {
+  size_t init_array_size = __init_array_end - __init_array_start;
+  for (size_t i = 0; i < init_array_size; ++i)
+    reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
+}
+
+static void call_fini_array_callbacks() {
+  size_t fini_array_size = __fini_array_end - __fini_array_start;
+  for (size_t i = 0; i < fini_array_size; ++i)
+    reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
+}
+
+void initialize(int argc, char **argv, char **env, void *in, void *out,
+                void *buffer) {
+  // We need a single GPU thread to perform the initialization of the global
+  // constructors and data. We simply mask off all but a single thread and
+  // execute.
+  count.fetch_add(1, cpp::MemoryOrder::RELAXED);
   if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    // We need to set up the RPC client first in case any of the constructors
+    // require it.
     rpc::client.reset(&lock, in, out, buffer);
-    init.store(1, cpp::MemoryOrder::RELAXED);
+
+    // We want the fini array callbacks to be run after other atexit
+    // callbacks are run. So, we register them before running the init
+    // array callbacks as they can potentially register their own atexit
+    // callbacks.
+    atexit(&call_fini_array_callbacks);
+    call_init_array_callbacks(argc, argv, env);
   }
 
-  // Wait until the previous thread signals that the data has been written.
-  while (!init.load(cpp::MemoryOrder::RELAXED))
+  // We wait until every single thread launched on the GPU has seen the
+  // initialization code. This will get very, very slow for high thread counts,
+  // but for testing purposes it is unlikely to matter.
+  while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
     rpc::sleep_briefly();
+  gpu::sync_threads();
+}
 
-  // Wait for the threads in the block to converge and fence the write.
+void finalize(int retval) {
+  // We wait until every single thread launched on the GPU has finished
+  // executing and reached the finalize region.
+  count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
+  while (count.load(cpp::MemoryOrder::RELAXED) != 0)
+    rpc::sleep_briefly();
   gpu::sync_threads();
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    // Only a single thread should call `exit` here, the rest should gracefully
+    // return from the kernel. This is so only one thread calls the destructors
+    // registred with 'atexit' above.
+    __llvm_libc::exit(retval);
+  }
 }
 
 } // namespace __llvm_libc
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::init_rpc(in, out, buffer);
+  __llvm_libc::initialize(argc, argv, envp, in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+
+  __llvm_libc::finalize(*ret);
 }
diff --git a/libc/test/integration/startup/gpu/CMakeLists.txt b/libc/test/integration/startup/gpu/CMakeLists.txt
@@ -25,3 +25,13 @@ add_integration_test(
     --blocks 16
     --threads 1
 )
+
+# Constructors are currently only supported on AMDGPU.
+if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
+  add_integration_test(
+    init_fini_array_test
+    SUITE libc-startup-tests
+    SRCS
+      init_fini_array_test.cpp
+  )
+endif()
diff --git a/libc/test/integration/startup/gpu/init_fini_array_test.cpp b/libc/test/integration/startup/gpu/init_fini_array_test.cpp
@@ -0,0 +1,60 @@
+//===-- Loader test to test init and fini array iteration -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "test/IntegrationTest/test.h"
+
+#include <stddef.h>
+
+int global_destroyed = false;
+
+class A {
+private:
+  int val[1024];
+
+public:
+  A(int i, int a) {
+    for (int k = 0; k < 1024; ++k)
+      val[k] = 0;
+    val[i] = a;
+  }
+
+  ~A() { global_destroyed = true; }
+
+  int get(int i) const { return val[i]; }
+};
+
+int GLOBAL_INDEX = 512;
+int INITVAL_INITIALIZER = 0x600D;
+int BEFORE_INITIALIZER = 0xFEED;
+
+A global(GLOBAL_INDEX, INITVAL_INITIALIZER);
+
+int initval = 0;
+int before = 0;
+
+__attribute__((constructor(101))) void run_before() {
+  before = BEFORE_INITIALIZER;
+}
+
+__attribute__((constructor(65535))) void run_after() {
+  ASSERT_EQ(before, BEFORE_INITIALIZER);
+}
+
+__attribute__((constructor)) void set_initval() {
+  initval = INITVAL_INITIALIZER;
+}
+__attribute__((destructor(1))) void reset_initval() {
+  ASSERT_TRUE(global_destroyed);
+  initval = 0;
+}
+
+TEST_MAIN() {
+  ASSERT_EQ(global.get(GLOBAL_INDEX), INITVAL_INITIALIZER);
+  ASSERT_EQ(initval, INITVAL_INITIALIZER);
+  return 0;
+}