Skip to content

Commit

Permalink
[libc] Add support for global ctors / dtors for AMDGPU
Browse files Browse the repository at this point in the history
This patch makes the necessary changes to support calling global
constructors and destructors on the GPU. The patch in D149340 allows the
`lld` linker to create the symbols pointing us to these globals. These
should be executed by a single thread, which is more difficult on the
GPU because all threads are active. I chose to use an atomic counter to
sync every thread on the GPU. This is very slow if you use more than a
few thousand threads, but for testing purposes it should be sufficient.

Depends on D149340 D149363

Reviewed By: sivachandra

Differential Revision: https://reviews.llvm.org/D149398
  • Loading branch information
jhuber6 committed Apr 29, 2023
1 parent a1da746 commit 1b823ab
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 8 deletions.
2 changes: 2 additions & 0 deletions libc/startup/gpu/amdgpu/CMakeLists.txt
Expand Up @@ -5,6 +5,8 @@ add_startup_object(
DEPENDS
libc.src.__support.RPC.rpc_client
libc.src.__support.GPU.utils
libc.src.stdlib.exit
libc.src.stdlib.atexit
COMPILE_OPTIONS
-ffreestanding # To avoid compiler warnings about calling the main function.
-fno-builtin
Expand Down
71 changes: 63 additions & 8 deletions libc/startup/gpu/amdgpu/start.cpp
Expand Up @@ -8,36 +8,91 @@

#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"
#include "src/stdlib/atexit.h"
#include "src/stdlib/exit.h"

extern "C" int main(int argc, char **argv, char **envp);

namespace __llvm_libc {

static cpp::Atomic<uint32_t> lock = 0;

static cpp::Atomic<uint32_t> init = 0;
static cpp::Atomic<uint32_t> count = 0;

void init_rpc(void *in, void *out, void *buffer) {
// Only a single thread should update the RPC data.
extern "C" uintptr_t __init_array_start[];
extern "C" uintptr_t __init_array_end[];
extern "C" uintptr_t __fini_array_start[];
extern "C" uintptr_t __fini_array_end[];

using InitCallback = void(int, char **, char **);
using FiniCallback = void(void);

static uint64_t get_grid_size() {
return gpu::get_num_threads() * gpu::get_num_blocks();
}

static void call_init_array_callbacks(int argc, char **argv, char **env) {
size_t init_array_size = __init_array_end - __init_array_start;
for (size_t i = 0; i < init_array_size; ++i)
reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
}

static void call_fini_array_callbacks() {
size_t fini_array_size = __fini_array_end - __fini_array_start;
for (size_t i = 0; i < fini_array_size; ++i)
reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
}

void initialize(int argc, char **argv, char **env, void *in, void *out,
void *buffer) {
// We need a single GPU thread to perform the initialization of the global
// constructors and data. We simply mask off all but a single thread and
// execute.
count.fetch_add(1, cpp::MemoryOrder::RELAXED);
if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
// We need to set up the RPC client first in case any of the constructors
// require it.
rpc::client.reset(&lock, in, out, buffer);
init.store(1, cpp::MemoryOrder::RELAXED);

// We want the fini array callbacks to be run after other atexit
// callbacks are run. So, we register them before running the init
// array callbacks as they can potentially register their own atexit
// callbacks.
atexit(&call_fini_array_callbacks);
call_init_array_callbacks(argc, argv, env);
}

// Wait until the previous thread signals that the data has been written.
while (!init.load(cpp::MemoryOrder::RELAXED))
// We wait until every single thread launched on the GPU has seen the
// initialization code. This will get very, very slow for high thread counts,
// but for testing purposes it is unlikely to matter.
while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
rpc::sleep_briefly();
gpu::sync_threads();
}

// Wait for the threads in the block to converge and fence the write.
void finalize(int retval) {
// We wait until every single thread launched on the GPU has finished
// executing and reached the finalize region.
count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
while (count.load(cpp::MemoryOrder::RELAXED) != 0)
rpc::sleep_briefly();
gpu::sync_threads();
if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
// Only a single thread should call `exit` here, the rest should gracefully
// return from the kernel. This is so only one thread calls the destructors
// registred with 'atexit' above.
__llvm_libc::exit(retval);
}
}

} // namespace __llvm_libc

extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
void *buffer) {
__llvm_libc::init_rpc(in, out, buffer);
__llvm_libc::initialize(argc, argv, envp, in, out, buffer);

__atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);

__llvm_libc::finalize(*ret);
}
10 changes: 10 additions & 0 deletions libc/test/integration/startup/gpu/CMakeLists.txt
Expand Up @@ -25,3 +25,13 @@ add_integration_test(
--blocks 16
--threads 1
)

# Constructors are currently only supported on AMDGPU.
if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
add_integration_test(
init_fini_array_test
SUITE libc-startup-tests
SRCS
init_fini_array_test.cpp
)
endif()
60 changes: 60 additions & 0 deletions libc/test/integration/startup/gpu/init_fini_array_test.cpp
@@ -0,0 +1,60 @@
//===-- Loader test to test init and fini array iteration -----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "test/IntegrationTest/test.h"

#include <stddef.h>

int global_destroyed = false;

class A {
private:
int val[1024];

public:
A(int i, int a) {
for (int k = 0; k < 1024; ++k)
val[k] = 0;
val[i] = a;
}

~A() { global_destroyed = true; }

int get(int i) const { return val[i]; }
};

int GLOBAL_INDEX = 512;
int INITVAL_INITIALIZER = 0x600D;
int BEFORE_INITIALIZER = 0xFEED;

A global(GLOBAL_INDEX, INITVAL_INITIALIZER);

int initval = 0;
int before = 0;

__attribute__((constructor(101))) void run_before() {
before = BEFORE_INITIALIZER;
}

__attribute__((constructor(65535))) void run_after() {
ASSERT_EQ(before, BEFORE_INITIALIZER);
}

__attribute__((constructor)) void set_initval() {
initval = INITVAL_INITIALIZER;
}
__attribute__((destructor(1))) void reset_initval() {
ASSERT_TRUE(global_destroyed);
initval = 0;
}

TEST_MAIN() {
ASSERT_EQ(global.get(GLOBAL_INDEX), INITVAL_INITIALIZER);
ASSERT_EQ(initval, INITVAL_INITIALIZER);
return 0;
}

0 comments on commit 1b823ab

Please sign in to comment.