106 changes: 105 additions & 1 deletion libc/src/__support/GPU/nvptx/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-License-id: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

Expand All @@ -14,9 +14,113 @@
#include <stdint.h>

namespace __llvm_libc {
namespace gpu {

/// The number of threads that execute in lock-step in a warp.
constexpr const uint64_t LANE_SIZE = 32;

/// Returns the number of CUDA blocks in the 'x' dimension.
LIBC_INLINE uint32_t get_num_blocks_x() {
return __nvvm_read_ptx_sreg_nctaid_x();
}

/// Returns the number of CUDA blocks in the 'y' dimension.
LIBC_INLINE uint32_t get_num_blocks_y() {
return __nvvm_read_ptx_sreg_nctaid_y();
}

/// Returns the number of CUDA blocks in the 'z' dimension.
LIBC_INLINE uint32_t get_num_blocks_z() {
return __nvvm_read_ptx_sreg_nctaid_z();
}

/// Returns the 'x' dimension of the current CUDA block's id.
LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }

/// Returns the 'y' dimension of the current CUDA block's id.
LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); }

/// Returns the 'z' dimension of the current CUDA block's id.
LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); }

/// Returns the absolute id of the CUDA block.
LIBC_INLINE uint64_t get_block_id() {
return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
}

/// Returns the number of CUDA threads in the 'x' dimension.
LIBC_INLINE uint32_t get_num_threads_x() {
return __nvvm_read_ptx_sreg_ntid_x();
}

/// Returns the number of CUDA threads in the 'y' dimension.
LIBC_INLINE uint32_t get_num_threads_y() {
return __nvvm_read_ptx_sreg_ntid_y();
}

/// Returns the number of CUDA threads in the 'z' dimension.
LIBC_INLINE uint32_t get_num_threads_z() {
return __nvvm_read_ptx_sreg_ntid_z();
}

/// Returns the 'x' dimension id of the thread in the current CUDA block.
LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }

/// Returns the 'y' dimension id of the thread in the current CUDA block.
LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); }

/// Returns the 'z' dimension id of the thread in the current CUDA block.
LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); }

/// Returns the absolute id of the thread in the current CUDA block.
LIBC_INLINE uint64_t get_thread_id() {
return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
}

/// Returns the size of a CUDA warp.
LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }

/// Returns the id of the thread inside of a CUDA warp executing together.
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
return get_thread_id() & (get_lane_size() - 1);
}

/// Returns the bit-mask of active threads in the current warp.
[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
uint32_t mask;
asm volatile("activemask.b32 %0;" : "=r"(mask));
return mask;
}

/// Copies the value from the first active thread in the warp to the rest.
[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
// NOTE: This is not sufficient in all cases on Volta hardware or later. The
// lane mask returned here is not always the true lane mask used by the
// intrinsics in cases of incedental or enforced divergence by the user.
uint64_t lane_mask = get_lane_mask();
uint64_t id = __builtin_ffsl(lane_mask) - 1;
#if __CUDA_ARCH__ >= 600
return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);
#else
return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
#endif
}

/// Waits for all the threads in the block to converge and issues a fence.
[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }

/// Waits for all threads in the warp to reconverge for independent scheduling.
[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
#if __CUDA_ARCH__ >= 700
__nvvm_bar_warp_sync(mask);
#else
(void)mask;
#endif
}

} // namespace gpu
} // namespace __llvm_libc

#endif
1 change: 1 addition & 0 deletions libc/src/__support/RPC/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@ add_object_library(
HDRS
rpc_client.h
DEPENDS
libc.src.__support.GPU.utils
.rpc
)
1 change: 1 addition & 0 deletions libc/startup/gpu/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ add_startup_object(
start.cpp
DEPENDS
libc.src.__support.RPC.rpc_client
libc.src.__support.GPU.utils
COMPILE_OPTIONS
-ffreestanding # To avoid compiler warnings about calling the main function.
-fno-builtin
Expand Down
28 changes: 25 additions & 3 deletions libc/startup/gpu/amdgpu/start.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,38 @@
//
//===----------------------------------------------------------------------===//

#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"

static __llvm_libc::cpp::Atomic<uint32_t> lock;

extern "C" int main(int argc, char **argv, char **envp);

namespace __llvm_libc {

static cpp::Atomic<uint32_t> lock = 0;

static cpp::Atomic<uint32_t> init = 0;

void init_rpc(void *in, void *out, void *buffer) {
// Only a single thread should update the RPC data.
if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
rpc::client.reset(&lock, in, out, buffer);
init.store(1, cpp::MemoryOrder::RELAXED);
}

// Wait until the previous thread signals that the data has been written.
while (!init.load(cpp::MemoryOrder::RELAXED))
rpc::sleep_briefly();

// Wait for the threads in the block to converge and fence the write.
gpu::sync_threads();
}

} // namespace __llvm_libc

extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
void *buffer) {
__llvm_libc::rpc::client.reset(&lock, in, out, buffer);
__llvm_libc::init_rpc(in, out, buffer);

__atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
}
1 change: 1 addition & 0 deletions libc/startup/gpu/nvptx/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ add_startup_object(
start.cpp
DEPENDS
libc.src.__support.RPC.rpc_client
libc.src.__support.GPU.utils
COMPILE_OPTIONS
-ffreestanding # To avoid compiler warnings about calling the main function.
-fno-builtin
Expand Down
30 changes: 26 additions & 4 deletions libc/startup/gpu/nvptx/start.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,38 @@
//
//===----------------------------------------------------------------------===//

#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"

static __llvm_libc::cpp::Atomic<uint32_t> lock;

extern "C" int main(int argc, char **argv, char **envp);

extern "C" [[gnu::visibility("protected")]] __attribute__((nvptx_kernel)) void
namespace __llvm_libc {

static cpp::Atomic<uint32_t> lock = 0;

static cpp::Atomic<uint32_t> init = 0;

void init_rpc(void *in, void *out, void *buffer) {
// Only a single thread should update the RPC data.
if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
rpc::client.reset(&lock, in, out, buffer);
init.store(1, cpp::MemoryOrder::RELAXED);
}

// Wait until the previous thread signals that the data has been written.
while (!init.load(cpp::MemoryOrder::RELAXED))
rpc::sleep_briefly();

// Wait for the threads in the block to converge and fence the write.
gpu::sync_threads();
}

} // namespace __llvm_libc

extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
void *buffer) {
__llvm_libc::rpc::client.reset(&lock, in, out, buffer);
__llvm_libc::init_rpc(in, out, buffer);

__atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
}
2 changes: 1 addition & 1 deletion libc/test/integration/startup/gpu/rpc_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
using namespace __llvm_libc;

static void test_add_simple() {
uint32_t num_additions = 1000 + 10 * get_block_id_x();
uint32_t num_additions = 1000 + 10 * gpu::get_block_id_x();
uint64_t cnt = 0;
for (uint32_t i = 0; i < num_additions; ++i) {
rpc::Port port = rpc::client.open(rpc::TEST_INCREMENT);
Expand Down