Skip to content

Commit

Permalink
[libc] Add more utility functions for the GPU
Browse files Browse the repository at this point in the history
This patch adds extra intrinsics for the GPU. Some of these are unused
for now but will be used later. We use these currently to update the
`RPC` handling. Currently, every thread can update the RPC client, which
isn't correct. This patch adds code neccesary to allow a single thread
to perfrom the write while the others wait.

Feedback is welcome for the naming of these functions. I'm copying the
OpenMP nomenclature where we call an AMD `wavefront` or NVIDIA `warp` a
`lane`.

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D148810
  • Loading branch information
jhuber6 committed Apr 24, 2023
1 parent 5084ba3 commit 50445df
Show file tree
Hide file tree
Showing 9 changed files with 305 additions and 9 deletions.
103 changes: 103 additions & 0 deletions libc/src/__support/GPU/amdgpu/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,114 @@
#include <stdint.h>

namespace __llvm_libc {
namespace gpu {

/// The number of threads that execute in lock-step in a lane.
constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE;

/// Returns the number of workgroups in the 'x' dimension of the grid.
LIBC_INLINE uint32_t get_num_blocks_x() {
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
}

/// Returns the number of workgroups in the 'y' dimension of the grid.
LIBC_INLINE uint32_t get_num_blocks_y() {
return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
}

/// Returns the number of workgroups in the 'z' dimension of the grid.
LIBC_INLINE uint32_t get_num_blocks_z() {
return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
}

/// Returns the 'x' dimension of the current AMD workgroup's id.
LIBC_INLINE uint32_t get_block_id_x() {
return __builtin_amdgcn_workgroup_id_x();
}

/// Returns the 'y' dimension of the current AMD workgroup's id.
LIBC_INLINE uint32_t get_block_id_y() {
return __builtin_amdgcn_workgroup_id_y();
}

/// Returns the 'z' dimension of the current AMD workgroup's id.
LIBC_INLINE uint32_t get_block_id_z() {
return __builtin_amdgcn_workgroup_id_z();
}

/// Returns the absolute id of the AMD workgroup.
LIBC_INLINE uint64_t get_block_id() {
return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
}

/// Returns the number of workitems in the 'x' dimension.
LIBC_INLINE uint32_t get_num_threads_x() {
return __builtin_amdgcn_workgroup_size_x();
}

/// Returns the number of workitems in the 'y' dimension.
LIBC_INLINE uint32_t get_num_threads_y() {
return __builtin_amdgcn_workgroup_size_y();
}

/// Returns the number of workitems in the 'z' dimension.
LIBC_INLINE uint32_t get_num_threads_z() {
return __builtin_amdgcn_workgroup_size_z();
}

/// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
LIBC_INLINE uint32_t get_thread_id_x() {
return __builtin_amdgcn_workitem_id_x();
}

/// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
LIBC_INLINE uint32_t get_thread_id_y() {
return __builtin_amdgcn_workitem_id_y();
}

/// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
LIBC_INLINE uint32_t get_thread_id_z() {
return __builtin_amdgcn_workitem_id_z();
}

/// Returns the absolute id of the thread in the current AMD workgroup.
LIBC_INLINE uint64_t get_thread_id() {
return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
}

/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware.
LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }

/// Returns the id of the thread inside of an AMD wavefront executing together.
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
if (LANE_SIZE == 64)
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
else
return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
}

/// Returns the bit-mask of active threads in the current wavefront.
[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
return __builtin_amdgcn_read_exec();
}

/// Copies the value from the first active thread in the wavefront to the rest.
[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
return __builtin_amdgcn_readfirstlane(x);
}

/// Waits for all the threads in the block to converge and issues a fence.
[[clang::convergent]] LIBC_INLINE void sync_threads() {
__builtin_amdgcn_s_barrier();
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
}

/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {}

} // namespace gpu
} // namespace __llvm_libc

#endif
42 changes: 42 additions & 0 deletions libc/src/__support/GPU/generic/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,51 @@
#include <stdint.h>

namespace __llvm_libc {
namespace gpu {

constexpr const uint64_t LANE_SIZE = 1;

LIBC_INLINE uint32_t get_num_blocks_x() { return 1; }

LIBC_INLINE uint32_t get_num_blocks_y() { return 0; }

LIBC_INLINE uint32_t get_num_blocks_z() { return 0; }

LIBC_INLINE uint32_t get_block_id_x() { return 0; }

LIBC_INLINE uint32_t get_block_id_y() { return 0; }

LIBC_INLINE uint32_t get_block_id_z() { return 0; }

LIBC_INLINE uint64_t get_block_id() { return 0; }

LIBC_INLINE uint32_t get_num_threads_x() { return 1; }

LIBC_INLINE uint32_t get_num_threads_y() { return 0; }

LIBC_INLINE uint32_t get_num_threads_z() { return 0; }

LIBC_INLINE uint32_t get_thread_id_x() { return 0; }

LIBC_INLINE uint32_t get_thread_id_y() { return 0; }

LIBC_INLINE uint32_t get_thread_id_z() { return 0; }

LIBC_INLINE uint64_t get_thread_id() { return 0; }

LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }

LIBC_INLINE uint32_t get_lane_id() { return 0; }

LIBC_INLINE uint64_t get_lane_mask() { return 1; }

LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }

LIBC_INLINE void sync_threads() {}

LIBC_INLINE void sync_lane(uint64_t) {}

} // namespace gpu
} // namespace __llvm_libc

#endif
106 changes: 105 additions & 1 deletion libc/src/__support/GPU/nvptx/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-License-id: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

Expand All @@ -14,9 +14,113 @@
#include <stdint.h>

namespace __llvm_libc {
namespace gpu {

/// The number of threads that execute in lock-step in a warp.
constexpr const uint64_t LANE_SIZE = 32;

/// Returns the number of CUDA blocks in the 'x' dimension.
LIBC_INLINE uint32_t get_num_blocks_x() {
return __nvvm_read_ptx_sreg_nctaid_x();
}

/// Returns the number of CUDA blocks in the 'y' dimension.
LIBC_INLINE uint32_t get_num_blocks_y() {
return __nvvm_read_ptx_sreg_nctaid_y();
}

/// Returns the number of CUDA blocks in the 'z' dimension.
LIBC_INLINE uint32_t get_num_blocks_z() {
return __nvvm_read_ptx_sreg_nctaid_z();
}

/// Returns the 'x' dimension of the current CUDA block's id.
LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }

/// Returns the 'y' dimension of the current CUDA block's id.
LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); }

/// Returns the 'z' dimension of the current CUDA block's id.
LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); }

/// Returns the absolute id of the CUDA block.
LIBC_INLINE uint64_t get_block_id() {
return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
}

/// Returns the number of CUDA threads in the 'x' dimension.
LIBC_INLINE uint32_t get_num_threads_x() {
return __nvvm_read_ptx_sreg_ntid_x();
}

/// Returns the number of CUDA threads in the 'y' dimension.
LIBC_INLINE uint32_t get_num_threads_y() {
return __nvvm_read_ptx_sreg_ntid_y();
}

/// Returns the number of CUDA threads in the 'z' dimension.
LIBC_INLINE uint32_t get_num_threads_z() {
return __nvvm_read_ptx_sreg_ntid_z();
}

/// Returns the 'x' dimension id of the thread in the current CUDA block.
LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }

/// Returns the 'y' dimension id of the thread in the current CUDA block.
LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); }

/// Returns the 'z' dimension id of the thread in the current CUDA block.
LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); }

/// Returns the absolute id of the thread in the current CUDA block.
LIBC_INLINE uint64_t get_thread_id() {
return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
}

/// Returns the size of a CUDA warp.
LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }

/// Returns the id of the thread inside of a CUDA warp executing together.
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
return get_thread_id() & (get_lane_size() - 1);
}

/// Returns the bit-mask of active threads in the current warp.
[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
uint32_t mask;
asm volatile("activemask.b32 %0;" : "=r"(mask));
return mask;
}

/// Copies the value from the first active thread in the warp to the rest.
[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
// NOTE: This is not sufficient in all cases on Volta hardware or later. The
// lane mask returned here is not always the true lane mask used by the
// intrinsics in cases of incedental or enforced divergence by the user.
uint64_t lane_mask = get_lane_mask();
uint64_t id = __builtin_ffsl(lane_mask) - 1;
#if __CUDA_ARCH__ >= 600
return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);
#else
return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
#endif
}

/// Waits for all the threads in the block to converge and issues a fence.
[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }

/// Waits for all threads in the warp to reconverge for independent scheduling.
[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
#if __CUDA_ARCH__ >= 700
__nvvm_bar_warp_sync(mask);
#else
(void)mask;
#endif
}

} // namespace gpu
} // namespace __llvm_libc

#endif
1 change: 1 addition & 0 deletions libc/src/__support/RPC/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@ add_object_library(
HDRS
rpc_client.h
DEPENDS
libc.src.__support.GPU.utils
.rpc
)
1 change: 1 addition & 0 deletions libc/startup/gpu/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ add_startup_object(
start.cpp
DEPENDS
libc.src.__support.RPC.rpc_client
libc.src.__support.GPU.utils
COMPILE_OPTIONS
-ffreestanding # To avoid compiler warnings about calling the main function.
-fno-builtin
Expand Down
28 changes: 25 additions & 3 deletions libc/startup/gpu/amdgpu/start.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,38 @@
//
//===----------------------------------------------------------------------===//

#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"

static __llvm_libc::cpp::Atomic<uint32_t> lock;

extern "C" int main(int argc, char **argv, char **envp);

namespace __llvm_libc {

static cpp::Atomic<uint32_t> lock = 0;

static cpp::Atomic<uint32_t> init = 0;

void init_rpc(void *in, void *out, void *buffer) {
// Only a single thread should update the RPC data.
if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
rpc::client.reset(&lock, in, out, buffer);
init.store(1, cpp::MemoryOrder::RELAXED);
}

// Wait until the previous thread signals that the data has been written.
while (!init.load(cpp::MemoryOrder::RELAXED))
rpc::sleep_briefly();

// Wait for the threads in the block to converge and fence the write.
gpu::sync_threads();
}

} // namespace __llvm_libc

extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
void *buffer) {
__llvm_libc::rpc::client.reset(&lock, in, out, buffer);
__llvm_libc::init_rpc(in, out, buffer);

__atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
}
1 change: 1 addition & 0 deletions libc/startup/gpu/nvptx/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ add_startup_object(
start.cpp
DEPENDS
libc.src.__support.RPC.rpc_client
libc.src.__support.GPU.utils
COMPILE_OPTIONS
-ffreestanding # To avoid compiler warnings about calling the main function.
-fno-builtin
Expand Down
30 changes: 26 additions & 4 deletions libc/startup/gpu/nvptx/start.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,38 @@
//
//===----------------------------------------------------------------------===//

#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"

static __llvm_libc::cpp::Atomic<uint32_t> lock;

extern "C" int main(int argc, char **argv, char **envp);

extern "C" [[gnu::visibility("protected")]] __attribute__((nvptx_kernel)) void
namespace __llvm_libc {

static cpp::Atomic<uint32_t> lock = 0;

static cpp::Atomic<uint32_t> init = 0;

void init_rpc(void *in, void *out, void *buffer) {
// Only a single thread should update the RPC data.
if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
rpc::client.reset(&lock, in, out, buffer);
init.store(1, cpp::MemoryOrder::RELAXED);
}

// Wait until the previous thread signals that the data has been written.
while (!init.load(cpp::MemoryOrder::RELAXED))
rpc::sleep_briefly();

// Wait for the threads in the block to converge and fence the write.
gpu::sync_threads();
}

} // namespace __llvm_libc

extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
void *buffer) {
__llvm_libc::rpc::client.reset(&lock, in, out, buffer);
__llvm_libc::init_rpc(in, out, buffer);

__atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
}
Loading

0 comments on commit 50445df

Please sign in to comment.