[libc] Add more utility functions for the GPU

This patch adds extra intrinsics for the GPU. Some of these are unused for now but will be used later. We use these currently to update the `RPC` handling. Currently, every thread can update the RPC client, which isn't correct. This patch adds code neccesary to allow a single thread to perfrom the write while the others wait. Feedback is welcome for the naming of these functions. I'm copying the OpenMP nomenclature where we call an AMD `wavefront` or NVIDIA `warp` a `lane`. Reviewed By: tra Differential Revision: https://reviews.llvm.org/D148810
llvm · Apr 24, 2023 · 50445df · 50445df
1 parent 5084ba3
commit 50445df
Show file tree

Hide file tree

Showing 9 changed files with 305 additions and 9 deletions.
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
@@ -14,11 +14,114 @@
 #include <stdint.h>
 
 namespace __llvm_libc {
+namespace gpu {
 
+/// The number of threads that execute in lock-step in a lane.
+constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE;
+
+/// Returns the number of workgroups in the 'x' dimension of the grid.
+LIBC_INLINE uint32_t get_num_blocks_x() {
+  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
+}
+
+/// Returns the number of workgroups in the 'y' dimension of the grid.
+LIBC_INLINE uint32_t get_num_blocks_y() {
+  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
+}
+
+/// Returns the number of workgroups in the 'z' dimension of the grid.
+LIBC_INLINE uint32_t get_num_blocks_z() {
+  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
+}
+
+/// Returns the 'x' dimension of the current AMD workgroup's id.
 LIBC_INLINE uint32_t get_block_id_x() {
   return __builtin_amdgcn_workgroup_id_x();
 }
 
+/// Returns the 'y' dimension of the current AMD workgroup's id.
+LIBC_INLINE uint32_t get_block_id_y() {
+  return __builtin_amdgcn_workgroup_id_y();
+}
+
+/// Returns the 'z' dimension of the current AMD workgroup's id.
+LIBC_INLINE uint32_t get_block_id_z() {
+  return __builtin_amdgcn_workgroup_id_z();
+}
+
+/// Returns the absolute id of the AMD workgroup.
+LIBC_INLINE uint64_t get_block_id() {
+  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
+         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
+}
+
+/// Returns the number of workitems in the 'x' dimension.
+LIBC_INLINE uint32_t get_num_threads_x() {
+  return __builtin_amdgcn_workgroup_size_x();
+}
+
+/// Returns the number of workitems in the 'y' dimension.
+LIBC_INLINE uint32_t get_num_threads_y() {
+  return __builtin_amdgcn_workgroup_size_y();
+}
+
+/// Returns the number of workitems in the 'z' dimension.
+LIBC_INLINE uint32_t get_num_threads_z() {
+  return __builtin_amdgcn_workgroup_size_z();
+}
+
+/// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
+LIBC_INLINE uint32_t get_thread_id_x() {
+  return __builtin_amdgcn_workitem_id_x();
+}
+
+/// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
+LIBC_INLINE uint32_t get_thread_id_y() {
+  return __builtin_amdgcn_workitem_id_y();
+}
+
+/// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
+LIBC_INLINE uint32_t get_thread_id_z() {
+  return __builtin_amdgcn_workitem_id_z();
+}
+
+/// Returns the absolute id of the thread in the current AMD workgroup.
+LIBC_INLINE uint64_t get_thread_id() {
+  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
+         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
+}
+
+/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware.
+LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+
+/// Returns the id of the thread inside of an AMD wavefront executing together.
+[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
+  if (LANE_SIZE == 64)
+    return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+  else
+    return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
+}
+
+/// Returns the bit-mask of active threads in the current wavefront.
+[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
+  return __builtin_amdgcn_read_exec();
+}
+
+/// Copies the value from the first active thread in the wavefront to the rest.
+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
+  return __builtin_amdgcn_readfirstlane(x);
+}
+
+/// Waits for all the threads in the block to converge and issues a fence.
+[[clang::convergent]] LIBC_INLINE void sync_threads() {
+  __builtin_amdgcn_s_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+}
+
+/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
+[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {}
+
+} // namespace gpu
 } // namespace __llvm_libc
 
 #endif
diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
@@ -14,9 +14,51 @@
 #include <stdint.h>
 
 namespace __llvm_libc {
+namespace gpu {
+
+constexpr const uint64_t LANE_SIZE = 1;
+
+LIBC_INLINE uint32_t get_num_blocks_x() { return 1; }
+
+LIBC_INLINE uint32_t get_num_blocks_y() { return 0; }
+
+LIBC_INLINE uint32_t get_num_blocks_z() { return 0; }
 
 LIBC_INLINE uint32_t get_block_id_x() { return 0; }
 
+LIBC_INLINE uint32_t get_block_id_y() { return 0; }
+
+LIBC_INLINE uint32_t get_block_id_z() { return 0; }
+
+LIBC_INLINE uint64_t get_block_id() { return 0; }
+
+LIBC_INLINE uint32_t get_num_threads_x() { return 1; }
+
+LIBC_INLINE uint32_t get_num_threads_y() { return 0; }
+
+LIBC_INLINE uint32_t get_num_threads_z() { return 0; }
+
+LIBC_INLINE uint32_t get_thread_id_x() { return 0; }
+
+LIBC_INLINE uint32_t get_thread_id_y() { return 0; }
+
+LIBC_INLINE uint32_t get_thread_id_z() { return 0; }
+
+LIBC_INLINE uint64_t get_thread_id() { return 0; }
+
+LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+
+LIBC_INLINE uint32_t get_lane_id() { return 0; }
+
+LIBC_INLINE uint64_t get_lane_mask() { return 1; }
+
+LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }
+
+LIBC_INLINE void sync_threads() {}
+
+LIBC_INLINE void sync_lane(uint64_t) {}
+
+} // namespace gpu
 } // namespace __llvm_libc
 
 #endif
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
@@ -2,7 +2,7 @@
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-License-id: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
@@ -14,9 +14,113 @@
 #include <stdint.h>
 
 namespace __llvm_libc {
+namespace gpu {
 
+/// The number of threads that execute in lock-step in a warp.
+constexpr const uint64_t LANE_SIZE = 32;
+
+/// Returns the number of CUDA blocks in the 'x' dimension.
+LIBC_INLINE uint32_t get_num_blocks_x() {
+  return __nvvm_read_ptx_sreg_nctaid_x();
+}
+
+/// Returns the number of CUDA blocks in the 'y' dimension.
+LIBC_INLINE uint32_t get_num_blocks_y() {
+  return __nvvm_read_ptx_sreg_nctaid_y();
+}
+
+/// Returns the number of CUDA blocks in the 'z' dimension.
+LIBC_INLINE uint32_t get_num_blocks_z() {
+  return __nvvm_read_ptx_sreg_nctaid_z();
+}
+
+/// Returns the 'x' dimension of the current CUDA block's id.
 LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }
 
+/// Returns the 'y' dimension of the current CUDA block's id.
+LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); }
+
+/// Returns the 'z' dimension of the current CUDA block's id.
+LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); }
+
+/// Returns the absolute id of the CUDA block.
+LIBC_INLINE uint64_t get_block_id() {
+  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
+         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
+}
+
+/// Returns the number of CUDA threads in the 'x' dimension.
+LIBC_INLINE uint32_t get_num_threads_x() {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+/// Returns the number of CUDA threads in the 'y' dimension.
+LIBC_INLINE uint32_t get_num_threads_y() {
+  return __nvvm_read_ptx_sreg_ntid_y();
+}
+
+/// Returns the number of CUDA threads in the 'z' dimension.
+LIBC_INLINE uint32_t get_num_threads_z() {
+  return __nvvm_read_ptx_sreg_ntid_z();
+}
+
+/// Returns the 'x' dimension id of the thread in the current CUDA block.
+LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }
+
+/// Returns the 'y' dimension id of the thread in the current CUDA block.
+LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); }
+
+/// Returns the 'z' dimension id of the thread in the current CUDA block.
+LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); }
+
+/// Returns the absolute id of the thread in the current CUDA block.
+LIBC_INLINE uint64_t get_thread_id() {
+  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
+         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
+}
+
+/// Returns the size of a CUDA warp.
+LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+
+/// Returns the id of the thread inside of a CUDA warp executing together.
+[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
+  return get_thread_id() & (get_lane_size() - 1);
+}
+
+/// Returns the bit-mask of active threads in the current warp.
+[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
+  uint32_t mask;
+  asm volatile("activemask.b32 %0;" : "=r"(mask));
+  return mask;
+}
+
+/// Copies the value from the first active thread in the warp to the rest.
+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
+  // NOTE: This is not sufficient in all cases on Volta hardware or later. The
+  // lane mask returned here is not always the true lane mask used by the
+  // intrinsics in cases of incedental or enforced divergence by the user.
+  uint64_t lane_mask = get_lane_mask();
+  uint64_t id = __builtin_ffsl(lane_mask) - 1;
+#if __CUDA_ARCH__ >= 600
+  return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);
+#else
+  return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
+#endif
+}
+
+/// Waits for all the threads in the block to converge and issues a fence.
+[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
+
+/// Waits for all threads in the warp to reconverge for independent scheduling.
+[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
+#if __CUDA_ARCH__ >= 700
+  __nvvm_bar_warp_sync(mask);
+#else
+  (void)mask;
+#endif
+}
+
+} // namespace gpu
 } // namespace __llvm_libc
 
 #endif
diff --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt
@@ -20,5 +20,6 @@ add_object_library(
   HDRS
     rpc_client.h
   DEPENDS
+    libc.src.__support.GPU.utils
     .rpc
 )
diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -4,6 +4,7 @@ add_startup_object(
     start.cpp
   DEPENDS
     libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin

diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
@@ -6,16 +6,38 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 
-static __llvm_libc::cpp::Atomic<uint32_t> lock;
-
 extern "C" int main(int argc, char **argv, char **envp);
 
+namespace __llvm_libc {
+
+static cpp::Atomic<uint32_t> lock = 0;
+
+static cpp::Atomic<uint32_t> init = 0;
+
+void init_rpc(void *in, void *out, void *buffer) {
+  // Only a single thread should update the RPC data.
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    rpc::client.reset(&lock, in, out, buffer);
+    init.store(1, cpp::MemoryOrder::RELAXED);
+  }
+
+  // Wait until the previous thread signals that the data has been written.
+  while (!init.load(cpp::MemoryOrder::RELAXED))
+    rpc::sleep_briefly();
+
+  // Wait for the threads in the block to converge and fence the write.
+  gpu::sync_threads();
+}
+
+} // namespace __llvm_libc
+
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::rpc::client.reset(&lock, in, out, buffer);
+  __llvm_libc::init_rpc(in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }
diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -5,6 +5,7 @@ add_startup_object(
     start.cpp
   DEPENDS
     libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin

diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
@@ -6,16 +6,38 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 
-static __llvm_libc::cpp::Atomic<uint32_t> lock;
-
 extern "C" int main(int argc, char **argv, char **envp);
 
-extern "C" [[gnu::visibility("protected")]] __attribute__((nvptx_kernel)) void
+namespace __llvm_libc {
+
+static cpp::Atomic<uint32_t> lock = 0;
+
+static cpp::Atomic<uint32_t> init = 0;
+
+void init_rpc(void *in, void *out, void *buffer) {
+  // Only a single thread should update the RPC data.
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    rpc::client.reset(&lock, in, out, buffer);
+    init.store(1, cpp::MemoryOrder::RELAXED);
+  }
+
+  // Wait until the previous thread signals that the data has been written.
+  while (!init.load(cpp::MemoryOrder::RELAXED))
+    rpc::sleep_briefly();
+
+  // Wait for the threads in the block to converge and fence the write.
+  gpu::sync_threads();
+}
+
+} // namespace __llvm_libc
+
+extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::rpc::client.reset(&lock, in, out, buffer);
+  __llvm_libc::init_rpc(in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }