[libc][rpc] Update locking to work on volta

Carefully work around not knowing the thread mask that nvptx intrinsic functions require. If the warp is converged when calling try_lock, a single rpc call will handle all lanes within it. Otherwise more than one rpc call with thread masks that compose to the unknown one will occur. Reviewed By: jhuber6 Differential Revision: https://reviews.llvm.org/D149897
llvm · May 4, 2023 · b132373 · b132373
1 parent 0a53220
commit b132373
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 5 deletions.
diff --git a/libc/src/__support/CPP/atomic.h b/libc/src/__support/CPP/atomic.h
@@ -90,6 +90,10 @@ template <typename T> struct Atomic {
     return __atomic_fetch_or(&val, mask, int(mem_ord));
   }
 
+  T fetch_and(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
+    return __atomic_fetch_and(&val, mask, int(mem_ord));
+  }
+
   T fetch_sub(T decrement, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
     return __atomic_fetch_sub(&val, decrement, int(mem_ord));
   }

diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
@@ -106,7 +106,7 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
 
 /// Returns the id of the thread inside of an AMD wavefront executing together.
 [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
-  if (LANE_SIZE == 64)
+  if constexpr (LANE_SIZE == 64)
     return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
   else
     return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
@@ -122,6 +122,16 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
   return __builtin_amdgcn_readfirstlane(x);
 }
 
+[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+  // the lane_mask & gives the nvptx semantics when lane_mask is a subset of
+  // the active threads
+  if constexpr (LANE_SIZE == 64) {
+    return lane_mask & __builtin_amdgcn_ballot_w64(x);
+  } else {
+    return lane_mask & __builtin_amdgcn_ballot_w32(x);
+  }
+}
+
 /// Waits for all the threads in the block to converge and issues a fence.
 [[clang::convergent]] LIBC_INLINE void sync_threads() {
   __builtin_amdgcn_s_barrier();

diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
@@ -58,6 +58,11 @@ LIBC_INLINE uint64_t get_lane_mask() { return 1; }
 
 LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }
 
+LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+  (void)lane_mask;
+  return x;
+}
+
 LIBC_INLINE void sync_threads() {}
 
 LIBC_INLINE void sync_lane(uint64_t) {}

diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
@@ -118,6 +118,13 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
 #endif
 }
 
+[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+#if __CUDA_ARCH__ >= 600
+  return __nvvm_vote_ballot_sync(lane_mask, x);
+#else
+  return lane_mask & __nvvm_vote_ballot(x);
+#endif
+}
 /// Waits for all the threads in the block to converge and issues a fence.
 [[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
 

diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
@@ -107,16 +107,55 @@ template <bool InvertInbox> struct Process {
   }
 
   /// Attempt to claim the lock at index. Return true on lock taken.
+  /// lane_mask is a bitmap of the threads in the warp that would hold the
+  /// single lock on success, e.g. the result of gpu::get_lane_mask()
   /// The lock is held when the zeroth bit of the uint32_t at lock[index]
   /// is set, and available when that bit is clear. Bits [1, 32) are zero.
   /// Or with one is a no-op when the lock is already held.
-  LIBC_INLINE bool try_lock(uint64_t, uint64_t index) {
-    return lock[index].fetch_or(1, cpp::MemoryOrder::RELAXED) == 0;
+  [[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
+                                                  uint64_t index) {
+    // On amdgpu, test and set to lock[index] and a sync_lane would suffice
+    // On volta, need to handle differences between the threads running and
+    // the threads that were detected in the previous call to get_lane_mask()
+    //
+    // All threads in lane_mask try to claim the lock. At most one can succeed.
+    // There may be threads active which are not in lane mask which must not
+    // succeed in taking the lock, as otherwise it will leak. This is handled
+    // by making threads which are not in lane_mask or with 0, a no-op.
+    uint32_t id = gpu::get_lane_id();
+    bool id_in_lane_mask = lane_mask & (1ul << id);
+
+    // All threads in the warp call fetch_or. Possibly at the same time.
+    bool before =
+        lock[index].fetch_or(id_in_lane_mask, cpp::MemoryOrder::RELAXED);
+    uint64_t packed = gpu::ballot(lane_mask, before);
+
+    // If every bit set in lane_mask is also set in packed, every single thread
+    // in the warp failed to get the lock. Ballot returns unset for threads not
+    // in the lane mask.
+    //
+    // Cases, per thread:
+    // mask==0 -> unspecified before, discarded by ballot -> 0
+    // mask==1 and before==0 (success), set zero by ballot -> 0
+    // mask==1 and before==1 (failure), set one by ballot -> 1
+    //
+    // mask != packed implies at least one of the threads got the lock
+    // atomic semantics of fetch_or mean at most one of the threads for the lock
+    return lane_mask != packed;
   }
 
   // Unlock the lock at index.
-  LIBC_INLINE void unlock(uint64_t, uint64_t index) {
-    lock[index].store(0, cpp::MemoryOrder::RELAXED);
+  [[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask,
+                                                uint64_t index) {
+    // Wait for other threads in the warp to finish using the lock
+    gpu::sync_lane(lane_mask);
+
+    // Use exactly one thread to clear the bit at position 0 in lock[index]
+    // Must restrict to a single thread to avoid one thread dropping the lock,
+    // then an unrelated warp claiming the lock, then a second thread in this
+    // warp dropping the lock again.
+    uint32_t and_mask = ~(rpc::is_first_lane(lane_mask) ? 1 : 0);
+    lock[index].fetch_and(and_mask, cpp::MemoryOrder::RELAXED);
   }
 };
 

diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H
 #define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/properties/architectures.h"