Revert "[libc] Treat the locks array as a bitfield"

Summary: This caused test failures on the gfx90a buildbot. This works on my gfx1030 and the Nvidia buildbots, so we'll need to investigate what is going wrong here. For now revert it to get the bots green. This reverts commit 05abcc5.
llvm · Jul 19, 2023 · 979fb95 · 979fb95
1 parent 80e20c8
commit 979fb95
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 38 deletions.
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
@@ -56,8 +56,11 @@ template <uint32_t lane_size = gpu::LANE_SIZE> struct alignas(64) Packet {
   Payload<lane_size> payload;
 };
 
-/// The maximum number of parallel ports that the RPC interface can support.
-constexpr uint64_t MAX_PORT_COUNT = 512;
+// TODO: This should be configured by the server and passed in. The general rule
+//       of thumb is that you should have at least as many ports as possible
+//       concurrent work items on the GPU to mitigate the lack offorward
+//       progress guarantees on the GPU.
+constexpr uint64_t DEFAULT_PORT_COUNT = 64;
 
 /// A common process used to synchronize communication between a client and a
 /// server. The process contains a read-only inbox and a write-only outbox used
@@ -84,8 +87,7 @@ template <bool Invert, typename Packet> struct Process {
   cpp::Atomic<uint32_t> *outbox;
   Packet *packet;
 
-  static constexpr uint64_t NUM_BITS_IN_WORD = (sizeof(uint32_t) * 8);
-  cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
+  cpp::Atomic<uint32_t> lock[DEFAULT_PORT_COUNT] = {0};
 
   /// Initialize the communication channels.
   LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
@@ -156,9 +158,10 @@ template <bool Invert, typename Packet> struct Process {
 
   /// Attempt to claim the lock at index. Return true on lock taken.
   /// lane_mask is a bitmap of the threads in the warp that would hold the
-  /// single lock on success, e.g. the result of gpu::get_lane_mask().
-  /// The lock is held when the nth bit of the lock bitfield is set, otherwise
-  /// it is availible.
+  /// single lock on success, e.g. the result of gpu::get_lane_mask()
+  /// The lock is held when the zeroth bit of the uint32_t at lock[index]
+  /// is set, and available when that bit is clear. Bits [1, 32) are zero.
+  /// Or with one is a no-op when the lock is already held.
   [[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
                                                   uint64_t index) {
     // On amdgpu, test and set to lock[index] and a sync_lane would suffice
@@ -170,10 +173,11 @@ template <bool Invert, typename Packet> struct Process {
     // succeed in taking the lock, as otherwise it will leak. This is handled
     // by making threads which are not in lane_mask or with 0, a no-op.
     uint32_t id = gpu::get_lane_id();
-    bool id_in_lane_mask = lane_mask & (1u << id);
+    bool id_in_lane_mask = lane_mask & (1ul << id);
 
     // All threads in the warp call fetch_or. Possibly at the same time.
-    bool before = set_nth(lock, index, id_in_lane_mask);
+    bool before =
+        lock[index].fetch_or(id_in_lane_mask, cpp::MemoryOrder::RELAXED);
     uint64_t packed = gpu::ballot(lane_mask, before);
 
     // If every bit set in lane_mask is also set in packed, every single thread
@@ -208,11 +212,12 @@ template <bool Invert, typename Packet> struct Process {
     // Wait for other threads in the warp to finish using the lock
     gpu::sync_lane(lane_mask);
 
-    // Use exactly one thread to clear the associated lock bit. Must restrict
-    // to a single thread to avoid one thread dropping the lock, then an
-    // unrelated warp claiming the lock, then a second thread in this warp
-    // dropping the lock again.
-    clear_nth(lock, index, rpc::is_first_lane(lane_mask));
+    // Use exactly one thread to clear the bit at position 0 in lock[index]
+    // Must restrict to a single thread to avoid one thread dropping the lock,
+    // then an unrelated warp claiming the lock, then a second thread in this
+    // warp dropping the lock again.
+    uint32_t and_mask = ~(rpc::is_first_lane(lane_mask) ? 1 : 0);
+    lock[index].fetch_and(and_mask, cpp::MemoryOrder::RELAXED);
     gpu::sync_lane(lane_mask);
   }
 
@@ -240,26 +245,6 @@ template <bool Invert, typename Packet> struct Process {
   LIBC_INLINE static constexpr uint64_t buffer_offset(uint64_t port_count) {
     return align_up(2 * mailbox_bytes(port_count), alignof(Packet));
   }
-
-  /// Conditionally set the n-th bit in the atomic bitfield.
-  LIBC_INLINE static constexpr uint32_t set_nth(cpp::Atomic<uint32_t> *bits,
-                                                uint64_t index, bool cond) {
-    const uint32_t slot = index / NUM_BITS_IN_WORD;
-    const uint32_t bit = index % NUM_BITS_IN_WORD;
-    return bits[slot].fetch_or(static_cast<uint32_t>(cond) << bit,
-                               cpp::MemoryOrder::RELAXED) &
-           (1u << bit);
-  }
-
-  /// Conditionally clear the n-th bit in the atomic bitfield.
-  LIBC_INLINE static constexpr uint32_t clear_nth(cpp::Atomic<uint32_t> *bits,
-                                                  uint64_t index, bool cond) {
-    const uint32_t slot = index / NUM_BITS_IN_WORD;
-    const uint32_t bit = index % NUM_BITS_IN_WORD;
-    return bits[slot].fetch_and(-1u ^ (static_cast<uint32_t>(cond) << bit),
-                                cpp::MemoryOrder::RELAXED) &
-           (1u << bit);
-  }
 };
 
 /// Invokes a function accross every active buffer across the total lane size.

diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
@@ -47,7 +47,7 @@ extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
 _begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
   // We need to set up the RPC client first in case any of the constructors
   // require it.
-  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
+  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::DEFAULT_PORT_COUNT,
                                  rpc_shared_buffer);
 
   // We want the fini array callbacks to be run after other atexit

diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
@@ -45,7 +45,7 @@ extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
   // We need to set up the RPC client first in case any of the constructors
   // require it.
-  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
+  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::DEFAULT_PORT_COUNT,
                                  rpc_shared_buffer);
 
   // We want the fini array callbacks to be run after other atexit

diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
@@ -23,7 +23,7 @@ using namespace __llvm_libc;
 static_assert(sizeof(rpc_buffer_t) == sizeof(rpc::Buffer),
               "Buffer size mismatch");
 
-static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT,
+static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::DEFAULT_PORT_COUNT,
               "Incorrect maximum port count");
 
 // The client needs to support different lane sizes for the SIMT model. Because

diff --git a/libc/utils/gpu/server/rpc_server.h b/libc/utils/gpu/server/rpc_server.h
@@ -18,7 +18,7 @@ extern "C" {
 #endif
 
 /// The maxium number of ports that can be opened for any server.
-const uint64_t RPC_MAXIMUM_PORT_COUNT = 512;
+const uint64_t RPC_MAXIMUM_PORT_COUNT = 64;
 
 /// The symbol name associated with the client for use with the LLVM C library
 /// implementation.