Skip to content

Commit

Permalink
Revert "[libc] Treat the locks array as a bitfield"
Browse files Browse the repository at this point in the history
Summary:
This caused test failures on the gfx90a buildbot. This works on my
gfx1030 and the Nvidia buildbots, so we'll need to investigate what is
going wrong here. For now revert it to get the bots green.

This reverts commit 05abcc5.
  • Loading branch information
jhuber6 committed Jul 19, 2023
1 parent 80e20c8 commit 979fb95
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 38 deletions.
53 changes: 19 additions & 34 deletions libc/src/__support/RPC/rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,11 @@ template <uint32_t lane_size = gpu::LANE_SIZE> struct alignas(64) Packet {
Payload<lane_size> payload;
};

/// The maximum number of parallel ports that the RPC interface can support.
constexpr uint64_t MAX_PORT_COUNT = 512;
// TODO: This should be configured by the server and passed in. The general rule
// of thumb is that you should have at least as many ports as possible
// concurrent work items on the GPU to mitigate the lack offorward
// progress guarantees on the GPU.
constexpr uint64_t DEFAULT_PORT_COUNT = 64;

/// A common process used to synchronize communication between a client and a
/// server. The process contains a read-only inbox and a write-only outbox used
Expand All @@ -84,8 +87,7 @@ template <bool Invert, typename Packet> struct Process {
cpp::Atomic<uint32_t> *outbox;
Packet *packet;

static constexpr uint64_t NUM_BITS_IN_WORD = (sizeof(uint32_t) * 8);
cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
cpp::Atomic<uint32_t> lock[DEFAULT_PORT_COUNT] = {0};

/// Initialize the communication channels.
LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
Expand Down Expand Up @@ -156,9 +158,10 @@ template <bool Invert, typename Packet> struct Process {

/// Attempt to claim the lock at index. Return true on lock taken.
/// lane_mask is a bitmap of the threads in the warp that would hold the
/// single lock on success, e.g. the result of gpu::get_lane_mask().
/// The lock is held when the nth bit of the lock bitfield is set, otherwise
/// it is availible.
/// single lock on success, e.g. the result of gpu::get_lane_mask()
/// The lock is held when the zeroth bit of the uint32_t at lock[index]
/// is set, and available when that bit is clear. Bits [1, 32) are zero.
/// Or with one is a no-op when the lock is already held.
[[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
uint64_t index) {
// On amdgpu, test and set to lock[index] and a sync_lane would suffice
Expand All @@ -170,10 +173,11 @@ template <bool Invert, typename Packet> struct Process {
// succeed in taking the lock, as otherwise it will leak. This is handled
// by making threads which are not in lane_mask or with 0, a no-op.
uint32_t id = gpu::get_lane_id();
bool id_in_lane_mask = lane_mask & (1u << id);
bool id_in_lane_mask = lane_mask & (1ul << id);

// All threads in the warp call fetch_or. Possibly at the same time.
bool before = set_nth(lock, index, id_in_lane_mask);
bool before =
lock[index].fetch_or(id_in_lane_mask, cpp::MemoryOrder::RELAXED);
uint64_t packed = gpu::ballot(lane_mask, before);

// If every bit set in lane_mask is also set in packed, every single thread
Expand Down Expand Up @@ -208,11 +212,12 @@ template <bool Invert, typename Packet> struct Process {
// Wait for other threads in the warp to finish using the lock
gpu::sync_lane(lane_mask);

// Use exactly one thread to clear the associated lock bit. Must restrict
// to a single thread to avoid one thread dropping the lock, then an
// unrelated warp claiming the lock, then a second thread in this warp
// dropping the lock again.
clear_nth(lock, index, rpc::is_first_lane(lane_mask));
// Use exactly one thread to clear the bit at position 0 in lock[index]
// Must restrict to a single thread to avoid one thread dropping the lock,
// then an unrelated warp claiming the lock, then a second thread in this
// warp dropping the lock again.
uint32_t and_mask = ~(rpc::is_first_lane(lane_mask) ? 1 : 0);
lock[index].fetch_and(and_mask, cpp::MemoryOrder::RELAXED);
gpu::sync_lane(lane_mask);
}

Expand Down Expand Up @@ -240,26 +245,6 @@ template <bool Invert, typename Packet> struct Process {
LIBC_INLINE static constexpr uint64_t buffer_offset(uint64_t port_count) {
return align_up(2 * mailbox_bytes(port_count), alignof(Packet));
}

/// Conditionally set the n-th bit in the atomic bitfield.
LIBC_INLINE static constexpr uint32_t set_nth(cpp::Atomic<uint32_t> *bits,
uint64_t index, bool cond) {
const uint32_t slot = index / NUM_BITS_IN_WORD;
const uint32_t bit = index % NUM_BITS_IN_WORD;
return bits[slot].fetch_or(static_cast<uint32_t>(cond) << bit,
cpp::MemoryOrder::RELAXED) &
(1u << bit);
}

/// Conditionally clear the n-th bit in the atomic bitfield.
LIBC_INLINE static constexpr uint32_t clear_nth(cpp::Atomic<uint32_t> *bits,
uint64_t index, bool cond) {
const uint32_t slot = index / NUM_BITS_IN_WORD;
const uint32_t bit = index % NUM_BITS_IN_WORD;
return bits[slot].fetch_and(-1u ^ (static_cast<uint32_t>(cond) << bit),
cpp::MemoryOrder::RELAXED) &
(1u << bit);
}
};

/// Invokes a function accross every active buffer across the total lane size.
Expand Down
2 changes: 1 addition & 1 deletion libc/startup/gpu/amdgpu/start.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
// We need to set up the RPC client first in case any of the constructors
// require it.
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::DEFAULT_PORT_COUNT,
rpc_shared_buffer);

// We want the fini array callbacks to be run after other atexit
Expand Down
2 changes: 1 addition & 1 deletion libc/startup/gpu/nvptx/start.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
// We need to set up the RPC client first in case any of the constructors
// require it.
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::DEFAULT_PORT_COUNT,
rpc_shared_buffer);

// We want the fini array callbacks to be run after other atexit
Expand Down
2 changes: 1 addition & 1 deletion libc/utils/gpu/server/rpc_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ using namespace __llvm_libc;
static_assert(sizeof(rpc_buffer_t) == sizeof(rpc::Buffer),
"Buffer size mismatch");

static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT,
static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::DEFAULT_PORT_COUNT,
"Incorrect maximum port count");

// The client needs to support different lane sizes for the SIMT model. Because
Expand Down
2 changes: 1 addition & 1 deletion libc/utils/gpu/server/rpc_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ extern "C" {
#endif

/// The maxium number of ports that can be opened for any server.
const uint64_t RPC_MAXIMUM_PORT_COUNT = 512;
const uint64_t RPC_MAXIMUM_PORT_COUNT = 64;

/// The symbol name associated with the client for use with the LLVM C library
/// implementation.
Expand Down

0 comments on commit 979fb95

Please sign in to comment.