Skip to content

Commit

Permalink
[libc] Treat the locks array as a bitfield
Browse files Browse the repository at this point in the history
Currently we keep an internal buffer of device memory that is used to
indicate ownership of a port. Since we only use this as a single bit we
can simply turn this into a bitfield. I did this manually rather than
having a separate type as we need very special handling of the masks
used to interact with the locks.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D155511
  • Loading branch information
jhuber6 committed Jul 21, 2023
1 parent 9996e71 commit d3aabeb
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 22 deletions.
50 changes: 32 additions & 18 deletions libc/src/__support/RPC/rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,8 @@ template <uint32_t lane_size = gpu::LANE_SIZE> struct alignas(64) Packet {
Payload<lane_size> payload;
};

// TODO: This should be configured by the server and passed in. The general rule
// of thumb is that you should have at least as many ports as possible
// concurrent work items on the GPU to mitigate the lack offorward
// progress guarantees on the GPU.
constexpr uint64_t DEFAULT_PORT_COUNT = 64;
/// The maximum number of parallel ports that the RPC interface can support.
constexpr uint64_t MAX_PORT_COUNT = 512;

/// A common process used to synchronize communication between a client and a
/// server. The process contains a read-only inbox and a write-only outbox used
Expand All @@ -87,7 +84,8 @@ template <bool Invert, typename Packet> struct Process {
cpp::Atomic<uint32_t> *outbox = nullptr;
Packet *packet = nullptr;

cpp::Atomic<uint32_t> lock[DEFAULT_PORT_COUNT] = {0};
static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8;
cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};

/// Initialize the communication channels.
LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
Expand Down Expand Up @@ -159,12 +157,10 @@ template <bool Invert, typename Packet> struct Process {
/// Attempt to claim the lock at index. Return true on lock taken.
/// lane_mask is a bitmap of the threads in the warp that would hold the
/// single lock on success, e.g. the result of gpu::get_lane_mask()
/// The lock is held when the zeroth bit of the uint32_t at lock[index]
/// is set, and available when that bit is clear. Bits [1, 32) are zero.
/// Or with one is a no-op when the lock is already held.
/// The lock is held when the n-th bit of the lock bitfield is set.
[[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
uint64_t index) {
// On amdgpu, test and set to lock[index] and a sync_lane would suffice
// On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
// On volta, need to handle differences between the threads running and
// the threads that were detected in the previous call to get_lane_mask()
//
Expand All @@ -176,8 +172,7 @@ template <bool Invert, typename Packet> struct Process {
bool id_in_lane_mask = lane_mask & (1ul << id);

// All threads in the warp call fetch_or. Possibly at the same time.
bool before =
lock[index].fetch_or(id_in_lane_mask, cpp::MemoryOrder::RELAXED);
bool before = set_nth(lock, index, id_in_lane_mask);
uint64_t packed = gpu::ballot(lane_mask, before);

// If every bit set in lane_mask is also set in packed, every single thread
Expand Down Expand Up @@ -212,12 +207,11 @@ template <bool Invert, typename Packet> struct Process {
// Wait for other threads in the warp to finish using the lock
gpu::sync_lane(lane_mask);

// Use exactly one thread to clear the bit at position 0 in lock[index]
// Must restrict to a single thread to avoid one thread dropping the lock,
// then an unrelated warp claiming the lock, then a second thread in this
// warp dropping the lock again.
uint32_t and_mask = ~(rpc::is_first_lane(lane_mask) ? 1 : 0);
lock[index].fetch_and(and_mask, cpp::MemoryOrder::RELAXED);
// Use exactly one thread to clear the nth bit in the lock array Must
// restrict to a single thread to avoid one thread dropping the lock, then
// an unrelated warp claiming the lock, then a second thread in this warp
// dropping the lock again.
clear_nth(lock, index, rpc::is_first_lane(lane_mask));
gpu::sync_lane(lane_mask);
}

Expand Down Expand Up @@ -245,6 +239,26 @@ template <bool Invert, typename Packet> struct Process {
LIBC_INLINE static constexpr uint64_t buffer_offset(uint64_t port_count) {
return align_up(2 * mailbox_bytes(port_count), alignof(Packet));
}

/// Conditionally set the n-th bit in the atomic bitfield.
LIBC_INLINE static constexpr uint32_t set_nth(cpp::Atomic<uint32_t> *bits,
uint64_t index, bool cond) {
uint64_t slot = index / NUM_BITS_IN_WORD;
uint64_t bit = index % NUM_BITS_IN_WORD;
return bits[slot].fetch_or(static_cast<uint32_t>(cond) << bit,
cpp::MemoryOrder::RELAXED) &
(1u << bit);
}

/// Conditionally clear the n-th bit in the atomic bitfield.
LIBC_INLINE static constexpr uint32_t clear_nth(cpp::Atomic<uint32_t> *bits,
uint64_t index, bool cond) {
uint64_t slot = index / NUM_BITS_IN_WORD;
uint64_t bit = index % NUM_BITS_IN_WORD;
return bits[slot].fetch_and(~0u ^ (static_cast<uint32_t>(cond) << bit),
cpp::MemoryOrder::RELAXED) &
(1u << bit);
}
};

/// Invokes a function accross every active buffer across the total lane size.
Expand Down
2 changes: 1 addition & 1 deletion libc/startup/gpu/amdgpu/start.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
// We need to set up the RPC client first in case any of the constructors
// require it.
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::DEFAULT_PORT_COUNT,
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
rpc_shared_buffer);

// We want the fini array callbacks to be run after other atexit
Expand Down
2 changes: 1 addition & 1 deletion libc/startup/gpu/nvptx/start.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
// We need to set up the RPC client first in case any of the constructors
// require it.
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::DEFAULT_PORT_COUNT,
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
rpc_shared_buffer);

// We want the fini array callbacks to be run after other atexit
Expand Down
2 changes: 1 addition & 1 deletion libc/utils/gpu/server/rpc_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ using namespace __llvm_libc;
static_assert(sizeof(rpc_buffer_t) == sizeof(rpc::Buffer),
"Buffer size mismatch");

static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::DEFAULT_PORT_COUNT,
static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT,
"Incorrect maximum port count");

// The client needs to support different lane sizes for the SIMT model. Because
Expand Down
2 changes: 1 addition & 1 deletion libc/utils/gpu/server/rpc_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ extern "C" {
#endif

/// The maxium number of ports that can be opened for any server.
const uint64_t RPC_MAXIMUM_PORT_COUNT = 64;
const uint64_t RPC_MAXIMUM_PORT_COUNT = 512;

/// The symbol name associated with the client for use with the LLVM C library
/// implementation.
Expand Down

0 comments on commit d3aabeb

Please sign in to comment.