Skip to content

Commit

Permalink
[libc][rpc] Update locking to work on volta
Browse files Browse the repository at this point in the history
Carefully work around not knowing the thread mask that nvptx intrinsic
functions require.

If the warp is converged when calling try_lock, a single rpc call will handle
all lanes within it. Otherwise more than one rpc call with thread masks that
compose to the unknown one will occur.

Reviewed By: jhuber6

Differential Revision: https://reviews.llvm.org/D149897
  • Loading branch information
JonChesterfield committed May 4, 2023
1 parent 0a53220 commit b132373
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 5 deletions.
4 changes: 4 additions & 0 deletions libc/src/__support/CPP/atomic.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ template <typename T> struct Atomic {
return __atomic_fetch_or(&val, mask, int(mem_ord));
}

T fetch_and(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
return __atomic_fetch_and(&val, mask, int(mem_ord));
}

T fetch_sub(T decrement, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
return __atomic_fetch_sub(&val, decrement, int(mem_ord));
}
Expand Down
12 changes: 11 additions & 1 deletion libc/src/__support/GPU/amdgpu/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }

/// Returns the id of the thread inside of an AMD wavefront executing together.
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
if (LANE_SIZE == 64)
if constexpr (LANE_SIZE == 64)
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
else
return __builtin_amdgcn_mbcnt_lo(~0u, 0u);
Expand All @@ -122,6 +122,16 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
return __builtin_amdgcn_readfirstlane(x);
}

[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
// the lane_mask & gives the nvptx semantics when lane_mask is a subset of
// the active threads
if constexpr (LANE_SIZE == 64) {
return lane_mask & __builtin_amdgcn_ballot_w64(x);
} else {
return lane_mask & __builtin_amdgcn_ballot_w32(x);
}
}

/// Waits for all the threads in the block to converge and issues a fence.
[[clang::convergent]] LIBC_INLINE void sync_threads() {
__builtin_amdgcn_s_barrier();
Expand Down
5 changes: 5 additions & 0 deletions libc/src/__support/GPU/generic/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ LIBC_INLINE uint64_t get_lane_mask() { return 1; }

LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }

LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
(void)lane_mask;
return x;
}

LIBC_INLINE void sync_threads() {}

LIBC_INLINE void sync_lane(uint64_t) {}
Expand Down
7 changes: 7 additions & 0 deletions libc/src/__support/GPU/nvptx/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,13 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
#endif
}

[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
#if __CUDA_ARCH__ >= 600
return __nvvm_vote_ballot_sync(lane_mask, x);
#else
return lane_mask & __nvvm_vote_ballot(x);
#endif
}
/// Waits for all the threads in the block to converge and issues a fence.
[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }

Expand Down
47 changes: 43 additions & 4 deletions libc/src/__support/RPC/rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,55 @@ template <bool InvertInbox> struct Process {
}

/// Attempt to claim the lock at index. Return true on lock taken.
/// lane_mask is a bitmap of the threads in the warp that would hold the
/// single lock on success, e.g. the result of gpu::get_lane_mask()
/// The lock is held when the zeroth bit of the uint32_t at lock[index]
/// is set, and available when that bit is clear. Bits [1, 32) are zero.
/// Or with one is a no-op when the lock is already held.
LIBC_INLINE bool try_lock(uint64_t, uint64_t index) {
return lock[index].fetch_or(1, cpp::MemoryOrder::RELAXED) == 0;
[[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
uint64_t index) {
// On amdgpu, test and set to lock[index] and a sync_lane would suffice
// On volta, need to handle differences between the threads running and
// the threads that were detected in the previous call to get_lane_mask()
//
// All threads in lane_mask try to claim the lock. At most one can succeed.
// There may be threads active which are not in lane mask which must not
// succeed in taking the lock, as otherwise it will leak. This is handled
// by making threads which are not in lane_mask or with 0, a no-op.
uint32_t id = gpu::get_lane_id();
bool id_in_lane_mask = lane_mask & (1ul << id);

// All threads in the warp call fetch_or. Possibly at the same time.
bool before =
lock[index].fetch_or(id_in_lane_mask, cpp::MemoryOrder::RELAXED);
uint64_t packed = gpu::ballot(lane_mask, before);

// If every bit set in lane_mask is also set in packed, every single thread
// in the warp failed to get the lock. Ballot returns unset for threads not
// in the lane mask.
//
// Cases, per thread:
// mask==0 -> unspecified before, discarded by ballot -> 0
// mask==1 and before==0 (success), set zero by ballot -> 0
// mask==1 and before==1 (failure), set one by ballot -> 1
//
// mask != packed implies at least one of the threads got the lock
// atomic semantics of fetch_or mean at most one of the threads for the lock
return lane_mask != packed;
}

// Unlock the lock at index.
LIBC_INLINE void unlock(uint64_t, uint64_t index) {
lock[index].store(0, cpp::MemoryOrder::RELAXED);
[[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask,
uint64_t index) {
// Wait for other threads in the warp to finish using the lock
gpu::sync_lane(lane_mask);

// Use exactly one thread to clear the bit at position 0 in lock[index]
// Must restrict to a single thread to avoid one thread dropping the lock,
// then an unrelated warp claiming the lock, then a second thread in this
// warp dropping the lock again.
uint32_t and_mask = ~(rpc::is_first_lane(lane_mask) ? 1 : 0);
lock[index].fetch_and(and_mask, cpp::MemoryOrder::RELAXED);
}
};

Expand Down
1 change: 1 addition & 0 deletions libc/src/__support/RPC/rpc_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H
#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H

#include "src/__support/GPU/utils.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/properties/architectures.h"

Expand Down

0 comments on commit b132373

Please sign in to comment.