Skip to content

Commit

Permalink
[libc] Drop RPC port count and index to 32-bit numbers
Browse files Browse the repository at this point in the history
The port count and index into the ports was originally written as a
64-bit number. This was with an abundance of caution, however it's
highly unlikely that any configuration will excede a 32-bit number as
most machines  will require something in the low-thousands. Because GPUs
are functionally 32-bit in many of their operations this costs us some
extra time and registers to do the 64-bit operations. Doing this saves
us about four registers in most tests.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D157980
  • Loading branch information
jhuber6 committed Aug 15, 2023
1 parent 4702a94 commit 5f276d3
Showing 1 changed file with 27 additions and 27 deletions.
54 changes: 27 additions & 27 deletions libc/src/__support/RPC/rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ template <bool Invert, typename Packet> struct Process {
LIBC_INLINE Process &operator=(Process &&) = default;
LIBC_INLINE ~Process() = default;

uint64_t port_count = 0;
uint32_t port_count = 0;
cpp::Atomic<uint32_t> *inbox = nullptr;
cpp::Atomic<uint32_t> *outbox = nullptr;
Packet *packet = nullptr;
Expand All @@ -89,7 +89,7 @@ template <bool Invert, typename Packet> struct Process {
cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};

/// Initialize the communication channels.
LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
this->port_count = port_count;
this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
advance(buffer, inbox_offset(port_count)));
Expand All @@ -111,25 +111,25 @@ template <bool Invert, typename Packet> struct Process {
/// Atomic<uint32_t> secondary[port_count];
/// Packet buffer[port_count];
/// };
LIBC_INLINE static constexpr uint64_t allocation_size(uint64_t port_count) {
LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count) {
return buffer_offset(port_count) + buffer_bytes(port_count);
}

/// Retrieve the inbox state from memory shared between processes.
LIBC_INLINE uint32_t load_inbox(uint64_t index) {
LIBC_INLINE uint32_t load_inbox(uint32_t index) {
return inbox[index].load(cpp::MemoryOrder::RELAXED);
}

/// Retrieve the outbox state from memory shared between processes.
LIBC_INLINE uint32_t load_outbox(uint64_t index) {
LIBC_INLINE uint32_t load_outbox(uint32_t index) {
return outbox[index].load(cpp::MemoryOrder::RELAXED);
}

/// Signal to the other process that this one is finished with the buffer.
/// Equivalent to loading outbox followed by store of the inverted value
/// The outbox is write only by this warp and tracking the value locally is
/// cheaper than calling load_outbox to get the value to store.
LIBC_INLINE uint32_t invert_outbox(uint64_t index, uint32_t current_outbox) {
LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
uint32_t inverted_outbox = !current_outbox;
atomic_thread_fence(cpp::MemoryOrder::RELEASE);
outbox[index].store(inverted_outbox, cpp::MemoryOrder::RELAXED);
Expand All @@ -138,7 +138,7 @@ template <bool Invert, typename Packet> struct Process {

// Given the current outbox and inbox values, wait until the inbox changes
// to indicate that this thread owns the buffer element.
LIBC_INLINE void wait_for_ownership(uint64_t index, uint32_t outbox,
LIBC_INLINE void wait_for_ownership(uint32_t index, uint32_t outbox,
uint32_t in) {
while (buffer_unavailable(in, outbox)) {
sleep_briefly();
Expand All @@ -160,7 +160,7 @@ template <bool Invert, typename Packet> struct Process {
/// single lock on success, e.g. the result of gpu::get_lane_mask()
/// The lock is held when the n-th bit of the lock bitfield is set.
[[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
uint64_t index) {
uint32_t index) {
// On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
// On volta, need to handle differences between the threads running and
// the threads that were detected in the previous call to get_lane_mask()
Expand Down Expand Up @@ -201,7 +201,7 @@ template <bool Invert, typename Packet> struct Process {
/// Unlock the lock at index. We need a lane sync to keep this function
/// convergent, otherwise the compiler will sink the store and deadlock.
[[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask,
uint64_t index) {
uint32_t index) {
// Do not move any writes past the unlock
atomic_thread_fence(cpp::MemoryOrder::RELEASE);

Expand All @@ -217,45 +217,45 @@ template <bool Invert, typename Packet> struct Process {
}

/// Number of bytes to allocate for an inbox or outbox.
LIBC_INLINE static constexpr uint64_t mailbox_bytes(uint64_t port_count) {
LIBC_INLINE static constexpr uint64_t mailbox_bytes(uint32_t port_count) {
return port_count * sizeof(cpp::Atomic<uint32_t>);
}

/// Number of bytes to allocate for the buffer containing the packets.
LIBC_INLINE static constexpr uint64_t buffer_bytes(uint64_t port_count) {
LIBC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count) {
return port_count * sizeof(Packet);
}

/// Offset of the inbox in memory. This is the same as the outbox if inverted.
LIBC_INLINE static constexpr uint64_t inbox_offset(uint64_t port_count) {
LIBC_INLINE static constexpr uint64_t inbox_offset(uint32_t port_count) {
return Invert ? mailbox_bytes(port_count) : 0;
}

/// Offset of the outbox in memory. This is the same as the inbox if inverted.
LIBC_INLINE static constexpr uint64_t outbox_offset(uint64_t port_count) {
LIBC_INLINE static constexpr uint64_t outbox_offset(uint32_t port_count) {
return Invert ? 0 : mailbox_bytes(port_count);
}

/// Offset of the buffer containing the packets after the inbox and outbox.
LIBC_INLINE static constexpr uint64_t buffer_offset(uint64_t port_count) {
LIBC_INLINE static constexpr uint64_t buffer_offset(uint32_t port_count) {
return align_up(2 * mailbox_bytes(port_count), alignof(Packet));
}

/// Conditionally set the n-th bit in the atomic bitfield.
LIBC_INLINE static constexpr uint32_t set_nth(cpp::Atomic<uint32_t> *bits,
uint64_t index, bool cond) {
uint64_t slot = index / NUM_BITS_IN_WORD;
uint64_t bit = index % NUM_BITS_IN_WORD;
uint32_t index, bool cond) {
uint32_t slot = index / NUM_BITS_IN_WORD;
uint32_t bit = index % NUM_BITS_IN_WORD;
return bits[slot].fetch_or(static_cast<uint32_t>(cond) << bit,
cpp::MemoryOrder::RELAXED) &
(1u << bit);
}

/// Conditionally clear the n-th bit in the atomic bitfield.
LIBC_INLINE static constexpr uint32_t clear_nth(cpp::Atomic<uint32_t> *bits,
uint64_t index, bool cond) {
uint64_t slot = index / NUM_BITS_IN_WORD;
uint64_t bit = index % NUM_BITS_IN_WORD;
uint32_t index, bool cond) {
uint32_t slot = index / NUM_BITS_IN_WORD;
uint32_t bit = index % NUM_BITS_IN_WORD;
return bits[slot].fetch_and(~0u ^ (static_cast<uint32_t>(cond) << bit),
cpp::MemoryOrder::RELAXED) &
(1u << bit);
Expand Down Expand Up @@ -292,7 +292,7 @@ static LIBC_INLINE void invoke_rpc(cpp::function<void(Buffer *, uint32_t)> fn,
/// processes. A port is conceptually an index into the memory provided by the
/// underlying process that is guarded by a lock bit.
template <bool T, typename S> struct Port {
LIBC_INLINE Port(Process<T, S> &process, uint64_t lane_mask, uint64_t index,
LIBC_INLINE Port(Process<T, S> &process, uint64_t lane_mask, uint32_t index,
uint32_t out)
: process(process), lane_mask(lane_mask), index(index), out(out),
receive(false), owns_buffer(true) {}
Expand Down Expand Up @@ -334,7 +334,7 @@ template <bool T, typename S> struct Port {
private:
Process<T, S> &process;
uint64_t lane_mask;
uint64_t index;
uint32_t index;
uint32_t out;
bool receive;
bool owns_buffer;
Expand All @@ -351,7 +351,7 @@ struct Client {
template <uint16_t opcode> LIBC_INLINE cpp::optional<Port> try_open();
template <uint16_t opcode> LIBC_INLINE Port open();

LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
process.reset(port_count, buffer);
}

Expand All @@ -374,15 +374,15 @@ template <uint32_t lane_size> struct Server {
LIBC_INLINE cpp::optional<Port> try_open();
LIBC_INLINE Port open();

LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
process.reset(port_count, buffer);
}

LIBC_INLINE void *get_buffer_start() const {
return process.get_buffer_start();
}

LIBC_INLINE static uint64_t allocation_size(uint64_t port_count) {
LIBC_INLINE static uint64_t allocation_size(uint32_t port_count) {
return Process<true, Packet<lane_size>>::allocation_size(port_count);
}

Expand Down Expand Up @@ -525,7 +525,7 @@ template <uint16_t opcode>
[[clang::convergent]] LIBC_INLINE cpp::optional<Client::Port>
Client::try_open() {
// Perform a naive linear scan for a port that can be opened to send data.
for (uint64_t index = 0; index < process.port_count; ++index) {
for (uint32_t index = 0; index < process.port_count; ++index) {
// Attempt to acquire the lock on this index.
uint64_t lane_mask = gpu::get_lane_mask();
if (!process.try_lock(lane_mask, index))
Expand Down Expand Up @@ -566,7 +566,7 @@ template <uint32_t lane_size>
cpp::optional<typename Server<lane_size>::Port>
Server<lane_size>::try_open() {
// Perform a naive linear scan for a port that has a pending request.
for (uint64_t index = 0; index < process.port_count; ++index) {
for (uint32_t index = 0; index < process.port_count; ++index) {
uint32_t in = process.load_inbox(index);
uint32_t out = process.load_outbox(index);

Expand Down

0 comments on commit 5f276d3

Please sign in to comment.