Skip to content

Commit

Permalink
[libomptarget][nfc] Refactor amdgpu partial barrier to simplify addin…
Browse files Browse the repository at this point in the history
…g a second one

[libomptarget][nfc] Refactor amdgpu partial barrier to simplify adding a second one

D101976 would require a second barrier instance. This NFC to amdgpu makes it
simpler to add one (an extra global, one more line in init). Also renames the
current barrier to L0.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D102016
  • Loading branch information
JonChesterfield committed May 6, 2021
1 parent 5dc1ed3 commit 44ee974
Showing 1 changed file with 17 additions and 12 deletions.
29 changes: 17 additions & 12 deletions openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
Expand Up @@ -52,15 +52,8 @@ EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
return __builtin_amdgcn_read_exec();
}

uint32_t __kmpc_L1_Barrier [[clang::loader_uninitialized]];
#pragma allocate(__kmpc_L1_Barrier) allocator(omp_pteam_mem_alloc)

EXTERN void __kmpc_impl_target_init() {
// Don't have global ctors, and shared memory is not zero init
__atomic_store_n(&__kmpc_L1_Barrier, 0u, __ATOMIC_RELEASE);
}

EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
static void pteam_mem_barrier(uint32_t num_threads, uint32_t * barrier_state)
{
__atomic_thread_fence(__ATOMIC_ACQUIRE);

uint32_t num_waves = num_threads / WARPSIZE;
Expand All @@ -81,7 +74,7 @@ EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
bool isLowest = GetLaneId() == lowestActiveThread;

if (isLowest) {
uint32_t load = __atomic_fetch_add(&__kmpc_L1_Barrier, 1,
uint32_t load = __atomic_fetch_add(barrier_state, 1,
__ATOMIC_RELAXED); // commutative

// Record the number of times the barrier has been passed
Expand All @@ -94,18 +87,30 @@ EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
load &= 0xffff0000u; // because bits zeroed second

// Reset the wave counter and release the waiting waves
__atomic_store_n(&__kmpc_L1_Barrier, load, __ATOMIC_RELAXED);
__atomic_store_n(barrier_state, load, __ATOMIC_RELAXED);
} else {
// more waves still to go, spin until generation counter changes
do {
__builtin_amdgcn_s_sleep(0);
load = __atomic_load_n(&__kmpc_L1_Barrier, __ATOMIC_RELAXED);
load = __atomic_load_n(barrier_state, __ATOMIC_RELAXED);
} while ((load & 0xffff0000u) == generation);
}
}
__atomic_thread_fence(__ATOMIC_RELEASE);
}

uint32_t __kmpc_L0_Barrier [[clang::loader_uninitialized]];
#pragma allocate(__kmpc_L0_Barrier) allocator(omp_pteam_mem_alloc)

EXTERN void __kmpc_impl_target_init() {
// Don't have global ctors, and shared memory is not zero init
__atomic_store_n(&__kmpc_L0_Barrier, 0u, __ATOMIC_RELEASE);
}

EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
pteam_mem_barrier(num_threads, &__kmpc_L0_Barrier);
}

namespace {
uint32_t get_grid_dim(uint32_t n, uint16_t d) {
uint32_t q = n / d;
Expand Down

0 comments on commit 44ee974

Please sign in to comment.