diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h index 130578ed43020..4b8068f9e4267 100644 --- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h +++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h @@ -16,6 +16,34 @@ namespace ompx { +namespace synchronize { + +/// Initialize the synchronization machinery. Must be called by all threads. +void init(bool IsSPMD); + +/// Synchronize all threads in a warp identified by \p Mask. +void warp(LaneMaskTy Mask); + +/// Synchronize all threads in a block. +void threads(); + +/// Synchronizing threads is allowed even if they all hit different instances of +/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more +/// restrictive in that it requires all threads to hit the same instance. The +/// noinline is removed by the openmp-opt pass and helps to preserve the +/// information till then. +///{ +#pragma omp begin assumes ext_aligned_barrier + +/// Synchronize all threads in a block, they are are reaching the same +/// instruction (hence all threads in the block are "aligned"). +__attribute__((noinline)) void threadsAligned(); + +#pragma omp end assumes +///} + +} // namespace synchronize + namespace atomic { enum OrderingTy { @@ -83,38 +111,6 @@ ATOMIC_FP_OP(double) } // namespace atomic -namespace synchronize { - -/// Initialize the synchronization machinery. Must be called by all threads. -void init(bool IsSPMD); - -/// Synchronize all threads in a warp identified by \p Mask. -void warp(LaneMaskTy Mask); - -/// Synchronize all threads in a block and perform a fence before and after the -/// barrier according to \p Ordering. Note that the fence might be part of the -/// barrier. -void threads(atomic::OrderingTy Ordering); - -/// Synchronizing threads is allowed even if they all hit different instances of -/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more -/// restrictive in that it requires all threads to hit the same instance. The -/// noinline is removed by the openmp-opt pass and helps to preserve the -/// information till then. -///{ -#pragma omp begin assumes ext_aligned_barrier - -/// Synchronize all threads in a block, they are reaching the same instruction -/// (hence all threads in the block are "aligned"). Also perform a fence before -/// and after the barrier according to \p Ordering. Note that the -/// fence might be part of the barrier if the target offers this. -__attribute__((noinline)) void threadsAligned(atomic::OrderingTy Ordering); - -#pragma omp end assumes -///} - -} // namespace synchronize - namespace fence { /// Memory fence with \p Ordering semantics for the team. diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp index fa615789c05cb..c88aacbf6e432 100644 --- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -40,7 +40,7 @@ static void genericStateMachine(IdentTy *Ident) { ParallelRegionFnTy WorkFn = nullptr; // Wait for the signal that we have a new work function. - synchronize::threads(atomic::seq_cst); + synchronize::threads(); // Retrieve the work function from the runtime. bool IsActive = __kmpc_kernel_parallel(&WorkFn); @@ -56,7 +56,7 @@ static void genericStateMachine(IdentTy *Ident) { __kmpc_kernel_end_parallel(); } - synchronize::threads(atomic::seq_cst); + synchronize::threads(); } while (true); } @@ -74,7 +74,7 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode, Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD; if (IsSPMD) { inititializeRuntime(/* IsSPMD */ true); - synchronize::threadsAligned(atomic::relaxed); + synchronize::threadsAligned(); } else { inititializeRuntime(/* IsSPMD */ false); // No need to wait since only the main threads will execute user @@ -83,10 +83,6 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode, if (IsSPMD) { state::assumeInitialState(IsSPMD); - - // Synchronize to ensure the assertions above are in an aligned region. - // The barrier is eliminated later. - synchronize::threadsAligned(atomic::relaxed); return -1; } @@ -136,11 +132,7 @@ void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode) { FunctionTracingRAII(); const bool IsSPMD = Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD; - - synchronize::threadsAligned(atomic::acq_rel); state::assumeInitialState(IsSPMD); - synchronize::threadsAligned(atomic::relaxed); - if (IsSPMD) return; diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp index d32dd7e4f9980..d2fee11236302 100644 --- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -113,7 +113,7 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, if (mapping::isSPMDMode()) { // Avoid the race between the read of the `icv::Level` above and the write // below by synchronizing all threads here. - synchronize::threadsAligned(atomic::seq_cst); + synchronize::threadsAligned(); { // Note that the order here is important. `icv::Level` has to be updated // last or the other updates will cause a thread specific state to be @@ -128,36 +128,28 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, // Synchronize all threads after the main thread (TId == 0) set up the // team state properly. - synchronize::threadsAligned(atomic::acq_rel); + synchronize::threadsAligned(); state::ParallelTeamSize.assert_eq(NumThreads, ident, /* ForceTeamState */ true); icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true); icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true); - // Ensure we synchronize before we run user code to avoid invalidating the - // assumptions above. - synchronize::threadsAligned(atomic::relaxed); - if (TId < NumThreads) invokeMicrotask(TId, 0, fn, args, nargs); // Synchronize all threads at the end of a parallel region. - synchronize::threadsAligned(atomic::seq_cst); + synchronize::threadsAligned(); } // Synchronize all threads to make sure every thread exits the scope above; // otherwise the following assertions and the assumption in // __kmpc_target_deinit may not hold. - synchronize::threadsAligned(atomic::acq_rel); + synchronize::threadsAligned(); state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true); icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true); icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true); - - // Ensure we synchronize to create an aligned region around the assumptions. - synchronize::threadsAligned(atomic::relaxed); - return; } @@ -251,9 +243,9 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, /* ForceTeamState */ true); // Master signals work to activate workers. - synchronize::threads(atomic::seq_cst); + synchronize::threads(); // Master waits for workers to signal. - synchronize::threads(atomic::seq_cst); + synchronize::threads(); } if (nargs) diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp index 5d106a1a4dcf3..90d03dd490b24 100644 --- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -123,8 +123,8 @@ void fenceTeam(atomic::OrderingTy Ordering); void fenceKernel(atomic::OrderingTy Ordering); void fenceSystem(atomic::OrderingTy Ordering); void syncWarp(__kmpc_impl_lanemask_t); -void syncThreads(atomic::OrderingTy Ordering); -void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } +void syncThreads(); +void syncThreadsAligned() { syncThreads(); } void unsetLock(omp_lock_t *); int testLock(omp_lock_t *); void initLock(omp_lock_t *); @@ -261,16 +261,8 @@ void syncWarp(__kmpc_impl_lanemask_t) { // AMDGCN doesn't need to sync threads in a warp } -void syncThreads(atomic::OrderingTy Ordering) { - if (Ordering != atomic::relaxed) - fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst); - - __builtin_amdgcn_s_barrier(); - - if (Ordering != atomic::relaxed) - fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst); -} -void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } +void syncThreads() { __builtin_amdgcn_s_barrier(); } +void syncThreadsAligned() { syncThreads(); } // TODO: Don't have wavefront lane locks. Possibly can't have them. void unsetLock(omp_lock_t *) { __builtin_trap(); } @@ -335,12 +327,12 @@ void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); } void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); } -void syncThreads(atomic::OrderingTy Ordering) { +void syncThreads() { constexpr int BarrierNo = 8; asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory"); } -void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); } +void syncThreadsAligned() { __syncthreads(); } constexpr uint32_t OMP_SPIN = 1000; constexpr uint32_t UNSET = 0; @@ -389,13 +381,9 @@ void synchronize::init(bool IsSPMD) { void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); } -void synchronize::threads(atomic::OrderingTy Ordering) { - impl::syncThreads(Ordering); -} +void synchronize::threads() { impl::syncThreads(); } -void synchronize::threadsAligned(atomic::OrderingTy Ordering) { - impl::syncThreadsAligned(Ordering); -} +void synchronize::threadsAligned() { impl::syncThreadsAligned(); } void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); } @@ -516,13 +504,13 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) { __attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); - synchronize::threadsAligned(atomic::OrderingTy::seq_cst); + synchronize::threadsAligned(); } __attribute__((noinline)) void __kmpc_barrier_simple_generic(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); - synchronize::threads(atomic::OrderingTy::seq_cst); + synchronize::threads(); } int32_t __kmpc_master(IdentTy *Loc, int32_t TId) { diff --git a/openmp/libomptarget/test/offloading/barrier_fence.c b/openmp/libomptarget/test/offloading/barrier_fence.c deleted file mode 100644 index cf796b4301489..0000000000000 --- a/openmp/libomptarget/test/offloading/barrier_fence.c +++ /dev/null @@ -1,75 +0,0 @@ -// RUN: %libomptarget-compile-generic -fopenmp-offload-mandatory -O3 -// RUN: %libomptarget-run-generic - -#include -#include - -struct IdentTy; -void __kmpc_barrier_simple_spmd(struct IdentTy *Loc, int32_t TId); -void __kmpc_barrier_simple_generic(struct IdentTy *Loc, int32_t TId); - -#pragma omp begin declare target device_type(nohost) -static int A[512] __attribute__((address_space(3), loader_uninitialized)); -static int B[512 * 32] __attribute__((loader_uninitialized)); -#pragma omp end declare target - -int main() { - printf("Testing simple spmd barrier\n"); - for (int r = 0; r < 50; r++) { -#pragma omp target teams distribute thread_limit(512) num_teams(440) - for (int j = 0; j < 512 * 32; ++j) { -#pragma omp parallel firstprivate(j) - { - int TId = omp_get_thread_num(); - int TeamId = omp_get_team_num(); - int NT = omp_get_num_threads(); - // Sequential - for (int i = 0; i < NT; ++i) { - // Test shared memory globals - if (TId == i) - A[i] = i + j; - __kmpc_barrier_simple_spmd(0, TId); - if (A[i] != i + j) - __builtin_trap(); - __kmpc_barrier_simple_spmd(0, TId); - // Test generic globals - if (TId == i) - B[TeamId] = i; - __kmpc_barrier_simple_spmd(0, TId); - if (B[TeamId] != i) - __builtin_trap(); - __kmpc_barrier_simple_spmd(0, TId); - } - } - } - } - - printf("Testing simple generic barrier\n"); - for (int r = 0; r < 50; r++) { -#pragma omp target teams distribute thread_limit(512) num_teams(440) - for (int j = 0; j < 512 * 32; ++j) { -#pragma omp parallel firstprivate(j) - { - int TId = omp_get_thread_num(); - int TeamId = omp_get_team_num(); - int NT = omp_get_num_threads(); - // Sequential - for (int i = 0; i < NT; ++i) { - if (TId == i) - A[i] = i + j; - __kmpc_barrier_simple_generic(0, TId); - if (A[i] != i + j) - __builtin_trap(); - __kmpc_barrier_simple_generic(0, TId); - if (TId == i) - B[TeamId] = i; - __kmpc_barrier_simple_generic(0, TId); - if (B[TeamId] != i) - __builtin_trap(); - __kmpc_barrier_simple_generic(0, TId); - } - } - } - } - return 0; -}