diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h index 6e8a651bd886d..228d3f6e556df 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -126,29 +126,17 @@ DEVICE unsigned GetWarpId(); DEVICE unsigned GetLaneId(); // Atomics -template INLINE T __kmpc_atomic_add(T *address, T val) { - return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST); -} - -INLINE uint32_t __kmpc_atomic_inc(uint32_t *address, uint32_t max) { - return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, ""); -} - -template INLINE T __kmpc_atomic_max(T *address, T val) { - return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST); -} - -template INLINE T __kmpc_atomic_exchange(T *address, T val) { - T r; - __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST); - return r; -} - -template INLINE T __kmpc_atomic_cas(T *address, T compare, T val) { - (void)__atomic_compare_exchange(address, &compare, &val, false, - __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); - return compare; -} +DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); + +static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); +DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, + unsigned long long); +DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, + unsigned long long); // Locks DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock); diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip index 7388a29215cc0..35828cda0e06b 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -132,11 +132,13 @@ DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size, } // namespace DEVICE int GetNumberOfBlocksInKernel() { - return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x()); + return get_grid_dim(__builtin_amdgcn_grid_size_x(), + __builtin_amdgcn_workgroup_size_x()); } DEVICE int GetNumberOfThreadsInBlock() { - return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(), + return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), + __builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x()); } @@ -149,6 +151,40 @@ EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() { return GetNumberOfThreadsInBlock(); } +// Atomics +DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { + return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); +} +DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { + return __builtin_amdgcn_atomic_inc32(Address, max, __ATOMIC_SEQ_CST, ""); +} +DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { + return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST); +} + +DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { + uint32_t R; + __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); + return R; +} +DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, + uint32_t Val) { + (void)__atomic_compare_exchange(Address, &Compare, &Val, false, + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); + return Compare; +} + +DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, + unsigned long long Val) { + unsigned long long R; + __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); + return R; +} +DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address, + unsigned long long Val) { + return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); +} + // Stub implementations DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; } DEVICE void __kmpc_impl_free(void *) {} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu index 75945e3cd8c4c..2bf19523ef6f4 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -140,6 +140,41 @@ DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); } DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } +// Forward declaration of atomics. Although they're template functions, we +// already have definitions for different types in CUDA internal headers with +// the right mangled names. +template DEVICE T atomicAdd(T *address, T val); +template DEVICE T atomicInc(T *address, T val); +template DEVICE T atomicMax(T *address, T val); +template DEVICE T atomicExch(T *address, T val); +template DEVICE T atomicCAS(T *address, T compare, T val); + +DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { + return atomicAdd(Address, Val); +} +DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { + return atomicInc(Address, Val); +} +DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { + return atomicMax(Address, Val); +} +DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { + return atomicExch(Address, Val); +} +DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, + uint32_t Val) { + return atomicCAS(Address, Compare, Val); +} + +DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, + unsigned long long Val) { + return atomicExch(Address, Val); +} +DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address, + unsigned long long Val) { + return atomicAdd(Address, Val); +} + #define __OMP_SPIN 1000 #define UNSET 0u #define SET 1u diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h index 1d7b649fe20e4..1828fcf594bce 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -130,35 +130,18 @@ DEVICE int GetNumberOfThreadsInBlock(); DEVICE unsigned GetWarpId(); DEVICE unsigned GetLaneId(); -// Forward declaration of atomics. Although they're template functions, we -// already have definitions for different types in CUDA internal headers with -// the right mangled names. -template DEVICE T atomicAdd(T *address, T val); -template DEVICE T atomicInc(T *address, T val); -template DEVICE T atomicMax(T *address, T val); -template DEVICE T atomicExch(T *address, T val); -template DEVICE T atomicCAS(T *address, T compare, T val); - // Atomics -template INLINE T __kmpc_atomic_add(T *address, T val) { - return atomicAdd(address, val); -} - -template INLINE T __kmpc_atomic_inc(T *address, T val) { - return atomicInc(address, val); -} - -template INLINE T __kmpc_atomic_max(T *address, T val) { - return atomicMax(address, val); -} - -template INLINE T __kmpc_atomic_exchange(T *address, T val) { - return atomicExch(address, val); -} - -template INLINE T __kmpc_atomic_cas(T *address, T compare, T val) { - return atomicCAS(address, compare, val); -} +DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); + +static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); +DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, + unsigned long long); +DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, + unsigned long long); // Locks DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);