Expand Up
@@ -29,47 +29,52 @@ namespace impl {
// /
// /{
// / NOTE: This function needs to be implemented by every target.
uint32_t atomicInc (uint32_t *Address, uint32_t Val, int Ordering);
uint32_t atomicInc (uint32_t *Address, uint32_t Val,
atomic::OrderingTy Ordering);
uint32_t atomicLoad (uint32_t *Address, int Ordering) {
return __atomic_fetch_add (Address, 0U , __ATOMIC_SEQ_CST );
uint32_t atomicLoad (uint32_t *Address, atomic::OrderingTy Ordering) {
return __atomic_fetch_add (Address, 0U , Ordering );
}
void atomicStore (uint32_t *Address, uint32_t Val, int Ordering) {
void atomicStore (uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering) {
__atomic_store_n (Address, Val, Ordering);
}
uint32_t atomicAdd (uint32_t *Address, uint32_t Val, int Ordering) {
uint32_t atomicAdd (uint32_t *Address, uint32_t Val,
atomic::OrderingTy Ordering) {
return __atomic_fetch_add (Address, Val, Ordering);
}
uint32_t atomicMax (uint32_t *Address, uint32_t Val, int Ordering) {
uint32_t atomicMax (uint32_t *Address, uint32_t Val,
atomic::OrderingTy Ordering) {
return __atomic_fetch_max (Address, Val, Ordering);
}
uint32_t atomicExchange (uint32_t *Address, uint32_t Val, int Ordering) {
uint32_t atomicExchange (uint32_t *Address, uint32_t Val,
atomic::OrderingTy Ordering) {
uint32_t R;
__atomic_exchange (Address, &Val, &R, Ordering);
return R;
}
uint32_t atomicCAS (uint32_t *Address, uint32_t Compare, uint32_t Val,
int Ordering) {
atomic::OrderingTy Ordering) {
(void )__atomic_compare_exchange (Address, &Compare, &Val, false , Ordering,
Ordering);
return Compare;
}
uint64_t atomicAdd (uint64_t *Address, uint64_t Val, int Ordering) {
uint64_t atomicAdd (uint64_t *Address, uint64_t Val,
atomic::OrderingTy Ordering) {
return __atomic_fetch_add (Address, Val, Ordering);
}
// /}
// Forward declarations defined to be defined for AMDGCN and NVPTX.
uint32_t atomicInc (uint32_t *A, uint32_t V, int Ordering);
uint32_t atomicInc (uint32_t *A, uint32_t V, atomic::OrderingTy Ordering);
void namedBarrierInit ();
void namedBarrier ();
void fenceTeam (int Ordering);
void fenceKernel (int Ordering);
void fenceSystem (int Ordering);
void fenceTeam (atomic::OrderingTy Ordering);
void fenceKernel (atomic::OrderingTy Ordering);
void fenceSystem (atomic::OrderingTy Ordering);
void syncWarp (__kmpc_impl_lanemask_t );
void syncThreads ();
void syncThreadsAligned () { syncThreads (); }
Expand All
@@ -84,30 +89,30 @@ void setLock(omp_lock_t *);
// /{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
uint32_t atomicInc (uint32_t *A, uint32_t V, int Ordering) {
uint32_t atomicInc (uint32_t *A, uint32_t V, atomic::OrderingTy Ordering) {
// builtin_amdgcn_atomic_inc32 should expand to this switch when
// passed a runtime value, but does not do so yet. Workaround here.
switch (Ordering) {
default :
__builtin_unreachable ();
case __ATOMIC_RELAXED :
return __builtin_amdgcn_atomic_inc32 (A, V, __ATOMIC_RELAXED , " " );
case __ATOMIC_ACQUIRE :
return __builtin_amdgcn_atomic_inc32 (A, V, __ATOMIC_ACQUIRE , " " );
case __ATOMIC_RELEASE :
return __builtin_amdgcn_atomic_inc32 (A, V, __ATOMIC_RELEASE , " " );
case __ATOMIC_ACQ_REL :
return __builtin_amdgcn_atomic_inc32 (A, V, __ATOMIC_ACQ_REL , " " );
case __ATOMIC_SEQ_CST :
return __builtin_amdgcn_atomic_inc32 (A, V, __ATOMIC_SEQ_CST , " " );
case atomic::relaxed :
return __builtin_amdgcn_atomic_inc32 (A, V, atomic::relaxed , " " );
case atomic::aquire :
return __builtin_amdgcn_atomic_inc32 (A, V, atomic::aquire , " " );
case atomic::release :
return __builtin_amdgcn_atomic_inc32 (A, V, atomic::release , " " );
case atomic::acq_rel :
return __builtin_amdgcn_atomic_inc32 (A, V, atomic::acq_rel , " " );
case atomic::seq_cst :
return __builtin_amdgcn_atomic_inc32 (A, V, atomic::seq_cst , " " );
}
}
uint32_t SHARED (namedBarrierTracker);
void namedBarrierInit () {
// Don't have global ctors, and shared memory is not zero init
atomic::store (&namedBarrierTracker, 0u , __ATOMIC_RELEASE );
atomic::store (&namedBarrierTracker, 0u , atomic::release );
}
void namedBarrier () {
Expand All
@@ -117,7 +122,7 @@ void namedBarrier() {
uint32_t WarpSize = mapping::getWarpSize ();
uint32_t NumWaves = NumThreads / WarpSize;
fence::team (__ATOMIC_ACQUIRE );
fence::team (atomic::aquire );
// named barrier implementation for amdgcn.
// Uses two 16 bit unsigned counters. One for the number of waves to have
Expand All
@@ -133,7 +138,7 @@ void namedBarrier() {
// Increment the low 16 bits once, using the lowest active thread.
if (mapping::isLeaderInWarp ()) {
uint32_t load = atomic::add (&namedBarrierTracker, 1 ,
__ATOMIC_RELAXED ); // commutative
atomic::relaxed ); // commutative
// Record the number of times the barrier has been passed
uint32_t generation = load & 0xffff0000u ;
Expand All
@@ -145,61 +150,61 @@ void namedBarrier() {
load &= 0xffff0000u ; // because bits zeroed second
// Reset the wave counter and release the waiting waves
atomic::store (&namedBarrierTracker, load, __ATOMIC_RELAXED );
atomic::store (&namedBarrierTracker, load, atomic::relaxed );
} else {
// more waves still to go, spin until generation counter changes
do {
__builtin_amdgcn_s_sleep (0 );
load = atomic::load (&namedBarrierTracker, __ATOMIC_RELAXED );
load = atomic::load (&namedBarrierTracker, atomic::relaxed );
} while ((load & 0xffff0000u ) == generation);
}
}
fence::team (__ATOMIC_RELEASE );
fence::team (atomic::release );
}
// sema checking of amdgcn_fence is aggressive. Intention is to patch clang
// so that it is usable within a template environment and so that a runtime
// value of the memory order is expanded to this switch within clang/llvm.
void fenceTeam (int Ordering) {
void fenceTeam (atomic::OrderingTy Ordering) {
switch (Ordering) {
default :
__builtin_unreachable ();
case __ATOMIC_ACQUIRE :
return __builtin_amdgcn_fence (__ATOMIC_ACQUIRE , " workgroup" );
case __ATOMIC_RELEASE :
return __builtin_amdgcn_fence (__ATOMIC_RELEASE , " workgroup" );
case __ATOMIC_ACQ_REL :
return __builtin_amdgcn_fence (__ATOMIC_ACQ_REL , " workgroup" );
case __ATOMIC_SEQ_CST :
return __builtin_amdgcn_fence (__ATOMIC_SEQ_CST , " workgroup" );
case atomic::aquire :
return __builtin_amdgcn_fence (atomic::aquire , " workgroup" );
case atomic::release :
return __builtin_amdgcn_fence (atomic::release , " workgroup" );
case atomic::acq_rel :
return __builtin_amdgcn_fence (atomic::acq_rel , " workgroup" );
case atomic::seq_cst :
return __builtin_amdgcn_fence (atomic::seq_cst , " workgroup" );
}
}
void fenceKernel (int Ordering) {
void fenceKernel (atomic::OrderingTy Ordering) {
switch (Ordering) {
default :
__builtin_unreachable ();
case __ATOMIC_ACQUIRE :
return __builtin_amdgcn_fence (__ATOMIC_ACQUIRE , " agent" );
case __ATOMIC_RELEASE :
return __builtin_amdgcn_fence (__ATOMIC_RELEASE , " agent" );
case __ATOMIC_ACQ_REL :
return __builtin_amdgcn_fence (__ATOMIC_ACQ_REL , " agent" );
case __ATOMIC_SEQ_CST :
return __builtin_amdgcn_fence (__ATOMIC_SEQ_CST , " agent" );
case atomic::aquire :
return __builtin_amdgcn_fence (atomic::aquire , " agent" );
case atomic::release :
return __builtin_amdgcn_fence (atomic::release , " agent" );
case atomic::acq_rel :
return __builtin_amdgcn_fence (atomic::acq_rel , " agent" );
case atomic::seq_cst :
return __builtin_amdgcn_fence (atomic::seq_cst , " agent" );
}
}
void fenceSystem (int Ordering) {
void fenceSystem (atomic::OrderingTy Ordering) {
switch (Ordering) {
default :
__builtin_unreachable ();
case __ATOMIC_ACQUIRE :
return __builtin_amdgcn_fence (__ATOMIC_ACQUIRE , " " );
case __ATOMIC_RELEASE :
return __builtin_amdgcn_fence (__ATOMIC_RELEASE , " " );
case __ATOMIC_ACQ_REL :
return __builtin_amdgcn_fence (__ATOMIC_ACQ_REL , " " );
case __ATOMIC_SEQ_CST :
return __builtin_amdgcn_fence (__ATOMIC_SEQ_CST , " " );
case atomic::aquire :
return __builtin_amdgcn_fence (atomic::aquire , " " );
case atomic::release :
return __builtin_amdgcn_fence (atomic::release , " " );
case atomic::acq_rel :
return __builtin_amdgcn_fence (atomic::acq_rel , " " );
case atomic::seq_cst :
return __builtin_amdgcn_fence (atomic::seq_cst , " " );
}
}
Expand All
@@ -226,7 +231,8 @@ void setLock(omp_lock_t *) { __builtin_trap(); }
#pragma omp begin declare variant match( \
device = {arch (nvptx, nvptx64)}, implementation = {extension (match_any)})
uint32_t atomicInc (uint32_t *Address, uint32_t Val, int Ordering) {
uint32_t atomicInc (uint32_t *Address, uint32_t Val,
atomic::OrderingTy Ordering) {
return __nvvm_atom_inc_gen_ui (Address, Val);
}
Expand Down
Expand Up
@@ -268,11 +274,11 @@ constexpr uint32_t SET = 1;
// called before it is defined
// here the overload won't happen. Investigate lalter!
void unsetLock (omp_lock_t *Lock) {
(void )atomicExchange ((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST );
(void )atomicExchange ((uint32_t *)Lock, UNSET, atomic::seq_cst );
}
int testLock (omp_lock_t *Lock) {
return atomicAdd ((uint32_t *)Lock, 0u , __ATOMIC_SEQ_CST );
return atomicAdd ((uint32_t *)Lock, 0u , atomic::seq_cst );
}
void initLock (omp_lock_t *Lock) { unsetLock (Lock); }
Expand All
@@ -281,7 +287,7 @@ void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
void setLock (omp_lock_t *Lock) {
// TODO: not sure spinning is a good idea here..
while (atomicCAS ((uint32_t *)Lock, UNSET, SET, __ATOMIC_SEQ_CST ) != UNSET) {
while (atomicCAS ((uint32_t *)Lock, UNSET, SET, atomic::seq_cst ) != UNSET) {
int32_t start = __nvvm_read_ptx_sreg_clock ();
int32_t now;
for (;;) {
Expand Down
Expand Up
@@ -310,29 +316,29 @@ void synchronize::threads() { impl::syncThreads(); }
void synchronize::threadsAligned () { impl::syncThreadsAligned (); }
void fence::team (int Ordering) { impl::fenceTeam (Ordering); }
void fence::team (atomic::OrderingTy Ordering) { impl::fenceTeam (Ordering); }
void fence::kernel (int Ordering) { impl::fenceKernel (Ordering); }
void fence::kernel (atomic::OrderingTy Ordering) { impl::fenceKernel (Ordering); }
void fence::system (int Ordering) { impl::fenceSystem (Ordering); }
void fence::system (atomic::OrderingTy Ordering) { impl::fenceSystem (Ordering); }
uint32_t atomic::load (uint32_t *Addr, int Ordering) {
uint32_t atomic::load (uint32_t *Addr, atomic::OrderingTy Ordering) {
return impl::atomicLoad (Addr, Ordering);
}
void atomic::store (uint32_t *Addr, uint32_t V, int Ordering) {
void atomic::store (uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
impl::atomicStore (Addr, V, Ordering);
}
uint32_t atomic::inc (uint32_t *Addr, uint32_t V, int Ordering) {
uint32_t atomic::inc (uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
return impl::atomicInc (Addr, V, Ordering);
}
uint32_t atomic::add (uint32_t *Addr, uint32_t V, int Ordering) {
uint32_t atomic::add (uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
return impl::atomicAdd (Addr, V, Ordering);
}
uint64_t atomic::add (uint64_t *Addr, uint64_t V, int Ordering) {
uint64_t atomic::add (uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering) {
return impl::atomicAdd (Addr, V, Ordering);
}
Expand Down
Expand Up
@@ -389,7 +395,7 @@ void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
void __kmpc_flush (IdentTy *Loc) {
FunctionTracingRAII ();
fence::kernel (__ATOMIC_SEQ_CST );
fence::kernel (atomic::seq_cst );
}
uint64_t __kmpc_warp_active_thread_mask (void ) {
Expand Down