91 changes: 65 additions & 26 deletions llvm/lib/Transforms/IPO/AttributorAttributes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,9 @@ static bool genericValueTraversal(
"Expected liveness in the presence of instructions!");
for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
BasicBlock *IncomingBB = PHI->getIncomingBlock(u);
bool UsedAssumedInformation = false;
if (A.isAssumedDead(*IncomingBB->getTerminator(), &QueryingAA,
LivenessAA,
LivenessAA, UsedAssumedInformation,
/* CheckBBLivenessOnly */ true)) {
AnyDead = true;
continue;
Expand Down Expand Up @@ -794,7 +795,9 @@ struct AANoUnwindImpl : AANoUnwind {
return false;
};

if (!A.checkForAllInstructions(CheckForNoUnwind, *this, Opcodes))
bool UsedAssumedInformation = false;
if (!A.checkForAllInstructions(CheckForNoUnwind, *this, Opcodes,
UsedAssumedInformation))
return indicatePessimisticFixpoint();

return ChangeStatus::UNCHANGED;
Expand Down Expand Up @@ -1052,7 +1055,9 @@ ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) {

// Discover returned values from all live returned instructions in the
// associated function.
if (!A.checkForAllInstructions(ReturnInstCB, *this, {Instruction::Ret}))
bool UsedAssumedInformation = false;
if (!A.checkForAllInstructions(ReturnInstCB, *this, {Instruction::Ret},
UsedAssumedInformation))
return indicatePessimisticFixpoint();
return Changed;
}
Expand Down Expand Up @@ -1185,8 +1190,11 @@ ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) {
return !cast<CallBase>(I).isConvergent();
};

if (!A.checkForAllReadWriteInstructions(CheckRWInstForNoSync, *this) ||
!A.checkForAllCallLikeInstructions(CheckForNoSync, *this))
bool UsedAssumedInformation = false;
if (!A.checkForAllReadWriteInstructions(CheckRWInstForNoSync, *this,
UsedAssumedInformation) ||
!A.checkForAllCallLikeInstructions(CheckForNoSync, *this,
UsedAssumedInformation))
return indicatePessimisticFixpoint();

return ChangeStatus::UNCHANGED;
Expand Down Expand Up @@ -1246,7 +1254,9 @@ struct AANoFreeImpl : public AANoFree {
return NoFreeAA.isAssumedNoFree();
};

if (!A.checkForAllCallLikeInstructions(CheckForNoFree, *this))
bool UsedAssumedInformation = false;
if (!A.checkForAllCallLikeInstructions(CheckForNoFree, *this,
UsedAssumedInformation))
return indicatePessimisticFixpoint();
return ChangeStatus::UNCHANGED;
}
Expand Down Expand Up @@ -1708,7 +1718,9 @@ struct AANoRecurseFunction final : AANoRecurseImpl {
return true;
};

if (!A.checkForAllCallLikeInstructions(CheckForNoRecurse, *this))
bool UsedAssumedInformation = false;
if (!A.checkForAllCallLikeInstructions(CheckForNoRecurse, *this,
UsedAssumedInformation))
return indicatePessimisticFixpoint();
return ChangeStatus::UNCHANGED;
}
Expand Down Expand Up @@ -1915,20 +1927,24 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
return true;
};

bool UsedAssumedInformation = false;
A.checkForAllInstructions(InspectMemAccessInstForUB, *this,
{Instruction::Load, Instruction::Store,
Instruction::AtomicCmpXchg,
Instruction::AtomicRMW},
UsedAssumedInformation,
/* CheckBBLivenessOnly */ true);
A.checkForAllInstructions(InspectBrInstForUB, *this, {Instruction::Br},
UsedAssumedInformation,
/* CheckBBLivenessOnly */ true);
A.checkForAllCallLikeInstructions(InspectCallSiteForUB, *this);
A.checkForAllCallLikeInstructions(InspectCallSiteForUB, *this,
UsedAssumedInformation);

// If the returned position of the anchor scope has noundef attriubte, check
// all returned instructions.
if (!getAnchorScope()->getReturnType()->isVoidTy()) {
const IRPosition &ReturnIRP = IRPosition::returned(*getAnchorScope());
if (!A.isAssumedDead(ReturnIRP, this, nullptr)) {
if (!A.isAssumedDead(ReturnIRP, this, nullptr, UsedAssumedInformation)) {
auto &RetPosNoUndefAA =
A.getAAFor<AANoUndef>(*this, ReturnIRP, DepClassTy::NONE);
if (RetPosNoUndefAA.isKnownNoUndef())
Expand Down Expand Up @@ -2146,7 +2162,9 @@ struct AAWillReturnImpl : public AAWillReturn {
return NoRecurseAA.isAssumedNoRecurse();
};

if (!A.checkForAllCallLikeInstructions(CheckForWillReturn, *this))
bool UsedAssumedInformation = false;
if (!A.checkForAllCallLikeInstructions(CheckForWillReturn, *this,
UsedAssumedInformation))
return indicatePessimisticFixpoint();

return ChangeStatus::UNCHANGED;
Expand Down Expand Up @@ -2882,8 +2900,9 @@ struct AAIsDeadReturned : public AAIsDeadValueImpl {
/// See AbstractAttribute::updateImpl(...).
ChangeStatus updateImpl(Attributor &A) override {

bool UsedAssumedInformation = false;
A.checkForAllInstructions([](Instruction &) { return true; }, *this,
{Instruction::Ret});
{Instruction::Ret}, UsedAssumedInformation);

auto PredForCallSite = [&](AbstractCallSite ACS) {
if (ACS.isCallbackCall() || !ACS.getInstruction())
Expand All @@ -2910,7 +2929,9 @@ struct AAIsDeadReturned : public AAIsDeadValueImpl {
AnyChange |= A.changeUseAfterManifest(RI.getOperandUse(0), UV);
return true;
};
A.checkForAllInstructions(RetInstPred, *this, {Instruction::Ret});
bool UsedAssumedInformation = false;
A.checkForAllInstructions(RetInstPred, *this, {Instruction::Ret},
UsedAssumedInformation);
return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
}

Expand Down Expand Up @@ -3871,8 +3892,10 @@ struct AANoReturnImpl : public AANoReturn {
/// See AbstractAttribute::updateImpl(Attributor &A).
virtual ChangeStatus updateImpl(Attributor &A) override {
auto CheckForNoReturn = [](Instruction &) { return false; };
bool UsedAssumedInformation = false;
if (!A.checkForAllInstructions(CheckForNoReturn, *this,
{(unsigned)Instruction::Ret}))
{(unsigned)Instruction::Ret},
UsedAssumedInformation))
return indicatePessimisticFixpoint();
return ChangeStatus::UNCHANGED;
}
Expand Down Expand Up @@ -4134,8 +4157,10 @@ struct AACaptureUseTracker final : public CaptureTracker {
/// See CaptureTracker::shouldExplore(...).
bool shouldExplore(const Use *U) override {
// Check liveness and ignore droppable users.
bool UsedAssumedInformation = false;
return !U->getUser()->isDroppable() &&
!A.isAssumedDead(*U, &NoCaptureAA, &IsDeadAA);
!A.isAssumedDead(*U, &NoCaptureAA, &IsDeadAA,
UsedAssumedInformation);
}

/// Update the state according to \p CapturedInMem, \p CapturedInInt, and
Expand Down Expand Up @@ -4542,7 +4567,7 @@ struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
// in other functions, e.g., we don't want to say a an argument in a
// static function is actually an argument in a different function.
Value &ArgOp = ACSArgPos.getAssociatedValue();
bool UsedAssumedInformation;
bool UsedAssumedInformation = false;
Optional<Value *> SimpleArgOp =
A.getAssumedSimplified(ACSArgPos, *this, UsedAssumedInformation);
if (!SimpleArgOp.hasValue())
Expand Down Expand Up @@ -4817,7 +4842,7 @@ struct AAValueSimplifyCallSiteReturned : AAValueSimplifyImpl {
DepClassTy::REQUIRED);
auto PredForReturned =
[&](Value &RetVal, const SmallSetVector<ReturnInst *, 4> &RetInsts) {
bool UsedAssumedInformation;
bool UsedAssumedInformation = false;
Optional<Value *> CSRetVal = A.translateArgumentToCallSiteContent(
&RetVal, *cast<CallBase>(getCtxI()), *this,
UsedAssumedInformation);
Expand Down Expand Up @@ -4947,8 +4972,10 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
return true;
};

bool UsedAssumedInformation = false;
bool Success = A.checkForAllCallLikeInstructions(
AllocationIdentifierCB, *this, /* CheckBBLivenessOnly */ false,
AllocationIdentifierCB, *this, UsedAssumedInformation,
/* CheckBBLivenessOnly */ false,
/* CheckPotentiallyDead */ true);
(void)Success;
assert(Success && "Did not expect the call base visit callback to fail!");
Expand Down Expand Up @@ -5151,7 +5178,8 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
continue;

// No need to analyze dead calls, ignore them instead.
if (A.isAssumedDead(*DI.CB, this, &LivenessAA,
bool UsedAssumedInformation = false;
if (A.isAssumedDead(*DI.CB, this, &LivenessAA, UsedAssumedInformation,
/* CheckBBLivenessOnly */ true))
continue;

Expand Down Expand Up @@ -5776,14 +5804,15 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
// escape into tail recursion.
// TODO: Be smarter about new allocas escaping into tail calls.
SmallVector<CallInst *, 16> TailCalls;
bool UsedAssumedInformation = false;
if (!A.checkForAllInstructions(
[&](Instruction &I) {
CallInst &CI = cast<CallInst>(I);
if (CI.isTailCall())
TailCalls.push_back(&CI);
return true;
},
*this, {Instruction::Call}))
*this, {Instruction::Call}, UsedAssumedInformation))
return ChangeStatus::UNCHANGED;

Argument *Arg = getAssociatedArgument();
Expand Down Expand Up @@ -6331,7 +6360,9 @@ ChangeStatus AAMemoryBehaviorFunction::updateImpl(Attributor &A) {
return !isAtFixpoint();
};

if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
bool UsedAssumedInformation = false;
if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this,
UsedAssumedInformation))
return indicatePessimisticFixpoint();

return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
Expand Down Expand Up @@ -6383,10 +6414,13 @@ ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) {
for (unsigned i = 0; i < Uses.size() && !isAtFixpoint(); i++) {
const Use *U = Uses[i];
Instruction *UserI = cast<Instruction>(U->getUser());
bool UsedAssumedInformation = false;
LLVM_DEBUG(dbgs() << "[AAMemoryBehavior] Use: " << **U << " in " << *UserI
<< " [Dead: " << (A.isAssumedDead(*U, this, &LivenessAA))
<< " [Dead: "
<< (A.isAssumedDead(*U, this, &LivenessAA,
UsedAssumedInformation))
<< "]\n");
if (A.isAssumedDead(*U, this, &LivenessAA))
if (A.isAssumedDead(*U, this, &LivenessAA, UsedAssumedInformation))
continue;

// Droppable users, e.g., llvm::assume does not actually perform any action.
Expand Down Expand Up @@ -7003,7 +7037,9 @@ struct AAMemoryLocationFunction final : public AAMemoryLocationImpl {
return getAssumedNotAccessedLocation() != VALID_STATE;
};

if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
bool UsedAssumedInformation = false;
if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this,
UsedAssumedInformation))
return indicatePessimisticFixpoint();

Changed |= AssumedState != getAssumed();
Expand Down Expand Up @@ -8310,12 +8346,13 @@ struct AANoUndefImpl : AANoUndef {
// We don't manifest noundef attribute for dead positions because the
// associated values with dead positions would be replaced with undef
// values.
if (A.isAssumedDead(getIRPosition(), nullptr, nullptr))
bool UsedAssumedInformation = false;
if (A.isAssumedDead(getIRPosition(), nullptr, nullptr,
UsedAssumedInformation))
return ChangeStatus::UNCHANGED;
// A position whose simplified value does not have any value is
// considered to be dead. We don't manifest noundef in such positions for
// the same reason above.
bool UsedAssumedInformation = false;
if (!A.getAssumedSimplified(getIRPosition(), *this, UsedAssumedInformation)
.hasValue())
return ChangeStatus::UNCHANGED;
Expand Down Expand Up @@ -8466,7 +8503,9 @@ struct AACallEdgesFunction : public AACallEdges {
};

// Visit all callable instructions.
if (!A.checkForAllCallLikeInstructions(ProcessCallInst, *this))
bool UsedAssumedInformation = false;
if (!A.checkForAllCallLikeInstructions(ProcessCallInst, *this,
UsedAssumedInformation))
// If we haven't looked at all call like instructions, assume that there
// are unknown callees.
HasUnknownCallee = true;
Expand Down
13 changes: 11 additions & 2 deletions llvm/lib/Transforms/IPO/OpenMPOpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2078,7 +2078,9 @@ struct AAICVTrackerFunction : public AAICVTracker {
// Track all changes of an ICV.
SetterRFI.foreachUse(TrackValues, F);

bool UsedAssumedInformation = false;
A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
UsedAssumedInformation,
/* CheckBBLivenessOnly */ true);

/// TODO: Figure out a way to avoid adding entry in
Expand Down Expand Up @@ -2261,7 +2263,9 @@ struct AAICVTrackerFunctionReturned : AAICVTracker {
return true;
};

bool UsedAssumedInformation = false;
if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
UsedAssumedInformation,
/* CheckBBLivenessOnly */ true))
UniqueICVValue = nullptr;

Expand Down Expand Up @@ -3203,7 +3207,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
SPMDCompatibilityTracker.insert(&I);
return true;
};
if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))

bool UsedAssumedInformationInCheckRWInst = false;
if (!A.checkForAllReadWriteInstructions(
CheckRWInst, *this, UsedAssumedInformationInCheckRWInst))
SPMDCompatibilityTracker.indicatePessimisticFixpoint();

// Callback to check a call instruction.
Expand All @@ -3216,7 +3223,9 @@ struct AAKernelInfoFunction : AAKernelInfo {
return true;
};

if (!A.checkForAllCallLikeInstructions(CheckCallInst, *this))
bool UsedAssumedInformationInCheckCallInst = false;
if (!A.checkForAllCallLikeInstructions(
CheckCallInst, *this, UsedAssumedInformationInCheckCallInst))
return indicatePessimisticFixpoint();

return StateBefore == getState() ? ChangeStatus::UNCHANGED
Expand Down
4 changes: 0 additions & 4 deletions openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,6 @@

#define WARPSIZE 64

// Maximum number of preallocated arguments to an outlined parallel/simd
// function. Anything more requires dynamic memory allocation.
#define MAX_SHARED_ARGS 20

// Maximum number of omp state objects per SM allocated statically in global
// memory.
#define OMP_STATE_COUNT 32
Expand Down
40 changes: 0 additions & 40 deletions openmp/libomptarget/deviceRTLs/common/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,46 +35,6 @@
#define BARRIER_COUNTER 0
#define ORDERED_COUNTER 1

// arguments needed for L0 parallelism only.
class omptarget_nvptx_SharedArgs {
public:
// All these methods must be called by the master thread only.
INLINE void Init() {
args = buffer;
nArgs = MAX_SHARED_ARGS;
}
INLINE void DeInit() {
// Free any memory allocated for outlined parallel function with a large
// number of arguments.
if (nArgs > MAX_SHARED_ARGS) {
SafeFree(args, "new extended args");
Init();
}
}
INLINE void EnsureSize(size_t size) {
if (size > nArgs) {
if (nArgs > MAX_SHARED_ARGS) {
SafeFree(args, "new extended args");
}
args = (void **)SafeMalloc(size * sizeof(void *), "new extended args");
nArgs = size;
}
}
// Called by all threads.
INLINE void **GetArgs() const { return args; };

private:
// buffer of pre-allocated arguments.
void *buffer[MAX_SHARED_ARGS];
// pointer to arguments buffer.
// starts off as a pointer to 'buffer' but can be dynamically allocated.
void **args;
// starts off as MAX_SHARED_ARGS but can increase in size.
uint32_t nArgs;
};

extern omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs);

// Worker slot type which is initialized with the default worker slot
// size of 4*32 bytes.
struct __kmpc_data_sharing_slot {
Expand Down
2 changes: 1 addition & 1 deletion openmp/libomptarget/deviceRTLs/common/omptargeti.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {

INLINE omptarget_nvptx_TaskDescr *
getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
37 changes: 25 additions & 12 deletions openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@
#include "target/shuffle.h"
#include "target_impl.h"

// Return true if this is the master thread.
INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
}

////////////////////////////////////////////////////////////////////////////////
// Runtime functions for trunk data sharing scheme.
////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -66,7 +61,8 @@ static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes,

EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
Bytes = Bytes + (Bytes % MinBytes);
if (IsMasterThread(__kmpc_is_spmd_exec_mode())) {
int TID = GetThreadIdInBlock();
if (__kmpc_is_generic_main_thread(TID)) {
// Main thread alone, use shared memory if space is available.
if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) {
void *Ptr = &MainSharedStack.Data[MainSharedStack.Usage[0]];
Expand All @@ -75,7 +71,6 @@ EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
return Ptr;
}
} else {
int TID = GetThreadIdInBlock();
int WID = GetWarpId();
unsigned WarpBytes = Bytes * WARPSIZE;
auto AllocSharedStack = [&]() {
Expand All @@ -92,7 +87,6 @@ EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
return __kmpc_alloc_for_warp(AllocSharedStack, Bytes, WarpBytes);
}
// Fallback to malloc
int TID = GetThreadIdInBlock();
unsigned WarpBytes = Bytes * WARPSIZE;
auto AllocGlobal = [&] {
return SafeMalloc(WarpBytes, "AllocGlobalFallback");
Expand Down Expand Up @@ -135,14 +129,32 @@ EXTERN void __kmpc_data_sharing_init_stack() {
}
}

/// Allocate storage in shared memory to communicate arguments from the main
/// thread to the workers in generic mode. If we exceed
/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
#define NUM_SHARED_VARIABLES_IN_SHARED_MEM 64

[[clang::loader_uninitialized]] static void
*SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
#pragma omp allocate(SharedMemVariableSharingSpace) \
allocator(omp_pteam_mem_alloc)
[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
#pragma omp allocate(SharedMemVariableSharingSpacePtr) \
allocator(omp_pteam_mem_alloc)

// Begin a data sharing context. Maintain a list of references to shared
// variables. This list of references to shared variables will be passed
// to one or more threads.
// In L0 data sharing this is called by master thread.
// In L1 data sharing this is called by active warp master thread.
EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
omptarget_nvptx_globalArgs.EnsureSize(nArgs);
*GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
} else {
SharedMemVariableSharingSpacePtr =
(void **)SafeMalloc(nArgs * sizeof(void *), "new extended args");
}
*GlobalArgs = SharedMemVariableSharingSpacePtr;
}

// End a data sharing context. There is no need to have a list of refs
Expand All @@ -152,7 +164,8 @@ EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
// In L0 data sharing this is called by master thread.
// In L1 data sharing this is called by active warp master thread.
EXTERN void __kmpc_end_sharing_variables() {
omptarget_nvptx_globalArgs.DeInit();
if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
SafeFree(SharedMemVariableSharingSpacePtr, "new extended args");
}

// This function will return a list of references to global variables. This
Expand All @@ -161,7 +174,7 @@ EXTERN void __kmpc_end_sharing_variables() {
// preserving the order.
// Called by all workers.
EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
*GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
*GlobalArgs = SharedMemVariableSharingSpacePtr;
}

// This function is used to init static memory manager. This manager is used to
Expand Down
4 changes: 1 addition & 3 deletions openmp/libomptarget/deviceRTLs/common/src/libcall.cu
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,7 @@ EXTERN int omp_get_thread_limit(void) {
}

EXTERN int omp_get_thread_num() {
bool isSPMDExecutionMode = __kmpc_is_spmd_exec_mode();
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
int rc = GetOmpThreadId(tid, isSPMDExecutionMode);
int rc = GetOmpThreadId();
PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
return rc;
}
Expand Down
4 changes: 2 additions & 2 deletions openmp/libomptarget/deviceRTLs/common/src/loop.cu
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ public:
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
return;
}
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
T tnum = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
T tripCount = ub - lb + 1; // +1 because ub is inclusive
Expand Down Expand Up @@ -453,7 +453,7 @@ public:
// ID of a thread in its own warp

// automatically selects thread or warp ID based on selected implementation
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int tid = GetLogicalThreadIdInBlock();
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
"current thread is not needed here; error");
// retrieve schedule
Expand Down
5 changes: 0 additions & 5 deletions openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,4 @@ uint32_t SHARED(execution_param);
////////////////////////////////////////////////////////////////////////////////
void *SHARED(ReductionScratchpadPtr);

////////////////////////////////////////////////////////////////////////////////
// Data sharing related variables.
////////////////////////////////////////////////////////////////////////////////
omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs);

#pragma omp end declare target
6 changes: 4 additions & 2 deletions openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,6 @@ static void __kmpc_generic_kernel_init() {
nThreads = GetNumberOfWorkersInTeam();
threadLimit = nThreads;

omptarget_nvptx_globalArgs.Init();

__kmpc_data_sharing_init_stack();
__kmpc_impl_target_init();
}
Expand Down Expand Up @@ -162,6 +160,10 @@ EXTERN int8_t __kmpc_is_spmd_exec_mode() {
return (execution_param & ModeMask) == Spmd;
}

EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid) {
return !__kmpc_is_spmd_exec_mode() && GetMasterThreadID() == Tid;
}

EXTERN bool __kmpc_kernel_parallel(void**WorkFn);

static void __kmpc_target_region_state_machine(ident_t *Ident) {
Expand Down
9 changes: 4 additions & 5 deletions openmp/libomptarget/deviceRTLs/common/src/parallel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
}

// assume this is only called for nested parallel
int threadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int threadId = GetLogicalThreadIdInBlock();

// unlike actual parallel, threads in the same team do not share
// the workTaskDescr in this case and num threads is fixed to 1
Expand Down Expand Up @@ -227,7 +227,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
}

// pop stack
int threadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int threadId = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
// set new top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
Expand All @@ -249,8 +249,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
// it's cheap to recalculate this value so we never use the result
// of this call.
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
return GetOmpThreadId(tid, __kmpc_is_spmd_exec_mode());
return GetOmpThreadId();
}

////////////////////////////////////////////////////////////////////////////////
Expand All @@ -262,7 +261,7 @@ EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
"Runtime must be initialized.");
tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
num_threads;
}
Expand Down
7 changes: 4 additions & 3 deletions openmp/libomptarget/deviceRTLs/common/src/reduction.cu
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ static int32_t nvptx_parallel_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
if (NumThreads == 1)
return 1;
Expand Down Expand Up @@ -184,10 +184,11 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
kmp_ListGlobalFctPtr glredFct) {

// Terminate all threads in non-SPMD mode except for the master thread.
if (!__kmpc_is_spmd_exec_mode() && GetThreadIdInBlock() != GetMasterThreadID())
if (!__kmpc_is_spmd_exec_mode() &&
!__kmpc_is_generic_main_thread(GetThreadIdInBlock()))
return 0;

uint32_t ThreadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
uint32_t ThreadId = GetLogicalThreadIdInBlock();

// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams
Expand Down
15 changes: 9 additions & 6 deletions openmp/libomptarget/deviceRTLs/common/src/support.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,11 @@ int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
// or a serial region by the master. If the master (whose CUDA thread
// id is GetMasterThreadID()) calls this routine, we return 0 because
// it is a shadow for the first worker.
int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
int GetLogicalThreadIdInBlock() {
// Implemented using control flow (predication) instead of with a modulo
// operation.
int tid = GetThreadIdInBlock();
if (!isSPMDExecutionMode && tid >= GetMasterThreadID())
if (__kmpc_is_generic_main_thread(tid))
return 0;
else
return tid;
Expand All @@ -83,16 +83,19 @@ int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
//
////////////////////////////////////////////////////////////////////////////////

int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
int GetOmpThreadId() {
int tid = GetThreadIdInBlock();
if (__kmpc_is_generic_main_thread(tid))
return 0;
// omp_thread_num
int rc;
if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
rc = 0;
} else if (isSPMDExecutionMode) {
rc = GetThreadIdInBlock();
} else if (__kmpc_is_spmd_exec_mode()) {
rc = tid;
} else {
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
rc = currTaskDescr->ThreadId();
}
Expand Down
2 changes: 1 addition & 1 deletion openmp/libomptarget/deviceRTLs/common/src/sync.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
"Expected SPMD mode with uninitialized runtime.");
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
tid = GetLogicalThreadIdInBlock();
int numberOfActiveOMPThreads =
GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
if (numberOfActiveOMPThreads > 1) {
Expand Down
6 changes: 3 additions & 3 deletions openmp/libomptarget/deviceRTLs/common/src/task.cu
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
"bad assumptions");

// 2. push new context: update new task descriptor
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
// set new task descriptor as top
Expand Down Expand Up @@ -135,7 +135,7 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
"bad assumptions");

// 2. push new context: update new task descriptor
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
// set new task descriptor as top
Expand Down Expand Up @@ -163,7 +163,7 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
// 3... noting to call... is inline
// 4. pop context
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
parentTaskDescr);
// 5. free
Expand Down
5 changes: 2 additions & 3 deletions openmp/libomptarget/deviceRTLs/common/support.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,12 @@ bool isRuntimeInitialized();
////////////////////////////////////////////////////////////////////////////////

// get global ids to locate tread/team info (constant regardless of OMP)
int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
int GetLogicalThreadIdInBlock();
int GetMasterThreadID();
int GetNumberOfWorkersInTeam();

// get OpenMP thread and team ids
int GetOmpThreadId(int threadId,
bool isSPMDExecutionMode); // omp_thread_num
int GetOmpThreadId(); // omp_thread_num
int GetOmpTeamId(); // omp_team_num

// get OpenMP number of threads and team
Expand Down
4 changes: 4 additions & 0 deletions openmp/libomptarget/deviceRTLs/interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,10 @@ EXTERN void __kmpc_parallel_51(ident_t *ident, kmp_int32 global_tid,
// SPMD execution mode interrogation function.
EXTERN int8_t __kmpc_is_spmd_exec_mode();

/// Return true if the hardware thread id \p Tid represents the OpenMP main
/// thread in generic mode outside of a parallel region.
EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid);

EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
const void *buf, size_t size,
int16_t is_shared, const void **res);
Expand Down
4 changes: 0 additions & 4 deletions openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,6 @@

#define WARPSIZE 32

// Maximum number of preallocated arguments to an outlined parallel/simd
// function. Anything more requires dynamic memory allocation.
#define MAX_SHARED_ARGS 20

// Maximum number of omp state objects per SM allocated statically in global
// memory.
#if __CUDA_ARCH__ >= 600
Expand Down