200 changes: 193 additions & 7 deletions llvm/lib/Transforms/IPO/OpenMPOpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
#include "llvm/IR/Assumptions.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/InitializePasses.h"
Expand Down Expand Up @@ -73,6 +74,9 @@ STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
"Number of OpenMP runtime function uses identified");
STATISTIC(NumOpenMPTargetRegionKernels,
"Number of OpenMP target region entry points (=kernels) identified");
STATISTIC(NumOpenMPTargetRegionKernelsSPMD,
"Number of OpenMP target region entry points (=kernels) executed in "
"SPMD-mode instead of generic-mode");
STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
"Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode without a state machines");
Expand Down Expand Up @@ -481,6 +485,10 @@ struct KernelInfoState : AbstractState {
/// State to track what parallel region we might reach.
BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;

/// State to track if we are in SPMD-mode, assumed or know, and why we decided
/// we cannot be.
BooleanStateWithPtrSetVector<Instruction> SPMDCompatibilityTracker;

/// The __kmpc_target_init call in this kernel, if any. If we find more than
/// one we abort as the kernel is malformed.
CallBase *KernelInitCB = nullptr;
Expand All @@ -507,6 +515,7 @@ struct KernelInfoState : AbstractState {
/// See AbstractState::indicatePessimisticFixpoint(...)
ChangeStatus indicatePessimisticFixpoint() override {
IsAtFixpoint = true;
SPMDCompatibilityTracker.indicatePessimisticFixpoint();
ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
return ChangeStatus::CHANGED;
}
Expand All @@ -522,6 +531,8 @@ struct KernelInfoState : AbstractState {
const KernelInfoState &getAssumed() const { return *this; }

bool operator==(const KernelInfoState &RHS) const {
if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker)
return false;
if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)
return false;
if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)
Expand Down Expand Up @@ -552,6 +563,7 @@ struct KernelInfoState : AbstractState {
indicatePessimisticFixpoint();
KernelDeinitCB = KIS.KernelDeinitCB;
}
SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
return *this;
Expand Down Expand Up @@ -2669,8 +2681,10 @@ struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
const std::string getAsStr() const override {
if (!isValidState())
return "<invalid>";
return

return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD"
: "generic") +
std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"
: "") +
std::string(" #PRs: ") +
std::to_string(ReachedKnownParallelRegions.size()) +
", #Unknown PRs: " +
Expand Down Expand Up @@ -2745,8 +2759,9 @@ struct AAKernelInfoFunction : AAKernelInfo {
assert((KernelInitCB && KernelDeinitCB) &&
"Kernel without __kmpc_target_init or __kmpc_target_deinit!");

// For kernels we need to register a simplification callback so that the Attributor
// knows the constant arguments to ___kmpc_target_init and
// For kernels we might need to initialize/finalize the IsSPMD state and
// we need to register a simplification callback so that the Attributor
// knows the constant arguments to __kmpc_target_init and
// __kmpc_target_deinit might actually change.

Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
Expand All @@ -2767,10 +2782,45 @@ struct AAKernelInfoFunction : AAKernelInfo {
return FalseVal;
};

Attributor::SimplifictionCallbackTy IsSPMDModeSimplifyCB =
[&](const IRPosition &IRP, const AbstractAttribute *AA,
bool &UsedAssumedInformation) -> Optional<Value *> {
// IRP represents the "SPMDCompatibilityTracker" argument of an
// __kmpc_target_init or
// __kmpc_target_deinit call. We will answer this one with the internal
// state.
if (!isValidState())
return nullptr;
if (!SPMDCompatibilityTracker.isAtFixpoint()) {
if (AA)
A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
UsedAssumedInformation = true;
} else {
UsedAssumedInformation = false;
}
auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(),
SPMDCompatibilityTracker.isAssumed());
return Val;
};

constexpr const int InitIsSPMDArgNo = 1;
constexpr const int DeinitIsSPMDArgNo = 1;
constexpr const int InitUseStateMachineArgNo = 2;
A.registerSimplificationCallback(
IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
StateMachineSimplifyCB);
A.registerSimplificationCallback(
IRPosition::callsite_argument(*KernelInitCB, InitIsSPMDArgNo),
IsSPMDModeSimplifyCB);
A.registerSimplificationCallback(
IRPosition::callsite_argument(*KernelDeinitCB, DeinitIsSPMDArgNo),
IsSPMDModeSimplifyCB);

// Check if we know we are in SPMD-mode already.
ConstantInt *IsSPMDArg =
dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo));
if (IsSPMDArg && !IsSPMDArg->isZero())
SPMDCompatibilityTracker.indicateOptimisticFixpoint();
}

/// Modify the IR based on the KernelInfoState as the fixpoint iteration is
Expand All @@ -2781,11 +2831,81 @@ struct AAKernelInfoFunction : AAKernelInfo {
if (!KernelInitCB || !KernelDeinitCB)
return ChangeStatus::UNCHANGED;

buildCustomStateMachine(A);
// Known SPMD-mode kernels need no manifest changes.
if (SPMDCompatibilityTracker.isKnown())
return ChangeStatus::UNCHANGED;

// If we can we change the execution mode to SPMD-mode otherwise we build a
// custom state machine.
if (!changeToSPMDMode(A))
buildCustomStateMachine(A);

return ChangeStatus::CHANGED;
}

bool changeToSPMDMode(Attributor &A) {
if (!SPMDCompatibilityTracker.isAssumed()) {
for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
if (!NonCompatibleI)
continue;
auto Remark = [&](OptimizationRemarkAnalysis ORA) {
ORA << "Kernel will be executed in generic-mode due to this "
"potential side-effect";
if (auto *CI = dyn_cast<CallBase>(NonCompatibleI)) {
if (Function *F = CI->getCalledFunction())
ORA << ", consider to add "
"`__attribute__((assume(\"ompx_spmd_amenable\")))`"
" to the called function '"
<< F->getName() << "'";
}
return ORA << ".";
};
A.emitRemark<OptimizationRemarkAnalysis>(
NonCompatibleI, "OpenMPKernelNonSPMDMode", Remark);

LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: "
<< *NonCompatibleI << "\n");
}

return false;
}

// Adjust the global exec mode flag that tells the runtime what mode this
// kernel is executed in.
Function *Kernel = getAnchorScope();
GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
(Kernel->getName() + "_exec_mode").str());
assert(ExecMode && "Kernel without exec mode?");
assert(ExecMode->getInitializer() &&
ExecMode->getInitializer()->isOneValue() &&
"Initially non-SPMD kernel has SPMD exec mode!");
ExecMode->setInitializer(
ConstantInt::get(ExecMode->getInitializer()->getType(), 0));

// Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
const int InitIsSPMDArgNo = 1;
const int DeinitIsSPMDArgNo = 1;
const int InitUseStateMachineArgNo = 2;

auto &Ctx = getAnchorValue().getContext();
A.changeUseAfterManifest(KernelInitCB->getArgOperandUse(InitIsSPMDArgNo),
*ConstantInt::getBool(Ctx, 1));
A.changeUseAfterManifest(
KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
*ConstantInt::getBool(Ctx, 0));
A.changeUseAfterManifest(
KernelDeinitCB->getArgOperandUse(DeinitIsSPMDArgNo),
*ConstantInt::getBool(Ctx, 1));
++NumOpenMPTargetRegionKernelsSPMD;

auto Remark = [&](OptimizationRemark OR) {
return OR << "Generic-mode kernel is changed to SPMD-mode.";
};
A.emitRemark<OptimizationRemark>(KernelInitCB, "OpenMPKernelSPMDMode",
Remark);
return true;
};

ChangeStatus buildCustomStateMachine(Attributor &A) {
assert(ReachedKnownParallelRegions.isValidState() &&
"Custom state machine with invalid parallel region states?");
Expand All @@ -2809,7 +2929,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
!IsSPMD->isZero())
return ChangeStatus::UNCHANGED;

// First, indicate we use a custom state machine now.
// If not SPMD mode, indicate we use a custom state machine now.
auto &Ctx = getAnchorValue().getContext();
auto *FalseVal = ConstantInt::getBool(Ctx, 0);
A.changeUseAfterManifest(
Expand Down Expand Up @@ -2871,7 +2991,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
return ORA
<< "State machine fallback caused by this call. If it is a "
"false positive, use "
"`__attribute__((assume(\"omp_no_openmp\"))` "
"`__attribute__((assume(\"omp_no_openmp\")))` "
"(or \"omp_no_parallelism\").";
};
A.emitRemark<OptimizationRemarkAnalysis>(
Expand Down Expand Up @@ -3064,6 +3184,28 @@ struct AAKernelInfoFunction : AAKernelInfo {
ChangeStatus updateImpl(Attributor &A) override {
KernelInfoState StateBefore = getState();

// Callback to check a read/write instruction.
auto CheckRWInst = [&](Instruction &I) {
// We handle calls later.
if (isa<CallBase>(I))
return true;
// We only care about write effects.
if (!I.mayWriteToMemory())
return true;
if (auto *SI = dyn_cast<StoreInst>(&I)) {
SmallVector<const Value *> Objects;
getUnderlyingObjects(SI->getPointerOperand(), Objects);
if (llvm::all_of(Objects,
[](const Value *Obj) { return isa<AllocaInst>(Obj); }))
return true;
}
// For now we give up on everything but stores.
SPMDCompatibilityTracker.insert(&I);
return true;
};
if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
SPMDCompatibilityTracker.indicatePessimisticFixpoint();

// Callback to check a call instruction.
auto CheckCallInst = [&](Instruction &I) {
auto &CB = cast<CallBase>(I);
Expand Down Expand Up @@ -3101,6 +3243,10 @@ struct AAKernelInfoCallSite : AAKernelInfo {
return Fn && hasAssumption(*Fn, AssumptionStr);
};

// Check for SPMD-mode assumptions.
if (HasAssumption(Callee, "ompx_spmd_amenable"))
SPMDCompatibilityTracker.indicateOptimisticFixpoint();

// First weed out calls we do not care about, that is readonly/readnone
// calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
// parallel region or anything else we are looking for.
Expand All @@ -3125,6 +3271,11 @@ struct AAKernelInfoCallSite : AAKernelInfo {
HasAssumption(Callee, "omp_no_parallelism")))
ReachedUnknownParallelRegions.insert(&CB);

// If SPMDCompatibilityTracker is not fixed, we need to give up on the
// idea we can run something unknown in SPMD-mode.
if (!SPMDCompatibilityTracker.isAtFixpoint())
SPMDCompatibilityTracker.insert(&CB);

// We have updated the state for this unknown call properly, there won't
// be any change so we indicate a fixpoint.
indicateOptimisticFixpoint();
Expand All @@ -3137,6 +3288,37 @@ struct AAKernelInfoCallSite : AAKernelInfo {
const unsigned int WrapperFunctionArgNo = 6;
RuntimeFunction RF = It->getSecond();
switch (RF) {
// All the functions we know are compatible with SPMD mode.
case OMPRTL___kmpc_is_spmd_exec_mode:
case OMPRTL___kmpc_for_static_fini:
case OMPRTL___kmpc_global_thread_num:
case OMPRTL___kmpc_single:
case OMPRTL___kmpc_end_single:
case OMPRTL___kmpc_master:
case OMPRTL___kmpc_end_master:
case OMPRTL___kmpc_barrier:
break;
case OMPRTL___kmpc_for_static_init_4:
case OMPRTL___kmpc_for_static_init_4u:
case OMPRTL___kmpc_for_static_init_8:
case OMPRTL___kmpc_for_static_init_8u: {
// Check the schedule and allow static schedule in SPMD mode.
unsigned ScheduleArgOpNo = 2;
auto *ScheduleTypeCI =
dyn_cast<ConstantInt>(CB.getArgOperand(ScheduleArgOpNo));
unsigned ScheduleTypeVal =
ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
switch (OMPScheduleType(ScheduleTypeVal)) {
case OMPScheduleType::Static:
case OMPScheduleType::StaticChunked:
case OMPScheduleType::Distribute:
case OMPScheduleType::DistributeChunked:
break;
default:
SPMDCompatibilityTracker.insert(&CB);
break;
};
} break;
case OMPRTL___kmpc_target_init:
KernelInitCB = &CB;
break;
Expand All @@ -3156,9 +3338,13 @@ struct AAKernelInfoCallSite : AAKernelInfo {
break;
case OMPRTL___kmpc_omp_task:
// We do not look into tasks right now, just give up.
SPMDCompatibilityTracker.insert(&CB);
ReachedUnknownParallelRegions.insert(&CB);
break;
default:
// Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
// generally.
SPMDCompatibilityTracker.insert(&CB);
break;
}
// All other OpenMP runtime calls will not reach parallel regions so they
Expand Down
38 changes: 2 additions & 36 deletions llvm/test/Transforms/OpenMP/custom_state_machines.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1526,51 +1526,17 @@ attributes #10 = { convergent nounwind readonly willreturn }
; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_pure_l72
; CHECK-SAME: () #[[ATTR0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true)
; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
; CHECK: worker_state_machine.begin:
; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
; CHECK: worker_state_machine.finished:
; CHECK-NEXT: ret void
; CHECK: worker_state_machine.is_active.check:
; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; CHECK: worker_state_machine.parallel_region.check:
; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__13_wrapper
; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; CHECK: worker_state_machine.parallel_region.execute:
; CHECK-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]])
; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; CHECK: worker_state_machine.parallel_region.check1:
; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]]
; CHECK: worker_state_machine.parallel_region.execute2:
; CHECK-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]])
; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; CHECK: worker_state_machine.parallel_region.check3:
; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
; CHECK: worker_state_machine.parallel_region.end:
; CHECK-NEXT: call void @__kmpc_kernel_end_parallel()
; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
; CHECK: worker_state_machine.done.barrier:
; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
; CHECK: thread.user_code.check:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef true, i1 noundef false, i1 noundef true)
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
; CHECK: user_code.entry:
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
; CHECK-NEXT: call void @__omp_outlined__12(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
; CHECK-NEXT: ret void
; CHECK: worker.exit:
; CHECK-NEXT: ret void
Expand Down
16 changes: 13 additions & 3 deletions llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
target triple = "nvptx64"

; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback [1 known parallel regions, 2 unkown parallel regions] (bad)
; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:13:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp"))` (or "omp_no_parallelism")
; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:15:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp"))` (or "omp_no_parallelism")
; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:13:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp")))` (or "omp_no_parallelism")
; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:15:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp")))` (or "omp_no_parallelism")
; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:20:1: Generic-mode kernel is executed with a customized state machine [1 known parallel regions] (good)

;; void unknown(void);
Expand All @@ -22,13 +22,15 @@ target triple = "nvptx64"
;; unknown();
;; }
;; }
;;
;;
;; void no_openmp(void) __attribute__((assume("omp_no_openmp")));
;; void test_no_fallback(void) {
;; #pragma omp target teams
;; {
;; known();
;; known();
;; known();
;; no_openmp(); // make it non-spmd
;; }
;; }

Expand All @@ -50,6 +52,7 @@ target triple = "nvptx64"
@__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode = weak constant i8 1
@12 = private unnamed_addr constant [73 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;known;4;1;;\00", align 1
@13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([73 x i8], [73 x i8]* @12, i32 0, i32 0) }, align 8
@G = external global i32
@llvm.compiler.used = appending global [2 x i8*] [i8* @__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode, i8* @__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode], section "llvm.metadata"

; Function Attrs: convergent norecurse nounwind
Expand Down Expand Up @@ -124,6 +127,8 @@ user_code.entry: ; preds = %entry
%6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %6, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !43
call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !45
call void @no_openmp()
call void @no_parallelism()
call void @__kmpc_target_deinit(%struct.ident_t* nonnull @11, i1 false, i1 true) #3, !dbg !46
br label %common.ret
}
Expand Down Expand Up @@ -154,13 +159,18 @@ declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5

declare void @no_openmp() #7
declare void @no_parallelism() #8

attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #2 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #3 = { nounwind }
attributes #4 = { norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #5 = { argmemonly nofree nosync nounwind willreturn }
attributes #6 = { convergent nounwind }
attributes #7 = { "llvm.assume"="omp_no_openmp" }
attributes #8 = { "llvm.assume"="omp_no_parallelism" }

!llvm.dbg.cu = !{!0}
!omp_offload.info = !{!3, !4}
Expand Down
214 changes: 214 additions & 0 deletions llvm/test/Transforms/OpenMP/spmdization.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s

;; void unknown(void);
;; void spmd_amenable(void) __attribute__((assume("ompx_spmd_amenable")))
;;
;; void sequential_loop() {
;; #pragma omp target teams
;; {
;; for (int i = 0; i < 100; ++i) {
;; #pragma omp parallel
;; {
;; unknown();
;; }
;; }
; spmd_amenable();
;; }
;; }

target triple = "nvptx64"

%struct.ident_t = type { i32, i32, i32, i32, i8* }

@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
@__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode = weak constant i8 1
@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode], section "llvm.metadata"

; The second argument of __kmpc_target_init and deinit is is set to true to indicate that we can run in SPMD mode.
; We also adjusted the global __omp_offloading_2c_38c77_sequential_loop_l4_exec_mode to have a zero initializer (which indicates SPMD mode to the runtime).
;.
; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
; CHECK: @[[__OMP_OFFLOADING_2C_38C77_SEQUENTIAL_LOOP_L4_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode], section "llvm.metadata"
;.
define weak void @__omp_offloading_2c_38c77_sequential_loop_l4() #0 {
; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_38c77_sequential_loop_l4
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true)
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
; CHECK: user_code.entry:
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2:[0-9]+]]
; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
; CHECK-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
; CHECK-NEXT: ret void
; CHECK: worker.exit:
; CHECK-NEXT: ret void
;
entry:
%.zero.addr = alloca i32, align 4
%.threadid_temp. = alloca i32, align 4
store i32 0, i32* %.zero.addr, align 4
%0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
%exec_user_code = icmp eq i32 %0, -1
br i1 %exec_user_code, label %user_code.entry, label %worker.exit

user_code.entry: ; preds = %entry
%1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
store i32 %1, i32* %.threadid_temp., align 4
call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #2
call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
ret void

worker.exit: ; preds = %entry
ret void
}

declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)

; Function Attrs: convergent norecurse nounwind
define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
; CHECK-NEXT: br label [[FOR_COND:%.*]]
; CHECK: for.cond:
; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
; CHECK-NEXT: br label [[FOR_INC]]
; CHECK: for.inc:
; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
; CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: call void @spmd_amenable()
; CHECK-NEXT: ret void
;
entry:
%captured_vars_addrs = alloca [0 x i8*], align 8
br label %for.cond

for.cond: ; preds = %for.inc, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
%cmp = icmp slt i32 %i.0, 100
br i1 %cmp, label %for.body, label %for.end

for.body: ; preds = %for.cond
%0 = load i32, i32* %.global_tid., align 4
%1 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %1, i64 0)
br label %for.inc

for.inc: ; preds = %for.body
%inc = add nsw i32 %i.0, 1
br label %for.cond, !llvm.loop !6

for.end: ; preds = %for.cond
call void @spmd_amenable()
ret void
}

; Function Attrs: convergent norecurse nounwind
define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1
; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: call void @unknown() #[[ATTR4:[0-9]+]]
; CHECK-NEXT: ret void
;
entry:
call void @unknown() #3
ret void
}

; Function Attrs: convergent
declare void @unknown() #1

; Function Attrs: convergent norecurse nounwind
define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #0 {
; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
; CHECK-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
; CHECK-NEXT: ret void
;
entry:
%.addr1 = alloca i32, align 4
%.zero.addr = alloca i32, align 4
%global_args = alloca i8**, align 8
store i32 0, i32* %.zero.addr, align 4
store i32 %1, i32* %.addr1, align 4
call void @__kmpc_get_shared_variables(i8*** %global_args)
call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr) #2
ret void
}

declare void @__kmpc_get_shared_variables(i8***)

declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64)

; Function Attrs: nounwind
declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #2

declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)

declare void @spmd_amenable() #4

attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #2 = { nounwind }
attributes #3 = { convergent }
attributes #4 = { "llvm.assume"="ompx_spmd_amenable" }

!omp_offload.info = !{!0}
!nvvm.annotations = !{!1}
!llvm.module.flags = !{!2, !3, !4, !8, !9}
!llvm.ident = !{!5}

!0 = !{i32 0, i32 44, i32 232567, !"sequential_loop", i32 4, i32 0}
!1 = !{void ()* @__omp_offloading_2c_38c77_sequential_loop_l4, !"kernel", i32 1}
!2 = !{i32 1, !"wchar_size", i32 4}
!3 = !{i32 7, !"PIC Level", i32 2}
!4 = !{i32 7, !"frame-pointer", i32 2}
!5 = !{!"clang version 13.0.0"}
!6 = distinct !{!6, !7}
!7 = !{!"llvm.loop.mustprogress"}
!8 = !{i32 7, !"openmp", i32 50}
!9 = !{i32 7, !"openmp-device", i32 50}
;.
; CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
; CHECK: attributes #[[ATTR2]] = { nounwind }
; CHECK: attributes #[[ATTR3:[0-9]+]] = { "llvm.assume"="ompx_spmd_amenable" }
; CHECK: attributes #[[ATTR4]] = { convergent }
;.
; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 44, i32 232567, !"sequential_loop", i32 4, i32 0}
; CHECK: [[META1:![0-9]+]] = !{void ()* @__omp_offloading_2c_38c77_sequential_loop_l4, !"kernel", i32 1}
; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
; CHECK: [[META3:![0-9]+]] = !{i32 7, !"PIC Level", i32 2}
; CHECK: [[META4:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
; CHECK: [[META7:![0-9]+]] = !{!"clang version 13.0.0"}
; CHECK: [[LOOP8]] = distinct !{!8, !9}
; CHECK: [[META9:![0-9]+]] = !{!"llvm.loop.mustprogress"}
;.
233 changes: 233 additions & 0 deletions llvm/test/Transforms/OpenMP/spmdization_remarks.ll

Large diffs are not rendered by default.

24 changes: 12 additions & 12 deletions openmp/libomptarget/deviceRTLs/common/src/loop.cu
Original file line number Diff line number Diff line change
Expand Up @@ -204,15 +204,15 @@ public:
INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
kmp_sched_t schedule, T lb, T ub, ST st,
ST chunk) {
if (checkRuntimeUninitialized(loc)) {
if (isRuntimeUninitialized()) {
// In SPMD mode no need to check parallelism level - dynamic scheduling
// may appear only in L2 parallel regions with lightweight runtime.
ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
return;
}
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc));
T tnum = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
T tripCount = ub - lb + 1; // +1 because ub is inclusive
ASSERT0(LT_FUSSY, threadId < tnum,
"current thread is not needed here; error");
Expand Down Expand Up @@ -441,10 +441,10 @@ public:

INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
T *plower, T *pupper, ST *pstride) {
if (checkRuntimeUninitialized(loc)) {
if (isRuntimeUninitialized()) {
// In SPMD mode no need to check parallelism level - dynamic scheduling
// may appear only in L2 parallel regions with lightweight runtime.
ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
if (*plast)
return DISPATCH_FINISHED;
*plast = 1;
Expand All @@ -453,8 +453,8 @@ public:
// ID of a thread in its own warp

// automatically selects thread or warp ID based on selected implementation
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)),
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
"current thread is not needed here; error");
// retrieve schedule
kmp_sched_t schedule =
Expand Down Expand Up @@ -624,7 +624,7 @@ EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
checkSPMDMode(loc));
__kmpc_is_spmd_exec_mode());
}

EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
Expand All @@ -635,7 +635,7 @@ EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
checkSPMDMode(loc));
__kmpc_is_spmd_exec_mode());
}

EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
Expand All @@ -646,7 +646,7 @@ EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
checkSPMDMode(loc));
__kmpc_is_spmd_exec_mode());
}

EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
Expand All @@ -657,7 +657,7 @@ EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
checkSPMDMode(loc));
__kmpc_is_spmd_exec_mode());
}

EXTERN
Expand Down
20 changes: 10 additions & 10 deletions openmp/libomptarget/deviceRTLs/common/src/parallel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -181,14 +181,14 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {

IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());

if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
"Expected SPMD mode with uninitialized runtime.");
return;
}

// assume this is only called for nested parallel
int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
int threadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());

// unlike actual parallel, threads in the same team do not share
// the workTaskDescr in this case and num threads is fixed to 1
Expand Down Expand Up @@ -220,14 +220,14 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,

DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());

if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
"Expected SPMD mode with uninitialized runtime.");
return;
}

// pop stack
int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
int threadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
// set new top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
Expand All @@ -249,8 +249,8 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
// it's cheap to recalculate this value so we never use the result
// of this call.
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
return GetOmpThreadId(tid, checkSPMDMode(loc));
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
return GetOmpThreadId(tid, __kmpc_is_spmd_exec_mode());
}

////////////////////////////////////////////////////////////////////////////////
Expand All @@ -260,9 +260,9 @@ EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
int32_t num_threads) {
PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
"Runtime must be initialized.");
tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
num_threads;
}
Expand Down
12 changes: 6 additions & 6 deletions openmp/libomptarget/deviceRTLs/common/src/reduction.cu
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,11 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
kmp_InterWarpCopyFctPtr cpyFct) {
return nvptx_parallel_reduce_nowait(
global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
__kmpc_is_spmd_exec_mode(), isRuntimeUninitialized());
}

INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
return checkGenericMode(loc) || IsTeamMaster(ThreadId);
return !__kmpc_is_spmd_exec_mode() || IsTeamMaster(ThreadId);
}

INLINE static uint32_t roundToWarpsize(uint32_t s) {
Expand All @@ -184,16 +184,16 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
kmp_ListGlobalFctPtr glredFct) {

// Terminate all threads in non-SPMD mode except for the master thread.
if (checkGenericMode(loc) && GetThreadIdInBlock() != GetMasterThreadID())
if (!__kmpc_is_spmd_exec_mode() && GetThreadIdInBlock() != GetMasterThreadID())
return 0;

uint32_t ThreadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
uint32_t ThreadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());

// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.
uint32_t NumThreads =
checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
__kmpc_is_spmd_exec_mode() ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
: /*Master thread only*/ 1;
uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = GetNumberOfBlocksInKernel();
Expand Down Expand Up @@ -225,7 +225,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
}
// Synchronize
if (checkSPMDMode(loc))
if (__kmpc_is_spmd_exec_mode())
__kmpc_barrier(loc, global_tid);

// reduce_data is global or shared so before being reduced within the
Expand Down
51 changes: 0 additions & 51 deletions openmp/libomptarget/deviceRTLs/common/src/support.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,57 +34,6 @@ bool isRuntimeInitialized() {
return (execution_param & RuntimeMask) == RuntimeInitialized;
}

////////////////////////////////////////////////////////////////////////////////
// Execution Modes based on location parameter fields
////////////////////////////////////////////////////////////////////////////////

bool checkSPMDMode(kmp_Ident *loc) {
if (!loc)
return __kmpc_is_spmd_exec_mode();

// If SPMD is true then we are not in the UNDEFINED state so
// we can return immediately.
if (loc->reserved_2 & KMP_IDENT_SPMD_MODE)
return true;

// If not in SPMD mode and runtime required is a valid
// combination of flags so we can return immediately.
if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE))
return false;

// We are in underfined state.
return __kmpc_is_spmd_exec_mode();
}

bool checkGenericMode(kmp_Ident *loc) { return !checkSPMDMode(loc); }

bool checkRuntimeUninitialized(kmp_Ident *loc) {
if (!loc)
return isRuntimeUninitialized();

// If runtime is required then we know we can't be
// in the undefined mode. We can return immediately.
if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE))
return false;

// If runtime is required then we need to check is in
// SPMD mode or not. If not in SPMD mode then we end
// up in the UNDEFINED state that marks the orphaned
// functions.
if (loc->reserved_2 & KMP_IDENT_SPMD_MODE)
return true;

// Check if we are in an UNDEFINED state. Undefined is denoted by
// non-SPMD + noRuntimeRequired which is a combination that
// cannot actually happen. Undefined states is used to mark orphaned
// functions.
return isRuntimeUninitialized();
}

bool checkRuntimeInitialized(kmp_Ident *loc) {
return !checkRuntimeUninitialized(loc);
}

////////////////////////////////////////////////////////////////////////////////
// support: get info from machine
////////////////////////////////////////////////////////////////////////////////
Expand Down
10 changes: 5 additions & 5 deletions openmp/libomptarget/deviceRTLs/common/src/sync.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,16 @@ EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) {
}

EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
if (checkRuntimeUninitialized(loc_ref)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc_ref),
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
"Expected SPMD mode with uninitialized runtime.");
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
int numberOfActiveOMPThreads =
GetNumberOfOmpThreads(checkSPMDMode(loc_ref));
GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
if (numberOfActiveOMPThreads > 1) {
if (checkSPMDMode(loc_ref)) {
if (__kmpc_is_spmd_exec_mode()) {
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
// The #threads parameter must be rounded up to the WARPSIZE.
Expand Down
12 changes: 6 additions & 6 deletions openmp/libomptarget/deviceRTLs/common/src/task.cu
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
void *noAliasDepList) {
PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
P64(newKmpTaskDescr));
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
"Runtime must be initialized.");
// 1. get explicit task descr from kmp task descr
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
Expand All @@ -96,7 +96,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
"bad assumptions");

// 2. push new context: update new task descriptor
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
// set new task descriptor as top
Expand All @@ -122,7 +122,7 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newKmpTaskDescr) {
PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
(unsigned long long)newKmpTaskDescr);
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
"Runtime must be initialized.");
// 1. get explicit task descr from kmp task descr
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
Expand All @@ -135,7 +135,7 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
"bad assumptions");

// 2. push new context: update new task descriptor
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
// set new task descriptor as top
Expand All @@ -148,7 +148,7 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
kmp_TaskDescr *newKmpTaskDescr) {
PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
(unsigned long long)newKmpTaskDescr);
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
"Runtime must be initialized.");
// 1. get explicit task descr from kmp task descr
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
Expand All @@ -163,7 +163,7 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
// 3... noting to call... is inline
// 4. pop context
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
parentTaskDescr);
// 5. free
Expand Down
9 changes: 0 additions & 9 deletions openmp/libomptarget/deviceRTLs/common/support.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,6 @@ bool isGenericMode();
bool isRuntimeUninitialized();
bool isRuntimeInitialized();

////////////////////////////////////////////////////////////////////////////////
// Execution Modes based on location parameter fields
////////////////////////////////////////////////////////////////////////////////

bool checkSPMDMode(kmp_Ident *loc);
bool checkGenericMode(kmp_Ident *loc);
bool checkRuntimeUninitialized(kmp_Ident *loc);
bool checkRuntimeInitialized(kmp_Ident *loc);

////////////////////////////////////////////////////////////////////////////////
// get info from machine
////////////////////////////////////////////////////////////////////////////////
Expand Down