Skip to content

Commit

Permalink
[OpenMP] Unify the min/max thread/teams pathways
Browse files Browse the repository at this point in the history
We used to pass the min/max threads/teams values through different paths
from the frontend to the middle end. This simplifies the situation by
passing the values once, only when we will create the KernelEnvironment,
which contains the values. At that point we also manifest the metadata,
as appropriate. Some footguns have also been removed, e.g., our target
check is now triple-based, not calling convention-based, as the latter
is dependent on the ordering of operations. The types of the values have
been unified to int32_t.
  • Loading branch information
jdoerfert committed Oct 29, 2023
1 parent 8d2efd7 commit 31b9121
Show file tree
Hide file tree
Showing 18 changed files with 4,850 additions and 4,920 deletions.
93 changes: 45 additions & 48 deletions clang/lib/CodeGen/CGOpenMPRuntime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6002,6 +6002,42 @@ void CGOpenMPRuntime::emitUsesAllocatorsFini(CodeGenFunction &CGF,
{ThreadId, AllocatorVal});
}

void CGOpenMPRuntime::computeMinAndMaxThreadsAndTeams(
const OMPExecutableDirective &D, CodeGenFunction &CGF,
int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal,
int32_t &MaxTeamsVal) {

getNumTeamsExprForTargetDirective(CGF, D, MinTeamsVal, MaxTeamsVal);
getNumThreadsExprForTargetDirective(CGF, D, MaxThreadsVal,
/*UpperBoundOnly=*/true);

for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
for (auto *A : C->getAttrs()) {
int32_t AttrMinThreadsVal = 1, AttrMaxThreadsVal = -1;
int32_t AttrMinBlocksVal = 1, AttrMaxBlocksVal = -1;
if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &AttrMaxThreadsVal,
&AttrMinBlocksVal, &AttrMaxBlocksVal);
else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
CGM.handleAMDGPUFlatWorkGroupSizeAttr(
nullptr, Attr, /*ReqdWGS=*/nullptr, &AttrMinThreadsVal,
&AttrMaxThreadsVal);
else
continue;

MinThreadsVal = std::max(MinThreadsVal, AttrMinThreadsVal);
if (AttrMaxThreadsVal > 0)
MaxThreadsVal = MaxThreadsVal > 0
? std::min(MaxThreadsVal, AttrMaxThreadsVal)
: AttrMaxThreadsVal;
MinTeamsVal = std::max(MinTeamsVal, AttrMinBlocksVal);
if (AttrMaxBlocksVal > 0)
MaxTeamsVal = MaxTeamsVal > 0 ? std::min(MaxTeamsVal, AttrMaxBlocksVal)
: AttrMaxBlocksVal;
}
}
}

void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Expand All @@ -6020,47 +6056,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
return CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc());
};

// Get NumTeams and ThreadLimit attributes
int32_t DefaultValMinTeams = 1;
int32_t DefaultValMaxTeams = -1;
uint32_t DefaultValMinThreads = 1;
uint32_t DefaultValMaxThreads = UINT32_MAX;

getNumTeamsExprForTargetDirective(CGF, D, DefaultValMinTeams,
DefaultValMaxTeams);
getNumThreadsExprForTargetDirective(CGF, D, DefaultValMaxThreads,
/*UpperBoundOnly=*/true);

for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
for (auto *A : C->getAttrs()) {
int32_t MinThreadsVal = 1, MaxThreadsVal = 0;
int32_t MinBlocksVal = 1, MaxBlocksVal = -1;
if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &MaxThreadsVal,
&MinBlocksVal, &MaxBlocksVal);
else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
CGM.handleAMDGPUFlatWorkGroupSizeAttr(
nullptr, Attr, /*ReqdWGS=*/nullptr, &MinThreadsVal, &MaxThreadsVal);
else
continue;

DefaultValMinThreads =
std::max(DefaultValMinThreads, uint32_t(MinThreadsVal));
DefaultValMaxThreads =
DefaultValMaxThreads
? std::min(DefaultValMaxThreads, uint32_t(MaxThreadsVal))
: MaxThreadsVal;
DefaultValMinTeams = DefaultValMinTeams
? std::max(DefaultValMinTeams, MinBlocksVal)
: MinBlocksVal;
DefaultValMaxTeams = std::min(DefaultValMaxTeams, MaxBlocksVal);
}
}

OMPBuilder.emitTargetRegionFunction(
EntryInfo, GenerateOutlinedFunction, DefaultValMinTeams,
DefaultValMaxTeams, DefaultValMinThreads, DefaultValMaxThreads,
IsOffloadEntry, OutlinedFn, OutlinedFnID);
OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
IsOffloadEntry, OutlinedFn, OutlinedFnID);

if (!OutlinedFn)
return;
Expand Down Expand Up @@ -6306,7 +6303,7 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective(
/// store the condition in \p CondVal. If \p E, and \p CondVal respectively, are
/// nullptr, no expression evaluation is perfomed.
static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS,
const Expr **E, uint32_t &UpperBound,
const Expr **E, int32_t &UpperBound,
bool UpperBoundOnly, llvm::Value **CondVal) {
const Stmt *Child = CGOpenMPRuntime::getSingleCompoundChild(
CGF.getContext(), CS->getCapturedStmt());
Expand Down Expand Up @@ -6368,10 +6365,10 @@ static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS,
UpperBound
? Constant->getZExtValue()
: std::min(UpperBound,
static_cast<uint32_t>(Constant->getZExtValue()));
static_cast<int32_t>(Constant->getZExtValue()));
// If we haven't found a upper bound, remember we saw a thread limiting
// clause.
if (UpperBound == UINT32_MAX)
if (UpperBound == -1)
UpperBound = 0;
if (!E)
return;
Expand All @@ -6397,7 +6394,7 @@ static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS,
}

const Expr *CGOpenMPRuntime::getNumThreadsExprForTargetDirective(
CodeGenFunction &CGF, const OMPExecutableDirective &D, uint32_t &UpperBound,
CodeGenFunction &CGF, const OMPExecutableDirective &D, int32_t &UpperBound,
bool UpperBoundOnly, llvm::Value **CondVal, const Expr **ThreadLimitExpr) {
assert((!CGF.getLangOpts().OpenMPIsTargetDevice || UpperBoundOnly) &&
"Clauses associated with the teams directive expected to be emitted "
Expand All @@ -6414,11 +6411,11 @@ const Expr *CGOpenMPRuntime::getNumThreadsExprForTargetDirective(
if (auto Constant = E->getIntegerConstantExpr(CGF.getContext()))
UpperBound = UpperBound ? Constant->getZExtValue()
: std::min(UpperBound,
uint32_t(Constant->getZExtValue()));
int32_t(Constant->getZExtValue()));
}
// If we haven't found a upper bound, remember we saw a thread limiting
// clause.
if (UpperBound == UINT32_MAX)
if (UpperBound == -1)
UpperBound = 0;
if (EPtr)
*EPtr = E;
Expand Down Expand Up @@ -6562,7 +6559,7 @@ llvm::Value *CGOpenMPRuntime::emitNumThreadsForTargetDirective(
llvm::Value *CondVal = nullptr;
llvm::Value *ThreadLimitVal = nullptr;
const Expr *ThreadLimitExpr = nullptr;
uint32_t UpperBound = -1;
int32_t UpperBound = -1;

const Expr *NT = getNumThreadsExprForTargetDirective(
CGF, D, UpperBound, /* UpperBoundOnly */ false, &CondVal,
Expand Down
10 changes: 9 additions & 1 deletion clang/lib/CodeGen/CGOpenMPRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,14 @@ class CGOpenMPRuntime {
/// An OpenMP-IR-Builder instance.
llvm::OpenMPIRBuilder OMPBuilder;

/// Helper to determine the min/max number of threads/teams for \p D.
void computeMinAndMaxThreadsAndTeams(const OMPExecutableDirective &D,
CodeGenFunction &CGF,
int32_t &MinThreadsVal,
int32_t &MaxThreadsVal,
int32_t &MinTeamsVal,
int32_t &MaxTeamsVal);

/// Helper to emit outlined function for 'target' directive.
/// \param D Directive to emit.
/// \param ParentName Name of the function that encloses the target region.
Expand Down Expand Up @@ -649,7 +657,7 @@ class CGOpenMPRuntime {
/// UpperBoundOnly is true, no expression evaluation is perfomed.
const Expr *getNumThreadsExprForTargetDirective(
CodeGenFunction &CGF, const OMPExecutableDirective &D,
uint32_t &UpperBound, bool UpperBoundOnly,
int32_t &UpperBound, bool UpperBoundOnly,
llvm::Value **CondExpr = nullptr, const Expr **ThreadLimitExpr = nullptr);

/// Emit an expression that denotes the number of threads a target region
Expand Down
30 changes: 20 additions & 10 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -757,13 +757,15 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
const OMPExecutableDirective &D;

public:
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
: EST(EST) {}
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
const OMPExecutableDirective &D)
: EST(EST), D(D) {}
void Enter(CodeGenFunction &CGF) override {
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
// Skip target region initialization.
RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
}
Expand All @@ -772,18 +774,25 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
RT.clearLocThreadIdInsertPt(CGF);
RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
}
} Action(EST);
} Action(EST, D);
CodeGen.setAction(Action);
IsInTTDRegion = true;
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
IsOffloadEntry, CodeGen);
IsInTTDRegion = false;
}

void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
CodeGenFunction &CGF,
EntryFunctionState &EST, bool IsSPMD) {
int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
MaxTeamsVal = -1;
computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
MinTeamsVal, MaxTeamsVal);

CGBuilderTy &Bld = CGF.Builder;
Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD));
Bld.restoreIP(OMPBuilder.createTargetInit(
Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
if (!IsSPMD)
emitGenericVarsProlog(CGF, EST.Loc);
}
Expand Down Expand Up @@ -815,19 +824,20 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
bool IsBareKernel;
DataSharingMode Mode;
const OMPExecutableDirective &D;

public:
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
CGOpenMPRuntimeGPU::EntryFunctionState &EST,
bool IsBareKernel)
bool IsBareKernel, const OMPExecutableDirective &D)
: RT(RT), EST(EST), IsBareKernel(IsBareKernel),
Mode(RT.CurrentDataSharingMode) {}
Mode(RT.CurrentDataSharingMode), D(D) {}
void Enter(CodeGenFunction &CGF) override {
if (IsBareKernel) {
RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
return;
}
RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
// Skip target region initialization.
RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
}
Expand All @@ -839,7 +849,7 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
RT.clearLocThreadIdInsertPt(CGF);
RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
}
} Action(*this, EST, IsBareKernel);
} Action(*this, EST, IsBareKernel, D);
CodeGen.setAction(Action);
IsInTTDRegion = true;
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
Expand Down
4 changes: 2 additions & 2 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
void syncCTAThreads(CodeGenFunction &CGF);

/// Helper for target directive initialization.
void emitKernelInit(CodeGenFunction &CGF, EntryFunctionState &EST,
bool IsSPMD);
void emitKernelInit(const OMPExecutableDirective &D, CodeGenFunction &CGF,
EntryFunctionState &EST, bool IsSPMD);

/// Helper for target directive finalization.
void emitKernelDeinit(CodeGenFunction &CGF, EntryFunctionState &EST,
Expand Down
15 changes: 8 additions & 7 deletions clang/test/OpenMP/bug57757.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,24 @@ void foo() {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP1]], i64 0, i32 2
// CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA16:![0-9]+]], !alias.scope !13, !noalias !17
// CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA18:![0-9]+]], !alias.scope !13, !noalias !16
// CHECK-NEXT: switch i32 [[TMP3]], label [[DOTOMP_OUTLINED__EXIT:%.*]] [
// CHECK-NEXT: i32 0, label [[DOTUNTIED_JMP__I:%.*]]
// CHECK-NEXT: i32 1, label [[DOTUNTIED_NEXT__I:%.*]]
// CHECK-NEXT: ]
// CHECK: .untied.jmp..i:
// CHECK-NEXT: store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], !alias.scope !13, !noalias !17
// CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !13
// CHECK-NEXT: store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA18]], !alias.scope !13, !noalias !16
// CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !19
// CHECK-NEXT: br label [[DOTOMP_OUTLINED__EXIT]]
// CHECK: .untied.next..i:
// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i64 0, i32 1
// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 2
// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 1
// CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA19:![0-9]+]], !noalias !13
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA16]], !noalias !13
// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA20:![0-9]+]], !noalias !13
// CHECK-NEXT: tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !13
// CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20:![0-9]+]], !alias.scope !16, !noalias !13
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA18]], !alias.scope !16, !noalias !13
// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA21:![0-9]+]], !alias.scope !16, !noalias !13
// CHECK-NEXT: tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !19
// CHECK-NEXT: br label [[DOTOMP_OUTLINED__EXIT]]
// CHECK: .omp_outlined..exit:
// CHECK-NEXT: ret i32 0
Expand Down
Loading

0 comments on commit 31b9121

Please sign in to comment.