Skip to content

Commit

Permalink
[OpenMPIRBuilder] Implement static-chunked workshare-loop schedules.
Browse files Browse the repository at this point in the history
Add applyStaticChunkedWorkshareLoop method implementing static schedule when chunk-size is specified. Unlike a static schedule without chunk-size (where chunk-size is chosen by the runtime such that each thread receives one chunk), we need two nested loops: one for looping over the iterations of a chunk, and a second for looping over all chunks assigned to the threads.

This patch includes the following related changes:
 * Adapt applyWorkshareLoop to triage between the schedule types, now possible since all schedules have been implemented. The default schedule is assumed to be non-chunked static, as without OpenMPIRBuilder.
 * Remove the chunk parameter from applyStaticWorkshareLoop, it is ignored by the runtime. Change the value for the value passed to the init function to 0, as without OpenMPIRBuilder.
 * Refactor CanonicalLoopInfo::setTripCount and CanonicalLoopInfo::mapIndVar as used by both, applyStaticWorkshareLoop and applyStaticChunkedWorkshareLoop.
 * Enable Clang to use the OpenMPIRBuilder in the presence of the schedule clause.

Differential Revision: https://reviews.llvm.org/D114413
  • Loading branch information
Meinersbur committed Mar 1, 2022
1 parent 73b193a commit a66f776
Show file tree
Hide file tree
Showing 22 changed files with 1,785 additions and 344 deletions.
60 changes: 55 additions & 5 deletions clang/lib/CodeGen/CGStmtOpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3717,13 +3717,52 @@ static bool emitWorksharingDirective(CodeGenFunction &CGF,
static bool isSupportedByOpenMPIRBuilder(const OMPForDirective &S) {
if (S.hasCancel())
return false;
for (OMPClause *C : S.clauses())
if (!isa<OMPNowaitClause>(C))
return false;
for (OMPClause *C : S.clauses()) {
if (isa<OMPNowaitClause>(C))
continue;

if (auto *SC = dyn_cast<OMPScheduleClause>(C)) {
if (SC->getFirstScheduleModifier() != OMPC_SCHEDULE_MODIFIER_unknown)
return false;
if (SC->getSecondScheduleModifier() != OMPC_SCHEDULE_MODIFIER_unknown)
return false;
switch (SC->getScheduleKind()) {
case OMPC_SCHEDULE_auto:
case OMPC_SCHEDULE_dynamic:
case OMPC_SCHEDULE_runtime:
case OMPC_SCHEDULE_guided:
case OMPC_SCHEDULE_static:
continue;
case OMPC_SCHEDULE_unknown:
return false;
}
}

return false;
}

return true;
}

static llvm::omp::ScheduleKind
convertClauseKindToSchedKind(OpenMPScheduleClauseKind ScheduleClauseKind) {
switch (ScheduleClauseKind) {
case OMPC_SCHEDULE_unknown:
return llvm::omp::OMP_SCHEDULE_Default;
case OMPC_SCHEDULE_auto:
return llvm::omp::OMP_SCHEDULE_Auto;
case OMPC_SCHEDULE_dynamic:
return llvm::omp::OMP_SCHEDULE_Dynamic;
case OMPC_SCHEDULE_guided:
return llvm::omp::OMP_SCHEDULE_Guided;
case OMPC_SCHEDULE_runtime:
return llvm::omp::OMP_SCHEDULE_Runtime;
case OMPC_SCHEDULE_static:
return llvm::omp::OMP_SCHEDULE_Static;
}
llvm_unreachable("Unhandled schedule kind");
}

void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
bool HasLastprivates = false;
bool UseOMPIRBuilder =
Expand All @@ -3732,18 +3771,29 @@ void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) {
UseOMPIRBuilder](CodeGenFunction &CGF, PrePostActionTy &) {
// Use the OpenMPIRBuilder if enabled.
if (UseOMPIRBuilder) {
bool NeedsBarrier = !S.getSingleClause<OMPNowaitClause>();

llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default;
llvm::Value *ChunkSize = nullptr;
if (auto *SchedClause = S.getSingleClause<OMPScheduleClause>()) {
SchedKind =
convertClauseKindToSchedKind(SchedClause->getScheduleKind());
if (const Expr *ChunkSizeExpr = SchedClause->getChunkSize())
ChunkSize = EmitScalarExpr(ChunkSizeExpr);
}

// Emit the associated statement and get its loop representation.
const Stmt *Inner = S.getRawStmt();
llvm::CanonicalLoopInfo *CLI =
EmitOMPCollapsedCanonicalLoopNest(Inner, 1);

bool NeedsBarrier = !S.getSingleClause<OMPNowaitClause>();
llvm::OpenMPIRBuilder &OMPBuilder =
CGM.getOpenMPRuntime().getOMPBuilder();
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
AllocaInsertPt->getParent(), AllocaInsertPt->getIterator());
OMPBuilder.applyWorkshareLoop(Builder.getCurrentDebugLocation(), CLI,
AllocaIP, NeedsBarrier);
AllocaIP, NeedsBarrier, SchedKind,
ChunkSize);
return;
}

Expand Down
16 changes: 8 additions & 8 deletions clang/test/OpenMP/cancel_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1366,7 +1366,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK3-NEXT: store i32 0, i32* [[P_UPPERBOUND]], align 4
// CHECK3-NEXT: store i32 1, i32* [[P_STRIDE]], align 4
// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK3-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
// CHECK3-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 0)
// CHECK3-NEXT: [[TMP0:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
// CHECK3-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], [[TMP0]]
Expand Down Expand Up @@ -1402,7 +1402,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK3-NEXT: store i32 1, i32* [[P_UPPERBOUND29]], align 4
// CHECK3-NEXT: store i32 1, i32* [[P_STRIDE30]], align 4
// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM31:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK3-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 1)
// CHECK3-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 0)
// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[P_LOWERBOUND28]], align 4
// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[P_UPPERBOUND29]], align 4
// CHECK3-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], [[TMP7]]
Expand Down Expand Up @@ -2002,7 +2002,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK4-NEXT: store i32 0, i32* [[P_UPPERBOUND]], align 4
// CHECK4-NEXT: store i32 1, i32* [[P_STRIDE]], align 4
// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK4-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
// CHECK4-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 0)
// CHECK4-NEXT: [[TMP0:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
// CHECK4-NEXT: [[TMP1:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
// CHECK4-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], [[TMP0]]
Expand Down Expand Up @@ -2038,7 +2038,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK4-NEXT: store i32 1, i32* [[P_UPPERBOUND29]], align 4
// CHECK4-NEXT: store i32 1, i32* [[P_STRIDE30]], align 4
// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM31:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK4-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 1)
// CHECK4-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 0)
// CHECK4-NEXT: [[TMP7:%.*]] = load i32, i32* [[P_LOWERBOUND28]], align 4
// CHECK4-NEXT: [[TMP8:%.*]] = load i32, i32* [[P_UPPERBOUND29]], align 4
// CHECK4-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], [[TMP7]]
Expand Down Expand Up @@ -3878,7 +3878,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK9-NEXT: store i32 0, i32* [[P_UPPERBOUND]], align 4
// CHECK9-NEXT: store i32 1, i32* [[P_STRIDE]], align 4
// CHECK9-NEXT: [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK9-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
// CHECK9-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 0)
// CHECK9-NEXT: [[TMP0:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
// CHECK9-NEXT: [[TMP1:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
// CHECK9-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], [[TMP0]]
Expand Down Expand Up @@ -3914,7 +3914,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK9-NEXT: store i32 1, i32* [[P_UPPERBOUND29]], align 4
// CHECK9-NEXT: store i32 1, i32* [[P_STRIDE30]], align 4
// CHECK9-NEXT: [[OMP_GLOBAL_THREAD_NUM31:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK9-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 1)
// CHECK9-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 0)
// CHECK9-NEXT: [[TMP7:%.*]] = load i32, i32* [[P_LOWERBOUND28]], align 4
// CHECK9-NEXT: [[TMP8:%.*]] = load i32, i32* [[P_UPPERBOUND29]], align 4
// CHECK9-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], [[TMP7]]
Expand Down Expand Up @@ -4514,7 +4514,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK10-NEXT: store i32 0, i32* [[P_UPPERBOUND]], align 4
// CHECK10-NEXT: store i32 1, i32* [[P_STRIDE]], align 4
// CHECK10-NEXT: [[OMP_GLOBAL_THREAD_NUM11:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK10-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
// CHECK10-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM11]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 0)
// CHECK10-NEXT: [[TMP0:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
// CHECK10-NEXT: [[TMP1:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
// CHECK10-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], [[TMP0]]
Expand Down Expand Up @@ -4550,7 +4550,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK10-NEXT: store i32 1, i32* [[P_UPPERBOUND29]], align 4
// CHECK10-NEXT: store i32 1, i32* [[P_STRIDE30]], align 4
// CHECK10-NEXT: [[OMP_GLOBAL_THREAD_NUM31:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK10-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 1)
// CHECK10-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM31]], i32 34, i32* [[P_LASTITER27]], i32* [[P_LOWERBOUND28]], i32* [[P_UPPERBOUND29]], i32* [[P_STRIDE30]], i32 1, i32 0)
// CHECK10-NEXT: [[TMP7:%.*]] = load i32, i32* [[P_LOWERBOUND28]], align 4
// CHECK10-NEXT: [[TMP8:%.*]] = load i32, i32* [[P_UPPERBOUND29]], align 4
// CHECK10-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], [[TMP7]]
Expand Down
2 changes: 1 addition & 1 deletion clang/test/OpenMP/irbuilder_for_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ extern "C" void workshareloop_iterator(float *a, float *b, float *c) {
// CHECK-NEXT: store i64 [[TMP2]], i64* [[P_UPPERBOUND]], align 8
// CHECK-NEXT: store i64 1, i64* [[P_STRIDE]], align 8
// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
// CHECK-NEXT: call void @__kmpc_for_static_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 1)
// CHECK-NEXT: call void @__kmpc_for_static_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 0)
// CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[P_LOWERBOUND]], align 8
// CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[P_UPPERBOUND]], align 8
// CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], [[TMP3]]
Expand Down
2 changes: 1 addition & 1 deletion clang/test/OpenMP/irbuilder_for_rangefor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ extern "C" void workshareloop_rangefor(float *a, float *b, float *c) {
// CHECK-NEXT: store i64 [[TMP5]], i64* [[P_UPPERBOUND]], align 8
// CHECK-NEXT: store i64 1, i64* [[P_STRIDE]], align 8
// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
// CHECK-NEXT: call void @__kmpc_for_static_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 1)
// CHECK-NEXT: call void @__kmpc_for_static_init_8u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 0)
// CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[P_LOWERBOUND]], align 8
// CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[P_UPPERBOUND]], align 8
// CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], [[TMP6]]
Expand Down
2 changes: 1 addition & 1 deletion clang/test/OpenMP/irbuilder_for_unsigned.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ extern "C" void workshareloop_unsigned(float *a, float *b, float *c, float *d) {
// CHECK-NEXT: store i32 [[TMP3]], i32* [[P_UPPERBOUND]], align 4
// CHECK-NEXT: store i32 1, i32* [[P_STRIDE]], align 4
// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1)
// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 0)
// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4
// CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]]
Expand Down
Loading

0 comments on commit a66f776

Please sign in to comment.