Skip to content

Commit 517c3ae

Browse files
Leporacanthicuskiranchandramohan
authored andcommitted
[OpenMP IRBuilder, MLIR] Add support for OpenMP do schedule dynamic
The implementation supports static schedule for Fortran do loops. This implements the dynamic variant of the same concept. Reviewed By: Meinersbur Differential Revision: https://reviews.llvm.org/D97393
1 parent 8628ed0 commit 517c3ae

File tree

5 files changed

+302
-14
lines changed

5 files changed

+302
-14
lines changed

llvm/include/llvm/Frontend/OpenMP/OMPConstants.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,17 @@ inline std::string getAllAssumeClauseOptions() {
107107
return S + "'";
108108
}
109109

110+
/// \note This needs to be kept in sync with kmp.h enum sched_type.
111+
/// Todo: Update kmp.h to include this file, and remove the enums in kmp.h
112+
/// To complete this, more enum values will need to be moved here.
113+
enum class OMPScheduleType {
114+
Static = 34, /**< static unspecialized */
115+
DynamicChunked = 35,
116+
ModifierNonmonotonic =
117+
(1 << 30), /**< Set if the nonmonotonic schedule modifier was present */
118+
LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierNonmonotonic)
119+
};
120+
110121
} // end namespace omp
111122

112123
} // end namespace llvm

llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ class OpenMPIRBuilder {
355355
/// \param CLI A descriptor of the canonical loop to workshare.
356356
/// \param AllocaIP An insertion point for Alloca instructions usable in the
357357
/// preheader of the loop.
358-
/// \param NeedsBarrier Indicates whether a barrier must be insterted after
358+
/// \param NeedsBarrier Indicates whether a barrier must be inserted after
359359
/// the loop.
360360
/// \param Chunk The size of loop chunk considered as a unit when
361361
/// scheduling. If \p nullptr, defaults to 1.
@@ -367,6 +367,30 @@ class OpenMPIRBuilder {
367367
bool NeedsBarrier,
368368
Value *Chunk = nullptr);
369369

370+
/// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
371+
///
372+
/// This takes a \p LoopInfo representing a canonical loop, such as the one
373+
/// created by \p createCanonicalLoop and emits additional instructions to
374+
/// turn it into a workshare loop. In particular, it calls to an OpenMP
375+
/// runtime function in the preheader to obtain, and then in each iteration
376+
/// to update the loop counter.
377+
/// \param Loc The source location description, the insertion location
378+
/// is not used.
379+
/// \param CLI A descriptor of the canonical loop to workshare.
380+
/// \param AllocaIP An insertion point for Alloca instructions usable in the
381+
/// preheader of the loop.
382+
/// \param NeedsBarrier Indicates whether a barrier must be insterted after
383+
/// the loop.
384+
/// \param Chunk The size of loop chunk considered as a unit when
385+
/// scheduling. If \p nullptr, defaults to 1.
386+
///
387+
/// \returns Point where to insert code after the loop.
388+
InsertPointTy createDynamicWorkshareLoop(const LocationDescription &Loc,
389+
CanonicalLoopInfo *CLI,
390+
InsertPointTy AllocaIP,
391+
bool NeedsBarrier,
392+
Value *Chunk = nullptr);
393+
370394
/// Modifies the canonical loop to be a workshare loop.
371395
///
372396
/// This takes a \p LoopInfo representing a canonical loop, such as the one

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Lines changed: 145 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1168,10 +1168,8 @@ CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop(
11681168

11691169
Value *ThreadNum = getOrCreateThreadID(SrcLoc);
11701170

1171-
// TODO: extract scheduling type and map it to OMP constant. This is curently
1172-
// happening in kmp.h and its ilk and needs to be moved to OpenMP.td first.
1173-
constexpr int StaticSchedType = 34;
1174-
Constant *SchedulingType = ConstantInt::get(I32Type, StaticSchedType);
1171+
Constant *SchedulingType =
1172+
ConstantInt::get(I32Type, static_cast<int>(OMPScheduleType::Static));
11751173

11761174
// Call the "init" function and update the trip count of the loop with the
11771175
// value it produced.
@@ -1220,6 +1218,148 @@ CanonicalLoopInfo *OpenMPIRBuilder::createWorkshareLoop(
12201218
return createStaticWorkshareLoop(Loc, CLI, AllocaIP, NeedsBarrier);
12211219
}
12221220

1221+
/// Returns an LLVM function to call for initializing loop bounds using OpenMP
1222+
/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
1223+
/// the runtime. Always interpret integers as unsigned similarly to
1224+
/// CanonicalLoopInfo.
1225+
static FunctionCallee
1226+
getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
1227+
unsigned Bitwidth = Ty->getIntegerBitWidth();
1228+
if (Bitwidth == 32)
1229+
return OMPBuilder.getOrCreateRuntimeFunction(
1230+
M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
1231+
if (Bitwidth == 64)
1232+
return OMPBuilder.getOrCreateRuntimeFunction(
1233+
M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
1234+
llvm_unreachable("unknown OpenMP loop iterator bitwidth");
1235+
}
1236+
1237+
/// Returns an LLVM function to call for updating the next loop using OpenMP
1238+
/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
1239+
/// the runtime. Always interpret integers as unsigned similarly to
1240+
/// CanonicalLoopInfo.
1241+
static FunctionCallee
1242+
getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
1243+
unsigned Bitwidth = Ty->getIntegerBitWidth();
1244+
if (Bitwidth == 32)
1245+
return OMPBuilder.getOrCreateRuntimeFunction(
1246+
M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
1247+
if (Bitwidth == 64)
1248+
return OMPBuilder.getOrCreateRuntimeFunction(
1249+
M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
1250+
llvm_unreachable("unknown OpenMP loop iterator bitwidth");
1251+
}
1252+
1253+
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createDynamicWorkshareLoop(
1254+
const LocationDescription &Loc, CanonicalLoopInfo *CLI,
1255+
InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) {
1256+
// Set up the source location value for OpenMP runtime.
1257+
Builder.SetCurrentDebugLocation(Loc.DL);
1258+
1259+
Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
1260+
Value *SrcLoc = getOrCreateIdent(SrcLocStr);
1261+
1262+
// Declare useful OpenMP runtime functions.
1263+
Value *IV = CLI->getIndVar();
1264+
Type *IVTy = IV->getType();
1265+
FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
1266+
FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
1267+
1268+
// Allocate space for computed loop bounds as expected by the "init" function.
1269+
Builder.restoreIP(AllocaIP);
1270+
Type *I32Type = Type::getInt32Ty(M.getContext());
1271+
Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
1272+
Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
1273+
Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
1274+
Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
1275+
1276+
// At the end of the preheader, prepare for calling the "init" function by
1277+
// storing the current loop bounds into the allocated space. A canonical loop
1278+
// always iterates from 0 to trip-count with step 1. Note that "init" expects
1279+
// and produces an inclusive upper bound.
1280+
BasicBlock *PreHeader = CLI->getPreheader();
1281+
Builder.SetInsertPoint(PreHeader->getTerminator());
1282+
Constant *One = ConstantInt::get(IVTy, 1);
1283+
Builder.CreateStore(One, PLowerBound);
1284+
Value *UpperBound = CLI->getTripCount();
1285+
Builder.CreateStore(UpperBound, PUpperBound);
1286+
Builder.CreateStore(One, PStride);
1287+
1288+
BasicBlock *Header = CLI->getHeader();
1289+
BasicBlock *Exit = CLI->getExit();
1290+
BasicBlock *Cond = CLI->getCond();
1291+
InsertPointTy AfterIP = CLI->getAfterIP();
1292+
1293+
// The CLI will be "broken" in the code below, as the loop is no longer
1294+
// a valid canonical loop.
1295+
1296+
if (!Chunk)
1297+
Chunk = One;
1298+
1299+
Value *ThreadNum = getOrCreateThreadID(SrcLoc);
1300+
1301+
OMPScheduleType DynamicSchedType =
1302+
OMPScheduleType::DynamicChunked | OMPScheduleType::ModifierNonmonotonic;
1303+
Constant *SchedulingType =
1304+
ConstantInt::get(I32Type, static_cast<int>(DynamicSchedType));
1305+
1306+
// Call the "init" function.
1307+
Builder.CreateCall(DynamicInit,
1308+
{SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
1309+
UpperBound, /* step */ One, Chunk});
1310+
1311+
// An outer loop around the existing one.
1312+
BasicBlock *OuterCond = BasicBlock::Create(
1313+
PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
1314+
PreHeader->getParent());
1315+
// This needs to be 32-bit always, so can't use the IVTy Zero above.
1316+
Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
1317+
Value *Res =
1318+
Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
1319+
PLowerBound, PUpperBound, PStride});
1320+
Constant *Zero32 = ConstantInt::get(I32Type, 0);
1321+
Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
1322+
Value *LowerBound =
1323+
Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
1324+
Builder.CreateCondBr(MoreWork, Header, Exit);
1325+
1326+
// Change PHI-node in loop header to use outer cond rather than preheader,
1327+
// and set IV to the LowerBound.
1328+
Instruction *Phi = &Header->front();
1329+
auto *PI = cast<PHINode>(Phi);
1330+
PI->setIncomingBlock(0, OuterCond);
1331+
PI->setIncomingValue(0, LowerBound);
1332+
1333+
// Then set the pre-header to jump to the OuterCond
1334+
Instruction *Term = PreHeader->getTerminator();
1335+
auto *Br = cast<BranchInst>(Term);
1336+
Br->setSuccessor(0, OuterCond);
1337+
1338+
// Modify the inner condition:
1339+
// * Use the UpperBound returned from the DynamicNext call.
1340+
// * jump to the loop outer loop when done with one of the inner loops.
1341+
Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
1342+
UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
1343+
Instruction *Comp = &*Builder.GetInsertPoint();
1344+
auto *CI = cast<CmpInst>(Comp);
1345+
CI->setOperand(1, UpperBound);
1346+
// Redirect the inner exit to branch to outer condition.
1347+
Instruction *Branch = &Cond->back();
1348+
auto *BI = cast<BranchInst>(Branch);
1349+
assert(BI->getSuccessor(1) == Exit);
1350+
BI->setSuccessor(1, OuterCond);
1351+
1352+
// Add the barrier if requested.
1353+
if (NeedsBarrier) {
1354+
Builder.SetInsertPoint(&Exit->back());
1355+
createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1356+
omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
1357+
/* CheckCancelFlag */ false);
1358+
}
1359+
1360+
return AfterIP;
1361+
}
1362+
12231363
/// Make \p Source branch to \p Target.
12241364
///
12251365
/// Handles two situations:
@@ -1901,7 +2041,7 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
19012041
llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
19022042

19032043
Function *Fn =
1904-
getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
2044+
getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
19052045

19062046
return Builder.CreateCall(Fn, Args);
19072047
}

llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1708,6 +1708,105 @@ TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) {
17081708
EXPECT_EQ(NumCallsInExitBlock, 3u);
17091709
}
17101710

1711+
TEST_F(OpenMPIRBuilderTest, DynamicWorkShareLoop) {
1712+
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
1713+
OpenMPIRBuilder OMPBuilder(*M);
1714+
OMPBuilder.initialize();
1715+
IRBuilder<> Builder(BB);
1716+
OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
1717+
1718+
Type *LCTy = Type::getInt32Ty(Ctx);
1719+
Value *StartVal = ConstantInt::get(LCTy, 10);
1720+
Value *StopVal = ConstantInt::get(LCTy, 52);
1721+
Value *StepVal = ConstantInt::get(LCTy, 2);
1722+
Value *ChunkVal = ConstantInt::get(LCTy, 7);
1723+
auto LoopBodyGen = [&](InsertPointTy, llvm::Value *) {};
1724+
1725+
CanonicalLoopInfo *CLI = OMPBuilder.createCanonicalLoop(
1726+
Loc, LoopBodyGen, StartVal, StopVal, StepVal,
1727+
/*IsSigned=*/false, /*InclusiveStop=*/false);
1728+
1729+
Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
1730+
InsertPointTy AllocaIP = Builder.saveIP();
1731+
1732+
// Collect all the info from CLI, as it isn't usable after the call to
1733+
// createDynamicWorkshareLoop.
1734+
InsertPointTy AfterIP = CLI->getAfterIP();
1735+
BasicBlock *Preheader = CLI->getPreheader();
1736+
BasicBlock *ExitBlock = CLI->getExit();
1737+
Value *IV = CLI->getIndVar();
1738+
1739+
InsertPointTy EndIP =
1740+
OMPBuilder.createDynamicWorkshareLoop(Loc, CLI, AllocaIP,
1741+
/*NeedsBarrier=*/true, ChunkVal);
1742+
// The returned value should be the "after" point.
1743+
ASSERT_EQ(EndIP.getBlock(), AfterIP.getBlock());
1744+
ASSERT_EQ(EndIP.getPoint(), AfterIP.getPoint());
1745+
1746+
auto AllocaIter = BB->begin();
1747+
ASSERT_GE(std::distance(BB->begin(), BB->end()), 4);
1748+
AllocaInst *PLastIter = dyn_cast<AllocaInst>(&*(AllocaIter++));
1749+
AllocaInst *PLowerBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
1750+
AllocaInst *PUpperBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
1751+
AllocaInst *PStride = dyn_cast<AllocaInst>(&*(AllocaIter++));
1752+
EXPECT_NE(PLastIter, nullptr);
1753+
EXPECT_NE(PLowerBound, nullptr);
1754+
EXPECT_NE(PUpperBound, nullptr);
1755+
EXPECT_NE(PStride, nullptr);
1756+
1757+
auto PreheaderIter = Preheader->begin();
1758+
ASSERT_GE(std::distance(Preheader->begin(), Preheader->end()), 6);
1759+
StoreInst *LowerBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
1760+
StoreInst *UpperBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
1761+
StoreInst *StrideStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
1762+
ASSERT_NE(LowerBoundStore, nullptr);
1763+
ASSERT_NE(UpperBoundStore, nullptr);
1764+
ASSERT_NE(StrideStore, nullptr);
1765+
1766+
CallInst *ThreadIdCall = dyn_cast<CallInst>(&*(PreheaderIter++));
1767+
ASSERT_NE(ThreadIdCall, nullptr);
1768+
EXPECT_EQ(ThreadIdCall->getCalledFunction()->getName(),
1769+
"__kmpc_global_thread_num");
1770+
1771+
CallInst *InitCall = dyn_cast<CallInst>(&*PreheaderIter);
1772+
1773+
ASSERT_NE(InitCall, nullptr);
1774+
EXPECT_EQ(InitCall->getCalledFunction()->getName(),
1775+
"__kmpc_dispatch_init_4u");
1776+
EXPECT_EQ(InitCall->getNumArgOperands(), 7U);
1777+
EXPECT_EQ(InitCall->getArgOperand(6),
1778+
ConstantInt::get(Type::getInt32Ty(Ctx), 7));
1779+
1780+
ConstantInt *OrigLowerBound =
1781+
dyn_cast<ConstantInt>(LowerBoundStore->getValueOperand());
1782+
ConstantInt *OrigUpperBound =
1783+
dyn_cast<ConstantInt>(UpperBoundStore->getValueOperand());
1784+
ConstantInt *OrigStride =
1785+
dyn_cast<ConstantInt>(StrideStore->getValueOperand());
1786+
ASSERT_NE(OrigLowerBound, nullptr);
1787+
ASSERT_NE(OrigUpperBound, nullptr);
1788+
ASSERT_NE(OrigStride, nullptr);
1789+
EXPECT_EQ(OrigLowerBound->getValue(), 1);
1790+
EXPECT_EQ(OrigUpperBound->getValue(), 21);
1791+
EXPECT_EQ(OrigStride->getValue(), 1);
1792+
1793+
// The original loop iterator should only be used in the condition, in the
1794+
// increment and in the statement that adds the lower bound to it.
1795+
EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3);
1796+
1797+
// The exit block should contain the barrier call, plus the call to obtain
1798+
// the thread ID.
1799+
size_t NumCallsInExitBlock =
1800+
count_if(*ExitBlock, [](Instruction &I) { return isa<CallInst>(I); });
1801+
EXPECT_EQ(NumCallsInExitBlock, 2u);
1802+
1803+
// Add a termination to our block and check that it is internally consistent.
1804+
Builder.restoreIP(EndIP);
1805+
Builder.CreateRetVoid();
1806+
OMPBuilder.finalize();
1807+
EXPECT_FALSE(verifyModule(*M, &errs()));
1808+
}
1809+
17111810
TEST_F(OpenMPIRBuilderTest, MasterDirective) {
17121811
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
17131812
OpenMPIRBuilder OMPBuilder(*M);

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -179,11 +179,17 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
179179
if (loop.getNumLoops() != 1)
180180
return opInst.emitOpError("collapsed loops not yet supported");
181181

182-
if (loop.schedule_val().hasValue() &&
183-
omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue()) !=
184-
omp::ClauseScheduleKind::Static)
185-
return opInst.emitOpError(
186-
"only static (default) loop schedule is currently supported");
182+
bool isStatic = true;
183+
184+
if (loop.schedule_val().hasValue()) {
185+
auto schedule =
186+
omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue());
187+
if (schedule != omp::ClauseScheduleKind::Static &&
188+
schedule != omp::ClauseScheduleKind::Dynamic)
189+
return opInst.emitOpError("only static (default) and dynamic loop "
190+
"schedule is currently supported");
191+
isStatic = (schedule == omp::ClauseScheduleKind::Static);
192+
}
187193

188194
// Find the loop configuration.
189195
llvm::Value *lowerBound = moduleTranslation.lookupValue(loop.lowerBound()[0]);
@@ -241,11 +247,19 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
241247
// Put them at the start of the current block for now.
242248
llvm::OpenMPIRBuilder::InsertPointTy allocaIP(
243249
insertBlock, insertBlock->getFirstInsertionPt());
244-
loopInfo = moduleTranslation.getOpenMPBuilder()->createStaticWorkshareLoop(
245-
ompLoc, loopInfo, allocaIP, !loop.nowait(), chunk);
250+
llvm::OpenMPIRBuilder::InsertPointTy afterIP;
251+
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
252+
if (isStatic) {
253+
loopInfo = ompBuilder->createStaticWorkshareLoop(ompLoc, loopInfo, allocaIP,
254+
!loop.nowait(), chunk);
255+
afterIP = loopInfo->getAfterIP();
256+
} else {
257+
afterIP = ompBuilder->createDynamicWorkshareLoop(ompLoc, loopInfo, allocaIP,
258+
!loop.nowait(), chunk);
259+
}
246260

247261
// Continue building IR after the loop.
248-
builder.restoreIP(loopInfo->getAfterIP());
262+
builder.restoreIP(afterIP);
249263
return success();
250264
}
251265

0 commit comments

Comments
 (0)