-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Flang][OpenMP] Add support for schedule clause for GPU #81618
base: main
Are you sure you want to change the base?
[Flang][OpenMP] Add support for schedule clause for GPU #81618
Conversation
Scope of changes: 1) Fixed handling of loop chunking in OpenMP runtime. 2) Pass chunk value from MLIR to OpenMP runtime. 3) Added explicit check that only static schedule is supported for target loops.
@llvm/pr-subscribers-mlir-llvm @llvm/pr-subscribers-flang-openmp Author: Dominik Adamski (DominikAdamski) ChangesScope of changes:
Full diff: https://github.com/llvm/llvm-project/pull/81618.diff 5 Files Affected:
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 2288969ecc95c4..8d74b12dbc4ba1 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -925,11 +925,15 @@ class OpenMPIRBuilder {
/// preheader of the loop.
/// \param LoopType Information about type of loop worksharing.
/// It corresponds to type of loop workshare OpenMP pragma.
+ /// \param ScheduleType Information about scheduling type.
+ /// \param ChunkSize Value of chunk size for static schedule.
///
/// \returns Point where to insert code after the workshare construct.
InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
InsertPointTy AllocaIP,
- omp::WorksharingLoopType LoopType);
+ omp::WorksharingLoopType LoopType,
+ omp::OMPScheduleType ScheduleType,
+ Value *ChunkSize);
/// Modifies the canonical loop to be a statically-scheduled workshare loop.
///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 02b333e9ccd567..f9cbc39a24016d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2728,7 +2728,8 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
static void createTargetLoopWorkshareCall(
OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
- Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
+ Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn,
+ Value *ThreadChunkSize) {
Type *TripCountTy = TripCount->getType();
Module &M = OMPBuilder->M;
IRBuilder<> &Builder = OMPBuilder->Builder;
@@ -2751,9 +2752,21 @@ static void createTargetLoopWorkshareCall(
RealArgs.push_back(
Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
- RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
- if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
+ switch (LoopType) {
+ case WorksharingLoopType::DistributeForStaticLoop:
+ RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+ ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
+ ThreadChunkSize, TripCountTy))
+ : RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+ break;
+ case WorksharingLoopType::DistributeStaticLoop:
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+ break;
+ case WorksharingLoopType::ForStaticLoop:
+ ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
+ ThreadChunkSize, TripCountTy))
+ : RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+ break;
}
Builder.CreateCall(RTLFn, RealArgs);
@@ -2764,7 +2777,7 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
CanonicalLoopInfo *CLI, Value *Ident,
Function &OutlinedFn, Type *ParallelTaskPtr,
const SmallVector<Instruction *, 4> &ToBeDeleted,
- WorksharingLoopType LoopType) {
+ WorksharingLoopType LoopType, Value *ChunkSize) {
IRBuilder<> &Builder = OMPIRBuilder->Builder;
BasicBlock *Preheader = CLI->getPreheader();
Value *TripCount = CLI->getTripCount();
@@ -2811,17 +2824,18 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
LoopBodyArg, ParallelTaskPtr, TripCount,
- OutlinedFn);
+ OutlinedFn, ChunkSize);
for (auto &ToBeDeletedItem : ToBeDeleted)
ToBeDeletedItem->eraseFromParent();
CLI->invalidate();
}
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
- InsertPointTy AllocaIP,
- WorksharingLoopType LoopType) {
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
+ DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
+ WorksharingLoopType LoopType, OMPScheduleType EffectiveScheduleType,
+ Value *ChunkSize) {
+
uint32_t SrcLocStrSize;
Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
@@ -2833,6 +2847,16 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
// Instructions which need to be deleted at the end of code generation
SmallVector<Instruction *, 4> ToBeDeleted;
+ // TODO: Add support for dynamic scheduling
+ switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
+ case OMPScheduleType::BaseStatic:
+ case OMPScheduleType::BaseStaticChunked:
+ break;
+ default:
+ report_fatal_error(
+ "Unknown/unimplemented schedule kind for target workshare loop", false);
+ }
+
OI.OuterAllocaBB = AllocaIP.getBlock();
// Mark the body loop as region which needs to be extracted
@@ -2906,7 +2930,7 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
OI.PostOutlineCB = [=, ToBeDeletedVec =
std::move(ToBeDeleted)](Function &OutlinedFn) {
workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
- ToBeDeletedVec, LoopType);
+ ToBeDeletedVec, LoopType, ChunkSize);
};
addOutlineInfo(std::move(OI));
return CLI->getAfterIP();
@@ -2918,11 +2942,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop(
bool HasSimdModifier, bool HasMonotonicModifier,
bool HasNonmonotonicModifier, bool HasOrderedClause,
WorksharingLoopType LoopType) {
- if (Config.isTargetDevice())
- return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
HasNonmonotonicModifier, HasOrderedClause);
+ if (Config.isTargetDevice())
+ return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType,
+ EffectiveScheduleType, ChunkSize);
bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
OMPScheduleType::ModifierOrdered;
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
index 220eb85b3483ec..a5f5d07262c8d9 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
@@ -25,6 +25,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
}
llvm.return
}
+
+ llvm.func @target_wsloop_schedule_static_chunked(%arg0: !llvm.ptr ){
+ %loop_ub = llvm.mlir.constant(9 : i32) : i32
+ %loop_lb = llvm.mlir.constant(0 : i32) : i32
+ %loop_step = llvm.mlir.constant(1 : i32) : i32
+ %chunk = llvm.mlir.constant(2 : i32) : i32
+ omp.wsloop schedule(static = %chunk : i32) for (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+ %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
+ llvm.store %loop_cnt, %gep : i32, !llvm.ptr
+ omp.yield
+ }
+ llvm.return
+ }
}
// CHECK: define void @[[FUNC0:.*]](ptr %[[ARG0:.*]])
@@ -45,3 +58,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0)
// CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]])
+
+// CHECK: define void @[[FUNC_SCHEDULE_STATIC_WSLOOP:.*]](ptr %[[ARG1:.*]])
+// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), ptr @[[LOOP_BODY_SCHEDULE_STATIC_FN:.*]], ptr %[[SCHEDULE_LOOP_ARGS:.*]], i32 10, i32 %[[NUM_THREADS:.*]], i32 2)
+
+// CHECK: define internal void @[[LOOP_BODY_SCHEDULE_STATIC_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index bcb7c5ad50a185..836d4f7f4934b4 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -685,17 +685,22 @@ template <typename Ty> class StaticLoopChunker {
Ty KernelIteration = NumBlocks * BlockChunk;
// Start index in the chunked space.
- Ty IV = BId * BlockChunk + TId;
+ Ty IV = BId * BlockChunk + TId * ThreadChunk;
ASSERT(IV >= 0, "Bad index");
+ // Make sure the starting index is within the kernel iteration boundaries.
+ if (IV >= KernelIteration)
+ return;
+
// Cover the entire iteration space, assumptions in the caller might allow
// to simplify this loop to a conditional.
do {
Ty BlockChunkLeft =
BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
- Ty ThreadChunkLeft =
+ Ty EffectiveThreadChunk =
ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
+ Ty ThreadChunkLeft = EffectiveThreadChunk;
while (ThreadChunkLeft--) {
@@ -711,8 +716,8 @@ template <typename Ty> class StaticLoopChunker {
++IV;
}
-
- IV += KernelIteration;
+ // Start the new kernel iteration before the first thread chunk
+ IV += (KernelIteration - EffectiveThreadChunk);
} while (IV < NumIters);
}
@@ -731,8 +736,8 @@ template <typename Ty> class StaticLoopChunker {
// from the `omp` getter and not the mapping directly.
Ty TId = omp_get_thread_num();
- // There are no blocks involved here.
- Ty BlockChunk = 0;
+ // There is only one block for the whole iteration space.
+ Ty BlockChunk = NumIters;
Ty NumBlocks = 1;
Ty BId = 0;
diff --git a/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90 b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
new file mode 100644
index 00000000000000..f0b444f6ddc66f
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-schedule-static-chunk.f90
@@ -0,0 +1,33 @@
+! Basic offloading test with a target region
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+program main
+ use omp_lib
+ integer :: x(100)
+ integer :: errors = 0
+ integer :: i
+
+ !$omp target parallel do schedule(static, 5) map(from: x)
+ do i = 1, 100
+ x(i) = i
+ end do
+ !$omp end target parallel do
+ do i = 1, 100
+ if ( x(i) .ne. i ) then
+ errors = errors + 1
+ end if
+ end do
+
+ print *,"number of errors: ", errors
+
+end program main
+
+! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK: number of errors: 0
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks Dominik, the OMPIRBuilder changes make sense to me, as well as the MLIR to LLVM IR and the Fortran offloading test. However, I'm not familiar enough with libomptarget to review that part, so I'll leave that to the experts.
@@ -685,17 +685,22 @@ template <typename Ty> class StaticLoopChunker { | |||
Ty KernelIteration = NumBlocks * BlockChunk; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need *ThreadChunk here too?
Let's say we have 5 blocks, and each block does a chunk of 3.
Each block has 11 threads and a chunk size of 2.
What I'd expect to work on in one iteration of the do loop below is:
Iteration : 0 1 2 3 ... 20 21
Block/Thread: B0T0, B0T0, B0T1, B0T1, ..., B0T10, B0T10
Iteration : 66 67 68 69 ... 86 87
Block/Thread: B1T0, B1T0, B1T1, B1T1, ..., B1T10, B1T10
...
Iteration : 264 265 266 267 ... 284 285
Block/Thread: B4T0, B4T0, B4T1, B4T1, ..., B4T10, B4T10
So, 2 * 11 = 22 iterations for a block and 5 * 22 = 110 iterations for the kernel.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Current implementation matches clang chunking scheme.
|
||
IV += KernelIteration; | ||
// Start the new kernel iteration before the first thread chunk | ||
IV += (KernelIteration - EffectiveThreadChunk); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I get the rest but not this change. As argued above, I think KernelIter should be larger.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that I look at this again, I see why you need to adjust this.
We might need a second loop, which is unfortunate but as long as default values will allow us to fold it, it's OK.
I think we might need to make KernelIteration larger, as described above.
However, as you noted, we certainly need to cover the block chunk gap (iteration 22 to 65 in the example above).
So, in this loop IV would be incremented by EffectiveThreadChunk.
This happens in an outer loop BlockChunk times, then we would move on and increment to the start of the next block chunk.
We really need the test that tracks what thread and block executed which iteration.
#83261 checks how clang handles chunking |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG, made one suggestions that might simplify the code.
Ty ThreadCnt = 0; | ||
// Cover the thread space | ||
while ((ThreadCnt < ThreadChunk) && | ||
((ThreadIV + ThreadCnt) < BlockChunk)) { | ||
// Index in the chunked space. | ||
Ty IV = BlockIV + ThreadIV + ThreadCnt; | ||
|
||
// Given the blocking it's hard to keep track of what to execute. | ||
if (IV >= NumIters) | ||
return; | ||
|
||
// Execute the loop body. | ||
LoopBody(IV, Arg); | ||
|
||
if (OneIterationPerThread) | ||
return; | ||
++ThreadCnt; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We might want to make this a for loop with a min condition:
Ty TripCnt = min(ThreadChunk, BlockChunk - ThreadIV, NumIters - ThreadIV - BlockIV);
for (Ty Cnt = 0; Cnt < TripCnt; ++Cnt) {
Ty IV = BlockIV + ThreadIV + Cnt;
LoopBody(IV, Arg)
if (OneIterationPerThread)
return;
}
Scope of changes: