-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[OpenMP] New Openmp device RTL functions #73225
[OpenMP] New Openmp device RTL functions #73225
Conversation
@llvm/pr-subscribers-flang-openmp Author: Dominik Adamski (DominikAdamski) ChangesAdd new implementation of workshare loop functions. Full diff: https://github.com/llvm/llvm-project/pull/73225.diff 2 Files Affected:
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 6a719d231142230..04c926004f72ef9 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -464,6 +464,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,)
__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
+__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
__OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
__OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
__OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index 0dbfafc4d699e7e..da743884ccf7ced 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -45,6 +45,9 @@ struct DynamicScheduleTracker {
#pragma omp begin declare target device_type(nohost)
+extern int32_t __omp_rtl_assume_teams_oversubscription;
+extern int32_t __omp_rtl_assume_threads_oversubscription;
+
// TODO: This variable is a hack inherited from the old runtime.
static uint64_t SHARED(Cnt);
@@ -636,4 +639,253 @@ void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
}
+namespace ompx {
+
+/// Helper class to hide the generic loop nest and provide the template argument
+/// throughout.
+template <typename Ty> class StaticLoopChunker {
+
+ /// Generic loop nest that handles block and/or thread distribution in the
+ /// absence of user specified chunk sizes. This implicitly picks a block chunk
+ /// size equal to the number of threads in the block and a thread chunk size
+ /// equal to one. In contrast to the chunked version we can get away with a
+ /// single loop in this case
+ static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
+ Ty NumBlocks, Ty BId, Ty NumThreads,
+ Ty TId, Ty NumIters,
+ bool OneIterationPerThread) {
+ Ty KernelIteration = NumBlocks * NumThreads;
+
+ // Start index in the normalized space.
+ Ty IV = BId * NumThreads + TId;
+ ASSERT(IV >= 0, "Bad index");
+
+ // Cover the entire iteration space, assumptions in the caller might allow
+ // to simplify this loop to a conditional.
+ if (IV < NumIters) {
+ do {
+
+ // Execute the loop body.
+ LoopBody(IV, Arg);
+
+ // Every thread executed one block and thread chunk now.
+ IV += KernelIteration;
+
+ if (OneIterationPerThread)
+ return;
+
+ } while (IV < NumIters);
+ }
+ }
+
+ /// Generic loop nest that handles block and/or thread distribution in the
+ /// presence of user specified chunk sizes (for at least one of them).
+ static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
+ Ty BlockChunk, Ty NumBlocks, Ty BId,
+ Ty ThreadChunk, Ty NumThreads, Ty TId,
+ Ty NumIters,
+ bool OneIterationPerThread) {
+ Ty KernelIteration = NumBlocks * BlockChunk;
+
+ // Start index in the chunked space.
+ Ty IV = BId * BlockChunk + TId;
+ ASSERT(IV >= 0, "Bad index");
+
+ // Cover the entire iteration space, assumptions in the caller might allow
+ // to simplify this loop to a conditional.
+ do {
+
+ Ty BlockChunkLeft =
+ BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
+ Ty ThreadChunkLeft =
+ ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
+
+ while (ThreadChunkLeft--) {
+
+ // Given the blocking it's hard to keep track of what to execute.
+ if (IV >= NumIters)
+ return;
+
+ // Execute the loop body.
+ LoopBody(IV, Arg);
+
+ if (OneIterationPerThread)
+ return;
+
+ ++IV;
+ }
+
+ IV += KernelIteration;
+
+ } while (IV < NumIters);
+ }
+
+public:
+ /// Worksharing `for`-loop.
+ static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
+ Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
+ ASSERT(NumIters >= 0, "Bad iteration count");
+ ASSERT(ThreadChunk >= 0, "Bad thread count");
+
+ // All threads need to participate but we don't know if we are in a
+ // parallel at all or if the user might have used a `num_threads` clause
+ // on the parallel and reduced the number compared to the block size.
+ // Since nested parallels are possible too we need to get the thread id
+ // from the `omp` getter and not the mapping directly.
+ Ty TId = omp_get_thread_num();
+
+ // There are no blocks involved here.
+ Ty BlockChunk = 0;
+ Ty NumBlocks = 1;
+ Ty BId = 0;
+
+ // If the thread chunk is not specified we pick a default now.
+ if (ThreadChunk == 0)
+ ThreadChunk = 1;
+
+ // If we know we have more threads than iterations we can indicate that to
+ // avoid an outer loop.
+ bool OneIterationPerThread = false;
+ if (__omp_rtl_assume_threads_oversubscription) {
+ ASSERT(NumThreads >= NumIters, "Broken assumption");
+ OneIterationPerThread = true;
+ }
+
+ if (ThreadChunk != 1)
+ NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+ ThreadChunk, NumThreads, TId, NumIters,
+ OneIterationPerThread);
+ else
+ NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+ NumIters, OneIterationPerThread);
+ }
+
+ /// Worksharing `distrbute`-loop.
+ static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
+ Ty NumIters, Ty BlockChunk) {
+ ASSERT(icv::Level == 0, "Bad distribute");
+ ASSERT(icv::ActiveLevel == 0, "Bad distribute");
+ ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+ ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
+
+ ASSERT(NumIters >= 0, "Bad iteration count");
+ ASSERT(BlockChunk >= 0, "Bad block count");
+
+ // There are no threads involved here.
+ Ty ThreadChunk = 0;
+ Ty NumThreads = 1;
+ Ty TId = 0;
+ ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id");
+
+ // All teams need to participate.
+ Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
+ Ty BId = mapping::getBlockIdInKernel();
+
+ // If the block chunk is not specified we pick a default now.
+ if (BlockChunk == 0)
+ BlockChunk = NumThreads;
+
+ // If we know we have more blocks than iterations we can indicate that to
+ // avoid an outer loop.
+ bool OneIterationPerThread = false;
+ if (__omp_rtl_assume_teams_oversubscription) {
+ ASSERT(NumBlocks >= NumIters, "Broken assumption");
+ OneIterationPerThread = true;
+ }
+
+ if (BlockChunk != NumThreads)
+ NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+ ThreadChunk, NumThreads, TId, NumIters,
+ OneIterationPerThread);
+ else
+ NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+ NumIters, OneIterationPerThread);
+
+ ASSERT(icv::Level == 0, "Bad distribute");
+ ASSERT(icv::ActiveLevel == 0, "Bad distribute");
+ ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+ ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
+ }
+
+ /// Worksharing `distrbute parallel for`-loop.
+ static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
+ void *Arg, Ty NumIters, Ty NumThreads,
+ Ty BlockChunk, Ty ThreadChunk) {
+ ASSERT(icv::Level == 1, "Bad distribute");
+ ASSERT(icv::ActiveLevel == 1, "Bad distribute");
+ ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+
+ ASSERT(NumIters >= 0, "Bad iteration count");
+ ASSERT(BlockChunk >= 0, "Bad block count");
+ ASSERT(ThreadChunk >= 0, "Bad thread count");
+
+ // All threads need to participate but the user might have used a
+ // `num_threads` clause on the parallel and reduced the number compared to
+ // the block size.
+ Ty TId = mapping::getThreadIdInBlock();
+
+ // All teams need to participate.
+ Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
+ Ty BId = mapping::getBlockIdInKernel();
+
+ // If the block chunk is not specified we pick a default now.
+ if (BlockChunk == 0)
+ BlockChunk = NumThreads;
+
+ // If the thread chunk is not specified we pick a default now.
+ if (ThreadChunk == 0)
+ ThreadChunk = 1;
+
+ // If we know we have more threads (across all blocks) than iterations we
+ // can indicate that to avoid an outer loop.
+ bool OneIterationPerThread = false;
+ if (__omp_rtl_assume_teams_oversubscription &
+ __omp_rtl_assume_threads_oversubscription) {
+ OneIterationPerThread = true;
+ ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
+ }
+
+ if (BlockChunk != NumThreads || ThreadChunk != 1)
+ NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+ ThreadChunk, NumThreads, TId, NumIters,
+ OneIterationPerThread);
+ else
+ NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+ NumIters, OneIterationPerThread);
+
+ ASSERT(icv::Level == 1, "Bad distribute");
+ ASSERT(icv::ActiveLevel == 1, "Bad distribute");
+ ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+ }
+};
+
+} // namespace ompx
+
+#define OMP_LOOP_ENTRY(BW, TY) \
+ __attribute__((flatten)) void __kmpc_distribute_for_static_loop##BW( \
+ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
+ TY num_threads, TY block_chunk, TY thread_chunk) { \
+ ompx::StaticLoopChunker<TY>::DistributeFor( \
+ loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk); \
+ } \
+ __attribute__((flatten)) void __kmpc_distribute_static_loop##BW( \
+ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
+ TY block_chunk) { \
+ ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters + 1, \
+ block_chunk); \
+ } \
+ __attribute__((flatten)) void __kmpc_for_static_loop##BW( \
+ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
+ TY num_threads, TY thread_chunk) { \
+ ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters + 1, num_threads, \
+ thread_chunk); \
+ }
+
+extern "C" {
+OMP_LOOP_ENTRY(_4, int32_t)
+OMP_LOOP_ENTRY(_4u, uint32_t)
+OMP_LOOP_ENTRY(_8, int64_t)
+OMP_LOOP_ENTRY(_8u, uint64_t)
+}
+
#pragma omp end declare target
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG, one nit below.
__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32) | ||
__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32) | ||
__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64) | ||
__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You need to add the SignExt Attributes to the Int32 arguments.
Look for __OMP_RTL_ATTRS(__kmpc_cancel,
below to see how,
✅ With the latest revision this PR passed the C/C++ code formatter. |
The workshare loop for target region uses the new OpenMP device runtime. The code generation scheme for the new device runtime is presented below: Input code: ``` workshare-loop { loop-body } ``` Output code: helper function which represents loop body: ``` function-loop-body(counter, loop-body-args) { loop-body } ``` workshare-loop is replaced by the proper device runtime call: ``` call __kmpc_new_worksharing_rtl(function-loop-body, loop-body-args, loop-tripcount, ...) ``` This PR uses the new device runtime functions which were added in PR: #73225
Add new implementation of workshare loop functions.