[OpenMP] New Openmp device RTL functions #73225

DominikAdamski · 2023-11-23T10:00:25Z

Add new implementation of workshare loop functions.

…nctions

llvmbot · 2023-11-23T10:00:56Z

@llvm/pr-subscribers-flang-openmp

Author: Dominik Adamski (DominikAdamski)

Changes

Add new implementation of workshare loop functions.

Full diff: https://github.com/llvm/llvm-project/pull/73225.diff

2 Files Affected:

(modified) llvm/include/llvm/Frontend/OpenMP/OMPKinds.def (+12)
(modified) openmp/libomptarget/DeviceRTL/src/Workshare.cpp (+252)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 6a719d231142230..04c926004f72ef9 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -464,6 +464,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
           VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
+__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
 __OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
 __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
 __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index 0dbfafc4d699e7e..da743884ccf7ced 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -45,6 +45,9 @@ struct DynamicScheduleTracker {
 
 #pragma omp begin declare target device_type(nohost)
 
+extern int32_t __omp_rtl_assume_teams_oversubscription;
+extern int32_t __omp_rtl_assume_threads_oversubscription;
+
 // TODO: This variable is a hack inherited from the old runtime.
 static uint64_t SHARED(Cnt);
 
@@ -636,4 +639,253 @@ void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
 }
 
+namespace ompx {
+
+/// Helper class to hide the generic loop nest and provide the template argument
+/// throughout.
+template <typename Ty> class StaticLoopChunker {
+
+  /// Generic loop nest that handles block and/or thread distribution in the
+  /// absence of user specified chunk sizes. This implicitly picks a block chunk
+  /// size equal to the number of threads in the block and a thread chunk size
+  /// equal to one. In contrast to the chunked version we can get away with a
+  /// single loop in this case
+  static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
+                                        Ty NumBlocks, Ty BId, Ty NumThreads,
+                                        Ty TId, Ty NumIters,
+                                        bool OneIterationPerThread) {
+    Ty KernelIteration = NumBlocks * NumThreads;
+
+    // Start index in the normalized space.
+    Ty IV = BId * NumThreads + TId;
+    ASSERT(IV >= 0, "Bad index");
+
+    // Cover the entire iteration space, assumptions in the caller might allow
+    // to simplify this loop to a conditional.
+    if (IV < NumIters) {
+      do {
+
+        // Execute the loop body.
+        LoopBody(IV, Arg);
+
+        // Every thread executed one block and thread chunk now.
+        IV += KernelIteration;
+
+        if (OneIterationPerThread)
+          return;
+
+      } while (IV < NumIters);
+    }
+  }
+
+  /// Generic loop nest that handles block and/or thread distribution in the
+  /// presence of user specified chunk sizes (for at least one of them).
+  static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
+                                        Ty BlockChunk, Ty NumBlocks, Ty BId,
+                                        Ty ThreadChunk, Ty NumThreads, Ty TId,
+                                        Ty NumIters,
+                                        bool OneIterationPerThread) {
+    Ty KernelIteration = NumBlocks * BlockChunk;
+
+    // Start index in the chunked space.
+    Ty IV = BId * BlockChunk + TId;
+    ASSERT(IV >= 0, "Bad index");
+
+    // Cover the entire iteration space, assumptions in the caller might allow
+    // to simplify this loop to a conditional.
+    do {
+
+      Ty BlockChunkLeft =
+          BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
+      Ty ThreadChunkLeft =
+          ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
+
+      while (ThreadChunkLeft--) {
+
+        // Given the blocking it's hard to keep track of what to execute.
+        if (IV >= NumIters)
+          return;
+
+        // Execute the loop body.
+        LoopBody(IV, Arg);
+
+        if (OneIterationPerThread)
+          return;
+
+        ++IV;
+      }
+
+      IV += KernelIteration;
+
+    } while (IV < NumIters);
+  }
+
+public:
+  /// Worksharing `for`-loop.
+  static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
+                  Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
+    ASSERT(NumIters >= 0, "Bad iteration count");
+    ASSERT(ThreadChunk >= 0, "Bad thread count");
+
+    // All threads need to participate but we don't know if we are in a
+    // parallel at all or if the user might have used a `num_threads` clause
+    // on the parallel and reduced the number compared to the block size.
+    // Since nested parallels are possible too we need to get the thread id
+    // from the `omp` getter and not the mapping directly.
+    Ty TId = omp_get_thread_num();
+
+    // There are no blocks involved here.
+    Ty BlockChunk = 0;
+    Ty NumBlocks = 1;
+    Ty BId = 0;
+
+    // If the thread chunk is not specified we pick a default now.
+    if (ThreadChunk == 0)
+      ThreadChunk = 1;
+
+    // If we know we have more threads than iterations we can indicate that to
+    // avoid an outer loop.
+    bool OneIterationPerThread = false;
+    if (__omp_rtl_assume_threads_oversubscription) {
+      ASSERT(NumThreads >= NumIters, "Broken assumption");
+      OneIterationPerThread = true;
+    }
+
+    if (ThreadChunk != 1)
+      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+                                ThreadChunk, NumThreads, TId, NumIters,
+                                OneIterationPerThread);
+    else
+      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+                                NumIters, OneIterationPerThread);
+  }
+
+  /// Worksharing `distrbute`-loop.
+  static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
+                         Ty NumIters, Ty BlockChunk) {
+    ASSERT(icv::Level == 0, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
+
+    ASSERT(NumIters >= 0, "Bad iteration count");
+    ASSERT(BlockChunk >= 0, "Bad block count");
+
+    // There are no threads involved here.
+    Ty ThreadChunk = 0;
+    Ty NumThreads = 1;
+    Ty TId = 0;
+    ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id");
+
+    // All teams need to participate.
+    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
+    Ty BId = mapping::getBlockIdInKernel();
+
+    // If the block chunk is not specified we pick a default now.
+    if (BlockChunk == 0)
+      BlockChunk = NumThreads;
+
+    // If we know we have more blocks than iterations we can indicate that to
+    // avoid an outer loop.
+    bool OneIterationPerThread = false;
+    if (__omp_rtl_assume_teams_oversubscription) {
+      ASSERT(NumBlocks >= NumIters, "Broken assumption");
+      OneIterationPerThread = true;
+    }
+
+    if (BlockChunk != NumThreads)
+      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+                                ThreadChunk, NumThreads, TId, NumIters,
+                                OneIterationPerThread);
+    else
+      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+                                NumIters, OneIterationPerThread);
+
+    ASSERT(icv::Level == 0, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
+  }
+
+  /// Worksharing `distrbute parallel for`-loop.
+  static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
+                            void *Arg, Ty NumIters, Ty NumThreads,
+                            Ty BlockChunk, Ty ThreadChunk) {
+    ASSERT(icv::Level == 1, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+
+    ASSERT(NumIters >= 0, "Bad iteration count");
+    ASSERT(BlockChunk >= 0, "Bad block count");
+    ASSERT(ThreadChunk >= 0, "Bad thread count");
+
+    // All threads need to participate but the user might have used a
+    // `num_threads` clause on the parallel and reduced the number compared to
+    // the block size.
+    Ty TId = mapping::getThreadIdInBlock();
+
+    // All teams need to participate.
+    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
+    Ty BId = mapping::getBlockIdInKernel();
+
+    // If the block chunk is not specified we pick a default now.
+    if (BlockChunk == 0)
+      BlockChunk = NumThreads;
+
+    // If the thread chunk is not specified we pick a default now.
+    if (ThreadChunk == 0)
+      ThreadChunk = 1;
+
+    // If we know we have more threads (across all blocks) than iterations we
+    // can indicate that to avoid an outer loop.
+    bool OneIterationPerThread = false;
+    if (__omp_rtl_assume_teams_oversubscription &
+        __omp_rtl_assume_threads_oversubscription) {
+      OneIterationPerThread = true;
+      ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
+    }
+
+    if (BlockChunk != NumThreads || ThreadChunk != 1)
+      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+                                ThreadChunk, NumThreads, TId, NumIters,
+                                OneIterationPerThread);
+    else
+      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+                                NumIters, OneIterationPerThread);
+
+    ASSERT(icv::Level == 1, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+  }
+};
+
+} // namespace ompx
+
+#define OMP_LOOP_ENTRY(BW, TY)                                                 \
+  __attribute__((flatten)) void __kmpc_distribute_for_static_loop##BW(         \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY num_threads, TY block_chunk, TY thread_chunk) {                       \
+    ompx::StaticLoopChunker<TY>::DistributeFor(                                \
+        loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk);  \
+  }                                                                            \
+  __attribute__((flatten)) void __kmpc_distribute_static_loop##BW(             \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY block_chunk) {                                                        \
+    ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters + 1,       \
+                                            block_chunk);                      \
+  }                                                                            \
+  __attribute__((flatten)) void __kmpc_for_static_loop##BW(                    \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY num_threads, TY thread_chunk) {                                       \
+    ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters + 1, num_threads, \
+                                     thread_chunk);                            \
+  }
+
+extern "C" {
+OMP_LOOP_ENTRY(_4, int32_t)
+OMP_LOOP_ENTRY(_4u, uint32_t)
+OMP_LOOP_ENTRY(_8, int64_t)
+OMP_LOOP_ENTRY(_8u, uint64_t)
+}
+
 #pragma omp end declare target

jdoerfert

LG, one nit below.

jdoerfert · 2023-11-27T16:31:50Z

llvm/include/llvm/Frontend/OpenMP/OMPKinds.def

+__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)


You need to add the SignExt Attributes to the Int32 arguments.
Look for __OMP_RTL_ATTRS(__kmpc_cancel, below to see how,

github-actions · 2023-11-29T09:23:18Z

✅ With the latest revision this PR passed the C/C++ code formatter.

The workshare loop for target region uses the new OpenMP device runtime. The code generation scheme for the new device runtime is presented below: Input code: ``` workshare-loop { loop-body } ``` Output code: helper function which represents loop body: ``` function-loop-body(counter, loop-body-args) { loop-body } ``` workshare-loop is replaced by the proper device runtime call: ``` call __kmpc_new_worksharing_rtl(function-loop-body, loop-body-args, loop-tripcount, ...) ``` This PR uses the new device runtime functions which were added in PR: #73225

jdoerfert and others added 3 commits November 22, 2023 06:08

[OpenMP] Implement new Worksharing and Distribute OpenMP DeviceRTL fu…

72aa710

…nctions

[OpenMP] Fix build issues

8b87ac9

[OpenMP] Add new OpenMP function definitions

e5eaff3

DominikAdamski requested a review from jdoerfert November 23, 2023 10:00

llvmbot added flang:openmp clang:openmp OpenMP related changes to Clang openmp:libomptarget OpenMP offload runtime labels Nov 23, 2023

DominikAdamski requested a review from dhruvachak November 23, 2023 10:00

DominikAdamski requested review from kparzysz, jsjodin, dpalermo, skatrak, agozillon and TIFitis November 23, 2023 10:00

DominikAdamski mentioned this pull request Nov 24, 2023

[OpenMPIRBuilder] Add support for target workshare loops #73360

Merged

jdoerfert approved these changes Nov 27, 2023

View reviewed changes

DominikAdamski added 2 commits November 28, 2023 06:29

Applied review comment

0e25125

Applied remarks

284a2b1

Fixed code formatting

b7f51ca

DominikAdamski merged commit d4d88b8 into llvm:main Nov 29, 2023
3 checks passed

DominikAdamski deleted the new_openmp_device_rtl_functions branch November 29, 2023 13:26

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[OpenMP] New Openmp device RTL functions #73225

[OpenMP] New Openmp device RTL functions #73225

DominikAdamski commented Nov 23, 2023

llvmbot commented Nov 23, 2023

jdoerfert left a comment

jdoerfert Nov 27, 2023

github-actions bot commented Nov 29, 2023 •

edited

[OpenMP] New Openmp device RTL functions #73225

[OpenMP] New Openmp device RTL functions #73225

Conversation

DominikAdamski commented Nov 23, 2023

llvmbot commented Nov 23, 2023

jdoerfert left a comment

Choose a reason for hiding this comment

jdoerfert Nov 27, 2023

Choose a reason for hiding this comment

github-actions bot commented Nov 29, 2023 • edited

github-actions bot commented Nov 29, 2023 •

edited