Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OpenMP] New Openmp device RTL functions #73225

Merged

Conversation

DominikAdamski
Copy link
Contributor

Add new implementation of workshare loop functions.

@llvmbot llvmbot added flang:openmp clang:openmp OpenMP related changes to Clang openmp:libomptarget OpenMP offload runtime labels Nov 23, 2023
@llvmbot
Copy link
Collaborator

llvmbot commented Nov 23, 2023

@llvm/pr-subscribers-flang-openmp

Author: Dominik Adamski (DominikAdamski)

Changes

Add new implementation of workshare loop functions.


Full diff: https://github.com/llvm/llvm-project/pull/73225.diff

2 Files Affected:

  • (modified) llvm/include/llvm/Frontend/OpenMP/OMPKinds.def (+12)
  • (modified) openmp/libomptarget/DeviceRTL/src/Workshare.cpp (+252)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 6a719d231142230..04c926004f72ef9 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -464,6 +464,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
           VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
+__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
 __OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
 __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
 __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index 0dbfafc4d699e7e..da743884ccf7ced 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -45,6 +45,9 @@ struct DynamicScheduleTracker {
 
 #pragma omp begin declare target device_type(nohost)
 
+extern int32_t __omp_rtl_assume_teams_oversubscription;
+extern int32_t __omp_rtl_assume_threads_oversubscription;
+
 // TODO: This variable is a hack inherited from the old runtime.
 static uint64_t SHARED(Cnt);
 
@@ -636,4 +639,253 @@ void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
 }
 
+namespace ompx {
+
+/// Helper class to hide the generic loop nest and provide the template argument
+/// throughout.
+template <typename Ty> class StaticLoopChunker {
+
+  /// Generic loop nest that handles block and/or thread distribution in the
+  /// absence of user specified chunk sizes. This implicitly picks a block chunk
+  /// size equal to the number of threads in the block and a thread chunk size
+  /// equal to one. In contrast to the chunked version we can get away with a
+  /// single loop in this case
+  static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
+                                        Ty NumBlocks, Ty BId, Ty NumThreads,
+                                        Ty TId, Ty NumIters,
+                                        bool OneIterationPerThread) {
+    Ty KernelIteration = NumBlocks * NumThreads;
+
+    // Start index in the normalized space.
+    Ty IV = BId * NumThreads + TId;
+    ASSERT(IV >= 0, "Bad index");
+
+    // Cover the entire iteration space, assumptions in the caller might allow
+    // to simplify this loop to a conditional.
+    if (IV < NumIters) {
+      do {
+
+        // Execute the loop body.
+        LoopBody(IV, Arg);
+
+        // Every thread executed one block and thread chunk now.
+        IV += KernelIteration;
+
+        if (OneIterationPerThread)
+          return;
+
+      } while (IV < NumIters);
+    }
+  }
+
+  /// Generic loop nest that handles block and/or thread distribution in the
+  /// presence of user specified chunk sizes (for at least one of them).
+  static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg,
+                                        Ty BlockChunk, Ty NumBlocks, Ty BId,
+                                        Ty ThreadChunk, Ty NumThreads, Ty TId,
+                                        Ty NumIters,
+                                        bool OneIterationPerThread) {
+    Ty KernelIteration = NumBlocks * BlockChunk;
+
+    // Start index in the chunked space.
+    Ty IV = BId * BlockChunk + TId;
+    ASSERT(IV >= 0, "Bad index");
+
+    // Cover the entire iteration space, assumptions in the caller might allow
+    // to simplify this loop to a conditional.
+    do {
+
+      Ty BlockChunkLeft =
+          BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
+      Ty ThreadChunkLeft =
+          ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
+
+      while (ThreadChunkLeft--) {
+
+        // Given the blocking it's hard to keep track of what to execute.
+        if (IV >= NumIters)
+          return;
+
+        // Execute the loop body.
+        LoopBody(IV, Arg);
+
+        if (OneIterationPerThread)
+          return;
+
+        ++IV;
+      }
+
+      IV += KernelIteration;
+
+    } while (IV < NumIters);
+  }
+
+public:
+  /// Worksharing `for`-loop.
+  static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
+                  Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
+    ASSERT(NumIters >= 0, "Bad iteration count");
+    ASSERT(ThreadChunk >= 0, "Bad thread count");
+
+    // All threads need to participate but we don't know if we are in a
+    // parallel at all or if the user might have used a `num_threads` clause
+    // on the parallel and reduced the number compared to the block size.
+    // Since nested parallels are possible too we need to get the thread id
+    // from the `omp` getter and not the mapping directly.
+    Ty TId = omp_get_thread_num();
+
+    // There are no blocks involved here.
+    Ty BlockChunk = 0;
+    Ty NumBlocks = 1;
+    Ty BId = 0;
+
+    // If the thread chunk is not specified we pick a default now.
+    if (ThreadChunk == 0)
+      ThreadChunk = 1;
+
+    // If we know we have more threads than iterations we can indicate that to
+    // avoid an outer loop.
+    bool OneIterationPerThread = false;
+    if (__omp_rtl_assume_threads_oversubscription) {
+      ASSERT(NumThreads >= NumIters, "Broken assumption");
+      OneIterationPerThread = true;
+    }
+
+    if (ThreadChunk != 1)
+      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+                                ThreadChunk, NumThreads, TId, NumIters,
+                                OneIterationPerThread);
+    else
+      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+                                NumIters, OneIterationPerThread);
+  }
+
+  /// Worksharing `distrbute`-loop.
+  static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
+                         Ty NumIters, Ty BlockChunk) {
+    ASSERT(icv::Level == 0, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
+
+    ASSERT(NumIters >= 0, "Bad iteration count");
+    ASSERT(BlockChunk >= 0, "Bad block count");
+
+    // There are no threads involved here.
+    Ty ThreadChunk = 0;
+    Ty NumThreads = 1;
+    Ty TId = 0;
+    ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id");
+
+    // All teams need to participate.
+    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
+    Ty BId = mapping::getBlockIdInKernel();
+
+    // If the block chunk is not specified we pick a default now.
+    if (BlockChunk == 0)
+      BlockChunk = NumThreads;
+
+    // If we know we have more blocks than iterations we can indicate that to
+    // avoid an outer loop.
+    bool OneIterationPerThread = false;
+    if (__omp_rtl_assume_teams_oversubscription) {
+      ASSERT(NumBlocks >= NumIters, "Broken assumption");
+      OneIterationPerThread = true;
+    }
+
+    if (BlockChunk != NumThreads)
+      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+                                ThreadChunk, NumThreads, TId, NumIters,
+                                OneIterationPerThread);
+    else
+      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+                                NumIters, OneIterationPerThread);
+
+    ASSERT(icv::Level == 0, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 0, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+    ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
+  }
+
+  /// Worksharing `distrbute parallel for`-loop.
+  static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
+                            void *Arg, Ty NumIters, Ty NumThreads,
+                            Ty BlockChunk, Ty ThreadChunk) {
+    ASSERT(icv::Level == 1, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+
+    ASSERT(NumIters >= 0, "Bad iteration count");
+    ASSERT(BlockChunk >= 0, "Bad block count");
+    ASSERT(ThreadChunk >= 0, "Bad thread count");
+
+    // All threads need to participate but the user might have used a
+    // `num_threads` clause on the parallel and reduced the number compared to
+    // the block size.
+    Ty TId = mapping::getThreadIdInBlock();
+
+    // All teams need to participate.
+    Ty NumBlocks = mapping::getNumberOfBlocksInKernel();
+    Ty BId = mapping::getBlockIdInKernel();
+
+    // If the block chunk is not specified we pick a default now.
+    if (BlockChunk == 0)
+      BlockChunk = NumThreads;
+
+    // If the thread chunk is not specified we pick a default now.
+    if (ThreadChunk == 0)
+      ThreadChunk = 1;
+
+    // If we know we have more threads (across all blocks) than iterations we
+    // can indicate that to avoid an outer loop.
+    bool OneIterationPerThread = false;
+    if (__omp_rtl_assume_teams_oversubscription &
+        __omp_rtl_assume_threads_oversubscription) {
+      OneIterationPerThread = true;
+      ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
+    }
+
+    if (BlockChunk != NumThreads || ThreadChunk != 1)
+      NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
+                                ThreadChunk, NumThreads, TId, NumIters,
+                                OneIterationPerThread);
+    else
+      NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId,
+                                NumIters, OneIterationPerThread);
+
+    ASSERT(icv::Level == 1, "Bad distribute");
+    ASSERT(icv::ActiveLevel == 1, "Bad distribute");
+    ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
+  }
+};
+
+} // namespace ompx
+
+#define OMP_LOOP_ENTRY(BW, TY)                                                 \
+  __attribute__((flatten)) void __kmpc_distribute_for_static_loop##BW(         \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY num_threads, TY block_chunk, TY thread_chunk) {                       \
+    ompx::StaticLoopChunker<TY>::DistributeFor(                                \
+        loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk);  \
+  }                                                                            \
+  __attribute__((flatten)) void __kmpc_distribute_static_loop##BW(             \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY block_chunk) {                                                        \
+    ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters + 1,       \
+                                            block_chunk);                      \
+  }                                                                            \
+  __attribute__((flatten)) void __kmpc_for_static_loop##BW(                    \
+      IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters,           \
+      TY num_threads, TY thread_chunk) {                                       \
+    ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters + 1, num_threads, \
+                                     thread_chunk);                            \
+  }
+
+extern "C" {
+OMP_LOOP_ENTRY(_4, int32_t)
+OMP_LOOP_ENTRY(_4u, uint32_t)
+OMP_LOOP_ENTRY(_8, int64_t)
+OMP_LOOP_ENTRY(_8u, uint64_t)
+}
+
 #pragma omp end declare target

Copy link
Member

@jdoerfert jdoerfert left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LG, one nit below.

__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to add the SignExt Attributes to the Int32 arguments.
Look for __OMP_RTL_ATTRS(__kmpc_cancel, below to see how,

Copy link

github-actions bot commented Nov 29, 2023

✅ With the latest revision this PR passed the C/C++ code formatter.

@DominikAdamski DominikAdamski merged commit d4d88b8 into llvm:main Nov 29, 2023
3 checks passed
@DominikAdamski DominikAdamski deleted the new_openmp_device_rtl_functions branch November 29, 2023 13:26
DominikAdamski added a commit that referenced this pull request Dec 6, 2023
The workshare loop for target region uses the new OpenMP device runtime.
The code generation scheme for the new device runtime is presented
below:

Input code:
```
workshare-loop {
  loop-body
}
```

Output code:
helper function which represents loop body:
```
function-loop-body(counter, loop-body-args) {
  loop-body
}
```
workshare-loop is replaced by the proper device runtime call:
```
call __kmpc_new_worksharing_rtl(function-loop-body, loop-body-args,
                                                     loop-tripcount, ...)
```
This PR uses the new device runtime functions which were added in PR:
#73225
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
clang:openmp OpenMP related changes to Clang flang:openmp openmp:libomptarget OpenMP offload runtime
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

3 participants