diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index 6a719d2311422..d22d2a8e948b0 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -464,6 +464,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,) __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr) __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32, VoidPtr, VoidPtr, VoidPtrPtr, SizeTy) +__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32) +__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32) +__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64) +__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64) +__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32) +__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32) +__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64) +__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64) +__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32) +__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32) +__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64) +__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64) __OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr) __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, ) __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32) @@ -650,6 +662,24 @@ __OMP_RTL_ATTRS(__kmpc_cancel, InaccessibleArgOnlyAttrs, SExt, ParamAttrs(ReadOnlyPtrAttrs, SExt, SExt)) __OMP_RTL_ATTRS(__kmpc_cancel_barrier, BarrierAttrs, SExt, ParamAttrs(ReadOnlyPtrAttrs, SExt)) +__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(), + ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(), + SExt, SExt, SExt, SExt)) +__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(), + ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(), + ZExt, ZExt, ZExt, ZExt)) +__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4, AlwaysInlineAttrs, AttributeSet(), + ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(), + SExt, SExt)) +__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4u, AlwaysInlineAttrs, AttributeSet(), + ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(), + ZExt, ZExt)) +__OMP_RTL_ATTRS(__kmpc_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(), + ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(), + SExt, SExt, SExt)) +__OMP_RTL_ATTRS(__kmpc_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(), + ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(), + ZExt, ZExt, ZExt)) __OMP_RTL_ATTRS(__kmpc_error, AttributeSet(), AttributeSet(), ParamAttrs(AttributeSet(), SExt)) __OMP_RTL_ATTRS(__kmpc_flush, BarrierAttrs, AttributeSet(), diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp index 0dbfafc4d699e..b587b85cc0078 100644 --- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp @@ -45,6 +45,9 @@ struct DynamicScheduleTracker { #pragma omp begin declare target device_type(nohost) +extern int32_t __omp_rtl_assume_teams_oversubscription; +extern int32_t __omp_rtl_assume_threads_oversubscription; + // TODO: This variable is a hack inherited from the old runtime. static uint64_t SHARED(Cnt); @@ -636,4 +639,255 @@ void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {} void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {} } +namespace ompx { + +/// Helper class to hide the generic loop nest and provide the template argument +/// throughout. +template class StaticLoopChunker { + + /// Generic loop nest that handles block and/or thread distribution in the + /// absence of user specified chunk sizes. This implicitly picks a block chunk + /// size equal to the number of threads in the block and a thread chunk size + /// equal to one. In contrast to the chunked version we can get away with a + /// single loop in this case + static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg, + Ty NumBlocks, Ty BId, Ty NumThreads, + Ty TId, Ty NumIters, + bool OneIterationPerThread) { + Ty KernelIteration = NumBlocks * NumThreads; + + // Start index in the normalized space. + Ty IV = BId * NumThreads + TId; + ASSERT(IV >= 0, "Bad index"); + + // Cover the entire iteration space, assumptions in the caller might allow + // to simplify this loop to a conditional. + if (IV < NumIters) { + do { + + // Execute the loop body. + LoopBody(IV, Arg); + + // Every thread executed one block and thread chunk now. + IV += KernelIteration; + + if (OneIterationPerThread) + return; + + } while (IV < NumIters); + } + } + + /// Generic loop nest that handles block and/or thread distribution in the + /// presence of user specified chunk sizes (for at least one of them). + static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg, + Ty BlockChunk, Ty NumBlocks, Ty BId, + Ty ThreadChunk, Ty NumThreads, Ty TId, + Ty NumIters, + bool OneIterationPerThread) { + Ty KernelIteration = NumBlocks * BlockChunk; + + // Start index in the chunked space. + Ty IV = BId * BlockChunk + TId; + ASSERT(IV >= 0, "Bad index"); + + // Cover the entire iteration space, assumptions in the caller might allow + // to simplify this loop to a conditional. + do { + + Ty BlockChunkLeft = + BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0; + Ty ThreadChunkLeft = + ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft; + + while (ThreadChunkLeft--) { + + // Given the blocking it's hard to keep track of what to execute. + if (IV >= NumIters) + return; + + // Execute the loop body. + LoopBody(IV, Arg); + + if (OneIterationPerThread) + return; + + ++IV; + } + + IV += KernelIteration; + + } while (IV < NumIters); + } + +public: + /// Worksharing `for`-loop. + static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, + Ty NumIters, Ty NumThreads, Ty ThreadChunk) { + ASSERT(NumIters >= 0, "Bad iteration count"); + ASSERT(ThreadChunk >= 0, "Bad thread count"); + + // All threads need to participate but we don't know if we are in a + // parallel at all or if the user might have used a `num_threads` clause + // on the parallel and reduced the number compared to the block size. + // Since nested parallels are possible too we need to get the thread id + // from the `omp` getter and not the mapping directly. + Ty TId = omp_get_thread_num(); + + // There are no blocks involved here. + Ty BlockChunk = 0; + Ty NumBlocks = 1; + Ty BId = 0; + + // If the thread chunk is not specified we pick a default now. + if (ThreadChunk == 0) + ThreadChunk = 1; + + // If we know we have more threads than iterations we can indicate that to + // avoid an outer loop. + bool OneIterationPerThread = false; + if (__omp_rtl_assume_threads_oversubscription) { + ASSERT(NumThreads >= NumIters, "Broken assumption"); + OneIterationPerThread = true; + } + + if (ThreadChunk != 1) + NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, + ThreadChunk, NumThreads, TId, NumIters, + OneIterationPerThread); + else + NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, + NumIters, OneIterationPerThread); + } + + /// Worksharing `distrbute`-loop. + static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, + Ty NumIters, Ty BlockChunk) { + ASSERT(icv::Level == 0, "Bad distribute"); + ASSERT(icv::ActiveLevel == 0, "Bad distribute"); + ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); + ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); + + ASSERT(NumIters >= 0, "Bad iteration count"); + ASSERT(BlockChunk >= 0, "Bad block count"); + + // There are no threads involved here. + Ty ThreadChunk = 0; + Ty NumThreads = 1; + Ty TId = 0; + ASSERT(TId == mapping::getThreadIdInBlock(), "Bad thread id"); + + // All teams need to participate. + Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); + Ty BId = mapping::getBlockIdInKernel(); + + // If the block chunk is not specified we pick a default now. + if (BlockChunk == 0) + BlockChunk = NumThreads; + + // If we know we have more blocks than iterations we can indicate that to + // avoid an outer loop. + bool OneIterationPerThread = false; + if (__omp_rtl_assume_teams_oversubscription) { + ASSERT(NumBlocks >= NumIters, "Broken assumption"); + OneIterationPerThread = true; + } + + if (BlockChunk != NumThreads) + NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, + ThreadChunk, NumThreads, TId, NumIters, + OneIterationPerThread); + else + NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, + NumIters, OneIterationPerThread); + + ASSERT(icv::Level == 0, "Bad distribute"); + ASSERT(icv::ActiveLevel == 0, "Bad distribute"); + ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); + ASSERT(state::ParallelTeamSize == 1, "Bad distribute"); + } + + /// Worksharing `distrbute parallel for`-loop. + static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *), + void *Arg, Ty NumIters, Ty NumThreads, + Ty BlockChunk, Ty ThreadChunk) { + ASSERT(icv::Level == 1, "Bad distribute"); + ASSERT(icv::ActiveLevel == 1, "Bad distribute"); + ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); + + ASSERT(NumIters >= 0, "Bad iteration count"); + ASSERT(BlockChunk >= 0, "Bad block count"); + ASSERT(ThreadChunk >= 0, "Bad thread count"); + + // All threads need to participate but the user might have used a + // `num_threads` clause on the parallel and reduced the number compared to + // the block size. + Ty TId = mapping::getThreadIdInBlock(); + + // All teams need to participate. + Ty NumBlocks = mapping::getNumberOfBlocksInKernel(); + Ty BId = mapping::getBlockIdInKernel(); + + // If the block chunk is not specified we pick a default now. + if (BlockChunk == 0) + BlockChunk = NumThreads; + + // If the thread chunk is not specified we pick a default now. + if (ThreadChunk == 0) + ThreadChunk = 1; + + // If we know we have more threads (across all blocks) than iterations we + // can indicate that to avoid an outer loop. + bool OneIterationPerThread = false; + if (__omp_rtl_assume_teams_oversubscription & + __omp_rtl_assume_threads_oversubscription) { + OneIterationPerThread = true; + ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); + } + + if (BlockChunk != NumThreads || ThreadChunk != 1) + NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, + ThreadChunk, NumThreads, TId, NumIters, + OneIterationPerThread); + else + NormalizedLoopNestNoChunk(LoopBody, Arg, NumBlocks, BId, NumThreads, TId, + NumIters, OneIterationPerThread); + + ASSERT(icv::Level == 1, "Bad distribute"); + ASSERT(icv::ActiveLevel == 1, "Bad distribute"); + ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); + } +}; + +} // namespace ompx + +#define OMP_LOOP_ENTRY(BW, TY) \ + [[gnu::flatten, clang::always_inline]] void \ + __kmpc_distribute_for_static_loop##BW( \ + IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ + TY num_threads, TY block_chunk, TY thread_chunk) { \ + ompx::StaticLoopChunker::DistributeFor( \ + loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk); \ + } \ + [[gnu::flatten, clang::always_inline]] void \ + __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ + void *arg, TY num_iters, \ + TY block_chunk) { \ + ompx::StaticLoopChunker::Distribute(loc, fn, arg, num_iters + 1, \ + block_chunk); \ + } \ + [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \ + IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ + TY num_threads, TY thread_chunk) { \ + ompx::StaticLoopChunker::For(loc, fn, arg, num_iters + 1, num_threads, \ + thread_chunk); \ + } + +extern "C" { +OMP_LOOP_ENTRY(_4, int32_t) +OMP_LOOP_ENTRY(_4u, uint32_t) +OMP_LOOP_ENTRY(_8, int64_t) +OMP_LOOP_ENTRY(_8u, uint64_t) +} + #pragma omp end declare target