diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp index 29a484aa0eb24..fc5a3b08cb9d0 100644 --- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp @@ -44,45 +44,119 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, } } -static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +static uint32_t gpu_irregular_simd_reduce(void *reduce_data, + ShuffleReductFnTy shflFct) { + uint32_t size, remote_id, physical_lane_id; + physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); + __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); + __kmpc_impl_lanemask_t Liveness = mapping::activemask(); + uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; + __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); + do { + Liveness = mapping::activemask(); + remote_id = utils::ffs(Liveness & lanemask_gt); + size = utils::popc(Liveness); + logical_lane_id /= 2; + shflFct(reduce_data, /*LaneId =*/logical_lane_id, + /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); + } while (logical_lane_id % 2 == 0 && size > 1); + return (logical_lane_id == 0); +} +#endif + +static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars, + uint64_t reduce_size, + void *reduce_data, ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct) { + InterWarpCopyFnTy cpyFct, + bool isSPMDExecutionMode, bool) { + uint32_t BlockThreadId = mapping::getThreadIdInBlock(); + if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false)) + BlockThreadId = 0; uint32_t NumThreads = omp_get_num_threads(); - // Handle degenerated parallel regions, including all nested ones, first. if (NumThreads == 1) return 1; - - /* - * 1. Reduce within a warp. - * 2. Warp master copies value to warp 0 via shared memory. - * 3. Warp 0 reduces to a single value. - * 4. The reduced value is available in the thread that returns 1. - */ - - uint32_t BlockThreadId = mapping::getThreadIdInBlock(); - uint32_t NumWarps = + /* + * This reduce function handles reduction within a team. It handles + * parallel regions in both L1 and L2 parallelism levels. It also + * supports Generic, SPMD, and NoOMP modes. + * + * 1. Reduce within a warp. + * 2. Warp master copies value to warp 0 via shared memory. + * 3. Warp 0 reduces to a single value. + * 4. The reduced value is available in the thread that returns 1. + */ + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + uint32_t WarpsNeeded = (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); + uint32_t WarpId = mapping::getWarpIdInBlock(); + // Volta execution model: // For the Generic execution mode a parallel region either has 1 thread and // beyond that, always a multiple of 32. For the SPMD execution mode we may // have any number of threads. - gpu_regular_warp_reduce(reduce_data, shflFct); + if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1)) + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/NumThreads % mapping::getWarpSize(), + /*LaneId=*/mapping::getThreadIdInBlock() % + mapping::getWarpSize()); // When we have more than [mapping::getWarpSize()] number of threads // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. if (NumThreads > mapping::getWarpSize()) { // Gather all the reduced values from each warp // to the first warp. - cpyFct(reduce_data, NumWarps); + cpyFct(reduce_data, WarpsNeeded); - if (BlockThreadId < mapping::getWarpSize()) - gpu_irregular_warp_reduce(reduce_data, shflFct, NumWarps, BlockThreadId); + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); } - - // In Generic and in SPMD mode block thread Id 0 is what we want. - // It's either the main thread in SPMD mode or the "acting" main thread in the - // parallel region. return BlockThreadId == 0; +#else + __kmpc_impl_lanemask_t Liveness = mapping::activemask(); + if (Liveness == lanes::All) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/utils::popc(Liveness), + /*LaneId=*/mapping::getThreadIdInBlock() % + mapping::getWarpSize()); + else { // Dispersed lanes. Only threads in L2 + // parallel region may enter here; return + // early. + return gpu_irregular_simd_reduce(reduce_data, shflFct); + } + + // When we have more than [mapping::getWarpSize()] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > mapping::getWarpSize()) { + uint32_t WarpsNeeded = + (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + + return BlockThreadId == 0; + } + + // Get the OMP thread Id. This is different from BlockThreadId in the case of + // an L2 parallel region. + return TId == 0; +#endif // __CUDA_ARCH__ >= 700 } uint32_t roundToWarpsize(uint32_t s) { @@ -99,7 +173,9 @@ extern "C" { int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size, void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) { - return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct); + return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data, + shflFct, cpyFct, mapping::isSPMDMode(), + false); } /// Mostly like _v2 but with the builtin assumption that we have less than diff --git a/openmp/libomptarget/test/offloading/generic_reduction.c b/openmp/libomptarget/test/offloading/generic_reduction.c deleted file mode 100644 index 8b5ff0f067f97..0000000000000 --- a/openmp/libomptarget/test/offloading/generic_reduction.c +++ /dev/null @@ -1,25 +0,0 @@ -// RUN: %libomptarget-compilexx-run-and-check-generic -// RUN: %libomptarget-compileoptxx-run-and-check-generic - -#include -#include -__attribute__((optnone)) void optnone(void) {} - -int main() { - int sum = 0, nt; -#pragma omp target teams map(tofrom : sum, nt) num_teams(1) - { - nt = 3 * omp_get_max_threads(); - optnone(); -#pragma omp parallel reduction(+ : sum) - sum += 1; -#pragma omp parallel reduction(+ : sum) - sum += 1; -#pragma omp parallel reduction(+ : sum) - sum += 1; - } - // CHECK: nt: [[NT:.*]] - // CHECK: sum: [[NT]] - printf("nt: %i\n", nt); - printf("sum: %i\n", sum); -}