Skip to content

Commit

Permalink
[OpenMP] Provide a specialized team reduction for the common case (#7…
Browse files Browse the repository at this point in the history
…0766)

We default to < 1024 teams if the user did not specify otherwise. As
such we can avoid the extra logic in the teams reduction that handles
more than num_of_records (default 1024) teams. This is a stopgap but
still shaves of 33% of the runtime in some simple reduction examples.
  • Loading branch information
jdoerfert committed Nov 2, 2023
1 parent 66152f4 commit eab828d
Showing 1 changed file with 98 additions and 0 deletions.
98 changes: 98 additions & 0 deletions openmp/libomptarget/DeviceRTL/src/Reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,109 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
false);
}

/// Mostly like _v2 but with the builtin assumption that we have less than
/// num_of_records (by default 1024) teams.
int32_t __kmpc_nvptx_teams_reduce_nowait_v3(
IdentTy *Loc, int32_t TId, void *__restrict__ GlobalBuffer,
uint32_t num_of_records, void *reduce_data, ShuffleReductFnTy shflFct,
InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
// Terminate all threads in non-SPMD mode except for the main thread.
uint32_t ThreadId = mapping::getThreadIdInBlock();
if (mapping::isGenericMode()) {
if (!mapping::isMainThreadInGenericMode())
return 0;
ThreadId = 0;
}

uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;

// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team main participates in the teams
// reduction because the workers are waiting for parallel work.
uint32_t NumThreads = omp_get_num_threads();
uint32_t TeamId = omp_get_team_num();
uint32_t NumTeams = omp_get_num_teams();
static unsigned SHARED(ChunkTeamCount);

// Block progress for teams greater than the current upper
// limit. We always only allow a number of teams less or equal
// to the number of slots in the buffer.
bool IsMain = (ThreadId == 0);

if (IsMain) {
lgcpyFct(GlobalBuffer, TeamId, reduce_data);

// Propagate the memory writes above to the world.
fence::kernel(atomic::release);

// Increment team counter.
// This counter is incremented by all teams in the current
// BUFFER_SIZE chunk.
ChunkTeamCount = atomic::inc(&Cnt, NumTeams, atomic::acq_rel,
atomic::MemScopeTy::device);
}

// Synchronize in SPMD mode as in generic mode all but 1 threads are in the
// state machine.
if (mapping::isSPMDMode())
synchronize::threadsAligned(atomic::acq_rel);

// Each thread will have a local struct containing the values to be
// reduced:
// 1. do reduction within each warp.
// 2. do reduction across warps.
// 3. write the final result to the main reduction variable
// by returning 1 in the thread holding the reduction result.

// Check if this is the very last team.
if (ChunkTeamCount != NumTeams - 1)
return 0;

// Last team processing.
NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumTeams));
if (ThreadId >= NumThreads)
return 0;

// Ensure we see the global memory writes by other teams
fence::kernel(atomic::aquire);

// Load from buffer and reduce.
glcpyFct(GlobalBuffer, ThreadId, reduce_data);
for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
glredFct(GlobalBuffer, i, reduce_data);

// Reduce across warps to the warp main.
gpu_regular_warp_reduce(reduce_data, shflFct);

uint32_t ActiveThreads = kmpcMin(NumTeams, NumThreads);
uint32_t WarpsNeeded =
(ActiveThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);

if (mapping::getWarpIdInBlock() == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);

return IsMain;
}

int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
ListGlobalFnTy glredFct) {
// The first check is a compile time constant, the second one a runtime check.
// If the first one succeeds we will use the specialized version.
if ((state::getKernelEnvironment().Configuration.MaxTeams >= 0 &&
state::getKernelEnvironment().Configuration.MaxTeams <= num_of_records &&
num_of_records == 1024) ||
(omp_get_num_teams() <= num_of_records))
return __kmpc_nvptx_teams_reduce_nowait_v3(
Loc, TId, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct,
lgcpyFct, lgredFct, glcpyFct, glredFct);

// Terminate all threads in non-SPMD mode except for the master thread.
uint32_t ThreadId = mapping::getThreadIdInBlock();
if (mapping::isGenericMode()) {
Expand Down

0 comments on commit eab828d

Please sign in to comment.