Skip to content

Commit

Permalink
[OPENMP][NVPTX]Fix parallel level counter in non-SPMD mode.
Browse files Browse the repository at this point in the history
Summary:
In non-SPMD mode we may end up with the divergent threads when trying to
increment/decrement parallel level counter. It may lead to incorrect
calculations of the parallel level and wrong results when threads are
divergent. We need to reconverge the threads before trying to modify the
parallel level counter.

Reviewers: grokos, jdoerfert

Subscribers: guansong, openmp-commits, caomhin, kkwli0

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D66802

llvm-svn: 370803
  • Loading branch information
alexey-bataev committed Sep 3, 2019
1 parent b187eef commit 4812941
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 16 deletions.
26 changes: 22 additions & 4 deletions openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,16 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
(int)newTaskDescr->ThreadId(), (int)nThreads);

isActive = true;
IncParallelLevel(threadsInTeam != 1);
// Reconverge the threads at the end of the parallel region to correctly
// handle parallel levels.
// In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
// warp. If only 1 thread is active, not need to reconverge the threads.
// If we have the whole warp, reconverge all the threads in the warp before
// actually trying to change the parallel level. Otherwise, parallel level
// can be changed incorrectly because of threads divergence.
bool IsActiveParallelRegion = threadsInTeam != 1;
IncParallelLevel(IsActiveParallelRegion,
IsActiveParallelRegion ? 0xFFFFFFFF : 1u);
}

return isActive;
Expand All @@ -329,7 +338,16 @@ EXTERN void __kmpc_kernel_end_parallel() {
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, currTaskDescr->GetPrevTaskDescr());

DecParallelLevel(threadsInTeam != 1);
// Reconverge the threads at the end of the parallel region to correctly
// handle parallel levels.
// In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
// warp. If only 1 thread is active, not need to reconverge the threads.
// If we have the whole warp, reconverge all the threads in the warp before
// actually trying to change the parallel level. Otherwise, parallel level can
// be changed incorrectly because of threads divergence.
bool IsActiveParallelRegion = threadsInTeam != 1;
DecParallelLevel(IsActiveParallelRegion,
IsActiveParallelRegion ? 0xFFFFFFFF : 1u);
}

////////////////////////////////////////////////////////////////////////////////
Expand All @@ -339,7 +357,7 @@ EXTERN void __kmpc_kernel_end_parallel() {
EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");

IncParallelLevel(/*ActiveParallel=*/false);
IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());

if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
Expand Down Expand Up @@ -378,7 +396,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");

DecParallelLevel(/*ActiveParallel=*/false);
DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());

if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
Expand Down
5 changes: 3 additions & 2 deletions openmp/libomptarget/deviceRTLs/nvptx/src/support.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
//
//===----------------------------------------------------------------------===//

#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
// Execution Parameters
////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -65,8 +66,8 @@ INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
INLINE int IsTeamMaster(int ompThreadId);

// Parallel level
INLINE void IncParallelLevel(bool ActiveParallel);
INLINE void DecParallelLevel(bool ActiveParallel);
INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);

////////////////////////////////////////////////////////////////////////////////
// Memory
Expand Down
18 changes: 8 additions & 10 deletions openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,30 +203,28 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
////////////////////////////////////////////////////////////////////////////////
// Parallel level

INLINE void IncParallelLevel(bool ActiveParallel) {
__kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
__kmpc_impl_syncwarp(Active);
INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
__kmpc_impl_syncwarp(Mask);
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] +=
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
__threadfence();
}
__kmpc_impl_syncwarp(Active);
__kmpc_impl_syncwarp(Mask);
}

INLINE void DecParallelLevel(bool ActiveParallel) {
__kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
__kmpc_impl_syncwarp(Active);
INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
__kmpc_impl_syncwarp(Mask);
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] -=
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
__threadfence();
}
__kmpc_impl_syncwarp(Active);
__kmpc_impl_syncwarp(Mask);
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
12 changes: 12 additions & 0 deletions openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,5 +135,17 @@ int main(int argc, char *argv[]) {
}
}

// Check for paraller level in non-SPMD kernels.
level = 0;
#pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level)
for (int i=0; i<5032; i+=32) {
int ub = (i+32 > 5032) ? 5032 : i+32;
#pragma omp parallel for schedule(dynamic)
for (int j=i ; j < ub; j++) ;
level += omp_get_level();
}
// CHECK: Integral level = 0.
printf("Integral level = %d.\n", level);

return 0;
}

0 comments on commit 4812941

Please sign in to comment.