Skip to content

Commit

Permalink
[OPENMP][NVPTX]Improve code by using parallel level counter.
Browse files Browse the repository at this point in the history
Summary:
Previously for the different purposes we need to get the active/common
parallel level and with full runtime we iterated over all the records to
calculate this level. Instead, we can used the warp-based parallel level
counters used in no-runtime mode.

Reviewers: grokos, gtbercea, kkwli0

Subscribers: guansong, jfb, jdoerfert, caomhin, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D61395

llvm-svn: 359822
  • Loading branch information
alexey-bataev committed May 2, 2019
1 parent 88a0f13 commit 8ccb8f8
Show file tree
Hide file tree
Showing 10 changed files with 200 additions and 197 deletions.
55 changes: 5 additions & 50 deletions openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
Expand Up @@ -47,8 +47,7 @@ EXTERN void omp_set_num_threads(int num) {
EXTERN int omp_get_num_threads(void) {
bool isSPMDExecutionMode = isSPMDMode();
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
int rc =
GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized());
int rc = GetNumberOfOmpThreads(tid, isSPMDExecutionMode);
PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
return rc;
}
Expand Down Expand Up @@ -83,7 +82,7 @@ EXTERN int omp_get_thread_limit(void) {
EXTERN int omp_get_thread_num() {
bool isSPMDExecutionMode = isSPMDMode();
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
int rc = GetOmpThreadId(tid, isSPMDExecutionMode, isRuntimeUninitialized());
int rc = GetOmpThreadId(tid, isSPMDExecutionMode);
PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
return rc;
}
Expand All @@ -95,18 +94,7 @@ EXTERN int omp_get_num_procs(void) {
}

EXTERN int omp_in_parallel(void) {
int rc = 0;
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, isSPMDMode(),
"Expected SPMD mode only with uninitialized runtime.");
rc = 1; // SPMD mode is always in parallel.
} else {
omptarget_nvptx_TaskDescr *currTaskDescr =
getMyTopTaskDescriptor(isSPMDMode());
if (currTaskDescr->InParallelRegion()) {
rc = 1;
}
}
int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
return rc;
}
Expand Down Expand Up @@ -155,46 +143,13 @@ EXTERN int omp_get_max_active_levels(void) {
}

EXTERN int omp_get_level(void) {
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, isSPMDMode(),
"Expected SPMD mode only with uninitialized runtime.");
// parallelLevel starts from 0, need to add 1 for correct level.
return parallelLevel[GetWarpId()] + 1;
}
int level = 0;
omptarget_nvptx_TaskDescr *currTaskDescr =
getMyTopTaskDescriptor(isSPMDMode());
ASSERT0(LT_FUSSY, currTaskDescr,
"do not expect fct to be called in a non-active thread");
do {
if (currTaskDescr->IsParallelConstruct()) {
level++;
}
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
} while (currTaskDescr);
int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
return level;
}

EXTERN int omp_get_active_level(void) {
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, isSPMDMode(),
"Expected SPMD mode only with uninitialized runtime.");
return 1;
}
int level = 0; // no active level parallelism
omptarget_nvptx_TaskDescr *currTaskDescr =
getMyTopTaskDescriptor(isSPMDMode());
ASSERT0(LT_FUSSY, currTaskDescr,
"do not expect fct to be called in a non-active thread");
do {
if (currTaskDescr->ThreadsInTeam() > 1) {
// has a parallel with more than one thread in team
level = 1;
break;
}
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
} while (currTaskDescr);
int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
return level;
}
Expand Down
51 changes: 23 additions & 28 deletions openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
Expand Up @@ -95,17 +95,16 @@ public:
INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
int32_t *plastiter, T *plower, T *pupper,
ST *pstride, ST chunk,
bool IsSPMDExecutionMode,
bool IsRuntimeUninitialized) {
bool IsSPMDExecutionMode) {
// When IsRuntimeUninitialized is true, we assume that the caller is
// in an L0 parallel region and that all worker threads participate.

int tid = GetLogicalThreadIdInBlock(IsSPMDExecutionMode);

// Assume we are in teams region or that we use a single block
// per target region
ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(
tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
ST numberOfActiveOMPThreads =
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode);

// All warps that are in excess of the maximum requested, do
// not execute the loop
Expand Down Expand Up @@ -456,9 +455,7 @@ public:

// automatically selects thread or warp ID based on selected implementation
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
ASSERT0(LT_FUSSY,
gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
checkRuntimeUninitialized(loc)),
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc)),
"current thread is not needed here; error");
// retrieve schedule
kmp_sched_t schedule =
Expand Down Expand Up @@ -509,13 +506,12 @@ public:
*pupper = myUb;
*pstride = 1;

PRINT(
LD_LOOP,
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
"last %d\n",
(int)GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
(int)GetNumberOfWorkersInTeam(), (long long)*plower, (long long)*pupper,
(long long)*pstride, (int)*plast);
PRINT(LD_LOOP,
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
"last %d\n",
(int)GetNumberOfOmpThreads(tid, isSPMDMode()),
(int)GetNumberOfWorkersInTeam(), (long long)*plower,
(long long)*pupper, (long long)*pstride, (int)*plast);
return DISPATCH_NOTFINISHED;
}

Expand Down Expand Up @@ -629,7 +625,7 @@ EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
checkSPMDMode(loc));
}

EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
Expand All @@ -640,7 +636,7 @@ EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
checkSPMDMode(loc));
}

EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
Expand All @@ -651,7 +647,7 @@ EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
checkSPMDMode(loc));
}

EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
Expand All @@ -662,7 +658,7 @@ EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
checkSPMDMode(loc));
}

EXTERN
Expand All @@ -674,7 +670,7 @@ void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
/*IsSPMDExecutionMode=*/true);
}

EXTERN
Expand All @@ -686,7 +682,7 @@ void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
/*IsSPMDExecutionMode=*/true);
}

EXTERN
Expand All @@ -698,7 +694,7 @@ void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
/*IsSPMDExecutionMode=*/true);
}

EXTERN
Expand All @@ -710,7 +706,7 @@ void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
/*IsSPMDExecutionMode=*/true);
}

EXTERN
Expand All @@ -721,7 +717,7 @@ void __kmpc_for_static_init_4_simple_generic(
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
/*IsSPMDExecutionMode=*/false);
}

EXTERN
Expand All @@ -732,7 +728,7 @@ void __kmpc_for_static_init_4u_simple_generic(
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
/*IsSPMDExecutionMode=*/false);
}

EXTERN
Expand All @@ -743,7 +739,7 @@ void __kmpc_for_static_init_8_simple_generic(
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
/*IsSPMDExecutionMode=*/false);
}

EXTERN
Expand All @@ -754,7 +750,7 @@ void __kmpc_for_static_init_8u_simple_generic(
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
/*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
/*IsSPMDExecutionMode=*/false);
}

EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
Expand Down Expand Up @@ -787,8 +783,7 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,

omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
checkRuntimeUninitialized(loc));
uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc));
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
for (unsigned i = 0; i < varNum; i++) {
// Reset buffer.
Expand Down
30 changes: 16 additions & 14 deletions openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
Expand Up @@ -43,6 +43,8 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
ASSERT0(LT_FUSSY, RequiresOMPRuntime,
"Generic always requires initialized runtime.");
setExecutionParameters(Generic, RuntimeInitialized);
for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I)
parallelLevel[I] = 0;

int threadIdInBlock = GetThreadIdInBlock();
ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
Expand Down Expand Up @@ -91,32 +93,32 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
int16_t RequiresDataSharing) {
PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");

setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized
: RuntimeUninitialized);
int threadId = GetThreadIdInBlock();
if (threadId == 0) {
usedSlotIdx = smid() % MAX_SM;
parallelLevel[0] =
1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
} else if (GetLaneId() == 0) {
parallelLevel[GetWarpId()] =
1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
}
if (!RequiresOMPRuntime) {
// If OMP runtime is not required don't initialize OMP state.
setExecutionParameters(Spmd, RuntimeUninitialized);
if (GetThreadIdInBlock() == 0) {
usedSlotIdx = smid() % MAX_SM;
parallelLevel[0] = 0;
} else if (GetLaneId() == 0) {
parallelLevel[GetWarpId()] = 0;
}
// Runtime is not required - exit.
__SYNCTHREADS();
return;
}
setExecutionParameters(Spmd, RuntimeInitialized);

//
// Team Context Initialization.
//
// In SPMD mode there is no master thread so use any cuda thread for team
// context initialization.
int threadId = GetThreadIdInBlock();
if (threadId == 0) {
// Get a state object from the queue.
int slot = smid() % MAX_SM;
usedSlotIdx = slot;
omptarget_nvptx_threadPrivateContext =
omptarget_nvptx_device_State[slot].Dequeue();
omptarget_nvptx_device_State[usedSlotIdx].Dequeue();

omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
Expand Down Expand Up @@ -148,7 +150,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
"%d threads\n",
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam());

if (RequiresDataSharing && threadId % WARPSIZE == 0) {
if (RequiresDataSharing && GetLaneId() == 0) {
// Warp master innitializes data sharing environment.
unsigned WID = threadId / WARPSIZE;
__kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(
Expand Down
2 changes: 2 additions & 0 deletions openmp/libomptarget/deviceRTLs/nvptx/src/option.h
Expand Up @@ -44,6 +44,8 @@
#define MAX_SM 16
#endif

#define OMP_ACTIVE_PARALLEL_LEVEL 128

////////////////////////////////////////////////////////////////////////////////
// algo options
////////////////////////////////////////////////////////////////////////////////
Expand Down

0 comments on commit 8ccb8f8

Please sign in to comment.