Skip to content

Commit

Permalink
[OpenMP][libomptarget] Refactor SPMD and runtime requirement checking
Browse files Browse the repository at this point in the history
Summary: Refactor the checking for SPMD mode and whether the runtime is initialized or not. This uses constant flags which enables the runtime to optimize out unused sections of code that depend on these flags.

Reviewers: ABataev, caomhin

Reviewed By: ABataev

Subscribers: guansong, jfb, openmp-commits

Differential Revision: https://reviews.llvm.org/D54960

llvm-svn: 347698
  • Loading branch information
doru1004 committed Nov 27, 2018
1 parent e535bab commit ad8632a
Show file tree
Hide file tree
Showing 9 changed files with 262 additions and 171 deletions.
4 changes: 2 additions & 2 deletions openmp/libomptarget/deviceRTLs/nvptx/src/cancel.cu
Expand Up @@ -13,14 +13,14 @@

#include "omptarget-nvptx.h"

EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent *loc, int32_t global_tid,
EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
int32_t cancelVal) {
PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", cancelVal);
// disabled
return FALSE;
}

EXTERN int32_t __kmpc_cancel(kmp_Indent *loc, int32_t global_tid,
EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
int32_t cancelVal) {
PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", cancelVal);
// disabled
Expand Down
10 changes: 6 additions & 4 deletions openmp/libomptarget/deviceRTLs/nvptx/src/critical.cu
Expand Up @@ -15,14 +15,16 @@

#include "omptarget-nvptx.h"

EXTERN void __kmpc_critical(kmp_Indent *loc, int32_t global_tid,
kmp_CriticalName *lck) {
EXTERN
void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
kmp_CriticalName *lck) {
PRINT0(LD_IO, "call to kmpc_critical()\n");
omp_set_lock((omp_lock_t *)lck);
}

EXTERN void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid,
kmp_CriticalName *lck) {
EXTERN
void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
kmp_CriticalName *lck) {
PRINT0(LD_IO, "call to kmpc_end_critical()\n");
omp_unset_lock((omp_lock_t *)lck);
}
158 changes: 93 additions & 65 deletions openmp/libomptarget/deviceRTLs/nvptx/src/interface.h

Large diffs are not rendered by default.

107 changes: 58 additions & 49 deletions openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
Expand Up @@ -332,11 +332,11 @@ EXTERN void __kmpc_kernel_end_parallel() {
// support for parallel that goes sequential
////////////////////////////////////////////////////////////////////////////////

EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) {
EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");

if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, isSPMDMode(),
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
omptarget_nvptx_simpleThreadPrivateContext->IncParLevel();
return;
Expand Down Expand Up @@ -370,12 +370,12 @@ EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) {
newTaskDescr);
}

EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");

if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, isSPMDMode(),
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
omptarget_nvptx_simpleThreadPrivateContext->DecParLevel();
return;
Expand All @@ -393,11 +393,11 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
currTaskDescr->RestoreLoopData();
}

EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) {
EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_parallel_level\n");

if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, isSPMDMode(),
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
}
Expand All @@ -417,42 +417,42 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) {
// cached by the compiler and used when calling the runtime. On nvptx
// it's cheap to recalculate this value so we never use the result
// of this call.
EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc) {
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
return GetLogicalThreadIdInBlock();
}

////////////////////////////////////////////////////////////////////////////////
// push params
////////////////////////////////////////////////////////////////////////////////

EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid,
EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
int32_t num_threads) {
PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
num_threads;
}

EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid,
EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
int32_t simd_limit) {
PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit);
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
}

// Do nothing. The host guarantees we started the requested number of
// teams and we only need inspection of gridDim.

EXTERN void __kmpc_push_num_teams(kmp_Indent *loc, int32_t tid,
EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
int32_t num_teams, int32_t thread_limit) {
PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams);
ASSERT0(LT_FUSSY, FALSE,
"should never have anything with new teams on device");
}

EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t tid,
EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
int proc_bind) {
PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind);
}
9 changes: 5 additions & 4 deletions openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
Expand Up @@ -31,7 +31,7 @@ int32_t __gpu_block_reduce() {
}

EXTERN
int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars,
int32_t __kmpc_reduce_gpu(kmp_Ident *loc, int32_t global_tid, int32_t num_vars,
size_t reduce_size, void *reduce_data,
void *reduce_array_size, kmp_ReductFctPtr *reductFct,
kmp_CriticalName *lck) {
Expand All @@ -40,7 +40,8 @@ int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars,
int numthread;
if (currTaskDescr->IsParallelConstruct()) {
numthread =
GetNumberOfOmpThreads(threadId, isSPMDMode(), isRuntimeUninitialized());
GetNumberOfOmpThreads(threadId, checkSPMDMode(loc),
checkRuntimeUninitialized(loc));
} else {
numthread = GetNumberOfOmpTeams();
}
Expand All @@ -55,12 +56,12 @@ int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars,
}

EXTERN
int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
int32_t __kmpc_reduce_combined(kmp_Ident *loc) {
return threadIdx.x == 0 ? 2 : 0;
}

EXTERN
int32_t __kmpc_reduce_simd(kmp_Indent *loc) {
int32_t __kmpc_reduce_simd(kmp_Ident *loc) {
return (threadIdx.x % 32 == 0) ? 1 : 0;
}

Expand Down
55 changes: 53 additions & 2 deletions openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
Expand Up @@ -32,6 +32,59 @@ INLINE bool isRuntimeInitialized() {
return (execution_param & RuntimeMask) == RuntimeInitialized;
}

////////////////////////////////////////////////////////////////////////////////
// Execution Modes based on location parameter fields
////////////////////////////////////////////////////////////////////////////////

INLINE bool checkSPMDMode(kmp_Ident *loc) {
if (!loc)
return isSPMDMode();

// If SPMD is true then we are not in the UNDEFINED state so
// we can return immediately.
if (loc->reserved_2 & KMP_IDENT_SPMD_MODE)
return true;

// If not in SPMD mode and runtime required is a valid
// combination of flags so we can return immediately.
if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE))
return false;

// We are in underfined state.
return isSPMDMode();
}

INLINE bool checkGenericMode(kmp_Ident *loc) {
return !checkSPMDMode(loc);
}

INLINE bool checkRuntimeUninitialized(kmp_Ident *loc) {
if (!loc)
return isRuntimeUninitialized();

// If runtime is required then we know we can't be
// in the undefined mode. We can return immediately.
if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE))
return false;

// If runtime is required then we need to check is in
// SPMD mode or not. If not in SPMD mode then we end
// up in the UNDEFINED state that marks the orphaned
// functions.
if (loc->reserved_2 & KMP_IDENT_SPMD_MODE)
return true;

// Check if we are in an UNDEFINED state. Undefined is denoted by
// non-SPMD + noRuntimeRequired which is a combination that
// cannot actually happen. Undefined states is used to mark orphaned
// functions.
return isRuntimeUninitialized();
}

INLINE bool checkRuntimeInitialized(kmp_Ident *loc) {
return !checkRuntimeUninitialized(loc);
}

////////////////////////////////////////////////////////////////////////////////
// support: get info from machine
////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -78,8 +131,6 @@ INLINE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
// id is GetMasterThreadID()) calls this routine, we return 0 because
// it is a shadow for the first worker.
INLINE int GetLogicalThreadIdInBlock() {
// return GetThreadIdInBlock() % GetMasterThreadID();

// Implemented using control flow (predication) instead of with a modulo
// operation.
int tid = GetThreadIdInBlock();
Expand Down
30 changes: 15 additions & 15 deletions openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
Expand Up @@ -17,11 +17,11 @@
// KMP Ordered calls
////////////////////////////////////////////////////////////////////////////////

EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t tid) {
EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) {
PRINT0(LD_IO, "call kmpc_ordered\n");
}

EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t tid) {
EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) {
PRINT0(LD_IO, "call kmpc_end_ordered\n");
}

Expand All @@ -33,26 +33,26 @@ EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t tid) {
// FIXME: what if not all threads (warps) participate to the barrier?
// We may need to implement it differently

EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) {
PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
__kmpc_barrier(loc_ref, tid);
PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
return 0;
}

EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
if (isRuntimeUninitialized()) {
ASSERT0(LT_FUSSY, isSPMDMode(),
EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
if (checkRuntimeUninitialized(loc_ref)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc_ref),
"Expected SPMD mode with uninitialized runtime.");
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
tid, isSPMDMode(), /*isRuntimeUninitialized=*/false);
tid, checkSPMDMode(loc_ref), /*isRuntimeUninitialized=*/false);
if (numberOfActiveOMPThreads > 1) {
if (isSPMDMode()) {
if (checkSPMDMode(loc_ref)) {
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
// The #threads parameter must be rounded up to the WARPSIZE.
Expand All @@ -72,15 +72,15 @@ EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {

// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0
// parallel region and that all worker threads participate.
EXTERN void __kmpc_barrier_simple_spmd(kmp_Indent *loc_ref, int32_t tid) {
EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
__syncthreads();
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
}

// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0
// parallel region and that all worker threads participate.
EXTERN void __kmpc_barrier_simple_generic(kmp_Indent *loc_ref, int32_t tid) {
EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE;
// The #threads parameter must be rounded up to the WARPSIZE.
int threads =
Expand All @@ -106,12 +106,12 @@ INLINE int32_t IsMaster() {
return IsTeamMaster(ompThreadId);
}

EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid) {
EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_master\n");
return IsMaster();
}

EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid) {
EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_end_master\n");
ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
}
Expand All @@ -120,13 +120,13 @@ EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid) {
// KMP SINGLE
////////////////////////////////////////////////////////////////////////////////

EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid) {
EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_single\n");
// decide to implement single with master; master get the single
return IsMaster();
}

EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid) {
EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_end_single\n");
// decide to implement single with master: master get the single
ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
Expand All @@ -137,7 +137,7 @@ EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid) {
// Flush
////////////////////////////////////////////////////////////////////////////////

EXTERN void __kmpc_flush(kmp_Indent *loc) {
EXTERN void __kmpc_flush(kmp_Ident *loc) {
PRINT0(LD_IO, "call kmpc_flush\n");
__threadfence_block();
}
Expand Down

0 comments on commit ad8632a

Please sign in to comment.