Skip to content

Commit

Permalink
[OPENMP][NVPTX]Improve thread limit counter, NFC.
Browse files Browse the repository at this point in the history
Summary:
Patch improves performance of the full runtime mode by moving
thread-limit counter to the shared memory. It also allows to save
global memory.

Reviewers: grokos, gtbercea, kkwli0

Subscribers: guansong, jdoerfert, caomhin, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D61526

llvm-svn: 359922
  • Loading branch information
alexey-bataev committed May 3, 2019
1 parent e5cbe78 commit a857e31
Show file tree
Hide file tree
Showing 5 changed files with 6 additions and 12 deletions.
5 changes: 1 addition & 4 deletions openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
Expand Up @@ -70,10 +70,7 @@ EXTERN int omp_get_max_threads(void) {
EXTERN int omp_get_thread_limit(void) {
if (isSPMDMode())
return GetNumberOfThreadsInBlock();
// per contention group.. meaning threads in current team
omptarget_nvptx_TaskDescr *currTaskDescr =
getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
int rc = currTaskDescr->ThreadLimit();
int rc = threadLimit;
PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
return rc;
}
Expand Down
2 changes: 1 addition & 1 deletion openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
Expand Up @@ -32,7 +32,7 @@ __device__ __shared__ uint32_t usedMemIdx;
__device__ __shared__ uint32_t usedSlotIdx;

__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];

__device__ __shared__ uint16_t threadLimit;
// Pointer to this team's OpenMP state object
__device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
Expand Down
3 changes: 1 addition & 2 deletions openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
Expand Up @@ -74,7 +74,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
currTaskDescr->NThreads() = GetNumberOfWorkersInTeam();
currTaskDescr->ThreadLimit() = ThreadLimit;
threadLimit = ThreadLimit;
}

EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
Expand Down Expand Up @@ -139,7 +139,6 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
newTaskDescr->InitLevelOneTaskDescr(ThreadLimit,
currTeamDescr.LevelZeroTaskDescr());
newTaskDescr->ThreadLimit() = ThreadLimit;
// install new top descriptor
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
newTaskDescr);
Expand Down
3 changes: 1 addition & 2 deletions openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
Expand Up @@ -165,7 +165,6 @@ class omptarget_nvptx_TaskDescr {
INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
// methods for other fields
INLINE uint16_t &NThreads() { return items.nthreads; }
INLINE uint16_t &ThreadLimit() { return items.threadlimit; }
INLINE uint16_t &ThreadId() { return items.threadId; }
INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; }
INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
Expand Down Expand Up @@ -213,7 +212,6 @@ class omptarget_nvptx_TaskDescr {
uint8_t flags; // 6 bit used (see flag above)
uint8_t unused;
uint16_t nthreads; // thread num for subsequent parallel regions
uint16_t threadlimit; // thread limit ICV
uint16_t threadId; // thread id
uint16_t threadsInTeam; // threads in current team
uint64_t runtimeChunkSize; // runtime chunk size
Expand Down Expand Up @@ -408,6 +406,7 @@ extern __device__ __shared__ uint32_t usedMemIdx;
extern __device__ __shared__ uint32_t usedSlotIdx;
extern __device__ __shared__ uint8_t
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
extern __device__ __shared__ uint16_t threadLimit;
extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;

Expand Down
5 changes: 2 additions & 3 deletions openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
Expand Up @@ -249,9 +249,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
uint16_t &NumThreadsClause =
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);

uint16_t NumThreads =
determineNumberOfThreads(NumThreadsClause, currTaskDescr->NThreads(),
currTaskDescr->ThreadLimit());
uint16_t NumThreads = determineNumberOfThreads(
NumThreadsClause, currTaskDescr->NThreads(), threadLimit);

if (NumThreadsClause != 0) {
// Reset request to avoid propagating to successive #parallel
Expand Down

0 comments on commit a857e31

Please sign in to comment.