Skip to content

Commit

Permalink
[libomptarget-nvptx] Fix number of threads in parallel
Browse files Browse the repository at this point in the history
If there is no num_threads() clause we must consider the
nthreads-var ICV. Its value is set by omp_set_num_threads()
and can be queried using omp_get_max_num_threads().
The rewritten code now closely resembles the algorithm given
in the OpenMP standard.

Differential Revision: https://reviews.llvm.org/D51783

llvm-svn: 343380
  • Loading branch information
hahnjo committed Sep 29, 2018
1 parent 54d31ef commit a743c04
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 84 deletions.
4 changes: 2 additions & 2 deletions openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
Expand Up @@ -61,8 +61,8 @@ EXTERN int omp_get_max_threads(void) {
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
int rc = 1; // default is 1 thread avail
if (!currTaskDescr->InParallelRegion()) {
// not currently in a parallel region... all are available
rc = GetNumberOfProcsInTeam();
// Not currently in a parallel region, return what was set.
rc = currTaskDescr->NThreads();
ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
}
PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
Expand Down
125 changes: 43 additions & 82 deletions openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
Expand Up @@ -193,25 +193,38 @@ EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
// support for parallel that goes parallel (1 static level only)
////////////////////////////////////////////////////////////////////////////////

// return number of cuda threads that participate to parallel
// calculation has to consider simd implementation in nvptx
// i.e. (num omp threads * num lanes)
//
// cudathreads =
// if(num_threads != 0) {
// if(thread_limit > 0) {
// min (num_threads*numLanes ; thread_limit*numLanes);
// } else {
// min (num_threads*numLanes; blockDim.x)
// }
// } else {
// if (thread_limit != 0) {
// min (thread_limit*numLanes; blockDim.x)
// } else { // no thread_limit, no num_threads, use all cuda threads
// blockDim.x;
// }
// }
//
static INLINE uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
uint16_t NThreadsICV,
uint16_t ThreadLimit) {
uint16_t ThreadsRequested = NThreadsICV;
if (NumThreadsClause != 0) {
ThreadsRequested = NumThreadsClause;
}

uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
ThreadsAvailable = ThreadLimit;
}

uint16_t NumThreads = ThreadsAvailable;
if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
NumThreads = ThreadsRequested;
}

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
// On Volta and newer architectures we require that all lanes in
// a warp participate in the parallel region. Round down to a
// multiple of WARPSIZE since it is legal to do so in OpenMP.
if (NumThreads < WARPSIZE) {
NumThreads = 1;
} else {
NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
}
#endif

return NumThreads;
}

// This routine is always called by the team master..
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
int16_t IsOMPRuntimeInitialized) {
Expand All @@ -234,78 +247,26 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
return;
}

uint16_t CudaThreadsForParallel = 0;
uint16_t NumThreadsClause =
uint16_t &NumThreadsClause =
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);

// we cannot have more than block size
uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam();

// currTaskDescr->ThreadLimit(): If non-zero, this is the limit as
// specified by the thread_limit clause on the target directive.
// GetNumberOfWorkersInTeam(): This is the number of workers available
// in this kernel instance.
//
// E.g: If thread_limit is 33, the kernel is launched with 33+32=65
// threads. The last warp is the master warp so in this case
// GetNumberOfWorkersInTeam() returns 64.

// this is different from ThreadAvail of OpenMP because we may be
// using some of the CUDA threads as SIMD lanes
int NumLanes = 1;
if (NumThreadsClause != 0) {
// reset request to avoid propagating to successive #parallel
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
0;

// assume that thread_limit*numlanes is already <= CudaThreadsAvail
// because that is already checked on the host side (CUDA offloading rtl)
if (currTaskDescr->ThreadLimit() != 0)
CudaThreadsForParallel =
NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit() * NumLanes
? NumThreadsClause * NumLanes
: currTaskDescr->ThreadLimit() * NumLanes;
else {
CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail)
? CudaThreadsAvail
: NumThreadsClause * NumLanes;
}
} else {
if (currTaskDescr->ThreadLimit() != 0) {
CudaThreadsForParallel =
(currTaskDescr->ThreadLimit() * NumLanes > CudaThreadsAvail)
? CudaThreadsAvail
: currTaskDescr->ThreadLimit() * NumLanes;
} else
CudaThreadsForParallel = CudaThreadsAvail;
}
uint16_t NumThreads =
determineNumberOfThreads(NumThreadsClause, currTaskDescr->NThreads(),
currTaskDescr->ThreadLimit());

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
// On Volta and newer architectures we require that all lanes in
// a warp participate in the parallel region. Round down to a
// multiple of WARPSIZE since it is legal to do so in OpenMP.
// CudaThreadsAvail is the number of workers available in this
// kernel instance and is greater than or equal to
// currTaskDescr->ThreadLimit().
if (CudaThreadsForParallel < CudaThreadsAvail) {
CudaThreadsForParallel =
(CudaThreadsForParallel < WARPSIZE)
? 1
: CudaThreadsForParallel & ~((uint16_t)WARPSIZE - 1);
if (NumThreadsClause != 0) {
// Reset request to avoid propagating to successive #parallel
NumThreadsClause = 0;
}
#endif

ASSERT(LT_FUSSY, CudaThreadsForParallel > 0,
"bad thread request of %d threads", CudaThreadsForParallel);
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
NumThreads);
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
"only team master can create parallel");

// set number of threads on work descriptor
// this is different from the number of cuda threads required for the parallel
// region
// Set number of threads on work descriptor.
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr,
CudaThreadsForParallel / NumLanes);
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, NumThreads);
}

// All workers call this function. Deactivate those not needed.
Expand Down
102 changes: 102 additions & 0 deletions openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
@@ -0,0 +1,102 @@
// RUN: %compile-run-and-check

#include <stdio.h>
#include <omp.h>

const int WarpSize = 32;
const int NumThreads1 = 1 * WarpSize;
const int NumThreads2 = 2 * WarpSize;
const int NumThreads3 = 3 * WarpSize;
const int MaxThreads = 1024;

int main(int argc, char *argv[]) {
int check1[MaxThreads];
int check2[MaxThreads];
int check3[MaxThreads];
int check4[MaxThreads];
for (int i = 0; i < MaxThreads; i++) {
check1[i] = check2[i] = check3[i] = check4[i] = 0;
}

int maxThreads1 = -1;
int maxThreads2 = -1;
int maxThreads3 = -1;

#pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \
map(maxThreads1, maxThreads2, maxThreads3)
{
#pragma omp parallel num_threads(NumThreads1)
{
check1[omp_get_thread_num()] += omp_get_num_threads();
}

// API method to set number of threads in parallel regions without
// num_threads() clause.
omp_set_num_threads(NumThreads2);
maxThreads1 = omp_get_max_threads();
#pragma omp parallel
{
check2[omp_get_thread_num()] += omp_get_num_threads();
}

maxThreads2 = omp_get_max_threads();

// num_threads() clause should override nthreads-var ICV.
#pragma omp parallel num_threads(NumThreads3)
{
check3[omp_get_thread_num()] += omp_get_num_threads();
}

maxThreads3 = omp_get_max_threads();

// Effect from omp_set_num_threads() should still be visible.
#pragma omp parallel
{
check4[omp_get_thread_num()] += omp_get_num_threads();
}
}

// CHECK: maxThreads1 = 64
printf("maxThreads1 = %d\n", maxThreads1);
// CHECK: maxThreads2 = 64
printf("maxThreads2 = %d\n", maxThreads2);
// CHECK: maxThreads3 = 64
printf("maxThreads3 = %d\n", maxThreads3);

// CHECK-NOT: invalid
for (int i = 0; i < MaxThreads; i++) {
if (i < NumThreads1) {
if (check1[i] != NumThreads1) {
printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]);
}
} else if (check1[i] != 0) {
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
}

if (i < NumThreads2) {
if (check2[i] != NumThreads2) {
printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]);
}
} else if (check2[i] != 0) {
printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
}

if (i < NumThreads3) {
if (check3[i] != NumThreads3) {
printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]);
}
} else if (check3[i] != 0) {
printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
}

if (i < NumThreads2) {
if (check4[i] != NumThreads2) {
printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]);
}
} else if (check4[i] != 0) {
printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
}
}

return 0;
}

0 comments on commit a743c04

Please sign in to comment.