Skip to content

Commit

Permalink
[OpenMP][CUDA] Cache the maximal number of threads per block (per ker…
Browse files Browse the repository at this point in the history
…nel)

Instead of calling `cuFuncGetAttribute` with
`CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK` for every kernel invocation,
we can do it for the first one and cache the result as part of the
`KernelInfo` struct. The only functional change is that we now expect
`cuFuncGetAttribute` to succeed and otherwise propagate the error.
Ignoring any error seems like a slippery slope...

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D86038
  • Loading branch information
jdoerfert committed Aug 16, 2020
1 parent 95a25e4 commit aa27cfc
Showing 1 changed file with 20 additions and 14 deletions.
34 changes: 20 additions & 14 deletions openmp/libomptarget/plugins/cuda/src/rtl.cpp
Expand Up @@ -75,6 +75,9 @@ struct KernelTy {
// 1 - Generic mode (with master warp)
int8_t ExecutionMode;

/// Maximal number of threads per block for this kernel.
int MaxThreadsPerBlock = 0;

KernelTy(CUfunction _Func, int8_t _ExecutionMode)
: Func(_Func), ExecutionMode(_ExecutionMode) {}
};
Expand Down Expand Up @@ -843,10 +846,9 @@ class DeviceRTLTy {
return OFFLOAD_SUCCESS;
}

int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr,
void **TgtArgs, ptrdiff_t *TgtOffsets,
const int ArgNum, const int TeamNum,
const int ThreadLimit,
int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
ptrdiff_t *TgtOffsets, const int ArgNum,
const int TeamNum, const int ThreadLimit,
const unsigned int LoopTripCount,
__tgt_async_info *AsyncInfo) const {
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
Expand All @@ -862,10 +864,9 @@ class DeviceRTLTy {
Args[I] = &Ptrs[I];
}

const KernelTy *KernelInfo =
reinterpret_cast<const KernelTy *>(TgtEntryPtr);
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);

unsigned int CudaThreadsPerBlock;
int CudaThreadsPerBlock;
if (ThreadLimit > 0) {
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
CudaThreadsPerBlock = ThreadLimit;
Expand All @@ -886,13 +887,18 @@ class DeviceRTLTy {
CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
}

int KernelLimit;
Err = cuFuncGetAttribute(&KernelLimit,
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
KernelInfo->Func);
if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) {
DP("Threads per block capped at kernel limit %d\n", KernelLimit);
CudaThreadsPerBlock = KernelLimit;
if (!KernelInfo->MaxThreadsPerBlock) {
Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
KernelInfo->Func);
if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
return OFFLOAD_FAIL;
}

if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
DP("Threads per block capped at kernel limit %d\n",
KernelInfo->MaxThreadsPerBlock);
CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
}

unsigned int CudaBlocksPerGrid;
Expand Down

0 comments on commit aa27cfc

Please sign in to comment.