Skip to content

Commit

Permalink
Reduced cutoff for short list sorting kernel (#2878)
Browse files Browse the repository at this point in the history
  • Loading branch information
peastman committed Oct 8, 2020
1 parent 94d7225 commit fce2608
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 11 deletions.
2 changes: 1 addition & 1 deletion platforms/cuda/src/CudaSort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int maxSharedMem;
cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
int maxShortList = min(8192, max(maxLocalBuffer, CudaContext::ThreadBlockSize*context.getNumThreadBlocks()));
int maxShortList = min(3000, max(maxLocalBuffer, CudaContext::ThreadBlockSize*context.getNumThreadBlocks()));
isShortList = (length <= maxShortList);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
;
Expand Down
18 changes: 8 additions & 10 deletions platforms/opencl/src/OpenCLSort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,19 +63,17 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
unsigned int maxShortList = min(8192, max(maxLocalBuffer, (int) OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks()));
// The following line checks CL_KERNEL_WORK_GROUP_SIZE to make sure we don't create too large a workgroup.
// Unfortunately, AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm just leaving it commented out.
// If the workgroup size turns out to be too large, we catch the exception and switch back to the standard
// sorting kernels.
//maxShortList = min(maxShortList, shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
isShortList = (length <= maxShortList);
int maxShortList = max(maxLocalBuffer, (int) OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks());
string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA")
if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
maxShortList = min(3000, maxShortList);
useShortList2 = (dataLength <= OpenCLContext::ThreadBlockSize*context.getNumThreadBlocks());
else
}
else {
maxShortList = min(1024, maxShortList);
useShortList2 = false;
}
isShortList = (length <= maxShortList);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
;
positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);
Expand Down

0 comments on commit fce2608

Please sign in to comment.