Skip to content
Closed
61 changes: 41 additions & 20 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
}

std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);

// Early exit for zero size kernel
if (*pGlobalWorkSize == 0) {
return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
phEventWaitList, phEvent);
}

// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};

uint32_t LocalSize = hKernel->getLocalSize();
CUfunction CuFunc = hKernel->get();

for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
switch (launchPropList[i].id) {
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
Expand All @@ -540,12 +555,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(

launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
// Note that cuda orders from right to left wrt SYCL dimensional order.
launch_attribute[i].value.clusterDim.x =
launchPropList[i].value.clusterDim[2];
launch_attribute[i].value.clusterDim.y =
launchPropList[i].value.clusterDim[1];
launch_attribute[i].value.clusterDim.z =
launchPropList[i].value.clusterDim[0];
if (workDim == 3) {
launch_attribute[i].value.clusterDim.x =
Comment on lines +558 to +559
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could use some help Here -

I was not able to figure out where this flipping of order happens,
I see it's being set in setKernelParams but how it flips it I was not able to understand it

launchPropList[i].value.clusterDim[2];
launch_attribute[i].value.clusterDim.y =
launchPropList[i].value.clusterDim[1];
launch_attribute[i].value.clusterDim.z =
launchPropList[i].value.clusterDim[0];
} else if (workDim == 2) {
launch_attribute[i].value.clusterDim.x =
launchPropList[i].value.clusterDim[1];
launch_attribute[i].value.clusterDim.y =
launchPropList[i].value.clusterDim[0];
launch_attribute[i].value.clusterDim.z =
launchPropList[i].value.clusterDim[2];
} else {
launch_attribute[i].value.clusterDim.x =
launchPropList[i].value.clusterDim[0];
launch_attribute[i].value.clusterDim.y =
launchPropList[i].value.clusterDim[1];
launch_attribute[i].value.clusterDim.z =
launchPropList[i].value.clusterDim[2];
}

UR_CHECK_ERROR(cuFuncSetAttribute(
CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));

break;
}
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
Expand All @@ -560,20 +595,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
}
}

// Early exit for zero size kernel
if (*pGlobalWorkSize == 0) {
return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
phEventWaitList, phEvent);
}

// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};

uint32_t LocalSize = hKernel->getLocalSize();
CUfunction CuFunc = hKernel->get();

// This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled
// using the standard UR_CHECK_ERROR
if (ur_result_t Ret =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) {
if (compute_capability >= 9.0) {
ur_exp_launch_property_t cluster_dims_prop;
cluster_dims_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION;
cluster_dims_prop.value.clusterDim[0] = 1;
cluster_dims_prop.value.clusterDim[0] = 16;
cluster_dims_prop.value.clusterDim[1] = 1;
cluster_dims_prop.value.clusterDim[2] = 1;

Expand Down