diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 906fd49d1d..1c074025a9 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -530,6 +530,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } std::vector launch_attribute(numPropsInLaunchPropList); + + // Early exit for zero size kernel + if (*pGlobalWorkSize == 0) { + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); + } + + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + + uint32_t LocalSize = hKernel->getLocalSize(); + CUfunction CuFunc = hKernel->get(); + for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) { switch (launchPropList[i].id) { case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: { @@ -540,12 +555,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; // Note that cuda orders from right to left wrt SYCL dimensional order. - launch_attribute[i].value.clusterDim.x = - launchPropList[i].value.clusterDim[2]; - launch_attribute[i].value.clusterDim.y = - launchPropList[i].value.clusterDim[1]; - launch_attribute[i].value.clusterDim.z = - launchPropList[i].value.clusterDim[0]; + if (workDim == 3) { + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[2]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[0]; + } else if (workDim == 2) { + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[0]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[2]; + } else { + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[0]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[2]; + } + + UR_CHECK_ERROR(cuFuncSetAttribute( + CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)); + break; } case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: { @@ -560,20 +595,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } } - // Early exit for zero size kernel - if (*pGlobalWorkSize == 0) { - return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, - phEventWaitList, phEvent); - } - - // Set the number of threads per block to the number of threads per warp - // by default unless user has provided a better number - size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; - size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - - uint32_t LocalSize = hKernel->getLocalSize(); - CUfunction CuFunc = hKernel->get(); - // This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled // using the standard UR_CHECK_ERROR if (ur_result_t Ret = diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp index bc252392eb..4cc0dcfe22 100644 --- a/test/conformance/exp_launch_properties/launch_properties.cpp +++ b/test/conformance/exp_launch_properties/launch_properties.cpp @@ -78,7 +78,7 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) { if (compute_capability >= 9.0) { ur_exp_launch_property_t cluster_dims_prop; cluster_dims_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION; - cluster_dims_prop.value.clusterDim[0] = 1; + cluster_dims_prop.value.clusterDim[0] = 16; cluster_dims_prop.value.clusterDim[1] = 1; cluster_dims_prop.value.clusterDim[2] = 1;