From 8f7f0668a57a5ff55ce3906b4b3e8b0dea160b1f Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Mon, 17 Jun 2024 08:43:38 +0100 Subject: [PATCH 1/9] set attribute allowing cluster size greater than 8 --- source/adapters/cuda/enqueue.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 906fd49d1d..ebe1426bfa 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -629,6 +629,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_config.attrs = &launch_attribute[0]; launch_config.numAttrs = numPropsInLaunchPropList; + UR_CHECK_ERROR(cuFuncSetAttribute( + CuFunc, cudaFuncAttributeNonPortableClusterSizeAllowed, 1)); + UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, const_cast(ArgIndices.data()), nullptr)); From 93691449efdc57187b5e8a91de5927caf5308834 Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Mon, 17 Jun 2024 08:57:24 +0100 Subject: [PATCH 2/9] set property cudaFuncAttributeNonPortableClusterSizeAllowed only if cluster launch is used --- source/adapters/cuda/enqueue.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index ebe1426bfa..895fce4525 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -530,7 +530,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } std::vector launch_attribute(numPropsInLaunchPropList); + bool has_property_cluster_launch = false; + for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) { + has_property_cluster_launch = true; + switch (launchPropList[i].id) { case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: { launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE; @@ -629,8 +633,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_config.attrs = &launch_attribute[0]; launch_config.numAttrs = numPropsInLaunchPropList; - UR_CHECK_ERROR(cuFuncSetAttribute( - CuFunc, cudaFuncAttributeNonPortableClusterSizeAllowed, 1)); + if (has_property_cluster_launch) { + UR_CHECK_ERROR(cuFuncSetAttribute( + CuFunc, cudaFuncAttributeNonPortableClusterSizeAllowed, 1)); + } UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, const_cast(ArgIndices.data()), From a8c442d89ed037afcc4fc82936007fca2fdf2e51 Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Mon, 17 Jun 2024 11:02:25 +0100 Subject: [PATCH 3/9] set has_property_cluster_launch only if cluster property is used --- source/adapters/cuda/enqueue.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 895fce4525..1aa3d6354d 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -533,14 +533,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( bool has_property_cluster_launch = false; for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) { - has_property_cluster_launch = true; - switch (launchPropList[i].id) { case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: { launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE; break; } case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: { + has_property_cluster_launch = true; launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; // Note that cuda orders from right to left wrt SYCL dimensional order. From b91c582f2d1cd38112c844ca2140344d2e11b544 Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Mon, 17 Jun 2024 14:10:02 +0100 Subject: [PATCH 4/9] fix cluster dimensions being set in accordance to grid dimensions --- source/adapters/cuda/enqueue.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 1aa3d6354d..a3d2fbf75c 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -543,12 +543,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; // Note that cuda orders from right to left wrt SYCL dimensional order. - launch_attribute[i].value.clusterDim.x = - launchPropList[i].value.clusterDim[2]; - launch_attribute[i].value.clusterDim.y = - launchPropList[i].value.clusterDim[1]; - launch_attribute[i].value.clusterDim.z = - launchPropList[i].value.clusterDim[0]; + if (workDim == 3) { + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[2]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[0]; + } else if (WorkDim == 2) { + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[0]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[2]; + } break; } case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: { From 3f2ed1c0dd540678ad435ee14a0127d0a5d2e4b7 Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Mon, 17 Jun 2024 14:20:23 +0100 Subject: [PATCH 5/9] fix ordering of cluster dims for workDim 2 --- source/adapters/cuda/enqueue.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index a3d2fbf75c..16b3e5dbcc 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -551,6 +551,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_attribute[i].value.clusterDim.z = launchPropList[i].value.clusterDim[0]; } else if (WorkDim == 2) { + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[0]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[2]; + } else { launch_attribute[i].value.clusterDim.x = launchPropList[i].value.clusterDim[0]; launch_attribute[i].value.clusterDim.y = From be8af682ab095b80ef9d105b7939a6aeb4537a99 Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Mon, 17 Jun 2024 15:12:41 +0100 Subject: [PATCH 6/9] fix compilation errors --- source/adapters/cuda/enqueue.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 16b3e5dbcc..63dc496bac 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -550,7 +550,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launchPropList[i].value.clusterDim[1]; launch_attribute[i].value.clusterDim.z = launchPropList[i].value.clusterDim[0]; - } else if (WorkDim == 2) { + } else if (workDim == 2) { launch_attribute[i].value.clusterDim.x = launchPropList[i].value.clusterDim[1]; launch_attribute[i].value.clusterDim.y = @@ -650,7 +650,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( if (has_property_cluster_launch) { UR_CHECK_ERROR(cuFuncSetAttribute( - CuFunc, cudaFuncAttributeNonPortableClusterSizeAllowed, 1)); + CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)); } UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, From 5d38fe1f01df1d29ca9030d5a50041b01a0fade7 Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Mon, 17 Jun 2024 17:18:41 +0100 Subject: [PATCH 7/9] review comments 1 --- source/adapters/cuda/enqueue.cpp | 37 ++++++++++++++------------------ 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 63dc496bac..53515fd7f3 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -530,7 +530,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } std::vector launch_attribute(numPropsInLaunchPropList); - bool has_property_cluster_launch = false; + + // Early exit for zero size kernel + if (*pGlobalWorkSize == 0) { + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); + } + + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + + uint32_t LocalSize = hKernel->getLocalSize(); + CUfunction CuFunc = hKernel->get(); for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) { switch (launchPropList[i].id) { @@ -539,7 +552,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( break; } case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: { - has_property_cluster_launch = true; + UR_CHECK_ERROR(cuFuncSetAttribute( + CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)); launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; // Note that cuda orders from right to left wrt SYCL dimensional order. @@ -579,20 +593,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } } - // Early exit for zero size kernel - if (*pGlobalWorkSize == 0) { - return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, - phEventWaitList, phEvent); - } - - // Set the number of threads per block to the number of threads per warp - // by default unless user has provided a better number - size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; - size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - - uint32_t LocalSize = hKernel->getLocalSize(); - CUfunction CuFunc = hKernel->get(); - // This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled // using the standard UR_CHECK_ERROR if (ur_result_t Ret = @@ -648,11 +648,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_config.attrs = &launch_attribute[0]; launch_config.numAttrs = numPropsInLaunchPropList; - if (has_property_cluster_launch) { - UR_CHECK_ERROR(cuFuncSetAttribute( - CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)); - } - UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, const_cast(ArgIndices.data()), nullptr)); From 7c1c64edd8b61d96e200f6b470d34ffb8709bec6 Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Mon, 17 Jun 2024 17:20:07 +0100 Subject: [PATCH 8/9] review comments 1 --- source/adapters/cuda/enqueue.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 53515fd7f3..1c074025a9 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -552,8 +552,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( break; } case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: { - UR_CHECK_ERROR(cuFuncSetAttribute( - CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)); launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; // Note that cuda orders from right to left wrt SYCL dimensional order. @@ -579,6 +577,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_attribute[i].value.clusterDim.z = launchPropList[i].value.clusterDim[2]; } + + UR_CHECK_ERROR(cuFuncSetAttribute( + CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)); + break; } case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: { From 4a23bb9cbeaefcdd054dffe53e980ccc6653437d Mon Sep 17 00:00:00 2001 From: "atharva.dubey" Date: Fri, 21 Jun 2024 16:38:53 +0100 Subject: [PATCH 9/9] increase cluster size upon launch to check CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED flag being added --- test/conformance/exp_launch_properties/launch_properties.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp index bc252392eb..4cc0dcfe22 100644 --- a/test/conformance/exp_launch_properties/launch_properties.cpp +++ b/test/conformance/exp_launch_properties/launch_properties.cpp @@ -78,7 +78,7 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) { if (compute_capability >= 9.0) { ur_exp_launch_property_t cluster_dims_prop; cluster_dims_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION; - cluster_dims_prop.value.clusterDim[0] = 1; + cluster_dims_prop.value.clusterDim[0] = 16; cluster_dims_prop.value.clusterDim[1] = 1; cluster_dims_prop.value.clusterDim[2] = 1;