From 8f7f0668a57a5ff55ce3906b4b3e8b0dea160b1f Mon Sep 17 00:00:00 2001
From: "atharva.dubey" <atharva.dubey@codeplay.com>
Date: Mon, 17 Jun 2024 08:43:38 +0100
Subject: [PATCH 1/9] set attribute allowing cluster size greater than 8

---
 source/adapters/cuda/enqueue.cpp | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 906fd49d1d..ebe1426bfa 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -629,6 +629,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     launch_config.attrs = &launch_attribute[0];
     launch_config.numAttrs = numPropsInLaunchPropList;
 
+    UR_CHECK_ERROR(cuFuncSetAttribute(
+        CuFunc, cudaFuncAttributeNonPortableClusterSizeAllowed, 1));
+
     UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
                                     const_cast<void **>(ArgIndices.data()),
                                     nullptr));

From 93691449efdc57187b5e8a91de5927caf5308834 Mon Sep 17 00:00:00 2001
From: "atharva.dubey" <atharva.dubey@codeplay.com>
Date: Mon, 17 Jun 2024 08:57:24 +0100
Subject: [PATCH 2/9] set property
 cudaFuncAttributeNonPortableClusterSizeAllowed only if cluster launch is used

---
 source/adapters/cuda/enqueue.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index ebe1426bfa..895fce4525 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -530,7 +530,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   }
 
   std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
+  bool has_property_cluster_launch = false;
+
   for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
+    has_property_cluster_launch = true;
+
     switch (launchPropList[i].id) {
     case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
       launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
@@ -629,8 +633,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     launch_config.attrs = &launch_attribute[0];
     launch_config.numAttrs = numPropsInLaunchPropList;
 
-    UR_CHECK_ERROR(cuFuncSetAttribute(
-        CuFunc, cudaFuncAttributeNonPortableClusterSizeAllowed, 1));
+    if (has_property_cluster_launch) {
+      UR_CHECK_ERROR(cuFuncSetAttribute(
+          CuFunc, cudaFuncAttributeNonPortableClusterSizeAllowed, 1));
+    }
 
     UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
                                     const_cast<void **>(ArgIndices.data()),

From a8c442d89ed037afcc4fc82936007fca2fdf2e51 Mon Sep 17 00:00:00 2001
From: "atharva.dubey" <atharva.dubey@codeplay.com>
Date: Mon, 17 Jun 2024 11:02:25 +0100
Subject: [PATCH 3/9] set has_property_cluster_launch only if cluster property
 is used

---
 source/adapters/cuda/enqueue.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 895fce4525..1aa3d6354d 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -533,14 +533,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   bool has_property_cluster_launch = false;
 
   for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
-    has_property_cluster_launch = true;
-
     switch (launchPropList[i].id) {
     case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
       launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
+      has_property_cluster_launch = true;
 
       launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
       // Note that cuda orders from right to left wrt SYCL dimensional order.

From b91c582f2d1cd38112c844ca2140344d2e11b544 Mon Sep 17 00:00:00 2001
From: "atharva.dubey" <atharva.dubey@codeplay.com>
Date: Mon, 17 Jun 2024 14:10:02 +0100
Subject: [PATCH 4/9] fix cluster dimensions being set in accordance to grid
 dimensions

---
 source/adapters/cuda/enqueue.cpp | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 1aa3d6354d..a3d2fbf75c 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -543,12 +543,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
 
       launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
       // Note that cuda orders from right to left wrt SYCL dimensional order.
-      launch_attribute[i].value.clusterDim.x =
-          launchPropList[i].value.clusterDim[2];
-      launch_attribute[i].value.clusterDim.y =
-          launchPropList[i].value.clusterDim[1];
-      launch_attribute[i].value.clusterDim.z =
-          launchPropList[i].value.clusterDim[0];
+      if (workDim == 3) {
+        launch_attribute[i].value.clusterDim.x =
+            launchPropList[i].value.clusterDim[2];
+        launch_attribute[i].value.clusterDim.y =
+            launchPropList[i].value.clusterDim[1];
+        launch_attribute[i].value.clusterDim.z =
+            launchPropList[i].value.clusterDim[0];
+      } else if (WorkDim == 2) {
+        launch_attribute[i].value.clusterDim.x =
+            launchPropList[i].value.clusterDim[0];
+        launch_attribute[i].value.clusterDim.y =
+            launchPropList[i].value.clusterDim[1];
+        launch_attribute[i].value.clusterDim.z =
+            launchPropList[i].value.clusterDim[2];
+      }
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {

From 3f2ed1c0dd540678ad435ee14a0127d0a5d2e4b7 Mon Sep 17 00:00:00 2001
From: "atharva.dubey" <atharva.dubey@codeplay.com>
Date: Mon, 17 Jun 2024 14:20:23 +0100
Subject: [PATCH 5/9] fix ordering of cluster dims for workDim 2

---
 source/adapters/cuda/enqueue.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index a3d2fbf75c..16b3e5dbcc 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -551,6 +551,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
         launch_attribute[i].value.clusterDim.z =
             launchPropList[i].value.clusterDim[0];
       } else if (WorkDim == 2) {
+        launch_attribute[i].value.clusterDim.x =
+            launchPropList[i].value.clusterDim[1];
+        launch_attribute[i].value.clusterDim.y =
+            launchPropList[i].value.clusterDim[0];
+        launch_attribute[i].value.clusterDim.z =
+            launchPropList[i].value.clusterDim[2];
+      } else {
         launch_attribute[i].value.clusterDim.x =
             launchPropList[i].value.clusterDim[0];
         launch_attribute[i].value.clusterDim.y =

From be8af682ab095b80ef9d105b7939a6aeb4537a99 Mon Sep 17 00:00:00 2001
From: "atharva.dubey" <atharva.dubey@codeplay.com>
Date: Mon, 17 Jun 2024 15:12:41 +0100
Subject: [PATCH 6/9] fix compilation errors

---
 source/adapters/cuda/enqueue.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 16b3e5dbcc..63dc496bac 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -550,7 +550,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
             launchPropList[i].value.clusterDim[1];
         launch_attribute[i].value.clusterDim.z =
             launchPropList[i].value.clusterDim[0];
-      } else if (WorkDim == 2) {
+      } else if (workDim == 2) {
         launch_attribute[i].value.clusterDim.x =
             launchPropList[i].value.clusterDim[1];
         launch_attribute[i].value.clusterDim.y =
@@ -650,7 +650,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
 
     if (has_property_cluster_launch) {
       UR_CHECK_ERROR(cuFuncSetAttribute(
-          CuFunc, cudaFuncAttributeNonPortableClusterSizeAllowed, 1));
+          CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));
     }
 
     UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,

From 5d38fe1f01df1d29ca9030d5a50041b01a0fade7 Mon Sep 17 00:00:00 2001
From: "atharva.dubey" <atharva.dubey@codeplay.com>
Date: Mon, 17 Jun 2024 17:18:41 +0100
Subject: [PATCH 7/9] review comments 1

---
 source/adapters/cuda/enqueue.cpp | 37 ++++++++++++++------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 63dc496bac..53515fd7f3 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -530,7 +530,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   }
 
   std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
-  bool has_property_cluster_launch = false;
+
+  // Early exit for zero size kernel
+  if (*pGlobalWorkSize == 0) {
+    return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
+                                          phEventWaitList, phEvent);
+  }
+
+  // Set the number of threads per block to the number of threads per warp
+  // by default unless user has provided a better number
+  size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
+  size_t BlocksPerGrid[3] = {1u, 1u, 1u};
+
+  uint32_t LocalSize = hKernel->getLocalSize();
+  CUfunction CuFunc = hKernel->get();
 
   for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
     switch (launchPropList[i].id) {
@@ -539,7 +552,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
-      has_property_cluster_launch = true;
+      UR_CHECK_ERROR(cuFuncSetAttribute(
+          CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));
 
       launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
       // Note that cuda orders from right to left wrt SYCL dimensional order.
@@ -579,20 +593,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
   }
 
-  // Early exit for zero size kernel
-  if (*pGlobalWorkSize == 0) {
-    return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
-                                          phEventWaitList, phEvent);
-  }
-
-  // Set the number of threads per block to the number of threads per warp
-  // by default unless user has provided a better number
-  size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
-  size_t BlocksPerGrid[3] = {1u, 1u, 1u};
-
-  uint32_t LocalSize = hKernel->getLocalSize();
-  CUfunction CuFunc = hKernel->get();
-
   // This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled
   // using the standard UR_CHECK_ERROR
   if (ur_result_t Ret =
@@ -648,11 +648,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     launch_config.attrs = &launch_attribute[0];
     launch_config.numAttrs = numPropsInLaunchPropList;
 
-    if (has_property_cluster_launch) {
-      UR_CHECK_ERROR(cuFuncSetAttribute(
-          CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));
-    }
-
     UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
                                     const_cast<void **>(ArgIndices.data()),
                                     nullptr));

From 7c1c64edd8b61d96e200f6b470d34ffb8709bec6 Mon Sep 17 00:00:00 2001
From: "atharva.dubey" <atharva.dubey@codeplay.com>
Date: Mon, 17 Jun 2024 17:20:07 +0100
Subject: [PATCH 8/9] review comments 1

---
 source/adapters/cuda/enqueue.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 53515fd7f3..1c074025a9 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -552,8 +552,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
-      UR_CHECK_ERROR(cuFuncSetAttribute(
-          CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));
 
       launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
       // Note that cuda orders from right to left wrt SYCL dimensional order.
@@ -579,6 +577,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
         launch_attribute[i].value.clusterDim.z =
             launchPropList[i].value.clusterDim[2];
       }
+
+      UR_CHECK_ERROR(cuFuncSetAttribute(
+          CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));
+
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {

From 4a23bb9cbeaefcdd054dffe53e980ccc6653437d Mon Sep 17 00:00:00 2001
From: "atharva.dubey" <atharva.dubey@codeplay.com>
Date: Fri, 21 Jun 2024 16:38:53 +0100
Subject: [PATCH 9/9] increase cluster size upon launch to check
 CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED flag being added

---
 test/conformance/exp_launch_properties/launch_properties.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp
index bc252392eb..4cc0dcfe22 100644
--- a/test/conformance/exp_launch_properties/launch_properties.cpp
+++ b/test/conformance/exp_launch_properties/launch_properties.cpp
@@ -78,7 +78,7 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) {
         if (compute_capability >= 9.0) {
             ur_exp_launch_property_t cluster_dims_prop;
             cluster_dims_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION;
-            cluster_dims_prop.value.clusterDim[0] = 1;
+            cluster_dims_prop.value.clusterDim[0] = 16;
             cluster_dims_prop.value.clusterDim[1] = 1;
             cluster_dims_prop.value.clusterDim[2] = 1;