From 732ef14e2f166a709f32f9e8c0b77e91e548295d Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Wed, 26 Jun 2024 14:01:47 +0100
Subject: [PATCH 1/9] Add cluster_launch device aspect.

this aspect is true only for sm_90 cuda gpus.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 include/ur_api.h                                  |  1 +
 include/ur_print.hpp                              | 15 +++++++++++++++
 scripts/core/exp-launch-properties.yml            | 10 ++++++++++
 source/adapters/cuda/device.cpp                   |  5 +++++
 .../exp_launch_properties/launch_properties.cpp   |  7 ++++++-
 tools/urinfo/urinfo.hpp                           |  2 ++
 6 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index 84fe704d5f..1dc0cd361e 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -1601,6 +1601,7 @@ typedef enum ur_device_info_t {
                                                                      ///< command-buffers.
     UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001,       ///< [::ur_bool_t] Returns true if the device supports updating the kernel
                                                                      ///< commands in a command-buffer.
+    UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP = 0x1111,                      ///< [::ur_bool_t] return true if enqueue Cluster Launch is supported
     UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000,             ///< [::ur_bool_t] returns true if the device supports the creation of
                                                                      ///< bindless images
     UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001,  ///< [::ur_bool_t] returns true if the device supports the creation of
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 5919e57019..57e0da2fa9 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -2530,6 +2530,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) {
     case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP:
         os << "UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP";
         break;
+    case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP:
+        os << "UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP";
+        break;
     case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP:
         os << "UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP";
         break;
@@ -4029,6 +4032,18 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info
 
         os << ")";
     } break;
+    case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
     case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: {
         const ur_bool_t *tptr = (const ur_bool_t *)ptr;
         if (sizeof(ur_bool_t) > size) {
diff --git a/scripts/core/exp-launch-properties.yml b/scripts/core/exp-launch-properties.yml
index 05f90aa7d2..aef4b2844a 100644
--- a/scripts/core/exp-launch-properties.yml
+++ b/scripts/core/exp-launch-properties.yml
@@ -128,4 +128,14 @@ returns:
     - $X_RESULT_ERROR_INVALID_VALUE
     - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
     - $X_RESULT_ERROR_OUT_OF_RESOURCES
+--- #--------------------------------------------------------------------------
+type: enum
+extend: true
+typed_etors: true
+desc: "Extension enums to $x_device_info_t to support arch specific launch properties."
+name: $x_device_info_t
+etors:
+    - name: CLUSTER_LAUNCH_EXP
+      value: "0x1111"
+      desc: "[$x_bool_t] return true if enqueue Cluster Launch is supported"
 
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index bd15a62504..b076d11930 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -1089,6 +1089,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP:
   case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP:
     return ReturnValue(true);
+  case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP: {
+    int Value = getAttribute(hDevice,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 9;
+    return ReturnValue(static_cast<bool>(Value));
+  }
 
   default:
     break;
diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp
index bc252392eb..f0d2bd4c54 100644
--- a/test/conformance/exp_launch_properties/launch_properties.cpp
+++ b/test/conformance/exp_launch_properties/launch_properties.cpp
@@ -75,7 +75,12 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) {
             props.push_back(coop_prop);
         }
 
-        if (compute_capability >= 9.0) {
+        ur_bool_t cluster_launch_supported = false;
+        ASSERT_SUCCESS(
+            urDeviceGetInfo(device, UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP,
+                            sizeof(ur_bool_t), &cluster_launch_supported, nullptr));
+
+        if (cluster_launch_supported) {
             ur_exp_launch_property_t cluster_dims_prop;
             cluster_dims_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION;
             cluster_dims_prop.value.clusterDim[0] = 1;
diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp
index 2b1f8c89a7..5f3c8874fd 100644
--- a/tools/urinfo/urinfo.hpp
+++ b/tools/urinfo/urinfo.hpp
@@ -335,6 +335,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice,
     printDeviceInfo<ur_bool_t>(
         hDevice, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP);
     std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(hDevice, UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP);
+    std::cout << prefix;
     printDeviceInfo<ur_bool_t>(hDevice,
                                UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP);
     std::cout << prefix;

From a1924481a87042e4b26273cd3860d2126d8833f9 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Wed, 26 Jun 2024 18:46:23 +0100
Subject: [PATCH 2/9] Fix format.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../conformance/exp_launch_properties/launch_properties.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp
index f0d2bd4c54..9095c37f8a 100644
--- a/test/conformance/exp_launch_properties/launch_properties.cpp
+++ b/test/conformance/exp_launch_properties/launch_properties.cpp
@@ -76,9 +76,9 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) {
         }
 
         ur_bool_t cluster_launch_supported = false;
-        ASSERT_SUCCESS(
-            urDeviceGetInfo(device, UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP,
-                            sizeof(ur_bool_t), &cluster_launch_supported, nullptr));
+        ASSERT_SUCCESS(urDeviceGetInfo(
+            device, UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP, sizeof(ur_bool_t),
+            &cluster_launch_supported, nullptr));
 
         if (cluster_launch_supported) {
             ur_exp_launch_property_t cluster_dims_prop;

From c2d1f28a5a406d6a09f2387d97818b82d9a0ad30 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Tue, 25 Jun 2024 16:30:45 -0700
Subject: [PATCH 3/9] [L0] Fix event usage after delete during release

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/event.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index 5e4cd3a2bd..5881610f68 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -1022,6 +1022,7 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
 
   // Save pointer to the queue before deleting/resetting event.
   auto Queue = Legacy(Event->UrQueue);
+  auto URQueue = Event->UrQueue;
 
   // If the event was a timestamp recording, we try to evict its entry in the
   // queue.
@@ -1053,8 +1054,8 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
   // created so that we can avoid ur_queue_handle_t is released before the
   // associated ur_event_handle_t is released. Here we have to decrement it so
   // ur_queue_handle_t can be released successfully.
-  if (Event->UrQueue) {
-    UR_CALL(urQueueReleaseInternal(Event->UrQueue));
+  if (URQueue) {
+    UR_CALL(urQueueReleaseInternal(URQueue));
   }
 
   return UR_RESULT_SUCCESS;

From 9d1bd64f336245c82c8ee3256ab9b013d390db0f Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 25 Jun 2024 10:53:31 +0100
Subject: [PATCH 4/9] Get scopedcontext device from UR context

An event created with interop doesn't have an associated queue. So get
an active device through the context.
---
 source/adapters/cuda/event.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp
index bbe92ddef8..9889031f1b 100644
--- a/source/adapters/cuda/event.cpp
+++ b/source/adapters/cuda/event.cpp
@@ -219,7 +219,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t,
 UR_APIEXPORT ur_result_t UR_APICALL
 urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
   try {
-    ScopedContext Active(phEventWaitList[0]->getQueue()->getDevice());
+    // Interop events don't have an associated queue, so get device through
+    // context
+    ScopedContext Active(phEventWaitList[0]->getContext()->getDevices()[0]);
 
     auto WaitFunc = [](ur_event_handle_t Event) -> ur_result_t {
       UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT);

From d93c13c4ebfdfa9c46838f4dde926bd13249dc1f Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 25 Jun 2024 11:27:25 +0100
Subject: [PATCH 5/9] Remove duplicate entry points

get_program is the same as getProgram.
---
 source/adapters/cuda/kernel.cpp | 4 ++--
 source/adapters/cuda/kernel.hpp | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
index 07fdf348a6..d43bd046dc 100644
--- a/source/adapters/cuda/kernel.cpp
+++ b/source/adapters/cuda/kernel.cpp
@@ -93,7 +93,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
   case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
     size_t GroupSize[3] = {0, 0, 0};
     const auto &ReqdWGSizeMDMap =
-        hKernel->get_program()->KernelReqdWorkGroupSizeMD;
+        hKernel->getProgram()->KernelReqdWorkGroupSizeMD;
     const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->getName());
     if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) {
       const auto ReqdWGSize = ReqdWGSizeMD->second;
@@ -222,7 +222,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
   case UR_KERNEL_INFO_CONTEXT:
     return ReturnValue(hKernel->getContext());
   case UR_KERNEL_INFO_PROGRAM:
-    return ReturnValue(hKernel->get_program());
+    return ReturnValue(hKernel->getProgram());
   case UR_KERNEL_INFO_ATTRIBUTES:
     return ReturnValue("");
   case UR_KERNEL_INFO_NUM_REGS: {
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index b7a7358b27..808b1abe9e 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -178,8 +178,6 @@ struct ur_kernel_handle_t_ {
     urContextRelease(Context);
   }
 
-  ur_program_handle_t get_program() const noexcept { return Program; }
-
   uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
   uint32_t decrementReferenceCount() noexcept { return --RefCount; }
@@ -187,6 +185,7 @@ struct ur_kernel_handle_t_ {
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 
   native_type get() const noexcept { return Function; };
+
   ur_program_handle_t getProgram() const noexcept { return Program; };
 
   native_type get_with_offset_parameter() const noexcept {

From 34bdb7c46ec05e73390c3334f67d9fb91d1c0dfc Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Mon, 24 Jun 2024 16:26:15 +0100
Subject: [PATCH 6/9] Make device param in urMemGetNativeHandle optional

---
 include/ur_api.h                                 | 7 +++++--
 scripts/core/memory.yml                          | 5 ++++-
 source/adapters/cuda/memory.cpp                  | 1 +
 source/adapters/hip/memory.cpp                   | 1 +
 source/adapters/null/ur_nullddi.cpp              | 3 ++-
 source/loader/layers/tracing/ur_trcddi.cpp       | 3 ++-
 source/loader/layers/validation/ur_valddi.cpp    | 7 ++-----
 source/loader/ur_ldrddi.cpp                      | 7 +++++--
 source/loader/ur_libapi.cpp                      | 7 +++++--
 source/ur_api.cpp                                | 7 +++++--
 test/conformance/memory/urMemGetNativeHandle.cpp | 6 ------
 11 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index 1dc0cd361e..889ccbdf28 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -2825,6 +2825,8 @@ urMemBufferPartition(
 ///     - The application may call this function from simultaneous threads for
 ///       the same context.
 ///     - The implementation of this function should be thread-safe.
+///     - The implementation may require a valid device handle to return the
+///       native mem handle
 ///
 /// @returns
 ///     - ::UR_RESULT_SUCCESS
@@ -2833,7 +2835,7 @@ urMemBufferPartition(
 ///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hMem`
-///         + `NULL == hDevice`
+///         + If `hDevice == NULL` and the implementation requires a valid device.
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == phNativeMem`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
@@ -2841,7 +2843,8 @@ urMemBufferPartition(
 UR_APIEXPORT ur_result_t UR_APICALL
 urMemGetNativeHandle(
     ur_mem_handle_t hMem,           ///< [in] handle of the mem.
-    ur_device_handle_t hDevice,     ///< [in] handle of the device that the native handle will be resident on.
+    ur_device_handle_t hDevice,     ///< [in][optional] handle of the device that the native handle will be
+                                    ///< resident on.
     ur_native_handle_t *phNativeMem ///< [out] a pointer to the native handle of the mem.
 );
 
diff --git a/scripts/core/memory.yml b/scripts/core/memory.yml
index c4009bc56e..21d8294519 100644
--- a/scripts/core/memory.yml
+++ b/scripts/core/memory.yml
@@ -436,6 +436,7 @@ details:
     - "Use interoperability platform extensions to convert native handle to native type."
     - "The application may call this function from simultaneous threads for the same context."
     - "The implementation of this function should be thread-safe."
+    - "The implementation may require a valid device handle to return the native mem handle"
 params:
     - type: $x_mem_handle_t
       name: hMem
@@ -444,7 +445,7 @@ params:
     - type: $x_device_handle_t
       name: hDevice
       desc: |
-            [in] handle of the device that the native handle will be resident on.
+            [in][optional] handle of the device that the native handle will be resident on.
     - type: $x_native_handle_t*
       name: phNativeMem
       desc: |
@@ -452,6 +453,8 @@ params:
 returns:
     - $X_RESULT_ERROR_UNSUPPORTED_FEATURE:
         - "If the adapter has no underlying equivalent handle."
+    - $X_RESULT_ERROR_INVALID_NULL_HANDLE:
+        - "If `hDevice == NULL` and the implementation requires a valid device."
 --- #--------------------------------------------------------------------------
 type: struct
 desc: "Native memory object creation properties"
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index 7bddac56b0..04b488a0f4 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -130,6 +130,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
 UR_APIEXPORT ur_result_t UR_APICALL
 urMemGetNativeHandle(ur_mem_handle_t hMem, ur_device_handle_t Device,
                      ur_native_handle_t *phNativeMem) {
+  UR_ASSERT(Device != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   try {
     *phNativeMem = reinterpret_cast<ur_native_handle_t>(
         std::get<BufferMem>(hMem->Mem).getPtr(Device));
diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp
index 64543af855..00f7e6a4b2 100644
--- a/source/adapters/hip/memory.cpp
+++ b/source/adapters/hip/memory.cpp
@@ -301,6 +301,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory,
 UR_APIEXPORT ur_result_t UR_APICALL
 urMemGetNativeHandle(ur_mem_handle_t hMem, ur_device_handle_t Device,
                      ur_native_handle_t *phNativeMem) {
+  UR_ASSERT(Device != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 #if defined(__HIP_PLATFORM_NVIDIA__)
   if (sizeof(BufferMem::native_type) > sizeof(ur_native_handle_t)) {
     // Check that all the upper bits that cannot be represented by
diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp
index e726061ee4..4a686a9d59 100644
--- a/source/adapters/null/ur_nullddi.cpp
+++ b/source/adapters/null/ur_nullddi.cpp
@@ -937,7 +937,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition(
 __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle(
     ur_mem_handle_t hMem, ///< [in] handle of the mem.
     ur_device_handle_t
-        hDevice, ///< [in] handle of the device that the native handle will be resident on.
+        hDevice, ///< [in][optional] handle of the device that the native handle will be
+                 ///< resident on.
     ur_native_handle_t
         *phNativeMem ///< [out] a pointer to the native handle of the mem.
     ) try {
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index bc140f17c2..cab7688e4f 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -1209,7 +1209,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition(
 __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle(
     ur_mem_handle_t hMem, ///< [in] handle of the mem.
     ur_device_handle_t
-        hDevice, ///< [in] handle of the device that the native handle will be resident on.
+        hDevice, ///< [in][optional] handle of the device that the native handle will be
+                 ///< resident on.
     ur_native_handle_t
         *phNativeMem ///< [out] a pointer to the native handle of the mem.
 ) {
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index ce1374f0c6..bd6e20e7ad 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -1290,7 +1290,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition(
 __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle(
     ur_mem_handle_t hMem, ///< [in] handle of the mem.
     ur_device_handle_t
-        hDevice, ///< [in] handle of the device that the native handle will be resident on.
+        hDevice, ///< [in][optional] handle of the device that the native handle will be
+                 ///< resident on.
     ur_native_handle_t
         *phNativeMem ///< [out] a pointer to the native handle of the mem.
 ) {
@@ -1305,10 +1306,6 @@ __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle(
             return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
         }
 
-        if (NULL == hDevice) {
-            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
-        }
-
         if (NULL == phNativeMem) {
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index fd18dd4361..f89edbd227 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -1259,7 +1259,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition(
 __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle(
     ur_mem_handle_t hMem, ///< [in] handle of the mem.
     ur_device_handle_t
-        hDevice, ///< [in] handle of the device that the native handle will be resident on.
+        hDevice, ///< [in][optional] handle of the device that the native handle will be
+                 ///< resident on.
     ur_native_handle_t
         *phNativeMem ///< [out] a pointer to the native handle of the mem.
 ) {
@@ -1276,7 +1277,9 @@ __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle(
     hMem = reinterpret_cast<ur_mem_object_t *>(hMem)->handle;
 
     // convert loader handle to platform handle
-    hDevice = reinterpret_cast<ur_device_object_t *>(hDevice)->handle;
+    hDevice = (hDevice)
+                  ? reinterpret_cast<ur_device_object_t *>(hDevice)->handle
+                  : nullptr;
 
     // forward to device-platform
     result = pfnGetNativeHandle(hMem, hDevice, phNativeMem);
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 1c5e288a03..9d98d18d10 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -1717,6 +1717,8 @@ ur_result_t UR_APICALL urMemBufferPartition(
 ///     - The application may call this function from simultaneous threads for
 ///       the same context.
 ///     - The implementation of this function should be thread-safe.
+///     - The implementation may require a valid device handle to return the
+///       native mem handle
 ///
 /// @returns
 ///     - ::UR_RESULT_SUCCESS
@@ -1725,7 +1727,7 @@ ur_result_t UR_APICALL urMemBufferPartition(
 ///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hMem`
-///         + `NULL == hDevice`
+///         + If `hDevice == NULL` and the implementation requires a valid device.
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == phNativeMem`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
@@ -1733,7 +1735,8 @@ ur_result_t UR_APICALL urMemBufferPartition(
 ur_result_t UR_APICALL urMemGetNativeHandle(
     ur_mem_handle_t hMem, ///< [in] handle of the mem.
     ur_device_handle_t
-        hDevice, ///< [in] handle of the device that the native handle will be resident on.
+        hDevice, ///< [in][optional] handle of the device that the native handle will be
+                 ///< resident on.
     ur_native_handle_t
         *phNativeMem ///< [out] a pointer to the native handle of the mem.
     ) try {
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index f33de6539b..5cc020778c 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -1479,6 +1479,8 @@ ur_result_t UR_APICALL urMemBufferPartition(
 ///     - The application may call this function from simultaneous threads for
 ///       the same context.
 ///     - The implementation of this function should be thread-safe.
+///     - The implementation may require a valid device handle to return the
+///       native mem handle
 ///
 /// @returns
 ///     - ::UR_RESULT_SUCCESS
@@ -1487,7 +1489,7 @@ ur_result_t UR_APICALL urMemBufferPartition(
 ///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hMem`
-///         + `NULL == hDevice`
+///         + If `hDevice == NULL` and the implementation requires a valid device.
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == phNativeMem`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
@@ -1495,7 +1497,8 @@ ur_result_t UR_APICALL urMemBufferPartition(
 ur_result_t UR_APICALL urMemGetNativeHandle(
     ur_mem_handle_t hMem, ///< [in] handle of the mem.
     ur_device_handle_t
-        hDevice, ///< [in] handle of the device that the native handle will be resident on.
+        hDevice, ///< [in][optional] handle of the device that the native handle will be
+                 ///< resident on.
     ur_native_handle_t
         *phNativeMem ///< [out] a pointer to the native handle of the mem.
 ) {
diff --git a/test/conformance/memory/urMemGetNativeHandle.cpp b/test/conformance/memory/urMemGetNativeHandle.cpp
index 55f8910f72..8023e1b1c3 100644
--- a/test/conformance/memory/urMemGetNativeHandle.cpp
+++ b/test/conformance/memory/urMemGetNativeHandle.cpp
@@ -20,12 +20,6 @@ TEST_P(urMemGetNativeHandleTest, InvalidNullHandleMem) {
                      urMemGetNativeHandle(nullptr, device, &phNativeMem));
 }
 
-TEST_P(urMemGetNativeHandleTest, InvalidNullHandleDevice) {
-    ur_native_handle_t phNativeMem;
-    ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
-                     urMemGetNativeHandle(buffer, nullptr, &phNativeMem));
-}
-
 TEST_P(urMemGetNativeHandleTest, InvalidNullPointerNativeMem) {
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
                      urMemGetNativeHandle(buffer, device, nullptr));

From 2bd1c89168aff82762c0e6d065380f0251dc9bcb Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 19 Jun 2024 10:49:02 +0100
Subject: [PATCH 7/9] [HIP] Improve urEnqueueUSMAdvise return code for
 non-shared mem

We can only call hipMemAdvise on managed pointers allocated via
hipMallocManaged, so can only support the UR entry point on pointers
allocated via urUSMSharedAlloc.

When faced with other pointers, we're supposed to ignore them and result
"success", but instead this PR continues the tradition of returning
UR_RESULT_ERROR_ADAPTER_SPECIFIC with a warning message providing the
user more information. This should all be fixed up later when there's a
better warning mechanism.
---
 source/adapters/hip/enqueue.cpp               | 28 +++++++++++++++----
 .../enqueue/enqueue_adapter_hip.match         |  7 ++++-
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 7f6da7a864..99f23a30a4 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -1424,8 +1424,6 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
   UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
 #endif
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
-
   try {
     ScopedContext Active(Device);
     std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
@@ -1480,6 +1478,20 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
       // UR_USM_MEM_ADVICE_SET/MEM_ADVICE_CLEAR_READ_MOSTLY.
     }
 
+    // hipMemAdvise only supports managed memory allocated via
+    // hipMallocManaged. We can't support this API with any other types of
+    // pointer. We should ignore them and result UR_RESULT_SUCCESS but instead
+    // we report a warning.
+    // FIXME: Fix this up when there's a better warning mechanism.
+    if (auto ptrAttribs = getPointerAttributes(pMem);
+        !ptrAttribs || !ptrAttribs->isManaged) {
+      releaseEvent();
+      setErrorMessage("mem_advise is ignored as the pointer argument is not "
+                      "a shared USM pointer",
+                      UR_RESULT_SUCCESS);
+      return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+    }
+
     const auto DeviceID = Device->get();
     if (advice & UR_USM_ADVICE_FLAG_DEFAULT) {
       UR_CHECK_ERROR(
@@ -1493,7 +1505,11 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
           hipMemAdvise(pMem, size, hipMemAdviseUnsetCoarseGrain, DeviceID));
 #endif
     } else {
-      Result = setHipMemAdvise(HIPDevicePtr, size, advice, DeviceID);
+      ur_result_t Result =
+          setHipMemAdvise(HIPDevicePtr, size, advice, DeviceID);
+      assert((Result == UR_RESULT_SUCCESS ||
+              Result == UR_RESULT_ERROR_INVALID_ENUMERATION) &&
+             "Unexpected return code");
       // UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid
       // but currently unmapped advice arguments as not supported by this
       // platform. Therefore, warn the user instead of throwing and aborting
@@ -1509,12 +1525,12 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
 
     releaseEvent();
   } catch (ur_result_t err) {
-    Result = err;
+    return err;
   } catch (...) {
-    Result = UR_RESULT_ERROR_UNKNOWN;
+    return UR_RESULT_ERROR_UNKNOWN;
   }
 
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D(
diff --git a/test/conformance/enqueue/enqueue_adapter_hip.match b/test/conformance/enqueue/enqueue_adapter_hip.match
index 443628e36e..f602837b14 100644
--- a/test/conformance/enqueue/enqueue_adapter_hip.match
+++ b/test/conformance/enqueue/enqueue_adapter_hip.match
@@ -7,10 +7,15 @@ urEnqueueKernelLaunchUSMLinkedList.Success/AMD_HIP_BACKEND___{{.*}}___UsePoolEna
 {{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___copy_3d_2d
 {{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___write_row_2D
 {{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___write_3d_2d
+
+# HIP doesn't ignore unsupported USM advice or prefetching. Instead of
+# returning UR_RESULT_SUCCESS as per the spec, it instead returns
+# UR_RESULT_ERROR_ADAPTER_SPECIFIC to issue a warning. These tests will fail
+# until this is rectified.
 urEnqueueUSMAdviseWithParamTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_ADVICE_FLAG_DEFAULT
 urEnqueueUSMAdviseTest.MultipleParamsSuccess/AMD_HIP_BACKEND___{{.*}}_
-urEnqueueUSMAdviseTest.NonCoherentDeviceMemorySuccessOrWarning/AMD_HIP_BACKEND___{{.*}}_
 urEnqueueUSMPrefetchWithParamTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT
 urEnqueueUSMPrefetchWithParamTest.CheckWaitEvent/AMD_HIP_BACKEND___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT
+
 urEnqueueTimestampRecordingExpTest.Success/AMD_HIP_BACKEND___{{.*}}
 urEnqueueTimestampRecordingExpTest.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}

From fd5c5f588d91999c87da398d67a0df5ad6b1bff6 Mon Sep 17 00:00:00 2001
From: "atharva.dubey" <atharva.dubey@codeplay.com>
Date: Mon, 17 Jun 2024 08:43:38 +0100
Subject: [PATCH 8/9] set attribute allowing cluster size greater than 8

set property cudaFuncAttributeNonPortableClusterSizeAllowed only if cluster launch is used

set has_property_cluster_launch only if cluster property is used

fix cluster dimensions being set in accordance to grid dimensions

fix ordering of cluster dims for workDim 2

fix compilation errors

review comments 1

review comments 1

increase cluster size upon launch to check CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED flag being added
---
 source/adapters/cuda/enqueue.cpp              | 61 +++++++++++++------
 .../launch_properties.cpp                     |  2 +-
 2 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 906fd49d1d..1c074025a9 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -530,6 +530,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   }
 
   std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
+
+  // Early exit for zero size kernel
+  if (*pGlobalWorkSize == 0) {
+    return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
+                                          phEventWaitList, phEvent);
+  }
+
+  // Set the number of threads per block to the number of threads per warp
+  // by default unless user has provided a better number
+  size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
+  size_t BlocksPerGrid[3] = {1u, 1u, 1u};
+
+  uint32_t LocalSize = hKernel->getLocalSize();
+  CUfunction CuFunc = hKernel->get();
+
   for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
     switch (launchPropList[i].id) {
     case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
@@ -540,12 +555,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
 
       launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
       // Note that cuda orders from right to left wrt SYCL dimensional order.
-      launch_attribute[i].value.clusterDim.x =
-          launchPropList[i].value.clusterDim[2];
-      launch_attribute[i].value.clusterDim.y =
-          launchPropList[i].value.clusterDim[1];
-      launch_attribute[i].value.clusterDim.z =
-          launchPropList[i].value.clusterDim[0];
+      if (workDim == 3) {
+        launch_attribute[i].value.clusterDim.x =
+            launchPropList[i].value.clusterDim[2];
+        launch_attribute[i].value.clusterDim.y =
+            launchPropList[i].value.clusterDim[1];
+        launch_attribute[i].value.clusterDim.z =
+            launchPropList[i].value.clusterDim[0];
+      } else if (workDim == 2) {
+        launch_attribute[i].value.clusterDim.x =
+            launchPropList[i].value.clusterDim[1];
+        launch_attribute[i].value.clusterDim.y =
+            launchPropList[i].value.clusterDim[0];
+        launch_attribute[i].value.clusterDim.z =
+            launchPropList[i].value.clusterDim[2];
+      } else {
+        launch_attribute[i].value.clusterDim.x =
+            launchPropList[i].value.clusterDim[0];
+        launch_attribute[i].value.clusterDim.y =
+            launchPropList[i].value.clusterDim[1];
+        launch_attribute[i].value.clusterDim.z =
+            launchPropList[i].value.clusterDim[2];
+      }
+
+      UR_CHECK_ERROR(cuFuncSetAttribute(
+          CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));
+
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
@@ -560,20 +595,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
   }
 
-  // Early exit for zero size kernel
-  if (*pGlobalWorkSize == 0) {
-    return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
-                                          phEventWaitList, phEvent);
-  }
-
-  // Set the number of threads per block to the number of threads per warp
-  // by default unless user has provided a better number
-  size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
-  size_t BlocksPerGrid[3] = {1u, 1u, 1u};
-
-  uint32_t LocalSize = hKernel->getLocalSize();
-  CUfunction CuFunc = hKernel->get();
-
   // This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled
   // using the standard UR_CHECK_ERROR
   if (ur_result_t Ret =
diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp
index 9095c37f8a..a54a44ecaf 100644
--- a/test/conformance/exp_launch_properties/launch_properties.cpp
+++ b/test/conformance/exp_launch_properties/launch_properties.cpp
@@ -83,7 +83,7 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) {
         if (cluster_launch_supported) {
             ur_exp_launch_property_t cluster_dims_prop;
             cluster_dims_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION;
-            cluster_dims_prop.value.clusterDim[0] = 1;
+            cluster_dims_prop.value.clusterDim[0] = 16;
             cluster_dims_prop.value.clusterDim[1] = 1;
             cluster_dims_prop.value.clusterDim[2] = 1;
 

From ae835155354cdb83bceae31ba1d7c7788f6c27e4 Mon Sep 17 00:00:00 2001
From: Ben Tracy <ben.tracy@codeplay.com>
Date: Mon, 17 Jun 2024 14:49:53 +0100
Subject: [PATCH 9/9] [CMDBUF] Fix coverity issue in command buffers

- Fix incorrect conditions for copy engine usage that were reported on coverity.
---
 source/adapters/level_zero/command_buffer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index d88e00d8e4..f7553cfa9a 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -885,7 +885,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
   UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
                                  CommandBuffer->Device));
 
-  bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost);
+  bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost);
 
   PreferCopyEngine |= UseCopyEngineForD2DCopy;
 
@@ -917,7 +917,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
   UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
                                  CommandBuffer->Device));
 
-  bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost);
+  bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost);
 
   PreferCopyEngine |= UseCopyEngineForD2DCopy;