diff --git a/include/ur_api.h b/include/ur_api.h index 84fe704d5f..889ccbdf28 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -1601,6 +1601,7 @@ typedef enum ur_device_info_t { ///< command-buffers. UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001, ///< [::ur_bool_t] Returns true if the device supports updating the kernel ///< commands in a command-buffer. + UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP = 0x1111, ///< [::ur_bool_t] return true if enqueue Cluster Launch is supported UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000, ///< [::ur_bool_t] returns true if the device supports the creation of ///< bindless images UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001, ///< [::ur_bool_t] returns true if the device supports the creation of @@ -2824,6 +2825,8 @@ urMemBufferPartition( /// - The application may call this function from simultaneous threads for /// the same context. /// - The implementation of this function should be thread-safe. +/// - The implementation may require a valid device handle to return the +/// native mem handle /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -2832,7 +2835,7 @@ urMemBufferPartition( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hMem` -/// + `NULL == hDevice` +/// + If `hDevice == NULL` and the implementation requires a valid device. /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phNativeMem` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE @@ -2840,7 +2843,8 @@ urMemBufferPartition( UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle( ur_mem_handle_t hMem, ///< [in] handle of the mem. - ur_device_handle_t hDevice, ///< [in] handle of the device that the native handle will be resident on. + ur_device_handle_t hDevice, ///< [in][optional] handle of the device that the native handle will be + ///< resident on. ur_native_handle_t *phNativeMem ///< [out] a pointer to the native handle of the mem. ); diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 5919e57019..57e0da2fa9 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -2530,6 +2530,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: os << "UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP"; break; + case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP: + os << "UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP"; + break; case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: os << "UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP"; break; @@ -4029,6 +4032,18 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info os << ")"; } break; + case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: { const ur_bool_t *tptr = (const ur_bool_t *)ptr; if (sizeof(ur_bool_t) > size) { diff --git a/scripts/core/exp-launch-properties.yml b/scripts/core/exp-launch-properties.yml index 05f90aa7d2..aef4b2844a 100644 --- a/scripts/core/exp-launch-properties.yml +++ b/scripts/core/exp-launch-properties.yml @@ -128,4 +128,14 @@ returns: - $X_RESULT_ERROR_INVALID_VALUE - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES +--- #-------------------------------------------------------------------------- +type: enum +extend: true +typed_etors: true +desc: "Extension enums to $x_device_info_t to support arch specific launch properties." +name: $x_device_info_t +etors: + - name: CLUSTER_LAUNCH_EXP + value: "0x1111" + desc: "[$x_bool_t] return true if enqueue Cluster Launch is supported" diff --git a/scripts/core/memory.yml b/scripts/core/memory.yml index c4009bc56e..21d8294519 100644 --- a/scripts/core/memory.yml +++ b/scripts/core/memory.yml @@ -436,6 +436,7 @@ details: - "Use interoperability platform extensions to convert native handle to native type." - "The application may call this function from simultaneous threads for the same context." - "The implementation of this function should be thread-safe." + - "The implementation may require a valid device handle to return the native mem handle" params: - type: $x_mem_handle_t name: hMem @@ -444,7 +445,7 @@ params: - type: $x_device_handle_t name: hDevice desc: | - [in] handle of the device that the native handle will be resident on. + [in][optional] handle of the device that the native handle will be resident on. - type: $x_native_handle_t* name: phNativeMem desc: | @@ -452,6 +453,8 @@ params: returns: - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: - "If the adapter has no underlying equivalent handle." + - $X_RESULT_ERROR_INVALID_NULL_HANDLE: + - "If `hDevice == NULL` and the implementation requires a valid device." --- #-------------------------------------------------------------------------- type: struct desc: "Native memory object creation properties" diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index bd15a62504..b076d11930 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -1089,6 +1089,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: return ReturnValue(true); + case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP: { + int Value = getAttribute(hDevice, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 9; + return ReturnValue(static_cast(Value)); + } default: break; diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 906fd49d1d..1c074025a9 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -530,6 +530,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } std::vector launch_attribute(numPropsInLaunchPropList); + + // Early exit for zero size kernel + if (*pGlobalWorkSize == 0) { + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); + } + + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + + uint32_t LocalSize = hKernel->getLocalSize(); + CUfunction CuFunc = hKernel->get(); + for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) { switch (launchPropList[i].id) { case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: { @@ -540,12 +555,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; // Note that cuda orders from right to left wrt SYCL dimensional order. - launch_attribute[i].value.clusterDim.x = - launchPropList[i].value.clusterDim[2]; - launch_attribute[i].value.clusterDim.y = - launchPropList[i].value.clusterDim[1]; - launch_attribute[i].value.clusterDim.z = - launchPropList[i].value.clusterDim[0]; + if (workDim == 3) { + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[2]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[0]; + } else if (workDim == 2) { + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[0]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[2]; + } else { + launch_attribute[i].value.clusterDim.x = + launchPropList[i].value.clusterDim[0]; + launch_attribute[i].value.clusterDim.y = + launchPropList[i].value.clusterDim[1]; + launch_attribute[i].value.clusterDim.z = + launchPropList[i].value.clusterDim[2]; + } + + UR_CHECK_ERROR(cuFuncSetAttribute( + CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)); + break; } case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: { @@ -560,20 +595,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } } - // Early exit for zero size kernel - if (*pGlobalWorkSize == 0) { - return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, - phEventWaitList, phEvent); - } - - // Set the number of threads per block to the number of threads per warp - // by default unless user has provided a better number - size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; - size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - - uint32_t LocalSize = hKernel->getLocalSize(); - CUfunction CuFunc = hKernel->get(); - // This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled // using the standard UR_CHECK_ERROR if (ur_result_t Ret = diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index bbe92ddef8..9889031f1b 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -219,7 +219,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t, UR_APIEXPORT ur_result_t UR_APICALL urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { try { - ScopedContext Active(phEventWaitList[0]->getQueue()->getDevice()); + // Interop events don't have an associated queue, so get device through + // context + ScopedContext Active(phEventWaitList[0]->getContext()->getDevices()[0]); auto WaitFunc = [](ur_event_handle_t Event) -> ur_result_t { UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT); diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp index 07fdf348a6..d43bd046dc 100644 --- a/source/adapters/cuda/kernel.cpp +++ b/source/adapters/cuda/kernel.cpp @@ -93,7 +93,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { size_t GroupSize[3] = {0, 0, 0}; const auto &ReqdWGSizeMDMap = - hKernel->get_program()->KernelReqdWorkGroupSizeMD; + hKernel->getProgram()->KernelReqdWorkGroupSizeMD; const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->getName()); if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) { const auto ReqdWGSize = ReqdWGSizeMD->second; @@ -222,7 +222,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, case UR_KERNEL_INFO_CONTEXT: return ReturnValue(hKernel->getContext()); case UR_KERNEL_INFO_PROGRAM: - return ReturnValue(hKernel->get_program()); + return ReturnValue(hKernel->getProgram()); case UR_KERNEL_INFO_ATTRIBUTES: return ReturnValue(""); case UR_KERNEL_INFO_NUM_REGS: { diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index b7a7358b27..808b1abe9e 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -178,8 +178,6 @@ struct ur_kernel_handle_t_ { urContextRelease(Context); } - ur_program_handle_t get_program() const noexcept { return Program; } - uint32_t incrementReferenceCount() noexcept { return ++RefCount; } uint32_t decrementReferenceCount() noexcept { return --RefCount; } @@ -187,6 +185,7 @@ struct ur_kernel_handle_t_ { uint32_t getReferenceCount() const noexcept { return RefCount; } native_type get() const noexcept { return Function; }; + ur_program_handle_t getProgram() const noexcept { return Program; }; native_type get_with_offset_parameter() const noexcept { diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp index 7bddac56b0..04b488a0f4 100644 --- a/source/adapters/cuda/memory.cpp +++ b/source/adapters/cuda/memory.cpp @@ -130,6 +130,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(ur_mem_handle_t hMem, ur_device_handle_t Device, ur_native_handle_t *phNativeMem) { + UR_ASSERT(Device != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); try { *phNativeMem = reinterpret_cast( std::get(hMem->Mem).getPtr(Device)); diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 7f6da7a864..99f23a30a4 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -1424,8 +1424,6 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); #endif - ur_result_t Result = UR_RESULT_SUCCESS; - try { ScopedContext Active(Device); std::unique_ptr EventPtr{nullptr}; @@ -1480,6 +1478,20 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, // UR_USM_MEM_ADVICE_SET/MEM_ADVICE_CLEAR_READ_MOSTLY. } + // hipMemAdvise only supports managed memory allocated via + // hipMallocManaged. We can't support this API with any other types of + // pointer. We should ignore them and result UR_RESULT_SUCCESS but instead + // we report a warning. + // FIXME: Fix this up when there's a better warning mechanism. + if (auto ptrAttribs = getPointerAttributes(pMem); + !ptrAttribs || !ptrAttribs->isManaged) { + releaseEvent(); + setErrorMessage("mem_advise is ignored as the pointer argument is not " + "a shared USM pointer", + UR_RESULT_SUCCESS); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + const auto DeviceID = Device->get(); if (advice & UR_USM_ADVICE_FLAG_DEFAULT) { UR_CHECK_ERROR( @@ -1493,7 +1505,11 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, hipMemAdvise(pMem, size, hipMemAdviseUnsetCoarseGrain, DeviceID)); #endif } else { - Result = setHipMemAdvise(HIPDevicePtr, size, advice, DeviceID); + ur_result_t Result = + setHipMemAdvise(HIPDevicePtr, size, advice, DeviceID); + assert((Result == UR_RESULT_SUCCESS || + Result == UR_RESULT_ERROR_INVALID_ENUMERATION) && + "Unexpected return code"); // UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid // but currently unmapped advice arguments as not supported by this // platform. Therefore, warn the user instead of throwing and aborting @@ -1509,12 +1525,12 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, releaseEvent(); } catch (ur_result_t err) { - Result = err; + return err; } catch (...) { - Result = UR_RESULT_ERROR_UNKNOWN; + return UR_RESULT_ERROR_UNKNOWN; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D( diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp index 64543af855..00f7e6a4b2 100644 --- a/source/adapters/hip/memory.cpp +++ b/source/adapters/hip/memory.cpp @@ -301,6 +301,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(ur_mem_handle_t hMem, ur_device_handle_t Device, ur_native_handle_t *phNativeMem) { + UR_ASSERT(Device != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE); #if defined(__HIP_PLATFORM_NVIDIA__) if (sizeof(BufferMem::native_type) > sizeof(ur_native_handle_t)) { // Check that all the upper bits that cannot be represented by diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index d88e00d8e4..f7553cfa9a 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -885,7 +885,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, CommandBuffer->Device)); - bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost); + bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost); PreferCopyEngine |= UseCopyEngineForD2DCopy; @@ -917,7 +917,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, CommandBuffer->Device)); - bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost); + bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost); PreferCopyEngine |= UseCopyEngineForD2DCopy; diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 5e4cd3a2bd..5881610f68 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -1022,6 +1022,7 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { // Save pointer to the queue before deleting/resetting event. auto Queue = Legacy(Event->UrQueue); + auto URQueue = Event->UrQueue; // If the event was a timestamp recording, we try to evict its entry in the // queue. @@ -1053,8 +1054,8 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { // created so that we can avoid ur_queue_handle_t is released before the // associated ur_event_handle_t is released. Here we have to decrement it so // ur_queue_handle_t can be released successfully. - if (Event->UrQueue) { - UR_CALL(urQueueReleaseInternal(Event->UrQueue)); + if (URQueue) { + UR_CALL(urQueueReleaseInternal(URQueue)); } return UR_RESULT_SUCCESS; diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp index e726061ee4..4a686a9d59 100644 --- a/source/adapters/null/ur_nullddi.cpp +++ b/source/adapters/null/ur_nullddi.cpp @@ -937,7 +937,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition( __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle( ur_mem_handle_t hMem, ///< [in] handle of the mem. ur_device_handle_t - hDevice, ///< [in] handle of the device that the native handle will be resident on. + hDevice, ///< [in][optional] handle of the device that the native handle will be + ///< resident on. ur_native_handle_t *phNativeMem ///< [out] a pointer to the native handle of the mem. ) try { diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index bc140f17c2..cab7688e4f 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -1209,7 +1209,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition( __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle( ur_mem_handle_t hMem, ///< [in] handle of the mem. ur_device_handle_t - hDevice, ///< [in] handle of the device that the native handle will be resident on. + hDevice, ///< [in][optional] handle of the device that the native handle will be + ///< resident on. ur_native_handle_t *phNativeMem ///< [out] a pointer to the native handle of the mem. ) { diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index ce1374f0c6..bd6e20e7ad 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -1290,7 +1290,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition( __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle( ur_mem_handle_t hMem, ///< [in] handle of the mem. ur_device_handle_t - hDevice, ///< [in] handle of the device that the native handle will be resident on. + hDevice, ///< [in][optional] handle of the device that the native handle will be + ///< resident on. ur_native_handle_t *phNativeMem ///< [out] a pointer to the native handle of the mem. ) { @@ -1305,10 +1306,6 @@ __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - if (NULL == hDevice) { - return UR_RESULT_ERROR_INVALID_NULL_HANDLE; - } - if (NULL == phNativeMem) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index fd18dd4361..f89edbd227 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -1259,7 +1259,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition( __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle( ur_mem_handle_t hMem, ///< [in] handle of the mem. ur_device_handle_t - hDevice, ///< [in] handle of the device that the native handle will be resident on. + hDevice, ///< [in][optional] handle of the device that the native handle will be + ///< resident on. ur_native_handle_t *phNativeMem ///< [out] a pointer to the native handle of the mem. ) { @@ -1276,7 +1277,9 @@ __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle( hMem = reinterpret_cast(hMem)->handle; // convert loader handle to platform handle - hDevice = reinterpret_cast(hDevice)->handle; + hDevice = (hDevice) + ? reinterpret_cast(hDevice)->handle + : nullptr; // forward to device-platform result = pfnGetNativeHandle(hMem, hDevice, phNativeMem); diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 1c5e288a03..9d98d18d10 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -1717,6 +1717,8 @@ ur_result_t UR_APICALL urMemBufferPartition( /// - The application may call this function from simultaneous threads for /// the same context. /// - The implementation of this function should be thread-safe. +/// - The implementation may require a valid device handle to return the +/// native mem handle /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -1725,7 +1727,7 @@ ur_result_t UR_APICALL urMemBufferPartition( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hMem` -/// + `NULL == hDevice` +/// + If `hDevice == NULL` and the implementation requires a valid device. /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phNativeMem` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE @@ -1733,7 +1735,8 @@ ur_result_t UR_APICALL urMemBufferPartition( ur_result_t UR_APICALL urMemGetNativeHandle( ur_mem_handle_t hMem, ///< [in] handle of the mem. ur_device_handle_t - hDevice, ///< [in] handle of the device that the native handle will be resident on. + hDevice, ///< [in][optional] handle of the device that the native handle will be + ///< resident on. ur_native_handle_t *phNativeMem ///< [out] a pointer to the native handle of the mem. ) try { diff --git a/source/ur_api.cpp b/source/ur_api.cpp index f33de6539b..5cc020778c 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -1479,6 +1479,8 @@ ur_result_t UR_APICALL urMemBufferPartition( /// - The application may call this function from simultaneous threads for /// the same context. /// - The implementation of this function should be thread-safe. +/// - The implementation may require a valid device handle to return the +/// native mem handle /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -1487,7 +1489,7 @@ ur_result_t UR_APICALL urMemBufferPartition( /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hMem` -/// + `NULL == hDevice` +/// + If `hDevice == NULL` and the implementation requires a valid device. /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phNativeMem` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE @@ -1495,7 +1497,8 @@ ur_result_t UR_APICALL urMemBufferPartition( ur_result_t UR_APICALL urMemGetNativeHandle( ur_mem_handle_t hMem, ///< [in] handle of the mem. ur_device_handle_t - hDevice, ///< [in] handle of the device that the native handle will be resident on. + hDevice, ///< [in][optional] handle of the device that the native handle will be + ///< resident on. ur_native_handle_t *phNativeMem ///< [out] a pointer to the native handle of the mem. ) { diff --git a/test/conformance/enqueue/enqueue_adapter_hip.match b/test/conformance/enqueue/enqueue_adapter_hip.match index 443628e36e..f602837b14 100644 --- a/test/conformance/enqueue/enqueue_adapter_hip.match +++ b/test/conformance/enqueue/enqueue_adapter_hip.match @@ -7,10 +7,15 @@ urEnqueueKernelLaunchUSMLinkedList.Success/AMD_HIP_BACKEND___{{.*}}___UsePoolEna {{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___copy_3d_2d {{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___write_row_2D {{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___write_3d_2d + +# HIP doesn't ignore unsupported USM advice or prefetching. Instead of +# returning UR_RESULT_SUCCESS as per the spec, it instead returns +# UR_RESULT_ERROR_ADAPTER_SPECIFIC to issue a warning. These tests will fail +# until this is rectified. urEnqueueUSMAdviseWithParamTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_ADVICE_FLAG_DEFAULT urEnqueueUSMAdviseTest.MultipleParamsSuccess/AMD_HIP_BACKEND___{{.*}}_ -urEnqueueUSMAdviseTest.NonCoherentDeviceMemorySuccessOrWarning/AMD_HIP_BACKEND___{{.*}}_ urEnqueueUSMPrefetchWithParamTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT urEnqueueUSMPrefetchWithParamTest.CheckWaitEvent/AMD_HIP_BACKEND___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT + urEnqueueTimestampRecordingExpTest.Success/AMD_HIP_BACKEND___{{.*}} urEnqueueTimestampRecordingExpTest.SuccessBlocking/AMD_HIP_BACKEND___{{.*}} diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp index bc252392eb..a54a44ecaf 100644 --- a/test/conformance/exp_launch_properties/launch_properties.cpp +++ b/test/conformance/exp_launch_properties/launch_properties.cpp @@ -75,10 +75,15 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) { props.push_back(coop_prop); } - if (compute_capability >= 9.0) { + ur_bool_t cluster_launch_supported = false; + ASSERT_SUCCESS(urDeviceGetInfo( + device, UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP, sizeof(ur_bool_t), + &cluster_launch_supported, nullptr)); + + if (cluster_launch_supported) { ur_exp_launch_property_t cluster_dims_prop; cluster_dims_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION; - cluster_dims_prop.value.clusterDim[0] = 1; + cluster_dims_prop.value.clusterDim[0] = 16; cluster_dims_prop.value.clusterDim[1] = 1; cluster_dims_prop.value.clusterDim[2] = 1; diff --git a/test/conformance/memory/urMemGetNativeHandle.cpp b/test/conformance/memory/urMemGetNativeHandle.cpp index 55f8910f72..8023e1b1c3 100644 --- a/test/conformance/memory/urMemGetNativeHandle.cpp +++ b/test/conformance/memory/urMemGetNativeHandle.cpp @@ -20,12 +20,6 @@ TEST_P(urMemGetNativeHandleTest, InvalidNullHandleMem) { urMemGetNativeHandle(nullptr, device, &phNativeMem)); } -TEST_P(urMemGetNativeHandleTest, InvalidNullHandleDevice) { - ur_native_handle_t phNativeMem; - ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, - urMemGetNativeHandle(buffer, nullptr, &phNativeMem)); -} - TEST_P(urMemGetNativeHandleTest, InvalidNullPointerNativeMem) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, urMemGetNativeHandle(buffer, device, nullptr)); diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp index 2b1f8c89a7..5f3c8874fd 100644 --- a/tools/urinfo/urinfo.hpp +++ b/tools/urinfo/urinfo.hpp @@ -335,6 +335,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, printDeviceInfo( hDevice, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP); std::cout << prefix; + printDeviceInfo(hDevice, UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP); + std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP); std::cout << prefix;