From 2ecb27bc8bad1148dfe86a5803f82959609fdf6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Mon, 17 Jun 2024 17:09:26 +0100 Subject: [PATCH] Use zeCommandListImmediateAppendCommandListsExp --- source/adapters/level_zero/command_buffer.cpp | 1496 +++++++++-------- source/adapters/level_zero/command_buffer.hpp | 57 +- 2 files changed, 861 insertions(+), 692 deletions(-) diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 642173d675..6c2e7f7caa 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -19,14 +19,24 @@ #define DEBUG_LOG(VAR) logger::debug(#VAR " {}", VAR); namespace { -/// Checks the version of the level-zero driver. -/// @param Context Execution context -/// @param VersionMajor Major verion number to compare to. -/// @param VersionMinor Minor verion number to compare to. -/// @param VersionBuild Build verion number to compare to. -/// @return true is the version of the driver is higher than or equal to the -/// compared version -bool IsDriverVersionNewerOrSimilar(ur_context_handle_t Context, + +// Gets a C pointer from a vector. If the vector is empty returns nullptr +// instead. This is different from the behaviour of the data() member function +// of the vector class which might not return nullptr when the vector is empty. +template T *getPointerFromVector(std::vector &V) { + return V.size() == 0 ? nullptr : V.data(); +} + +/** + * Checks the version of the level-zero driver. + * @param[in] Context Execution context + * @param[in] VersionMajor Major version number to compare to. + * @param[in] VersionMinor Minor version number to compare to. + * @param[in] VersionBuild Build version number to compare to. + * @return true if the version of the driver is higher than or equal to the + * compared version. + */ +bool isDriverVersionNewerOrSimilar(ur_context_handle_t Context, uint32_t VersionMajor, uint32_t VersionMinor, uint32_t VersionBuild) { ZeStruct ZeDriverProperties; @@ -42,138 +52,71 @@ bool IsDriverVersionNewerOrSimilar(ur_context_handle_t Context, (DriverVersionBuild >= VersionBuild)); } -// Default to using compute engine for fill operation, but allow to -// override this with an environment variable. -bool PreferCopyEngineForFill = [] { - const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_FILL"); - const char *PiRet = - std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL"); - return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0)); -}(); - -}; // namespace - -ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( - ur_context_handle_t Context, ur_device_handle_t Device, - ze_command_list_handle_t CommandList, - ze_command_list_handle_t CommandListResetEvents, - ze_command_list_handle_t CopyCommandList, - ZeStruct ZeDesc, - ZeStruct ZeCopyDesc, - const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList) - : Context(Context), Device(Device), ZeComputeCommandList(CommandList), - ZeCommandListResetEvents(CommandListResetEvents), - ZeCommandListDesc(ZeDesc), ZeCopyCommandList(CopyCommandList), - ZeCopyCommandListDesc(ZeCopyDesc), ZeFencesMap(), ZeActiveFence(nullptr), - QueueProperties(), SyncPoints(), NextSyncPoint(0), - IsUpdatable(Desc ? Desc->isUpdatable : false), - IsProfilingEnabled(Desc ? Desc->enableProfiling : false), - IsInOrderCmdList(IsInOrderCmdList) { - urContextRetain(Context); - urDeviceRetain(Device); -} - -// The ur_exp_command_buffer_handle_t_ destructor release all the memory objects -// allocated for command_buffer managment -ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { - // Release the memory allocated to the Context stored in the command_buffer - urContextRelease(Context); - - // Release the device - urDeviceRelease(Device); - - // Release the memory allocated to the CommandList stored in the - // command_buffer - if (ZeComputeCommandList) { - ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeComputeCommandList)); - } - if (UseCopyEngine() && ZeCopyCommandList) { - ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCopyCommandList)); - } - - // Release the memory allocated to the CommandListResetEvents stored in the - // command_buffer - if (ZeCommandListResetEvents) { - ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListResetEvents)); - } - - // Release additional signal and wait events used by command_buffer - if (SignalEvent) { - CleanupCompletedEvent(SignalEvent, false); - urEventReleaseInternal(SignalEvent); - } - if (WaitEvent) { - CleanupCompletedEvent(WaitEvent, false); - urEventReleaseInternal(WaitEvent); - } - if (AllResetEvent) { - CleanupCompletedEvent(AllResetEvent, false); - urEventReleaseInternal(AllResetEvent); +/** + * Default to using compute engine for fill operation, but allow to override + * this with an environment variable. Disable the copy engine if the pattern + * size is larger than the maximum supported. + * @param[in] CommandBuffer The CommandBuffer where the fill command will be + * appended. + * @param[in] PatternSize The pattern size for the fill command. + * @param[out] PreferCopyEngine Whether copy engine usage should be enabled or + * disabled for fill commands. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +preferCopyEngineForFill(ur_exp_command_buffer_handle_t CommandBuffer, + size_t PatternSize, bool &PreferCopyEngine) { + assert(PatternSize > 0); + + PreferCopyEngine = false; + if (!CommandBuffer->UseCopyEngine()) { + return UR_RESULT_SUCCESS; } - // Release events added to the command_buffer - for (auto &Sync : SyncPoints) { - auto &Event = Sync.second; - CleanupCompletedEvent(Event, false); - urEventReleaseInternal(Event); - } + // If the copy engine is available, and it supports this pattern size, the + // command should be enqueued in the copy command list, otherwise enqueue it + // in the compute command list. + PreferCopyEngine = + PatternSize <= + CommandBuffer->Device + ->QueueGroup[ur_device_handle_t_::queue_group_info_t::MainCopy] + .ZeProperties.maxMemoryFillPatternSize; - // Release fences allocated to command-buffer - for (auto &ZeFencePair : ZeFencesMap) { - auto &ZeFence = ZeFencePair.second; - ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence)); + if (!PreferCopyEngine) { + // Pattern size must fit the compute queue capabilities. + UR_ASSERT( + PatternSize <= + CommandBuffer->Device + ->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeProperties.maxMemoryFillPatternSize, + UR_RESULT_ERROR_INVALID_VALUE); } - auto ReleaseIndirectMem = [](ur_kernel_handle_t Kernel) { - if (IndirectAccessTrackingEnabled) { - // urKernelRelease is called by CleanupCompletedEvent(Event) as soon as - // kernel execution has finished. This is the place where we need to - // release memory allocations. If kernel is not in use (not submitted by - // some other thread) then release referenced memory allocations. As a - // result, memory can be deallocated and context can be removed from - // container in the platform. That's why we need to lock a mutex here. - ur_platform_handle_t Platform = Kernel->Program->Context->getPlatform(); - std::scoped_lock ContextsLock(Platform->ContextsMutex); - - if (--Kernel->SubmissionsCount == 0) { - // Kernel is not submitted for execution, release referenced memory - // allocations. - for (auto &MemAlloc : Kernel->MemAllocs) { - // std::pair *, Hash - USMFreeHelper(MemAlloc->second.Context, MemAlloc->first, - MemAlloc->second.OwnNativeHandle); - } - Kernel->MemAllocs.clear(); - } - } - }; - - for (auto &AssociatedKernel : KernelsList) { - ReleaseIndirectMem(AssociatedKernel); - urKernelRelease(AssociatedKernel); - } -} + const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_FILL"); + const char *PiRet = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL"); -ur_exp_command_buffer_command_handle_t_:: - ur_exp_command_buffer_command_handle_t_( - ur_exp_command_buffer_handle_t CommandBuffer, uint64_t CommandId, - uint32_t WorkDim, bool UserDefinedLocalSize, - ur_kernel_handle_t Kernel = nullptr) - : CommandBuffer(CommandBuffer), CommandId(CommandId), WorkDim(WorkDim), - UserDefinedLocalSize(UserDefinedLocalSize), Kernel(Kernel) { - urCommandBufferRetainExp(CommandBuffer); - if (Kernel) - urKernelRetain(Kernel); -} + PreferCopyEngine = + PreferCopyEngine && + (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0)); -ur_exp_command_buffer_command_handle_t_:: - ~ur_exp_command_buffer_command_handle_t_() { - urCommandBufferReleaseExp(CommandBuffer); - if (Kernel) - urKernelRelease(Kernel); + return UR_RESULT_SUCCESS; } -/// Helper function for calculating work dimensions for kernels +/** + * Calculates a work group size for the kernel based on the GlobalWorkSize or + * the LocalWorkSize if provided. + * @param[in][optional] Kernel The Kernel. Used when LocalWorkSize is not + * provided. + * @param[in][optional] Device The device associated with the kernel. Used when + * LocalWorkSize is not provided. + * @param[out] ZeThreadGroupDimensions Number of work groups in each dimension. + * @param[out] WG The work group size for each dimension. + * @param[in] WorkDim The number of dimensions in the kernel. + * @param[in] GlobalWorkSize The global work size. + * @param[in][optional] LocalWorkSize The local work size. + * @return UR_RESULT_SUCCESS or an error code on failure. + */ ur_result_t calculateKernelWorkDimensions( ur_kernel_handle_t Kernel, ur_device_handle_t Device, ze_group_count_t &ZeThreadGroupDimensions, uint32_t (&WG)[3], @@ -185,7 +128,8 @@ ur_result_t calculateKernelWorkDimensions( // suggested group size. UR_ASSERT(LocalWorkSize || Kernel, UR_RESULT_ERROR_INVALID_VALUE); - // New variable needed because GlobalWorkSize parameter might not be of size 3 + // New variable needed because GlobalWorkSize parameter might not be of size + // 3 size_t GlobalWorkSize3D[3]{1, 1, 1}; std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D); @@ -283,19 +227,19 @@ ur_result_t calculateKernelWorkDimensions( return UR_RESULT_SUCCESS; } -/// Helper function for finding the Level Zero events associated with the -/// commands in a command-buffer, each event is pointed to by a sync-point in -/// the wait list. -/// -/// @param[in] CommandBuffer to lookup the L0 events from. -/// @param[in] NumSyncPointsInWaitList Length of \p SyncPointWaitList. -/// @param[in] SyncPointWaitList List of sync points in \p CommandBuffer -/// to find the L0 events for. -/// @param[out] ZeEventList Return parameter for the L0 events associated with -/// each sync-point in \p SyncPointWaitList. -/// -/// @return UR_RESULT_SUCCESS or an error code on failure -static ur_result_t getEventsFromSyncPoints( +/** + * Helper function for finding the Level Zero events associated with the + * commands in a command-buffer, each event is pointed to by a sync-point in the + * wait list. + * @param[in] CommandBuffer to lookup the L0 events from. + * @param[in] NumSyncPointsInWaitList Length of \p SyncPointWaitList. + * @param[in] SyncPointWaitList List of sync points in \p CommandBuffer to find + * the L0 events for. + * @param[out] ZeEventList Return parameter for the L0 events associated with + * each sync-point in \p SyncPointWaitList. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t getEventsFromSyncPoints( const ur_exp_command_buffer_handle_t &CommandBuffer, size_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, @@ -315,63 +259,88 @@ static ur_result_t getEventsFromSyncPoints( return UR_RESULT_SUCCESS; } +/** + * If needed, creates a sync point for a given command and returns the L0 + * events associated with the sync point. + * This operations is skipped if the command buffer is in order. + * @param[in] CommandType The type of the command. + * @param[in] CommandBuffer The CommandBuffer where the command is appended. + * @param[in] NumSyncPointsInWaitList Number of sync points that are + * dependencies for the command. + * @param[in] SyncPointWaitList List of sync point that are dependencies for the + * command. + * @param[in] HostVisible Whether the event associated with the sync point + * should be host visible. + * @param[out][optional] RetSyncPoint The new sync point. + * @param[out] ZeEventList A list of L0 events that are dependencies for this + * sync point. + * @param[out] ZeLaunchEvent The L0 event associated with this sync point. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t createSyncPointAndGetZeEvents( + ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer, + uint32_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + bool HostVisible, ur_exp_command_buffer_sync_point_t *RetSyncPoint, + std::vector &ZeEventList, + ze_event_handle_t &ZeLaunchEvent) { + + ZeLaunchEvent = nullptr; + + if (CommandBuffer->IsInOrderCmdList) { + return UR_RESULT_SUCCESS; + } + + UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, ZeEventList)); + ur_event_handle_t LaunchEvent; + UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, HostVisible, + &LaunchEvent, false, !CommandBuffer->IsProfilingEnabled)); + LaunchEvent->CommandType = CommandType; + ZeLaunchEvent = LaunchEvent->ZeEvent; + + // Get sync point and register the event with it. + ur_exp_command_buffer_sync_point_t SyncPoint = + CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); + + if (RetSyncPoint) { + *RetSyncPoint = SyncPoint; + } + + return UR_RESULT_SUCCESS; +} + // Shared by all memory read/write/copy PI interfaces. // Helper function for common code when enqueuing memory operations to a command // buffer. -static ur_result_t enqueueCommandBufferMemCopyHelper( +ur_result_t enqueueCommandBufferMemCopyHelper( ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer, void *Dst, const void *Src, size_t Size, bool PreferCopyEngine, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *RetSyncPoint) { - if (CommandBuffer->IsInOrderCmdList) { - ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (CommandBuffer->ZeComputeCommandList, Dst, Src, Size, nullptr, 0, - nullptr)); - logger::debug("calling zeCommandListAppendMemoryCopy()"); - } else { - std::vector ZeEventList; - ur_event_handle_t LaunchEvent; - UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, ZeEventList)); - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, false, - &LaunchEvent, false, - !CommandBuffer->IsProfilingEnabled)); - LaunchEvent->CommandType = CommandType; - - // Get sync point and register the event with it. - ur_exp_command_buffer_sync_point_t SyncPoint = - CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); - if (RetSyncPoint) { - *RetSyncPoint = SyncPoint; - } + std::vector ZeEventList; + ze_event_handle_t ZeLaunchEvent = nullptr; + UR_CALL(createSyncPointAndGetZeEvents( + CommandType, CommandBuffer, NumSyncPointsInWaitList, SyncPointWaitList, + false, RetSyncPoint, ZeEventList, ZeLaunchEvent)); - ze_command_list_handle_t ZeCommandList = - CommandBuffer->ZeComputeCommandList; - // If the copy engine available, the command is enqueued in the - // ZeCopyCommandList. - if (PreferCopyEngine && CommandBuffer->UseCopyEngine()) { - ZeCommandList = CommandBuffer->ZeCopyCommandList; - // We indicate that the ZeCopyCommandList contains commands to be - // submitted. - CommandBuffer->MCopyCommandListEmpty = false; - } - ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (ZeCommandList, Dst, Src, Size, LaunchEvent->ZeEvent, - ZeEventList.size(), ZeEventList.data())); + ze_command_list_handle_t ZeCommandList = + CommandBuffer->chooseCommandList(PreferCopyEngine); + + logger::debug("calling zeCommandListAppendMemoryCopy()"); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (ZeCommandList, Dst, Src, Size, ZeLaunchEvent, ZeEventList.size(), + getPointerFromVector(ZeEventList))); - logger::debug("calling zeCommandListAppendMemoryCopy() with" - " ZeEvent {}", - ur_cast(LaunchEvent->ZeEvent)); - } return UR_RESULT_SUCCESS; } // Helper function for common code when enqueuing rectangular memory operations // to a command buffer. -static ur_result_t enqueueCommandBufferMemCopyRectHelper( +ur_result_t enqueueCommandBufferMemCopyRectHelper( ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer, void *Dst, const void *Src, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin, ur_rect_region_t Region, size_t SrcRowPitch, @@ -411,154 +380,234 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper( const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ, Width, Height, Depth}; - if (CommandBuffer->IsInOrderCmdList) { - ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, - (CommandBuffer->ZeComputeCommandList, Dst, &ZeDstRegion, - DstPitch, DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, - SrcSlicePitch, nullptr, 0, nullptr)); - - logger::debug("calling zeCommandListAppendMemoryCopyRegion()"); - } else { - std::vector ZeEventList; - UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, ZeEventList)); - - ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, false, - &LaunchEvent, false, - !CommandBuffer->IsProfilingEnabled)); - LaunchEvent->CommandType = CommandType; - - // Get sync point and register the event with it. - ur_exp_command_buffer_sync_point_t SyncPoint = - CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); - if (RetSyncPoint) { - *RetSyncPoint = SyncPoint; - } - - ze_command_list_handle_t ZeCommandList = - CommandBuffer->ZeComputeCommandList; - // If the copy engine available, the command is enqueued in the - // ZeCopyCommandList. - if (PreferCopyEngine && CommandBuffer->UseCopyEngine()) { - ZeCommandList = CommandBuffer->ZeCopyCommandList; - // We indicate that the ZeCopyCommandList contains commands to be - // submitted. - CommandBuffer->MCopyCommandListEmpty = false; - } + std::vector ZeEventList; + ze_event_handle_t ZeLaunchEvent = nullptr; + UR_CALL(createSyncPointAndGetZeEvents( + CommandType, CommandBuffer, NumSyncPointsInWaitList, SyncPointWaitList, + false, RetSyncPoint, ZeEventList, ZeLaunchEvent)); - ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, - (ZeCommandList, Dst, &ZeDstRegion, DstPitch, DstSlicePitch, Src, - &ZeSrcRegion, SrcPitch, SrcSlicePitch, LaunchEvent->ZeEvent, - ZeEventList.size(), ZeEventList.data())); + ze_command_list_handle_t ZeCommandList = + CommandBuffer->chooseCommandList(PreferCopyEngine); - logger::debug("calling zeCommandListAppendMemoryCopyRegion() with" - " ZeEvent {}", - ur_cast(LaunchEvent->ZeEvent)); - } + logger::debug("calling zeCommandListAppendMemoryCopyRegion()"); + ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, + (ZeCommandList, Dst, &ZeDstRegion, DstPitch, DstSlicePitch, Src, + &ZeSrcRegion, SrcPitch, SrcSlicePitch, ZeLaunchEvent, + ZeEventList.size(), getPointerFromVector(ZeEventList))); return UR_RESULT_SUCCESS; } -// Helper function for enqueuing memory fills -static ur_result_t enqueueCommandBufferFillHelper( +// Helper function for enqueuing memory fills. +ur_result_t enqueueCommandBufferFillHelper( ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer, void *Ptr, const void *Pattern, size_t PatternSize, size_t Size, - bool PreferCopyEngine, uint32_t NumSyncPointsInWaitList, + uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *RetSyncPoint) { // Pattern size must be a power of two. UR_ASSERT((PatternSize > 0) && ((PatternSize & (PatternSize - 1)) == 0), UR_RESULT_ERROR_INVALID_VALUE); - ze_command_list_handle_t ZeCommandList; - // If the copy engine available and patternsize is valid, the command is - // enqueued in the ZeCopyCommandList, otherwise enqueue it in the compute - // command list. + std::vector ZeEventList; + ze_event_handle_t ZeLaunchEvent = nullptr; + UR_CALL(createSyncPointAndGetZeEvents( + CommandType, CommandBuffer, NumSyncPointsInWaitList, SyncPointWaitList, + true, RetSyncPoint, ZeEventList, ZeLaunchEvent)); - if (PreferCopyEngine && CommandBuffer->UseCopyEngine() && - PatternSize <= - CommandBuffer->Device - ->QueueGroup[ur_device_handle_t_::queue_group_info_t::MainCopy] - .ZeProperties.maxMemoryFillPatternSize) { - - ZeCommandList = CommandBuffer->ZeCopyCommandList; - // We indicate that the ZeCopyCommandList contains commands to be - // submitted. - CommandBuffer->MCopyCommandListEmpty = false; - } else { - // Pattern size must fit the compute queue capabilities. - UR_ASSERT( - PatternSize <= - CommandBuffer->Device - ->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] - .ZeProperties.maxMemoryFillPatternSize, - UR_RESULT_ERROR_INVALID_VALUE); - ZeCommandList = CommandBuffer->ZeComputeCommandList; + bool PreferCopyEngine; + UR_CALL( + preferCopyEngineForFill(CommandBuffer, PatternSize, PreferCopyEngine)); + + ze_command_list_handle_t ZeCommandList = + CommandBuffer->chooseCommandList(PreferCopyEngine); + + logger::debug("calling zeCommandListAppendMemoryFill()"); + ZE2UR_CALL(zeCommandListAppendMemoryFill, + (ZeCommandList, Ptr, Pattern, PatternSize, Size, ZeLaunchEvent, + ZeEventList.size(), getPointerFromVector(ZeEventList))); + + return UR_RESULT_SUCCESS; +} +} // namespace + +ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( + ur_context_handle_t Context, ur_device_handle_t Device, + ze_command_list_handle_t CommandList, + ze_command_list_handle_t CommandListResetEvents, + ze_command_list_handle_t CopyCommandList, ur_event_handle_t SignalEvent, + ur_event_handle_t WaitEvent, ur_event_handle_t AllResetEvent, + const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList) + : Context(Context), Device(Device), ZeComputeCommandList(CommandList), + ZeCommandListResetEvents(CommandListResetEvents), + ZeCopyCommandList(CopyCommandList), SignalEvent(SignalEvent), + WaitEvent(WaitEvent), AllResetEvent(AllResetEvent), ZeFencesMap(), + ZeActiveFence(nullptr), SyncPoints(), NextSyncPoint(0), + IsUpdatable(Desc ? Desc->isUpdatable : false), + IsProfilingEnabled(Desc ? Desc->enableProfiling : false), + IsInOrderCmdList(IsInOrderCmdList) { + urContextRetain(Context); + urDeviceRetain(Device); +} + +// The ur_exp_command_buffer_handle_t_ destructor releases all the memory +// objects allocated for command_buffer management. +ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { + // Release the memory allocated to the Context stored in the command_buffer + urContextRelease(Context); + + // Release the device + urDeviceRelease(Device); + + // Release the memory allocated to the CommandList stored in the + // command_buffer + if (ZeComputeCommandList) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeComputeCommandList)); + } + if (UseCopyEngine() && ZeCopyCommandList) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCopyCommandList)); } - if (CommandBuffer->IsInOrderCmdList) { - ZE2UR_CALL(zeCommandListAppendMemoryFill, - (CommandBuffer->ZeComputeCommandList, Ptr, Pattern, PatternSize, - Size, nullptr, 0, nullptr)); + // Release the memory allocated to the CommandListResetEvents stored in the + // command_buffer + if (ZeCommandListResetEvents) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListResetEvents)); + } - logger::debug("calling zeCommandListAppendMemoryFill()"); - } else { - std::vector ZeEventList; - UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, ZeEventList)); - - ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, true, - &LaunchEvent, false, - !CommandBuffer->IsProfilingEnabled)); - LaunchEvent->CommandType = CommandType; - - // Get sync point and register the event with it. - ur_exp_command_buffer_sync_point_t SyncPoint = - CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); - if (RetSyncPoint) { - *RetSyncPoint = SyncPoint; + // Release additional signal and wait events used by command_buffer + if (SignalEvent) { + CleanupCompletedEvent(SignalEvent, false); + urEventReleaseInternal(SignalEvent); + } + if (WaitEvent) { + CleanupCompletedEvent(WaitEvent, false); + urEventReleaseInternal(WaitEvent); + } + if (AllResetEvent) { + CleanupCompletedEvent(AllResetEvent, false); + urEventReleaseInternal(AllResetEvent); + } + + // Release events added to the command_buffer + for (auto &Sync : SyncPoints) { + auto &Event = Sync.second; + CleanupCompletedEvent(Event, false); + urEventReleaseInternal(Event); + } + + // Release fences allocated to command-buffer + for (auto &ZeFencePair : ZeFencesMap) { + auto &ZeFence = ZeFencePair.second; + ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence)); + } + + auto ReleaseIndirectMem = [](ur_kernel_handle_t Kernel) { + if (IndirectAccessTrackingEnabled) { + // urKernelRelease is called by CleanupCompletedEvent(Event) as soon as + // kernel execution has finished. This is the place where we need to + // release memory allocations. If kernel is not in use (not submitted by + // some other thread) then release referenced memory allocations. As a + // result, memory can be deallocated and context can be removed from + // container in the platform. That's why we need to lock a mutex here. + ur_platform_handle_t Platform = Kernel->Program->Context->getPlatform(); + std::scoped_lock ContextsLock(Platform->ContextsMutex); + + if (--Kernel->SubmissionsCount == 0) { + // Kernel is not submitted for execution, release referenced memory + // allocations. + for (auto &MemAlloc : Kernel->MemAllocs) { + // std::pair *, Hash + USMFreeHelper(MemAlloc->second.Context, MemAlloc->first, + MemAlloc->second.OwnNativeHandle); + } + Kernel->MemAllocs.clear(); + } } + }; - ZE2UR_CALL(zeCommandListAppendMemoryFill, - (ZeCommandList, Ptr, Pattern, PatternSize, Size, - LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + for (auto &AssociatedKernel : KernelsList) { + ReleaseIndirectMem(AssociatedKernel); + urKernelRelease(AssociatedKernel); + } +} + +ur_exp_command_buffer_command_handle_t_:: + ur_exp_command_buffer_command_handle_t_( + ur_exp_command_buffer_handle_t CommandBuffer, uint64_t CommandId, + uint32_t WorkDim, bool UserDefinedLocalSize, + ur_kernel_handle_t Kernel = nullptr) + : CommandBuffer(CommandBuffer), CommandId(CommandId), WorkDim(WorkDim), + UserDefinedLocalSize(UserDefinedLocalSize), Kernel(Kernel) { + urCommandBufferRetainExp(CommandBuffer); + if (Kernel) + urKernelRetain(Kernel); +} - logger::debug("calling zeCommandListAppendMemoryFill() with" - " ZeEvent {}", - ur_cast(LaunchEvent->ZeEvent)); +ur_exp_command_buffer_command_handle_t_:: + ~ur_exp_command_buffer_command_handle_t_() { + urCommandBufferReleaseExp(CommandBuffer); + if (Kernel) + urKernelRelease(Kernel); +} + +void ur_exp_command_buffer_handle_t_::RegisterSyncPoint( + ur_exp_command_buffer_sync_point_t SyncPoint, ur_event_handle_t Event) { + SyncPoints[SyncPoint] = Event; + NextSyncPoint++; + ZeEventsList.push_back(Event->ZeEvent); +} + +ze_command_list_handle_t +ur_exp_command_buffer_handle_t_::chooseCommandList(bool PreferCopyEngine) { + if (PreferCopyEngine && this->UseCopyEngine() && !this->IsInOrderCmdList) { + // We indicate that ZeCopyCommandList contains commands to be submitted. + this->MCopyCommandListEmpty = false; + return this->ZeCopyCommandList; } + return this->ZeComputeCommandList; +} +ur_result_t ur_exp_command_buffer_handle_t_::getFenceForQueue( + ze_command_queue_handle_t &ZeCommandQueue, ze_fence_handle_t &ZeFence) { + // If we already have created a fence for this queue, first reset then reuse + // it, otherwise create a new fence. + auto ZeWorkloadFenceForQueue = this->ZeFencesMap.find(ZeCommandQueue); + if (ZeWorkloadFenceForQueue == this->ZeFencesMap.end()) { + ZeStruct ZeFenceDesc; + ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + this->ZeFencesMap.insert({{ZeCommandQueue, ZeFence}}); + } else { + ZeFence = ZeWorkloadFenceForQueue->second; + ZE2UR_CALL(zeFenceReset, (ZeFence)); + } + this->ZeActiveFence = ZeFence; return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, - const ur_exp_command_buffer_desc_t *CommandBufferDesc, - ur_exp_command_buffer_handle_t *CommandBuffer) { - // In-order command-lists are not available in old driver version. - bool CompatibleDriver = IsDriverVersionNewerOrSimilar(Context, 1, 3, 28454); - const bool IsInOrder = - CompatibleDriver - ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false) - : false; +namespace { - uint32_t QueueGroupOrdinal = - Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute] - .ZeOrdinal; +/** + * Creates a L0 command list + * @param[in] Context The Context associated with the command-list + * @param[in] Device The Device associated with the command-list + * @param[in] IsInOrder Whether the command-list should be in-order. + * @param[in] IsUpdatable Whether the command-list should be mutable. + * @param[in] IsCopy Whether to use copy-engine for the the new command-list. + * @param[out] CommandList The L0 command-list created by this function. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t createMainCommandList(ur_context_handle_t Context, + ur_device_handle_t Device, bool IsInOrder, + bool IsUpdatable, bool IsCopy, + ze_command_list_handle_t &CommandList) { + + auto Type = IsCopy ? ur_device_handle_t_::queue_group_info_t::type::MainCopy + : ur_device_handle_t_::queue_group_info_t::type::Compute; + uint32_t QueueGroupOrdinal = Device->QueueGroup[Type].ZeOrdinal; ZeStruct ZeCommandListDesc; ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; - ze_command_list_handle_t ZeCommandListResetEvents; - // Create a command-list for reseting the events associated to enqueued cmd. - ZE2UR_CALL(zeCommandListCreate, - (Context->ZeContext, Device->ZeDevice, &ZeCommandListDesc, - &ZeCommandListResetEvents)); - // For non-linear graph, dependencies between commands are explicitly enforced // by sync points when enqueuing. Consequently, relax the command ordering in // the command list can enable the backend to further optimize the workload @@ -568,47 +617,91 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, DEBUG_LOG(ZeCommandListDesc.flags); ZeStruct ZeMutableCommandListDesc; - if (CommandBufferDesc && CommandBufferDesc->isUpdatable) { + if (IsUpdatable) { ZeMutableCommandListDesc.flags = 0; ZeCommandListDesc.pNext = &ZeMutableCommandListDesc; } - ze_command_list_handle_t ZeComputeCommandList; - // TODO We could optimize this by pooling both Level Zero command-lists and UR - // command-buffers, then reusing them. ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice, - &ZeCommandListDesc, &ZeComputeCommandList)); + &ZeCommandListDesc, &CommandList)); + + return UR_RESULT_SUCCESS; +} + +/** + * Checks whether the command buffer can be constructed using in order + * command-lists. + * @param[in] Context The Context associated with the command buffer. + * @param[in] CommandBufferDesc The description of the command buffer. + * @return Returns true if in order command-lists can be enabled. + */ +bool canBeInOrder(ur_context_handle_t Context, + const ur_exp_command_buffer_desc_t *CommandBufferDesc) { + // In-order command-lists are not available in old driver version. + bool CompatibleDriver = isDriverVersionNewerOrSimilar(Context, 1, 3, 28454); + return CompatibleDriver + ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false) + : false; +} +} // namespace + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, + const ur_exp_command_buffer_desc_t *CommandBufferDesc, + ur_exp_command_buffer_handle_t *CommandBuffer) { + + bool IsInOrder = canBeInOrder(Context, CommandBufferDesc); + bool EnableProfiling = + CommandBufferDesc && CommandBufferDesc->enableProfiling; + bool IsUpdatable = CommandBufferDesc && CommandBufferDesc->isUpdatable; - // Create a list for copy commands. - // Note that to simplify the implementation, the current implementation only - // uses the main copy engine and does not use the link engine even if - // available. + if (IsUpdatable) { + UR_ASSERT(Context->getPlatform()->ZeMutableCmdListExt.Supported, + UR_RESULT_ERROR_UNSUPPORTED_FEATURE); + } + + ur_event_handle_t SignalEvent; + ur_event_handle_t WaitEvent; + ur_event_handle_t AllResetEvent; + + UR_CALL(EventCreate(Context, nullptr, false, false, &SignalEvent, false, + !EnableProfiling)); + UR_CALL(EventCreate(Context, nullptr, false, false, &WaitEvent, false, + !EnableProfiling)); + UR_CALL(EventCreate(Context, nullptr, false, false, &AllResetEvent, false, + !EnableProfiling)); + std::vector PrecondEvents = {WaitEvent->ZeEvent, + AllResetEvent->ZeEvent}; + + ze_command_list_handle_t ZeComputeCommandList = nullptr; + UR_CALL(createMainCommandList(Context, Device, IsInOrder, IsUpdatable, false, + ZeComputeCommandList)); + ZE2UR_CALL(zeCommandListAppendBarrier, + (ZeComputeCommandList, nullptr, PrecondEvents.size(), + PrecondEvents.data())); + + ze_command_list_handle_t ZeCommandListResetEvents = nullptr; + UR_CALL(createMainCommandList(Context, Device, false, false, false, + ZeCommandListResetEvents)); + ZE2UR_CALL(zeCommandListAppendEventReset, + (ZeCommandListResetEvents, SignalEvent->ZeEvent)); + + // Create a list for copy commands. Note that to simplify the implementation, + // the current implementation only uses the main copy engine and does not use + // the link engine even if available. ze_command_list_handle_t ZeCopyCommandList = nullptr; - ZeStruct ZeCopyCommandListDesc; if (Device->hasMainCopyEngine()) { - uint32_t QueueGroupOrdinalCopy = - Device - ->QueueGroup - [ur_device_handle_t_::queue_group_info_t::type::MainCopy] - .ZeOrdinal; - - ZeCopyCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinalCopy; - // Dependencies between commands are explicitly enforced by sync points when - // enqueuing. Consequently, relax the command ordering in the command list - // can enable the backend to further optimize the workload - ZeCopyCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING; - - // TODO We could optimize this by pooling both Level Zero command-lists and - // UR command-buffers, then reusing them. - ZE2UR_CALL(zeCommandListCreate, - (Context->ZeContext, Device->ZeDevice, &ZeCopyCommandListDesc, - &ZeCopyCommandList)); + UR_CALL(createMainCommandList(Context, Device, false, false, true, + ZeCopyCommandList)); + ZE2UR_CALL(zeCommandListAppendBarrier, + (ZeCopyCommandList, nullptr, PrecondEvents.size(), + PrecondEvents.data())); } try { *CommandBuffer = new ur_exp_command_buffer_handle_t_( Context, Device, ZeComputeCommandList, ZeCommandListResetEvents, - ZeCopyCommandList, ZeCommandListDesc, ZeCopyCommandListDesc, + ZeCopyCommandList, SignalEvent, WaitEvent, AllResetEvent, CommandBufferDesc, IsInOrder); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; @@ -616,38 +709,6 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, return UR_RESULT_ERROR_UNKNOWN; } - // Create signal & wait events to be used in the command-list for sync - // on command-buffer enqueue. - auto RetCommandBuffer = *CommandBuffer; - UR_CALL(EventCreate(Context, nullptr, false, false, - &RetCommandBuffer->SignalEvent, false, - !RetCommandBuffer->IsProfilingEnabled)); - UR_CALL(EventCreate(Context, nullptr, false, false, - &RetCommandBuffer->WaitEvent, false, - !RetCommandBuffer->IsProfilingEnabled)); - UR_CALL(EventCreate(Context, nullptr, false, false, - &RetCommandBuffer->AllResetEvent, false, - !RetCommandBuffer->IsProfilingEnabled)); - - // Add prefix commands - ZE2UR_CALL( - zeCommandListAppendEventReset, - (ZeCommandListResetEvents, RetCommandBuffer->SignalEvent->ZeEvent)); - std::vector PrecondEvents = { - RetCommandBuffer->WaitEvent->ZeEvent, - RetCommandBuffer->AllResetEvent->ZeEvent}; - ZE2UR_CALL(zeCommandListAppendBarrier, - (ZeComputeCommandList, nullptr, PrecondEvents.size(), - PrecondEvents.data())); - - if (Device->hasMainCopyEngine()) { - // The copy command-list must be executed once the preconditions have been - // met. We therefore begin this command-list with a barrier on the - // preconditions. - ZE2UR_CALL(zeCommandListAppendBarrier, - (ZeCopyCommandList, nullptr, PrecondEvents.size(), - PrecondEvents.data())); - } return UR_RESULT_SUCCESS; } @@ -666,58 +727,154 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) { return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { - UR_ASSERT(CommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); - // It is not allowed to append to command list from multiple threads. - std::scoped_lock Guard(CommandBuffer->Mutex); - - // Create a list of events for our signal event to wait on - // This loop also resets the L0 events we use for command-buffer internal - // sync-points to the non-signaled state. - // This is required for multiple submissions. - const size_t NumEvents = CommandBuffer->SyncPoints.size(); - for (size_t i = 0; i < NumEvents; i++) { - auto ZeEvent = CommandBuffer->SyncPoints[i]->ZeEvent; - CommandBuffer->ZeEventsList.push_back(ZeEvent); - ZE2UR_CALL(zeCommandListAppendEventReset, - (CommandBuffer->ZeCommandListResetEvents, ZeEvent)); - } - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (CommandBuffer->ZeCommandListResetEvents, - CommandBuffer->AllResetEvent->ZeEvent)); - - if (CommandBuffer->IsInOrderCmdList) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (CommandBuffer->ZeComputeCommandList, - CommandBuffer->SignalEvent->ZeEvent)); - } else { - // Create a list of events for our signal event to wait on - const size_t NumEvents = CommandBuffer->SyncPoints.size(); - std::vector WaitEventList{NumEvents}; - for (size_t i = 0; i < NumEvents; i++) { - WaitEventList[i] = CommandBuffer->SyncPoints[i]->ZeEvent; - } +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { + UR_ASSERT(CommandBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER); + // It is not allowed to append to command list from multiple threads. + std::scoped_lock Guard(CommandBuffer->Mutex); + + if (CommandBuffer->IsInOrderCmdList) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandBuffer->ZeComputeCommandList, + CommandBuffer->SignalEvent->ZeEvent)); + } else { + // Reset the L0 events we use for command-buffer sync-points to the + // non-signaled state. This is required for multiple submissions. + for (auto &Event : CommandBuffer->ZeEventsList) { + ZE2UR_CALL(zeCommandListAppendEventReset, + (CommandBuffer->ZeCommandListResetEvents, Event)); + } + + // Wait for all the user added commands to complete, and signal the + // command-buffer signal-event when they are done. + ZE2UR_CALL(zeCommandListAppendBarrier, + (CommandBuffer->ZeComputeCommandList, + CommandBuffer->SignalEvent->ZeEvent, + CommandBuffer->ZeEventsList.size(), + CommandBuffer->ZeEventsList.data())); + } + + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandBuffer->ZeCommandListResetEvents, + CommandBuffer->AllResetEvent->ZeEvent)); + + // Close the command lists and have them ready for dispatch. + ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeComputeCommandList)); + ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandListResetEvents)); + + if (CommandBuffer->UseCopyEngine()) { + ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCopyCommandList)); + } + + CommandBuffer->IsFinalized = true; + + return UR_RESULT_SUCCESS; +} + +namespace { + +/** + * Sets the global offset for a kernel command that will be appended to the + * command buffer. + * @param[in] CommandBuffer The CommandBuffer where the command will be + * appended. + * @param[in] Kernel The handle to the kernel that will be appended. + * @param[in] GlobalWorkOffset The global offset value. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t setKernelGlobalOffset(ur_exp_command_buffer_handle_t CommandBuffer, + ur_kernel_handle_t Kernel, + const size_t *GlobalWorkOffset) { + + if (!CommandBuffer->Context->getPlatform() + ->ZeDriverGlobalOffsetExtensionFound) { + logger::debug("No global offset extension found on this driver"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + ZE2UR_CALL(zeKernelSetGlobalOffsetExp, + (Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1], + GlobalWorkOffset[2])); + + return UR_RESULT_SUCCESS; +} + +/** + * Sets the kernel arguments for a kernel command that will be appended to the + * command buffer. + * @param[in] CommandBuffer The CommandBuffer where the command will be + * appended. + * @param[in] Kernel The handle to the kernel that will be appended. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +setKernelPendingArguments(ur_exp_command_buffer_handle_t CommandBuffer, + ur_kernel_handle_t Kernel) { + + // If there are any pending arguments set them now. + for (auto &Arg : Kernel->PendingArguments) { + // The ArgValue may be a NULL pointer in which case a NULL value is used for + // the kernel argument declared as a pointer to global or constant memory. + char **ZeHandlePtr = nullptr; + if (Arg.Value) { + UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, + CommandBuffer->Device)); + } + ZE2UR_CALL(zeKernelSetArgumentValue, + (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); + } + Kernel->PendingArguments.clear(); + + return UR_RESULT_SUCCESS; +} + +/** + * Creates a new command handle to use in future updates to the command buffer. + * @param[in] CommandBuffer The CommandBuffer associated with the new command. + * @param[in] Kernel The Kernel associated with the new command. + * @param[in] WorkDim Dimensions of the kernel associated with the new command. + * @param[in] LocalWorkSize LocalWorkSize of the kernel associated with the new + * command. + * @param[out] Command The handle to the new command. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer, + ur_kernel_handle_t Kernel, uint32_t WorkDim, + const size_t *LocalWorkSize, + ur_exp_command_buffer_command_handle_t &Command) { + + assert(CommandBuffer->IsUpdatable); - // Wait for all the user added commands to complete, and signal the - // command-buffer signal-event when they are done. - ZE2UR_CALL(zeCommandListAppendBarrier, (CommandBuffer->ZeComputeCommandList, - CommandBuffer->SignalEvent->ZeEvent, - NumEvents, WaitEventList.data())); - } + // If command-buffer is updatable then get command id which is going to be + // used if command is updated in the future. This + // zeCommandListGetNextCommandIdExp can be called only if the command is + // updatable. + uint64_t CommandId = 0; + ZeStruct ZeMutableCommandDesc; + ZeMutableCommandDesc.flags = ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS | + ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT | + ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE | + ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET; - // Close the command lists and have them ready for dispatch. - ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeComputeCommandList)); - ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandListResetEvents)); + auto Platform = CommandBuffer->Context->getPlatform(); + ZE2UR_CALL( + Platform->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp, + (CommandBuffer->ZeComputeCommandList, &ZeMutableCommandDesc, &CommandId)); + DEBUG_LOG(CommandId); - if (CommandBuffer->UseCopyEngine()) { - ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCopyCommandList)); + try { + Command = new ur_exp_command_buffer_command_handle_t_( + CommandBuffer, CommandId, WorkDim, LocalWorkSize != nullptr, Kernel); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; } - CommandBuffer->IsFinalized = true; - return UR_RESULT_SUCCESS; } +} // namespace UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, @@ -727,41 +884,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *RetSyncPoint, ur_exp_command_buffer_command_handle_t *Command) { - UR_ASSERT(CommandBuffer && Kernel && Kernel->Program, - UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(Kernel->Program, UR_RESULT_ERROR_INVALID_NULL_POINTER); + // Lock automatically releases when this goes out of scope. std::scoped_lock Lock( Kernel->Mutex, Kernel->Program->Mutex, CommandBuffer->Mutex); if (GlobalWorkOffset != NULL) { - if (!CommandBuffer->Context->getPlatform() - ->ZeDriverGlobalOffsetExtensionFound) { - logger::debug("No global offset extension found on this driver"); - return UR_RESULT_ERROR_INVALID_VALUE; - } - - ZE2UR_CALL(zeKernelSetGlobalOffsetExp, - (Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1], - GlobalWorkOffset[2])); + UR_CALL(setKernelGlobalOffset(CommandBuffer, Kernel, GlobalWorkOffset)); } // If there are any pending arguments set them now. - for (auto &Arg : Kernel->PendingArguments) { - // The ArgValue may be a NULL pointer in which case a NULL value is used for - // the kernel argument declared as a pointer to global or constant memory. - char **ZeHandlePtr = nullptr; - if (Arg.Value) { - UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, - CommandBuffer->Device)); - } - ZE2UR_CALL(zeKernelSetArgumentValue, - (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); + if (!Kernel->PendingArguments.empty()) { + UR_CALL(setKernelPendingArguments(CommandBuffer, Kernel)); } - Kernel->PendingArguments.clear(); ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; uint32_t WG[3]; - UR_CALL(calculateKernelWorkDimensions(Kernel, CommandBuffer->Device, ZeThreadGroupDimensions, WG, WorkDim, GlobalWorkSize, LocalWorkSize)); @@ -769,75 +908,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2])); CommandBuffer->KernelsList.push_back(Kernel); + // Increment the reference count of the Kernel and indicate that the Kernel // is in use. Once the event has been signaled, the code in // CleanupCompletedEvent(Event) will do a urKernelRelease to update the // reference count on the kernel, using the kernel saved in CommandData. UR_CALL(urKernelRetain(Kernel)); - // If command-buffer is updatable then get command id which is going to be - // used if command is updated in the future. This - // zeCommandListGetNextCommandIdExp can be called only if command is - // updatable. - uint64_t CommandId = 0; - if (CommandBuffer->IsUpdatable) { - ZeStruct ZeMutableCommandDesc; - ZeMutableCommandDesc.flags = ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS | - ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT | - ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE | - ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET; - - auto Plt = CommandBuffer->Context->getPlatform(); - UR_ASSERT(Plt->ZeMutableCmdListExt.Supported, - UR_RESULT_ERROR_UNSUPPORTED_FEATURE); - ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp, - (CommandBuffer->ZeComputeCommandList, &ZeMutableCommandDesc, - &CommandId)); - DEBUG_LOG(CommandId); - } - try { - if (Command) - *Command = new ur_exp_command_buffer_command_handle_t_( - CommandBuffer, CommandId, WorkDim, LocalWorkSize != nullptr, Kernel); - } catch (const std::bad_alloc &) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; + if (Command && CommandBuffer->IsUpdatable) { + UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, LocalWorkSize, + *Command)); } - if (CommandBuffer->IsInOrderCmdList) { - ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (CommandBuffer->ZeComputeCommandList, Kernel->ZeKernel, - &ZeThreadGroupDimensions, nullptr, 0, nullptr)); - - logger::debug("calling zeCommandListAppendLaunchKernel()"); - } else { - std::vector ZeEventList; - UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, ZeEventList)); - ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, false, - &LaunchEvent, false, - !CommandBuffer->IsProfilingEnabled)); - LaunchEvent->CommandType = UR_COMMAND_KERNEL_LAUNCH; - - // Get sync point and register the event with it. - ur_exp_command_buffer_sync_point_t SyncPoint = - CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); - if (RetSyncPoint) { - *RetSyncPoint = SyncPoint; - } - - ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (CommandBuffer->ZeComputeCommandList, Kernel->ZeKernel, - &ZeThreadGroupDimensions, LaunchEvent->ZeEvent, - ZeEventList.size(), ZeEventList.data())); + std::vector ZeEventList; + ze_event_handle_t ZeLaunchEvent = nullptr; + UR_CALL(createSyncPointAndGetZeEvents( + UR_COMMAND_KERNEL_LAUNCH, CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, false, RetSyncPoint, ZeEventList, ZeLaunchEvent)); - logger::debug("calling zeCommandListAppendLaunchKernel() with" - " ZeEvent {}", - ur_cast(LaunchEvent->ZeEvent)); - } + logger::debug("calling zeCommandListAppendLaunchKernel()"); + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (CommandBuffer->ZeComputeCommandList, Kernel->ZeKernel, + &ZeThreadGroupDimensions, ZeLaunchEvent, ZeEventList.size(), + getPointerFromVector(ZeEventList))); return UR_RESULT_SUCCESS; } @@ -850,7 +943,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( bool PreferCopyEngine = !IsDevicePointer(CommandBuffer->Context, Src) || !IsDevicePointer(CommandBuffer->Context, Dst); - PreferCopyEngine |= UseCopyEngineForD2DCopy; return enqueueCommandBufferMemCopyHelper( @@ -879,7 +971,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( CommandBuffer->Device)); bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost); - PreferCopyEngine |= UseCopyEngineForD2DCopy; return enqueueCommandBufferMemCopyHelper( @@ -911,7 +1002,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( CommandBuffer->Device)); bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost); - PreferCopyEngine |= UseCopyEngineForD2DCopy; return enqueueCommandBufferMemCopyRectHelper( @@ -1026,8 +1116,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( (CommandBuffer->ZeComputeCommandList, Mem, Size)); } else { std::vector ZeEventList; - UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, ZeEventList)); + ze_event_handle_t ZeLaunchEvent = nullptr; + UR_CALL(createSyncPointAndGetZeEvents( + UR_COMMAND_USM_PREFETCH, CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, true, RetSyncPoint, ZeEventList, ZeLaunchEvent)); if (NumSyncPointsInWaitList) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, @@ -1035,20 +1127,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( ZeEventList.data())); } - ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, true, - &LaunchEvent, false, - !CommandBuffer->IsProfilingEnabled)); - LaunchEvent->CommandType = UR_COMMAND_USM_PREFETCH; - - // Get sync point and register the event with it. - ur_exp_command_buffer_sync_point_t SyncPoint = - CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); - if (RetSyncPoint) { - *RetSyncPoint = SyncPoint; - } - // Add the prefetch command to the command buffer. // Note that L0 does not handle migration flags. ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, @@ -1057,7 +1135,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( // Level Zero does not have a completion "event" with the prefetch API, // so manually add command to signal our event. ZE2UR_CALL(zeCommandListAppendSignalEvent, - (CommandBuffer->ZeComputeCommandList, LaunchEvent->ZeEvent)); + (CommandBuffer->ZeComputeCommandList, ZeLaunchEvent)); } return UR_RESULT_SUCCESS; @@ -1101,8 +1179,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( CommandBuffer->Device->ZeDevice, Mem, Size, ZeAdvice)); } else { std::vector ZeEventList; - UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, ZeEventList)); + ze_event_handle_t ZeLaunchEvent = nullptr; + UR_CALL(createSyncPointAndGetZeEvents( + UR_COMMAND_USM_ADVISE, CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, true, RetSyncPoint, ZeEventList, ZeLaunchEvent)); if (NumSyncPointsInWaitList) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, @@ -1110,20 +1190,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( ZeEventList.data())); } - ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, true, - &LaunchEvent, false, - !CommandBuffer->IsProfilingEnabled)); - LaunchEvent->CommandType = UR_COMMAND_USM_ADVISE; - - // Get sync point and register the event with it. - ur_exp_command_buffer_sync_point_t SyncPoint = - CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); - if (RetSyncPoint) { - *RetSyncPoint = SyncPoint; - } - ZE2UR_CALL(zeCommandListAppendMemAdvise, (CommandBuffer->ZeComputeCommandList, CommandBuffer->Device->ZeDevice, Mem, Size, ZeAdvice)); @@ -1131,7 +1197,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( // Level Zero does not have a completion "event" with the advise API, // so manually add command to signal our event. ZE2UR_CALL(zeCommandListAppendSignalEvent, - (CommandBuffer->ZeComputeCommandList, LaunchEvent->ZeEvent)); + (CommandBuffer->ZeComputeCommandList, ZeLaunchEvent)); } return UR_RESULT_SUCCESS; @@ -1155,8 +1221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, ZeHandleDst + Offset, Pattern, // It will be interpreted as an 8-bit value, PatternSize, // which is indicated with this pattern_size==1 - Size, PreferCopyEngineForFill, NumSyncPointsInWaitList, SyncPointWaitList, - SyncPoint); + Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( @@ -1170,36 +1235,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, Ptr, Pattern, // It will be interpreted as an 8-bit value, PatternSize, // which is indicated with this pattern_size==1 - Size, PreferCopyEngineForFill, NumSyncPointsInWaitList, SyncPointWaitList, - SyncPoint); + Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( - ur_exp_command_buffer_handle_t CommandBuffer, ur_queue_handle_t UrQueue, - uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, - ur_event_handle_t *Event) { - auto Queue = Legacy(UrQueue); - std::scoped_lock lock(Queue->Mutex); - // Use compute engine rather than copy engine - const auto UseCopyEngine = false; +namespace { + +/** + * Gets an L0 command queue that supports the chosen engine. + * @param[in] Queue The UR queue used to submit the command buffer. + * @param[in] UseCopyEngine Which engine to use. true for the copy engine and + * false for the compute engine. + * @param[out] ZeCommandQueue The L0 command queue. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t getZeCommandQueue(ur_queue_handle_legacy_t Queue, + bool UseCopyEngine, + ze_command_queue_handle_t &ZeCommandQueue) { auto &QGroup = Queue->getQueueGroup(UseCopyEngine); uint32_t QueueGroupOrdinal; - auto &ZeCommandQueue = QGroup.getZeQueue(&QueueGroupOrdinal); - - // If we already have created a fence for this queue, first reset then reuse - // it, otherwise create a new fence. - ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence; - auto ZeWorkloadFenceForQueue = - CommandBuffer->ZeFencesMap.find(ZeCommandQueue); - if (ZeWorkloadFenceForQueue == CommandBuffer->ZeFencesMap.end()) { - ZeStruct ZeFenceDesc; - ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); - CommandBuffer->ZeFencesMap.insert({{ZeCommandQueue, ZeFence}}); - } else { - ZeFence = ZeWorkloadFenceForQueue->second; - ZE2UR_CALL(zeFenceReset, (ZeFence)); - } + ZeCommandQueue = QGroup.getZeQueue(&QueueGroupOrdinal); + return UR_RESULT_SUCCESS; +} +/** + * Waits for the all the dependencies of the command buffer + * @param[in] CommandBuffer The command buffer. + * @param[in] Queue The UR queue used to submit the command buffer. + * @param[in] NumEventsInWaitList The number of events to wait for. + * @param[in] EventWaitList List of events to wait for. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer, + ur_queue_handle_legacy_t Queue, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList) { + const bool UseCopyEngine = false; bool MustSignalWaitEvent = true; if (NumEventsInWaitList) { _ur_ze_event_list_t TmpWaitList; @@ -1234,6 +1304,78 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( if (MustSignalWaitEvent) { ZE2UR_CALL(zeEventHostSignal, (CommandBuffer->WaitEvent->ZeEvent)); } + return UR_RESULT_SUCCESS; +} + +/** + * Creates a host visible event and appends a barrier to signal it when the + * command buffer finishes executing. + * @param[in] CommandBuffer The command buffer. + * @param[in] Queue The UR queue used to submit the command buffer. + * @param[in] SignalCommandList The command-list to append the barrier to. + * @param[out] Event The host visible event which will be returned to the user. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t createUserEvent(ur_exp_command_buffer_handle_t CommandBuffer, + ur_queue_handle_legacy_t Queue, + ur_command_list_ptr_t SignalCommandList, + ur_event_handle_t &Event) { + // Execution event for this enqueue of the UR command-buffer + ur_event_handle_t RetEvent{}; + + UR_CALL(createEventAndAssociateQueue(Queue, &RetEvent, + UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, + SignalCommandList, false, false, true)); + + if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) && + (!CommandBuffer->IsInOrderCmdList) && + (CommandBuffer->IsProfilingEnabled)) { + // Multiple submissions of a command buffer implies that we need to save + // the event timestamps before resubmiting the command buffer. We + // therefore copy the these timestamps in a dedicated USM memory section + // before completing the command buffer execution, and then attach this + // memory to the event returned to users to allow to allow the profiling + // engine to recover these timestamps. + command_buffer_profiling_t *Profiling = new command_buffer_profiling_t(); + + Profiling->NumEvents = CommandBuffer->ZeEventsList.size(); + Profiling->Timestamps = + new ze_kernel_timestamp_result_t[Profiling->NumEvents]; + + ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps, + (SignalCommandList->first, CommandBuffer->ZeEventsList.size(), + CommandBuffer->ZeEventsList.data(), + (void *)Profiling->Timestamps, 0, RetEvent->ZeEvent, 1, + &(CommandBuffer->SignalEvent->ZeEvent))); + + RetEvent->CommandData = static_cast(Profiling); + } else { + ZE2UR_CALL(zeCommandListAppendBarrier, + (SignalCommandList->first, RetEvent->ZeEvent, 1, + &(CommandBuffer->SignalEvent->ZeEvent))); + } + + Event = RetEvent; + + return UR_RESULT_SUCCESS; +} +} // namespace + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( + ur_exp_command_buffer_handle_t CommandBuffer, ur_queue_handle_t UrQueue, + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_event_handle_t *Event) { + auto Queue = Legacy(UrQueue); + std::scoped_lock Lock(Queue->Mutex); + + ze_command_queue_handle_t ZeCommandQueue; + getZeCommandQueue(Queue, false, ZeCommandQueue); + + ze_fence_handle_t ZeFence; + CommandBuffer->getFenceForQueue(ZeCommandQueue, ZeFence); + + UR_CALL(waitForDependencies(CommandBuffer, Queue, NumEventsInWaitList, + EventWaitList)); // Submit reset events command-list. This command-list is of a batch // command-list type, regardless of the UR Queue type. We therefore need to @@ -1254,18 +1396,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( // The Copy command-list is submitted to the main copy queue if it is not // empty. if (!CommandBuffer->MCopyCommandListEmpty) { - auto &QGroupCopy = Queue->getQueueGroup(true); - uint32_t QueueGroupOrdinal; - auto &ZeCopyCommandQueue = QGroupCopy.getZeQueue(&QueueGroupOrdinal); + ze_command_queue_handle_t ZeCopyCommandQueue; + getZeCommandQueue(Queue, true, ZeCopyCommandQueue); ZE2UR_CALL( zeCommandQueueExecuteCommandLists, (ZeCopyCommandQueue, 1, &CommandBuffer->ZeCopyCommandList, nullptr)); } - // Execution event for this enqueue of the UR command-buffer - ur_event_handle_t RetEvent{}; - - // Create a command-list to signal RetEvent on completion + // Create a command-list to signal the Event on completion ur_command_list_ptr_t SignalCommandList{}; UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList, false, NumEventsInWaitList, @@ -1281,44 +1419,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( (SignalCommandList->first, CommandBuffer->AllResetEvent->ZeEvent)); if (Event) { - UR_CALL(createEventAndAssociateQueue( - Queue, &RetEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, - SignalCommandList, false, false, true)); - - if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) && - (!CommandBuffer->IsInOrderCmdList) && - (CommandBuffer->IsProfilingEnabled)) { - // Multiple submissions of a command buffer implies that we need to save - // the event timestamps before resubmiting the command buffer. We - // therefore copy the these timestamps in a dedicated USM memory section - // before completing the command buffer execution, and then attach this - // memory to the event returned to users to allow to allow the profiling - // engine to recover these timestamps. - command_buffer_profiling_t *Profiling = new command_buffer_profiling_t(); - - Profiling->NumEvents = CommandBuffer->ZeEventsList.size(); - Profiling->Timestamps = - new ze_kernel_timestamp_result_t[Profiling->NumEvents]; - - ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps, - (SignalCommandList->first, CommandBuffer->ZeEventsList.size(), - CommandBuffer->ZeEventsList.data(), - (void *)Profiling->Timestamps, 0, RetEvent->ZeEvent, 1, - &(CommandBuffer->SignalEvent->ZeEvent))); - - RetEvent->CommandData = static_cast(Profiling); - } else { - ZE2UR_CALL(zeCommandListAppendBarrier, - (SignalCommandList->first, RetEvent->ZeEvent, 1, - &(CommandBuffer->SignalEvent->ZeEvent))); - } + UR_CALL(createUserEvent(CommandBuffer, Queue, SignalCommandList, *Event)); } - Queue->executeCommandList(SignalCommandList, false, false); - - if (Event) { - *Event = RetEvent; - } + UR_CALL(Queue->executeCommandList(SignalCommandList, false, false)); return UR_RESULT_SUCCESS; } @@ -1338,22 +1442,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( +namespace { + +/** + * Validates contents of the update command description. + * @param[in] Command The command which is being updated. + * @param[in] CommandDesc The update command description. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t validateCommandDesc( ur_exp_command_buffer_command_handle_t Command, const ur_exp_command_buffer_update_kernel_launch_desc_t *CommandDesc) { - UR_ASSERT(Command, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(Command->Kernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(CommandDesc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(CommandDesc->newWorkDim <= 3, - UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - // Lock command, kernel and command buffer for update. - std::scoped_lock Guard( - Command->Mutex, Command->CommandBuffer->Mutex, Command->Kernel->Mutex); - UR_ASSERT(Command->CommandBuffer->IsUpdatable, - UR_RESULT_ERROR_INVALID_OPERATION); - UR_ASSERT(Command->CommandBuffer->IsFinalized, - UR_RESULT_ERROR_INVALID_OPERATION); + auto CommandBuffer = Command->CommandBuffer; + auto SupportedFeatures = + Command->CommandBuffer->Device->ZeDeviceMutableCmdListsProperties + ->mutableCommandFlags; + logger::debug("Mutable features supported by device {}", SupportedFeatures); uint32_t Dim = CommandDesc->newWorkDim; if (Dim != 0) { @@ -1378,25 +1483,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( } } - auto CommandBuffer = Command->CommandBuffer; - const void *NextDesc = nullptr; - auto SupportedFeatures = - Command->CommandBuffer->Device->ZeDeviceMutableCmdListsProperties - ->mutableCommandFlags; - logger::debug("Mutable features supported by device {}", SupportedFeatures); - - // We need the created descriptors to live till the point when - // zexCommandListUpdateMutableCommandsExp is called at the end of the - // function. - std::vector>> - ArgDescs; - std::vector>> - OffsetDescs; - std::vector>> - GroupSizeDescs; - std::vector>> - GroupCountDescs; - // Check if new global offset is provided. size_t *NewGlobalWorkOffset = CommandDesc->pNewGlobalWorkOffset; UR_ASSERT(!NewGlobalWorkOffset || @@ -1408,6 +1494,62 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( logger::error("No global offset extension found on this driver"); return UR_RESULT_ERROR_INVALID_VALUE; } + } + + // Check if new group size is provided. + size_t *NewLocalWorkSize = CommandDesc->pNewLocalWorkSize; + UR_ASSERT(!NewLocalWorkSize || + (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE), + UR_RESULT_ERROR_UNSUPPORTED_FEATURE); + + // Check if new global size is provided and we need to update group count. + size_t *NewGlobalWorkSize = CommandDesc->pNewGlobalWorkSize; + UR_ASSERT(!NewGlobalWorkSize || + (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT), + UR_RESULT_ERROR_UNSUPPORTED_FEATURE); + UR_ASSERT(!(NewGlobalWorkSize && !NewLocalWorkSize) || + (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE), + UR_RESULT_ERROR_UNSUPPORTED_FEATURE); + + UR_ASSERT( + (!CommandDesc->numNewMemObjArgs && !CommandDesc->numNewPointerArgs && + !CommandDesc->numNewValueArgs) || + (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS), + UR_RESULT_ERROR_UNSUPPORTED_FEATURE); + + return UR_RESULT_SUCCESS; +} + +/** + * Update the kernel command with the new values. + * @param[in] Command The command which is being updated. + * @param[in] CommandDesc The update command description. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t updateKernelCommand( + ur_exp_command_buffer_command_handle_t Command, + const ur_exp_command_buffer_update_kernel_launch_desc_t *CommandDesc) { + + // We need the created descriptors to live till the point when + // zeCommandListUpdateMutableCommandsExp is called at the end of the + // function. + std::vector>, + std::unique_ptr>, + std::unique_ptr>, + std::unique_ptr>>> + Descs; + + const auto CommandBuffer = Command->CommandBuffer; + const void *NextDesc = nullptr; + + uint32_t Dim = CommandDesc->newWorkDim; + size_t *NewGlobalWorkOffset = CommandDesc->pNewGlobalWorkOffset; + size_t *NewLocalWorkSize = CommandDesc->pNewLocalWorkSize; + size_t *NewGlobalWorkSize = CommandDesc->pNewGlobalWorkSize; + + // Check if a new global offset is provided. + if (NewGlobalWorkOffset && Dim > 0) { auto MutableGroupOffestDesc = std::make_unique>(); MutableGroupOffestDesc->commandId = Command->CommandId; @@ -1420,15 +1562,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( DEBUG_LOG(MutableGroupOffestDesc->offsetY); MutableGroupOffestDesc->offsetZ = Dim == 3 ? NewGlobalWorkOffset[2] : 0; DEBUG_LOG(MutableGroupOffestDesc->offsetZ); + NextDesc = MutableGroupOffestDesc.get(); - OffsetDescs.push_back(std::move(MutableGroupOffestDesc)); + Descs.push_back(std::move(MutableGroupOffestDesc)); } - // Check if new group size is provided. - size_t *NewLocalWorkSize = CommandDesc->pNewLocalWorkSize; - UR_ASSERT(!NewLocalWorkSize || - (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE), - UR_RESULT_ERROR_UNSUPPORTED_FEATURE); + // Check if a new group size is provided. if (NewLocalWorkSize && Dim > 0) { auto MutableGroupSizeDesc = std::make_unique>(); @@ -1442,29 +1581,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( DEBUG_LOG(MutableGroupSizeDesc->groupSizeY); MutableGroupSizeDesc->groupSizeZ = Dim == 3 ? NewLocalWorkSize[2] : 1; DEBUG_LOG(MutableGroupSizeDesc->groupSizeZ); + NextDesc = MutableGroupSizeDesc.get(); - GroupSizeDescs.push_back(std::move(MutableGroupSizeDesc)); + Descs.push_back(std::move(MutableGroupSizeDesc)); } - // Check if new global size is provided and we need to update group count. - size_t *NewGlobalWorkSize = CommandDesc->pNewGlobalWorkSize; - UR_ASSERT(!NewGlobalWorkSize || - (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT), - UR_RESULT_ERROR_UNSUPPORTED_FEATURE); - UR_ASSERT(!(NewGlobalWorkSize && !NewLocalWorkSize) || - (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE), - UR_RESULT_ERROR_UNSUPPORTED_FEATURE); - + // Check if a new global size is provided and if we need to update the group + // count. ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; if (NewGlobalWorkSize && Dim > 0) { - uint32_t WG[3]; - // If new global work size is provided but new local work size is not - // provided then we still need to update local work size based on size - // suggested by the driver for the kernel. + // If a new global work size is provided but a new local work size is not + // then we still need to update local work size based on the size suggested + // by the driver for the kernel. bool UpdateWGSize = NewLocalWorkSize == nullptr; + + uint32_t WG[3]; UR_CALL(calculateKernelWorkDimensions( Command->Kernel, CommandBuffer->Device, ZeThreadGroupDimensions, WG, Dim, NewGlobalWorkSize, NewLocalWorkSize)); + auto MutableGroupCountDesc = std::make_unique>(); MutableGroupCountDesc->commandId = Command->CommandId; @@ -1475,8 +1610,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( DEBUG_LOG(MutableGroupCountDesc->pGroupCount->groupCountX); DEBUG_LOG(MutableGroupCountDesc->pGroupCount->groupCountY); DEBUG_LOG(MutableGroupCountDesc->pGroupCount->groupCountZ); + NextDesc = MutableGroupCountDesc.get(); - GroupCountDescs.push_back(std::move(MutableGroupCountDesc)); + Descs.push_back(std::move(MutableGroupCountDesc)); if (UpdateWGSize) { auto MutableGroupSizeDesc = @@ -1493,16 +1629,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( DEBUG_LOG(MutableGroupSizeDesc->groupSizeZ); NextDesc = MutableGroupSizeDesc.get(); - GroupSizeDescs.push_back(std::move(MutableGroupSizeDesc)); + Descs.push_back(std::move(MutableGroupSizeDesc)); } } - UR_ASSERT( - (!CommandDesc->numNewMemObjArgs && !CommandDesc->numNewPointerArgs && - !CommandDesc->numNewValueArgs) || - (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS), - UR_RESULT_ERROR_UNSUPPORTED_FEATURE); - // Check if new memory object arguments are provided. for (uint32_t NewMemObjArgNum = CommandDesc->numNewMemObjArgs; NewMemObjArgNum-- > 0;) { @@ -1526,6 +1656,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( return UR_RESULT_ERROR_INVALID_ARGUMENT; } } + ur_mem_handle_t NewMemObjArg = NewMemObjArgDesc.hNewMemObjArg; // The NewMemObjArg may be a NULL pointer in which case a NULL value is used // for the kernel argument declared as a pointer to global or constant @@ -1535,6 +1666,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( UR_CALL(NewMemObjArg->getZeHandlePtr(ZeHandlePtr, UrAccessMode, CommandBuffer->Device)); } + auto ZeMutableArgDesc = std::make_unique>(); ZeMutableArgDesc->commandId = Command->CommandId; @@ -1549,7 +1681,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( DEBUG_LOG(ZeMutableArgDesc->pArgValue); NextDesc = ZeMutableArgDesc.get(); - ArgDescs.push_back(std::move(ZeMutableArgDesc)); + Descs.push_back(std::move(ZeMutableArgDesc)); } // Check if there are new pointer arguments. @@ -1557,6 +1689,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( NewPointerArgNum-- > 0;) { ur_exp_command_buffer_update_pointer_arg_desc_t NewPointerArgDesc = CommandDesc->pNewPointerArgList[NewPointerArgNum]; + auto ZeMutableArgDesc = std::make_unique>(); ZeMutableArgDesc->commandId = Command->CommandId; @@ -1571,7 +1704,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( DEBUG_LOG(ZeMutableArgDesc->pArgValue); NextDesc = ZeMutableArgDesc.get(); - ArgDescs.push_back(std::move(ZeMutableArgDesc)); + Descs.push_back(std::move(ZeMutableArgDesc)); } // Check if there are new value arguments. @@ -1579,6 +1712,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( NewValueArgNum-- > 0;) { ur_exp_command_buffer_update_value_arg_desc_t NewValueArgDesc = CommandDesc->pNewValueArgList[NewValueArgNum]; + auto ZeMutableArgDesc = std::make_unique>(); ZeMutableArgDesc->commandId = Command->CommandId; @@ -1603,25 +1737,51 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( } ZeMutableArgDesc->pArgValue = ArgValuePtr; DEBUG_LOG(ZeMutableArgDesc->pArgValue); + NextDesc = ZeMutableArgDesc.get(); - ArgDescs.push_back(std::move(ZeMutableArgDesc)); + Descs.push_back(std::move(ZeMutableArgDesc)); } ZeStruct MutableCommandDesc; MutableCommandDesc.pNext = NextDesc; MutableCommandDesc.flags = 0; + auto Platform = CommandBuffer->Context->getPlatform(); + ZE2UR_CALL( + Platform->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp, + (CommandBuffer->ZeComputeCommandList, &MutableCommandDesc)); + + return UR_RESULT_SUCCESS; +} +} // namespace + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( + ur_exp_command_buffer_command_handle_t Command, + const ur_exp_command_buffer_update_kernel_launch_desc_t *CommandDesc) { + UR_ASSERT(Command->Kernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(CommandDesc->newWorkDim <= 3, + UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + // Lock command, kernel and command buffer for update. + std::scoped_lock Guard( + Command->Mutex, Command->CommandBuffer->Mutex, Command->Kernel->Mutex); + + UR_ASSERT(Command->CommandBuffer->IsUpdatable, + UR_RESULT_ERROR_INVALID_OPERATION); + UR_ASSERT(Command->CommandBuffer->IsFinalized, + UR_RESULT_ERROR_INVALID_OPERATION); + + UR_CALL(validateCommandDesc(Command, CommandDesc)); + // We must synchronize mutable command list execution before mutating. - if (ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence) { + if (ze_fence_handle_t &ZeFence = Command->CommandBuffer->ZeActiveFence) { ZE2UR_CALL(zeFenceHostSynchronize, (ZeFence, UINT64_MAX)); } - auto Plt = CommandBuffer->Context->getPlatform(); - UR_ASSERT(Plt->ZeMutableCmdListExt.Supported, - UR_RESULT_ERROR_UNSUPPORTED_FEATURE); - ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp, - (CommandBuffer->ZeComputeCommandList, &MutableCommandDesc)); - ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeComputeCommandList)); + UR_CALL(updateKernelCommand(Command, CommandDesc)); + + ZE2UR_CALL(zeCommandListClose, + (Command->CommandBuffer->ZeComputeCommandList)); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp index 48f1c68330..7a2676017c 100644 --- a/source/adapters/level_zero/command_buffer.hpp +++ b/source/adapters/level_zero/command_buffer.hpp @@ -29,18 +29,14 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_context_handle_t Context, ur_device_handle_t Device, ze_command_list_handle_t CommandList, ze_command_list_handle_t CommandListResetEvents, - ze_command_list_handle_t CopyCommandList, - ZeStruct ZeDesc, - ZeStruct ZeCopyDesc, + ze_command_list_handle_t CopyCommandList, ur_event_handle_t SignalEvent, + ur_event_handle_t WaitEvent, ur_event_handle_t AllResetEvent, const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList); ~ur_exp_command_buffer_handle_t_(); void RegisterSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, - ur_event_handle_t Event) { - SyncPoints[SyncPoint] = Event; - NextSyncPoint++; - } + ur_event_handle_t Event); ur_exp_command_buffer_sync_point_t GetNextSyncPoint() const { return NextSyncPoint; @@ -49,6 +45,25 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { // Indicates if a copy engine is available for use bool UseCopyEngine() const { return ZeCopyCommandList != nullptr; } + /** + * Obtains a fence for a specific L0 queue. If there is already an available + * fence for this queue, it will be reused. + * @param[in] ZeCommandQueue The L0 queue associated with the fence. + * @param[out] ZeFence The fence. + * @return UR_RESULT_SUCCESS or an error code on failure + */ + ur_result_t getFenceForQueue(ze_command_queue_handle_t &ZeCommandQueue, + ze_fence_handle_t &ZeFence); + + /** + * Chooses which command list to use when appending a command to this command + * buffer. + * @param[in] PreferCopyEngine If true, will try to choose a copy engine + * command-list. Will choose a compute command-list otherwise. + * @return The chosen command list. + */ + ze_command_list_handle_t chooseCommandList(bool PreferCopyEngine); + // UR context associated with this command-buffer ur_context_handle_t Context; // Device associated with this command buffer @@ -57,12 +72,17 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ze_command_list_handle_t ZeComputeCommandList; // Level Zero command list handle ze_command_list_handle_t ZeCommandListResetEvents; - // Level Zero command list descriptor - ZeStruct ZeCommandListDesc; // Level Zero Copy command list handle ze_command_list_handle_t ZeCopyCommandList; - // Level Zero Copy command list descriptor - ZeStruct ZeCopyCommandListDesc; + // Event which will signals the most recent execution of the command-buffer + // has finished + ur_event_handle_t SignalEvent = nullptr; + // Event which a command-buffer waits on until the wait-list dependencies + // passed to a command-buffer enqueue have been satisfied. + ur_event_handle_t WaitEvent = nullptr; + // Event which a command-buffer waits on until the main command-list event + // have been reset. + ur_event_handle_t AllResetEvent = nullptr; // This flag is must be set to false if at least one copy command has been // added to `ZeCopyCommandList` bool MCopyCommandListEmpty = true; @@ -73,26 +93,15 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { // Must be an element in ZeFencesMap, so is not required to be destroyed // itself. ze_fence_handle_t ZeActiveFence; - // Queue properties from command-buffer descriptor - // TODO: Do we need these? - ur_queue_properties_t QueueProperties; // Map of sync_points to ur_events std::unordered_map SyncPoints; // Next sync_point value (may need to consider ways to reuse values if 32-bits // is not enough) ur_exp_command_buffer_sync_point_t NextSyncPoint; - // List of Level Zero events associated to submitted commands. + // List of Level Zero events associated with submitted commands. std::vector ZeEventsList; - // Event which will signals the most recent execution of the command-buffer - // has finished - ur_event_handle_t SignalEvent = nullptr; - // Event which a command-buffer waits on until the wait-list dependencies - // passed to a command-buffer enqueue have been satisfied. - ur_event_handle_t WaitEvent = nullptr; - // Event which a command-buffer waits on until the main command-list event - // have been reset. - ur_event_handle_t AllResetEvent = nullptr; + // Indicates if command-buffer commands can be updated after it is closed. bool IsUpdatable = false; // Indicates if command buffer was finalized.