From 501781a35cd927c6cb756f32b28b00115473608e Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Mon, 6 May 2024 20:36:25 +0200 Subject: [PATCH] [L0] move cmd list initialization to queue ctor - remove per thread cache - create all immediate command lists and a predefined number of regular lists upfront - add abstraction for cmd list cache in the context --- source/adapters/level_zero/command_buffer.cpp | 4 +- source/adapters/level_zero/context.cpp | 190 ++++----- source/adapters/level_zero/context.hpp | 78 ++-- source/adapters/level_zero/device.cpp | 2 - source/adapters/level_zero/device.hpp | 5 +- source/adapters/level_zero/event.cpp | 55 ++- source/adapters/level_zero/queue.cpp | 371 ++++++++---------- source/adapters/level_zero/queue.hpp | 69 +--- 8 files changed, 357 insertions(+), 417 deletions(-) diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 67415a0de0..985c9ac92b 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -1021,7 +1021,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( std::scoped_lock lock(Queue->Mutex); // Use compute engine rather than copy engine const auto UseCopyEngine = false; - auto &QGroup = Queue->getQueueGroup(UseCopyEngine); + + // TODO(cache): use getAvailableCommandList here + auto &QGroup = Queue->ComputeQueueGroup; uint32_t QueueGroupOrdinal; auto &ZeCommandQueue = QGroup.getZeQueue(&QueueGroupOrdinal); diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index 19696142f5..89d444a0a8 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -18,6 +18,72 @@ #include "queue.hpp" #include "ur_level_zero.hpp" +bool immediate_command_list_descriptor_t::operator==( + const immediate_command_list_descriptor_t &rhs) const { + return Device == rhs.Device && QueueDesc.flags == rhs.QueueDesc.flags && + QueueDesc.mode == rhs.QueueDesc.mode && + QueueDesc.priority == rhs.QueueDesc.priority; +} + +bool regular_command_list_descriptor_t::operator==( + const regular_command_list_descriptor_t &rhs) const { + return Device == rhs.Device && Ordinal == rhs.Ordinal && + IsInOrder == rhs.IsInOrder; +} + +inline size_t command_list_descriptor_hash_t::operator()( + const command_list_descriptor_t &desc) const { + if (auto ImmCmdDesc = + std::get_if(&desc)) { + return combine_hashes(0, ImmCmdDesc->Device, ImmCmdDesc->QueueDesc.ordinal, + ImmCmdDesc->QueueDesc.flags, + ImmCmdDesc->QueueDesc.mode, + ImmCmdDesc->QueueDesc.priority); + } else { + auto RegCmdDesc = std::get(desc); + return combine_hashes(0, RegCmdDesc.Device, RegCmdDesc.IsInOrder, + RegCmdDesc.Ordinal); + } +} + +command_list_cache::~command_list_cache() { + for (auto &Kv : ZeCommandListCache) { + while (Kv.second.size() > 0) { + auto ZeCommandList = Kv.second.top(); + if (ZeCommandList) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); + } + Kv.second.pop(); + } + } +} + +std::optional +command_list_cache::getCommandList(const command_list_descriptor_t &desc) { + std::scoped_lock Lock(ZeCommandListCacheMutex); + auto it = ZeCommandListCache.find(desc); + if (it == ZeCommandListCache.end()) + return std::nullopt; + + assert(!it->second.empty()); + + auto CommandListHandle = it->second.top(); + it->second.pop(); + + if (it->second.empty()) + ZeCommandListCache.erase(it); + + return std::make_optional(CommandListHandle); +} + +void command_list_cache::addCommandList(const command_list_descriptor_t &desc, + ze_command_list_handle_t cmdList) { + // TODO: add a limit? + std::scoped_lock Lock(ZeCommandListCacheMutex); + auto [it, _] = ZeCommandListCache.try_emplace(desc); + it->second.push(cmdList); +} + UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( uint32_t DeviceCount, ///< [in] the number of devices given in phDevices const ur_device_handle_t @@ -428,29 +494,6 @@ ur_result_t ur_context_handle_t_::finalize() { if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) return ze2urResult(ZeResult); - std::scoped_lock Lock(ZeCommandListCacheMutex); - for (auto &List : ZeComputeCommandListCache) { - for (auto &Item : List.second) { - ze_command_list_handle_t ZeCommandList = Item.first; - if (ZeCommandList) { - auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return ze2urResult(ZeResult); - } - } - } - for (auto &List : ZeCopyCommandListCache) { - for (auto &Item : List.second) { - ze_command_list_handle_t ZeCommandList = Item.first; - if (ZeCommandList) { - auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return ze2urResult(ZeResult); - } - } - } return UR_RESULT_SUCCESS; } @@ -623,7 +666,7 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) { // Get value of the threshold for number of active command lists allowed before // we start heuristically cleaning them up. -static const size_t CmdListsCleanupThreshold = [] { +size_t ur_context_handle_t_::CmdListsCleanupThreshold = [] { const char *UrRet = std::getenv("UR_L0_COMMANDLISTS_CLEANUP_THRESHOLD"); const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_COMMANDLISTS_CLEANUP_THRESHOLD"); @@ -643,6 +686,7 @@ static const size_t CmdListsCleanupThreshold = [] { }(); // Retrieve an available command list to be used in a PI call. +// TODO(cache): remove ForcedCmdQueue ur_result_t ur_context_handle_t_::getAvailableCommandList( ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList, bool UseCopyEngine, uint32_t NumEventsInWaitList, @@ -650,7 +694,11 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( ze_command_queue_handle_t *ForcedCmdQueue) { // Immediate commandlists have been pre-allocated and are always available. if (Queue->UsingImmCmdLists) { - CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); + CommandList = + (UseCopyEngine ? Queue->CopyQueueGroup : Queue->ComputeQueueGroup) + .getImmCmdList(); + + // TODO(cache): remove this from the hot path if (CommandList->second.EventList.size() >= Queue->getImmdCmmdListsEventCleanupThreshold()) { std::vector EventListToCleanup; @@ -662,6 +710,7 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( return Res; return UR_RESULT_SUCCESS; } else { + // TODO(cache): remove this from the hot path // Cleanup regular command-lists if there are too many. // It handles the case that the queue is not synced to the host // for a long time and we want to reclaim the command-lists for @@ -700,86 +749,16 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( // have been enqueued into the command-list when they were created. } - // Create/Reuse the command list, because in Level Zero commands are added to - // the command lists, and later are then added to the command queue. - // Each command list is paired with an associated fence to track when the - // command list is available for reuse. - ur_result_t ur_result = UR_RESULT_ERROR_OUT_OF_RESOURCES; - - // Initally, we need to check if a command list has already been created - // on this device that is available for use. If so, then reuse that - // Level-Zero Command List and Fence for this PI call. - { - // Make sure to acquire the lock before checking the size, or there - // will be a race condition. - std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); - // Under mutex since operator[] does insertion on the first usage for - // every unique ZeDevice. - auto &ZeCommandListCache = - UseCopyEngine - ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice] - : Queue->Context - ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; - - for (auto ZeCommandListIt = ZeCommandListCache.begin(); - ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) { - // If this is an InOrder Queue, then only allow lists which are in order. - if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue() && - !(ZeCommandListIt->second.InOrderList)) { - continue; - } - auto &ZeCommandList = ZeCommandListIt->first; - auto it = Queue->CommandListMap.find(ZeCommandList); - if (it != Queue->CommandListMap.end()) { - if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue) - continue; - CommandList = it; - if (CommandList->second.ZeFence != nullptr) - CommandList->second.ZeFenceInUse = true; - } else { - // If there is a command list available on this context, but it - // wasn't yet used in this queue then create a new entry in this - // queue's map to hold the fence and other associated command - // list information. - auto &QGroup = Queue->getQueueGroup(UseCopyEngine); - uint32_t QueueGroupOrdinal; - auto &ZeCommandQueue = ForcedCmdQueue - ? *ForcedCmdQueue - : QGroup.getZeQueue(&QueueGroupOrdinal); - if (ForcedCmdQueue) - QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue); - - ze_fence_handle_t ZeFence; - ZeStruct ZeFenceDesc; - ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); - ZeStruct ZeQueueDesc; - ZeQueueDesc.ordinal = QueueGroupOrdinal; - - CommandList = - Queue->CommandListMap - .emplace(ZeCommandList, - ur_command_list_info_t(ZeFence, true, false, - ZeCommandQueue, ZeQueueDesc, - Queue->useCompletionBatching())) - .first; - } - ZeCommandListCache.erase(ZeCommandListIt); - if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList)) - return Res; - if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) - return Res; - return UR_RESULT_SUCCESS; - } - } - - // If there are no available command lists in the cache, then we check for - // command lists that have already signalled, but have not been added to the - // available list yet. Each command list has a fence associated which tracks + // Each command list has a fence associated which tracks // if a command list has completed dispatch of its commands and is ready for // reuse. If a command list is found to have been signalled, then the // command list & fence are reset and we return. for (auto it = Queue->CommandListMap.begin(); it != Queue->CommandListMap.end(); ++it) { + if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue) { + continue; + } + // Make sure this is the command list type needed. if (UseCopyEngine != it->second.isCopy(Queue)) continue; @@ -790,8 +769,9 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( continue; } - ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence)); + ze_result_t ZeResult = ZE_RESULT_SUCCESS; + if (it->second.ZeFenceInUse) + ZeResult = ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence)); if (ZeResult == ZE_RESULT_SUCCESS) { std::vector EventListToCleanup; Queue->resetCommandList(it, false, EventListToCleanup); @@ -807,9 +787,11 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( // If there are no available command lists nor signalled command lists, // then we must create another command list. - ur_result = Queue->createCommandList(UseCopyEngine, CommandList); + UR_CALL(Queue->initializeSingleRegularCommandList( + (UseCopyEngine ? Queue->CopyQueueGroup : Queue->ComputeQueueGroup), + CommandList, ForcedCmdQueue)); CommandList->second.ZeFenceInUse = true; - return ur_result; + return UR_RESULT_SUCCESS; } bool ur_context_handle_t_::isValidDevice(ur_device_handle_t Device) const { diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index 4184411de7..41572145eb 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -27,9 +28,41 @@ #include -struct l0_command_list_cache_info { - ZeStruct ZeQueueDesc; - bool InOrderList = false; +struct immediate_command_list_descriptor_t { + ze_device_handle_t Device; + ZeStruct QueueDesc; + bool operator==(const immediate_command_list_descriptor_t &rhs) const; +}; + +struct regular_command_list_descriptor_t { + ze_device_handle_t Device; + bool IsInOrder; + uint32_t Ordinal; + bool operator==(const regular_command_list_descriptor_t &rhs) const; +}; + +using command_list_descriptor_t = + std::variant; + +struct command_list_descriptor_hash_t { + inline size_t operator()(const command_list_descriptor_t &desc) const; +}; + +struct command_list_cache { + ~command_list_cache(); + + std::optional + getCommandList(const command_list_descriptor_t &desc); + void addCommandList(const command_list_descriptor_t &desc, + ze_command_list_handle_t cmdList); + +private: + std::unordered_map, + command_list_descriptor_hash_t> + ZeCommandListCache; + ur_mutex ZeCommandListCacheMutex; }; struct ur_context_handle_t_ : _ur_object { @@ -71,10 +104,6 @@ struct ur_context_handle_t_ : _ur_object { // called from simultaneous threads. ur_mutex ImmediateCommandListMutex; - // Mutex Lock for the Command List Cache. This lock is used to control both - // compute and copy command list caches. - ur_mutex ZeCommandListCacheMutex; - // If context contains one device or sub-devices of the same device, we want // to save this device. // This field is only set at ur_context_handle_t creation time, and cannot @@ -82,22 +111,11 @@ struct ur_context_handle_t_ : _ur_object { // ur_context_handle_t. ur_device_handle_t SingleRootDevice = nullptr; - // Cache of all currently available/completed command/copy lists. - // Note that command-list can only be re-used on the same device. - // - // TODO: explore if we should use root-device for creating command-lists - // as spec says that in that case any sub-device can re-use it: "The - // application must only use the command list for the device, or its - // sub-devices, which was provided during creation." - // - std::unordered_map>> - ZeComputeCommandListCache; - std::unordered_map>> - ZeCopyCommandListCache; + // Get reference to the command list cache for a given list type + // TODO: get rid of UseCopyEngine - just use ordinal? + auto &getCommandListCache(bool UseCopyEngine) { + return UseCopyEngine ? CopyListCache : ComputeListCache; + } // Store USM pool for USM shared and device allocations. There is 1 memory // pool per each pair of (context, device) per each memory type. @@ -302,7 +320,21 @@ struct ur_context_handle_t_ : _ur_object { // For that the Device or its root devices need to be in the context. bool isValidDevice(ur_device_handle_t Device) const; + // TODO(cache): move this out of Context + static size_t CmdListsCleanupThreshold; + private: + // Cache of all currently available/completed command/copy lists. + // Note that command-list can only be re-used on the same device. + // + // TODO: explore if we should use root-device for creating command-lists + // as spec says that in that case any sub-device can re-use it: "The + // application must only use the command list for the device, or its + // sub-devices, which was provided during creation." + // + command_list_cache ComputeListCache; + command_list_cache CopyListCache; + // Get the cache of events for a provided scope and profiling mode. auto getEventCache(bool HostVisible, bool WithProfiling, ur_device_handle_t Device) { diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 59c1ce81bf..af9f972f00 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -1062,8 +1062,6 @@ ur_device_handle_t_::useImmediateCommandLists() { return NotUsed; case 1: return PerQueue; - case 2: - return PerThreadPerQueue; default: return NotUsed; } diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 3cdfcbce7e..63d24c3e27 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -135,10 +135,7 @@ struct ur_device_handle_t_ : _ur_object { // Immediate commandlists are not used. NotUsed = 0, // One set of compute and copy immediate commandlists per queue. - PerQueue, - // One set of compute and copy immediate commandlists per host thread that - // accesses the queue. - PerThreadPerQueue + PerQueue }; // Read env settings to select immediate commandlist mode. ImmCmdlistMode useImmediateCommandLists(); diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 9821333547..a434e3c119 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -280,45 +280,40 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( std::vector CmdLists; // There must be at least one L0 queue. - auto &ComputeGroup = Queue->ComputeQueueGroupsByTID.get(); - auto &CopyGroup = Queue->CopyQueueGroupsByTID.get(); + auto &ComputeGroup = Queue->ComputeQueueGroup; + auto &CopyGroup = Queue->CopyQueueGroup; UR_ASSERT(!ComputeGroup.ZeQueues.empty() || !CopyGroup.ZeQueues.empty(), UR_RESULT_ERROR_INVALID_QUEUE); - size_t NumQueues = 0; - for (auto &QueueMap : - {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) - for (auto &QueueGroup : QueueMap) - NumQueues += QueueGroup.second.ZeQueues.size(); + size_t NumQueues = ComputeGroup.ZeQueues.size() + CopyGroup.ZeQueues.size(); OkToBatch = true; // Get an available command list tied to each command queue. We need // these so a queue-wide barrier can be inserted into each command // queue. CmdLists.reserve(NumQueues); - for (auto &QueueMap : - {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) - for (auto &QueueGroup : QueueMap) { - bool UseCopyEngine = - QueueGroup.second.Type != ur_queue_handle_t_::queue_type::Compute; - if (Queue->UsingImmCmdLists) { - // If immediate command lists are being used, each will act as their own - // queue, so we must insert a barrier into each. - for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists) - if (ImmCmdList != Queue->CommandListMap.end()) - CmdLists.push_back(ImmCmdList); - } else { - for (auto ZeQueue : QueueGroup.second.ZeQueues) { - if (ZeQueue) { - ur_command_list_ptr_t CmdList; - UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CmdList, UseCopyEngine, NumEventsInWaitList, - EventWaitList, OkToBatch, &ZeQueue)); - CmdLists.push_back(CmdList); - } + for (auto QueueGroup : {&Queue->ComputeQueueGroup, &Queue->CopyQueueGroup}) { + bool UseCopyEngine = + QueueGroup->Type != ur_queue_handle_t_::queue_type::Compute; + if (Queue->UsingImmCmdLists) { + // If immediate command lists are being used, each will act as their own + // queue, so we must insert a barrier into each. + for (auto &ImmCmdList : QueueGroup->ImmCmdLists) + if (ImmCmdList != Queue->CommandListMap.end()) + CmdLists.push_back(ImmCmdList); + } else { + for (auto ZeQueue : QueueGroup->ZeQueues) { + if (ZeQueue) { + ur_command_list_ptr_t CmdList; + // TODO(cache): just iterate over command lists, not queues + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CmdList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + OkToBatch, &ZeQueue)); + CmdLists.push_back(CmdList); } } } + } // If no activity has occurred on the queue then there will be no cmdlists. // We need one for generating an Event, so create one. @@ -1314,7 +1309,8 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( // new command list is different from the last used command list then // signal new event from the last immediate command list. We are going // to insert a barrier in the new command list waiting for that event. - auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); + auto QueueGroup = UseCopyEngine ? CurQueue->CopyQueueGroup + : CurQueue->ComputeQueueGroup; uint32_t QueueGroupOrdinal, QueueIndex; auto NextIndex = QueueGroup.getQueueIndex(&QueueGroupOrdinal, &QueueIndex, @@ -1363,7 +1359,8 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( // the same UR Queue. if (CurQueue->Device->useDriverInOrderLists() && CurQueue->isInOrderQueue() && CurQueue->UsingImmCmdLists) { - auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); + auto QueueGroup = + UseCopyEngine ? CurQueue->CopyQueueGroup : CurQueue->ComputeQueueGroup; uint32_t QueueGroupOrdinal, QueueIndex; auto NextIndex = QueueGroup.getQueueIndex(&QueueGroupOrdinal, &QueueIndex, /*QueryOnly */ true); diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index ad48962375..bf931ed9ae 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -408,37 +408,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo( Queue->hasOpenCommandList(IsCopy{false})) return ReturnValue(false); - for (const auto &QueueMap : - {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) { - for (const auto &QueueGroup : QueueMap) { - if (Queue->UsingImmCmdLists) { - // Immediate command lists are not associated with any Level Zero - // queue, that's why we have to check status of events in each - // immediate command list. Start checking from the end and exit early - // if some event is not completed. - for (const auto &ImmCmdList : QueueGroup.second.ImmCmdLists) { - if (ImmCmdList == Queue->CommandListMap.end()) - continue; + for (const auto QueueGroup : + {&Queue->ComputeQueueGroup, &Queue->CopyQueueGroup}) { + if (Queue->UsingImmCmdLists) { + // Immediate command lists are not associated with any Level Zero + // queue, that's why we have to check status of events in each + // immediate command list. Start checking from the end and exit early + // if some event is not completed. + for (const auto &ImmCmdList : QueueGroup->ImmCmdLists) { + if (ImmCmdList == Queue->CommandListMap.end()) + continue; - const auto &EventList = ImmCmdList->second.EventList; - for (auto It = EventList.crbegin(); It != EventList.crend(); It++) { - ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeEventQueryStatus, ((*It)->ZeEvent)); - if (ZeResult == ZE_RESULT_NOT_READY) { - return ReturnValue(false); - } else if (ZeResult != ZE_RESULT_SUCCESS) { - return ze2urResult(ZeResult); - } - } - } - } else { - for (const auto &ZeQueue : QueueGroup.second.ZeQueues) { - if (!ZeQueue) - continue; - // Provide 0 as the timeout parameter to immediately get the status - // of the Level Zero queue. - ze_result_t ZeResult = ZE_CALL_NOCHECK(zeCommandQueueSynchronize, - (ZeQueue, /* timeout */ 0)); + const auto &EventList = ImmCmdList->second.EventList; + for (auto It = EventList.crbegin(); It != EventList.crend(); It++) { + ze_result_t ZeResult = + ZE_CALL_NOCHECK(zeEventQueryStatus, ((*It)->ZeEvent)); if (ZeResult == ZE_RESULT_NOT_READY) { return ReturnValue(false); } else if (ZeResult != ZE_RESULT_SUCCESS) { @@ -446,6 +430,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo( } } } + } else { + for (const auto &ZeQueue : QueueGroup->ZeQueues) { + if (!ZeQueue) + continue; + // Provide 0 as the timeout parameter to immediately get the status + // of the Level Zero queue. + ze_result_t ZeResult = ZE_CALL_NOCHECK(zeCommandQueueSynchronize, + (ZeQueue, /* timeout */ 0)); + if (ZeResult == ZE_RESULT_NOT_READY) { + return ReturnValue(false); + } else if (ZeResult != ZE_RESULT_SUCCESS) { + return ze2urResult(ZeResult); + } + } } } return ReturnValue(true); @@ -460,17 +458,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo( return UR_RESULT_SUCCESS; } -// Controls if we should choose doing eager initialization -// to make it happen on warmup paths and have the reportable -// paths be less likely affected. -// -static bool doEagerInit = [] { - const char *UrRet = std::getenv("UR_L0_EAGER_INIT"); - const char *PiRet = std::getenv("SYCL_EAGER_INIT"); - const char *EagerInit = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - return EagerInit ? std::atoi(EagerInit) != 0 : false; -}(); - UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( ur_context_handle_t Context, ///< [in] handle of the context object ur_device_handle_t Device, ///< [in] handle of the device object @@ -537,46 +524,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( return UR_RESULT_ERROR_UNKNOWN; } - // Do eager initialization of Level Zero handles on request. - if (doEagerInit) { - ur_queue_handle_t Q = *Queue; - // Creates said number of command-lists. - auto warmupQueueGroup = [Q](bool UseCopyEngine, - uint32_t RepeatCount) -> ur_result_t { - ur_command_list_ptr_t CommandList; - while (RepeatCount--) { - if (Q->UsingImmCmdLists) { - CommandList = Q->getQueueGroup(UseCopyEngine).getImmCmdList(); - } else { - // Heuristically create some number of regular command-list to reuse. - for (int I = 0; I < 10; ++I) { - UR_CALL(Q->createCommandList(UseCopyEngine, CommandList)); - // Immediately return them to the cache of available command-lists. - std::vector EventsUnused; - UR_CALL(Q->resetCommandList(CommandList, true /* MakeAvailable */, - EventsUnused)); - } - } - } - return UR_RESULT_SUCCESS; - }; - // Create as many command-lists as there are queues in the group. - // With this the underlying round-robin logic would initialize all - // native queues, and create command-lists and their fences. - // At this point only the thread creating the queue will have associated - // command-lists. Other threads have not accessed the queue yet. So we can - // only warmup the initial thread's command-lists. - const auto &QueueGroup = Q->ComputeQueueGroupsByTID.get(); - UR_CALL(warmupQueueGroup(false, QueueGroup.UpperIndex - - QueueGroup.LowerIndex + 1)); - if (Q->useCopyEngine()) { - const auto &QueueGroup = Q->CopyQueueGroupsByTID.get(); - UR_CALL(warmupQueueGroup(true, QueueGroup.UpperIndex - - QueueGroup.LowerIndex + 1)); - } - // TODO: warmup event pools. Both host-visible and device-only. - } - return UR_RESULT_SUCCESS; } @@ -629,6 +576,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease( // Destroy completions batches if they are being used. This needs // to happen prior to resetCommandList so that all events are // checked. + // TODO(cache): split regular vs immediate cmd lists path it->second.completions.reset(); Queue->resetCommandList(it, true, EventListToCleanup); } @@ -642,25 +590,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease( if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) return ze2urResult(ZeResult); } + // TODO(cache): move this to a separate function if (Queue->UsingImmCmdLists && Queue->OwnZeCommandQueue) { - std::scoped_lock Lock( - Queue->Context->ZeCommandListCacheMutex); const ur_command_list_info_t &MapEntry = it->second; if (MapEntry.CanReuse) { - // Add commandlist to the cache for future use. - // It will be deleted when the context is destroyed. - auto &ZeCommandListCache = - MapEntry.isCopy(Queue) - ? Queue->Context - ->ZeCopyCommandListCache[Queue->Device->ZeDevice] - : Queue->Context - ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; - struct l0_command_list_cache_info ListInfo; - ListInfo.ZeQueueDesc = it->second.ZeQueueDesc; - ListInfo.InOrderList = it->second.IsInOrderList; - ZeCommandListCache.push_back({it->first, ListInfo}); + immediate_command_list_descriptor_t Desc{Queue->Device->ZeDevice, + it->second.ZeQueueDesc}; + Queue->Context->getCommandListCache(MapEntry.isCopy(Queue)) + .addCommandList(Desc, it->first); } else { - // A non-reusable comamnd list that came from a make_queue call is + // A non-reusable command list that came from a make_queue call is + // destroyed since it cannot be recycled. + ze_command_list_handle_t ZeCommandList = it->first; + if (ZeCommandList) { + ZE2UR_CALL(zeCommandListDestroy, (ZeCommandList)); + } + } + } else if (Queue->OwnZeCommandQueue) { + // Regular command lists + const ur_command_list_info_t &MapEntry = it->second; + if (MapEntry.CanReuse) { + regular_command_list_descriptor_t Desc{ + Queue->Device->ZeDevice, it->second.IsInOrderList, + it->second.ZeQueueDesc.ordinal}; + Queue->Context->getCommandListCache(MapEntry.isCopy(Queue)) + .addCommandList(Desc, it->first); + } else { + // A non-reusable command list that came from a make_queue call is // destroyed since it cannot be recycled. ze_command_list_handle_t ZeCommandList = it->first; if (ZeCommandList) { @@ -699,8 +655,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( int32_t NativeHandleDesc{}; - // Get handle to this thread's queue group. - auto &QueueGroup = Queue->getQueueGroup(false /*compute*/); + auto &QueueGroup = Queue->ComputeQueueGroup; if (Queue->UsingImmCmdLists) { auto ZeCmdList = ur_cast(NativeQueue); @@ -712,7 +667,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( auto ZeQueue = ur_cast(NativeQueue); // Extract a Level Zero compute queue handle from the given PI queue - auto &QueueGroup = Queue->getQueueGroup(false /*compute*/); uint32_t QueueGroupOrdinalUnused; *ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused); // TODO: How to pass this up in the urQueueGetNativeHandle interface? @@ -804,7 +758,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } - auto &InitialGroup = (*RetQueue)->ComputeQueueGroupsByTID.begin()->second; + auto &InitialGroup = (*RetQueue)->ComputeQueueGroup; InitialGroup.setImmCmdList(*RetQueue, ur_cast(NativeQueue)); } else { @@ -848,12 +802,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish( UR_CALL(UrQueue->executeAllOpenCommandLists()); // Make a copy of queues to sync and release the lock. - for (auto &QueueMap : - {UrQueue->ComputeQueueGroupsByTID, UrQueue->CopyQueueGroupsByTID}) - for (auto &QueueGroup : QueueMap) - std::copy(QueueGroup.second.ZeQueues.begin(), - QueueGroup.second.ZeQueues.end(), - std::back_inserter(ZeQueues)); + for (auto QueueGroup : + {&UrQueue->ComputeQueueGroup, &UrQueue->CopyQueueGroup}) + std::copy(QueueGroup->ZeQueues.begin(), QueueGroup->ZeQueues.end(), + std::back_inserter(ZeQueues)); // Remember the last command's event. auto LastCommandEvent = UrQueue->LastCommandEvent; @@ -1067,7 +1019,9 @@ ur_queue_handle_t_::ur_queue_handle_t_( std::vector &CopyQueues, ur_context_handle_t Context, ur_device_handle_t Device, bool OwnZeCommandQueue, ur_queue_flags_t Properties, int ForceComputeIndex) - : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue}, + : ComputeQueueGroup(this, queue_type::Compute), + CopyQueueGroup(this, queue_type::MainCopy), Context{Context}, + Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue}, Properties(Properties) { // Set the type of commandlists the queue will use when user-selected // submission mode. Otherwise use env var setting and if unset, use default. @@ -1088,8 +1042,6 @@ ur_queue_handle_t_::ur_queue_handle_t_( // First, see if the queue's device allows for round-robin or it is // fixed to one particular compute CCS (it is so for sub-sub-devices). auto &ComputeQueueGroupInfo = Device->QueueGroup[queue_type::Compute]; - ur_queue_group_t ComputeQueueGroup{reinterpret_cast(this), - queue_type::Compute}; ComputeQueueGroup.ZeQueues = ComputeQueues; // Create space to hold immediate commandlists corresponding to the // ZeQueues @@ -1132,11 +1084,7 @@ ur_queue_handle_t_::ur_queue_handle_t_( ComputeQueueGroup.ZeQueues.size(), CommandListMap.end()); } - ComputeQueueGroupsByTID.set(ComputeQueueGroup); - // Copy group initialization. - ur_queue_group_t CopyQueueGroup{reinterpret_cast(this), - queue_type::MainCopy}; const auto &Range = getRangeOfAllowedCopyEngines((ur_device_handle_t)Device); if (Range.first < 0 || Range.second < 0) { // We are asked not to use copy engines, just do nothing. @@ -1159,7 +1107,12 @@ ur_queue_handle_t_::ur_queue_handle_t_( } } } - CopyQueueGroupsByTID.set(CopyQueueGroup); + + if (UsingImmCmdLists) { + initializeImmediateCommandLists(); + } else { + initializeRegularCommandLists(); + } // Initialize compute/copy command batches. ComputeCommandBatch.OpenCommandList = CommandListMap.end(); @@ -1183,6 +1136,64 @@ ur_queue_handle_t_::ur_queue_handle_t_( Device->Platform->ZeDriverEventPoolCountingEventsExtensionFound; } +// TODO(cache): remove ForcedCmdQueue +ur_result_t ur_queue_handle_t_::initializeSingleRegularCommandList( + ur_queue_group_t &QueueGroup, ur_command_list_ptr_t &CommandList, + ze_command_queue_handle_t *ForcedCmdQueue) { + bool IsInOrder = Device->useDriverInOrderLists() && isInOrderQueue(); + + uint32_t Ordinal; + auto ZeQueue = QueueGroup.getZeQueue(&Ordinal); + regular_command_list_descriptor_t Desc{Device->ZeDevice, IsInOrder, Ordinal}; + auto ZeCommandListOpt = + Context->getCommandListCache(QueueGroup.isCopy()).getCommandList(Desc); + + if (ForcedCmdQueue) { + ZeQueue = *ForcedCmdQueue; + } + + if (!ZeCommandListOpt.has_value()) { + UR_CALL(createCommandList(QueueGroup.isCopy(), CommandList, &ZeQueue)); + } else { + ze_fence_handle_t ZeFence; + ZeStruct ZeFenceDesc; + ZE2UR_CALL(zeFenceCreate, (ZeQueue, &ZeFenceDesc, &ZeFence)); + ZeStruct ZeQueueDesc; + ZeQueueDesc.ordinal = Ordinal; + + CommandList = CommandListMap + .emplace(ZeCommandListOpt.value(), + ur_command_list_info_t(ZeFence, true, false, + ZeQueue, ZeQueueDesc, + useCompletionBatching())) + .first; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_handle_t_::initializeRegularCommandLists() { + // TODO: make this an env variable? Should this be a hard limit per queue? + static constexpr size_t InitialNumRegularLists = 10; + for (auto QueueGroup : {&ComputeQueueGroup, &CopyQueueGroup}) { + for (size_t I = 0; I < QueueGroup->ZeQueues.size() * InitialNumRegularLists; + I++) { + ur_command_list_ptr_t CommandList; + UR_CALL(initializeSingleRegularCommandList(*QueueGroup, CommandList)); + } + } + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_handle_t_::initializeImmediateCommandLists() { + for (auto QueueGroup : {&ComputeQueueGroup, &CopyQueueGroup}) { + for (size_t I = 0; I < QueueGroup->ZeQueues.size(); I++) { + QueueGroup->getImmCmdList(); + } + } + return UR_RESULT_SUCCESS; +} + void ur_queue_handle_t_::adjustBatchSizeForFullBatch(bool IsCopy) { auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; auto &ZeCommandListBatchConfig = @@ -1582,16 +1593,14 @@ ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) { } if (UrQueue->OwnZeCommandQueue) { - for (auto &QueueMap : - {UrQueue->ComputeQueueGroupsByTID, UrQueue->CopyQueueGroupsByTID}) - for (auto &QueueGroup : QueueMap) - for (auto &ZeQueue : QueueGroup.second.ZeQueues) - if (ZeQueue) { - auto ZeResult = ZE_CALL_NOCHECK(zeCommandQueueDestroy, (ZeQueue)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return ze2urResult(ZeResult); - } + for (auto QueueGroup : {&Queue->ComputeQueueGroup, &Queue->CopyQueueGroup}) + for (auto &ZeQueue : QueueGroup->ZeQueues) + if (ZeQueue) { + auto ZeResult = ZE_CALL_NOCHECK(zeCommandQueueDestroy, (ZeQueue)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + } } Queue->clearEndTimeRecordings(); @@ -1710,39 +1719,35 @@ ur_result_t ur_queue_handle_t_::synchronize() { // clean up all events known to have been completed as well, // so they can be reused later - for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) { - for (auto &QueueGroup : QueueMap) { - if (UsingImmCmdLists) { - for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists) { - if (ImmCmdList == this->CommandListMap.end()) - continue; - // Cleanup all events from the synced command list. - CleanupEventListFromResetCmdList(ImmCmdList->second.EventList, - true); - ImmCmdList->second.EventList.clear(); - } + for (auto QueueGroup : {&ComputeQueueGroup, &CopyQueueGroup}) { + if (UsingImmCmdLists) { + for (auto &ImmCmdList : QueueGroup->ImmCmdLists) { + if (ImmCmdList == this->CommandListMap.end()) + continue; + // Cleanup all events from the synced command list. + CleanupEventListFromResetCmdList(ImmCmdList->second.EventList, + true); + ImmCmdList->second.EventList.clear(); } } } } else { // Otherwise sync all L0 queues/immediate command-lists. - for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) { - for (auto &QueueGroup : QueueMap) { - if (UsingImmCmdLists) { - for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists) - UR_CALL(syncImmCmdList(this, ImmCmdList)); - } else { - for (auto &ZeQueue : QueueGroup.second.ZeQueues) - if (ZeQueue) { - if (UrL0QueueSyncNonBlocking) { - this->Mutex.unlock(); - ZE2UR_CALL(zeHostSynchronize, (ZeQueue)); - this->Mutex.lock(); - } else { - ZE2UR_CALL(zeHostSynchronize, (ZeQueue)); - } + for (auto QueueGroup : {&ComputeQueueGroup, &CopyQueueGroup}) { + if (UsingImmCmdLists) { + for (auto &ImmCmdList : QueueGroup->ImmCmdLists) + UR_CALL(syncImmCmdList(this, ImmCmdList)); + } else { + for (auto &ZeQueue : QueueGroup->ZeQueues) + if (ZeQueue) { + if (UrL0QueueSyncNonBlocking) { + this->Mutex.unlock(); + ZE2UR_CALL(zeHostSynchronize, (ZeQueue)); + this->Mutex.lock(); + } else { + ZE2UR_CALL(zeHostSynchronize, (ZeQueue)); } - } + } } } } @@ -2067,16 +2072,14 @@ ur_result_t ur_queue_handle_t_::resetCommandList( // Standard commandlists move in and out of the cache as they are recycled. // Immediate commandlists are always available. - if (CommandList->second.ZeFence != nullptr && MakeAvailable) { - std::scoped_lock Lock(this->Context->ZeCommandListCacheMutex); - auto &ZeCommandListCache = - UseCopyEngine - ? this->Context->ZeCopyCommandListCache[this->Device->ZeDevice] - : this->Context->ZeComputeCommandListCache[this->Device->ZeDevice]; - struct l0_command_list_cache_info ListInfo; - ListInfo.ZeQueueDesc = CommandList->second.ZeQueueDesc; - ListInfo.InOrderList = CommandList->second.IsInOrderList; - ZeCommandListCache.push_back({CommandList->first, ListInfo}); + if (CommandList->second.ZeFence != nullptr && MakeAvailable && + CommandListMap.size() > ur_context_handle_t_::CmdListsCleanupThreshold) { + regular_command_list_descriptor_t Desc{ + Device->ZeDevice, CommandList->second.IsInOrderList, + CommandList->second.ZeQueueDesc.ordinal}; + Context->getCommandListCache(UseCopyEngine) + .addCommandList(Desc, CommandList->first); + CommandListMap.erase(CommandList); } return UR_RESULT_SUCCESS; @@ -2123,12 +2126,6 @@ ur_queue_handle_t_::eventOpenCommandList(ur_event_handle_t Event) { return CommandListMap.end(); } -ur_queue_handle_t_::ur_queue_group_t & -ur_queue_handle_t_::getQueueGroup(bool UseCopyEngine) { - auto &Map = (UseCopyEngine ? CopyQueueGroupsByTID : ComputeQueueGroupsByTID); - return Map.get(); -} - // Return the index of the next queue to use based on a // round robin strategy and the queue group ordinal. uint32_t ur_queue_handle_t_::ur_queue_group_t::getQueueIndex( @@ -2241,7 +2238,7 @@ ur_result_t ur_queue_handle_t_::createCommandList( ze_command_list_handle_t ZeCommandList; uint32_t QueueGroupOrdinal; - auto &QGroup = getQueueGroup(UseCopyEngine); + auto &QGroup = UseCopyEngine ? CopyQueueGroup : ComputeQueueGroup; auto &ZeCommandQueue = ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal); if (ForcedCmdQueue) @@ -2344,15 +2341,13 @@ static const bool UseCopyEngineForInOrderQueue = [] { }(); bool ur_queue_handle_t_::useCopyEngine(bool PreferCopyEngine) const { - auto InitialCopyGroup = CopyQueueGroupsByTID.begin()->second; - return PreferCopyEngine && InitialCopyGroup.ZeQueues.size() > 0 && + return PreferCopyEngine && CopyQueueGroup.ZeQueues.size() > 0 && (!isInOrderQueue() || UseCopyEngineForInOrderQueue); } // This function will return one of po6ssibly multiple available // immediate commandlists associated with this Queue. ur_command_list_ptr_t &ur_queue_handle_t_::ur_queue_group_t::getImmCmdList() { - uint32_t QueueIndex, QueueOrdinal; auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex); @@ -2383,47 +2378,29 @@ ur_command_list_ptr_t &ur_queue_handle_t_::ur_queue_group_t::getImmCmdList() { // Check if context's command list cache has an immediate command list with // matching index. - ze_command_list_handle_t ZeCommandList = nullptr; - { - // Acquire lock to avoid race conditions. - std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); - // Under mutex since operator[] does insertion on the first usage for every - // unique ZeDevice. - auto &ZeCommandListCache = - isCopy() - ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice] - : Queue->Context - ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; - for (auto ZeCommandListIt = ZeCommandListCache.begin(); - ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) { - const auto &Desc = (*ZeCommandListIt).second.ZeQueueDesc; - if (Desc.index == ZeCommandQueueDesc.index && - Desc.flags == ZeCommandQueueDesc.flags && - Desc.mode == ZeCommandQueueDesc.mode && - Desc.priority == ZeCommandQueueDesc.priority) { - ZeCommandList = (*ZeCommandListIt).first; - ZeCommandListCache.erase(ZeCommandListIt); - break; - } - } - } + auto ZeCommandListOpt = + Queue->Context->getCommandListCache(isCopy()).getCommandList( + immediate_command_list_descriptor_t{Queue->Device->ZeDevice, + ZeCommandQueueDesc}); // If cache didn't contain a command list, create one. - if (!ZeCommandList) { + if (!ZeCommandListOpt) { logger::debug("[getZeQueue]: create queue ordinal = {}, index = {} " "(round robin in [{}, {}]) priority = {}", ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex, UpperIndex, Priority); + ze_command_list_handle_t ZeCommandList; ZE_CALL_NOCHECK(zeCommandListCreateImmediate, (Queue->Context->ZeContext, Queue->Device->ZeDevice, &ZeCommandQueueDesc, &ZeCommandList)); + ZeCommandListOpt = ZeCommandList; } ImmCmdLists[Index] = Queue->CommandListMap .insert(std::pair{ - ZeCommandList, + ZeCommandListOpt.value(), ur_command_list_info_t(nullptr, true, false, nullptr, ZeCommandQueueDesc, Queue->useCompletionBatching())}) diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 799e90e9d9..7ac66e5fed 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -281,59 +281,8 @@ struct ur_queue_handle_t_ : _ur_object { uint32_t NextIndex{0}; }; - // Helper class to facilitate per-thread queue groups - // We maintain a hashtable of queue groups if requested to do them per-thread. - // Otherwise it is just single entry used for all threads. - struct pi_queue_group_by_tid_t - : public std::unordered_map { - bool PerThread = false; - - // Returns thread id if doing per-thread, or a generic id that represents - // all the threads. - std::thread::id tid() const { - return PerThread ? std::this_thread::get_id() : std::thread::id(); - } - - // Make the specified queue group be the master - void set(const ur_queue_group_t &QueueGroup) { - const auto &Device = QueueGroup.Queue->Device; - PerThread = - Device->ImmCommandListUsed == ur_device_handle_t_::PerThreadPerQueue; - assert(empty()); - insert({tid(), QueueGroup}); - } - - // Get a queue group to use for this thread - ur_queue_group_t &get() { - assert(!empty()); - auto It = find(tid()); - if (It != end()) { - return It->second; - } - // Add new queue group for this thread initialized from a master entry. - auto QueueGroup = begin()->second; - // Create space for queues and immediate commandlists, which are created - // on demand. - QueueGroup.ZeQueues = std::vector( - QueueGroup.ZeQueues.size(), nullptr); - QueueGroup.ImmCmdLists = std::vector( - QueueGroup.ZeQueues.size(), QueueGroup.Queue->CommandListMap.end()); - - std::tie(It, std::ignore) = insert({tid(), QueueGroup}); - return It->second; - } - }; - - // A map of compute groups containing compute queue handles, one per thread. - // When a queue is accessed from multiple host threads, a separate queue group - // is created for each thread. The key used for mapping is the thread ID. - pi_queue_group_by_tid_t ComputeQueueGroupsByTID; - - // A group containing copy queue handles. The main copy engine, if available, - // comes first followed by link copy engines, if available. - // When a queue is accessed from multiple host threads, a separate queue group - // is created for each thread. The key used for mapping is the thread ID. - pi_queue_group_by_tid_t CopyQueueGroupsByTID; + ur_queue_group_t ComputeQueueGroup; + ur_queue_group_t CopyQueueGroup; // Keeps the PI context to which this queue belongs. // This field is only set at ur_queue_handle_t creation time, and cannot @@ -375,6 +324,7 @@ struct ur_queue_handle_t_ : _ur_object { bool CounterBasedEventsEnabled = false; // Map of all command lists used in this queue. + // TODO(cache): replace this with a vector ur_command_list_map_t CommandListMap; // Helper data structure to hold all variables related to batching @@ -502,6 +452,15 @@ struct ur_queue_handle_t_ : _ur_object { // Clear the end time recording timestamps entries. void clearEndTimeRecordings(); + // get a regular command list from the cache or create a new one + ur_result_t initializeSingleRegularCommandList( + ur_queue_group_t &QueueGroup, ur_command_list_ptr_t &CommandList, + ze_command_queue_handle_t *ForcedCmdQueue = nullptr); + + // initializes the queue with the command lists + ur_result_t initializeRegularCommandLists(); + ur_result_t initializeImmediateCommandLists(); + // adjust the queue's batch size, knowing that the current command list // is being closed with a full batch. // For copy commands, IsCopy is set to 'true'. @@ -646,10 +605,6 @@ struct ur_queue_handle_t_ : _ur_object { // Gets the open command containing the event, or CommandListMap.end() ur_command_list_ptr_t eventOpenCommandList(ur_event_handle_t Event); - // Return the queue group to use based on standard/immediate commandlist mode, - // and if immediate mode, the thread-specific group. - ur_queue_group_t &getQueueGroup(bool UseCopyEngine); - // Helper function to create a new command-list to this queue and associated // fence tracking its completion. This command list & fence are added to the // map of command lists in this queue with ZeFenceInUse = false.