diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt index b277380783500..4a2890e5ca741 100644 --- a/offload/CMakeLists.txt +++ b/offload/CMakeLists.txt @@ -150,9 +150,9 @@ if(DEFINED LIBOMPTARGET_BUILD_CUDA_PLUGIN OR message(WARNING "Option removed, use 'LIBOMPTARGET_PLUGINS_TO_BUILD' instead") endif() -set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host) +set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host level_zero) set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING - "Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".") + "Semicolon-separated list of plugins to use: cuda, amdgpu, level_zero, host or \"all\".") if(LIBOMPTARGET_PLUGINS_TO_BUILD STREQUAL "all") set(LIBOMPTARGET_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_PLUGIN_TARGETS}) @@ -176,6 +176,18 @@ if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "cuda") endif() endif() +if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64)$" AND + CMAKE_SYSTEM_NAME MATCHES "Linux|Windows")) + if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD) + message(STATUS "Not building Level Zero plugin: it is only supported on " + "Linux/Windows x86_64 or ppc64le hosts") + list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero") + endif() +endif() +if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD AND + NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND) + list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero") +endif() message(STATUS "Building the offload library with support for " "the \"${LIBOMPTARGET_PLUGINS_TO_BUILD}\" plugins") diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake index 2a8bdebf2c1dd..dc5ea50c958a0 100644 --- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake +++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake @@ -89,4 +89,16 @@ if(LIBOMPTARGET_AMDGPU_ARCH) endif() endif() +################################################################################ +# Looking for Level0 +################################################################################ +find_path(LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIR NAMES level_zero/ze_api.h) + +if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIR) + set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND FALSE) +else() + set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND TRUE) + find_library(LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY NAMES ze_loader) +endif() + set(OPENMP_PTHREAD_LIB ${LLVM_PTHREAD_LIB}) diff --git a/offload/include/OpenMP/InteropAPI.h b/offload/include/OpenMP/InteropAPI.h index 53ac4be2e2e98..2553bfa930784 100644 --- a/offload/include/OpenMP/InteropAPI.h +++ b/offload/include/OpenMP/InteropAPI.h @@ -160,17 +160,12 @@ struct InteropTableEntry { Interops.push_back(obj); } - template void clear(ClearFuncTy f) { - for (auto &Obj : Interops) { - f(Obj); - } - } - /// vector interface int size() const { return Interops.size(); } iterator begin() { return Interops.begin(); } iterator end() { return Interops.end(); } iterator erase(iterator it) { return Interops.erase(it); } + void clear() { Interops.clear(); } }; struct InteropTblTy diff --git a/offload/include/PerThreadTable.h b/offload/include/PerThreadTable.h index 45b196171b4c8..0241370953c67 100644 --- a/offload/include/PerThreadTable.h +++ b/offload/include/PerThreadTable.h @@ -16,6 +16,60 @@ #include #include #include +#include + +template struct PerThread { + struct PerThreadData { + std::unique_ptr ThEntry; + }; + + std::mutex Mtx; + std::list> ThreadDataList; + + // define default constructors, disable copy and move constructors + PerThread() = default; + PerThread(const PerThread &) = delete; + PerThread(PerThread &&) = delete; + PerThread &operator=(const PerThread &) = delete; + PerThread &operator=(PerThread &&) = delete; + ~PerThread() { + std::lock_guard Lock(Mtx); + ThreadDataList.clear(); + } + +private: + PerThreadData &getThreadData() { + static thread_local std::shared_ptr ThData = nullptr; + if (!ThData) { + ThData = std::make_shared(); + std::lock_guard Lock(Mtx); + ThreadDataList.push_back(ThData); + } + return *ThData; + } + +protected: + ObjectType &getThreadEntry() { + auto &ThData = getThreadData(); + if (ThData.ThEntry) + return *ThData.ThEntry; + ThData.ThEntry = std::make_unique(); + return *ThData.ThEntry; + } + +public: + ObjectType &get() { return getThreadEntry(); } + + template void clear(F f) { + std::lock_guard Lock(Mtx); + for (auto ThData : ThreadDataList) { + if (!ThData->ThEntry) + continue; + f(*ThData->ThEntry); + } + ThreadDataList.clear(); + } +}; // Using an STL container (such as std::vector) indexed by thread ID has // too many race conditions issues so we store each thread entry into a @@ -23,10 +77,32 @@ // T is the container type used to store the objects, e.g., std::vector, // std::set, etc. by each thread. O is the type of the stored objects e.g., // omp_interop_val_t *, ... - template struct PerThreadTable { using iterator = typename ContainerType::iterator; + template > + struct has_iterator : std::false_type {}; + template + struct has_iterator> : std::true_type {}; + + template > + struct has_clear : std::false_type {}; + template + struct has_clear().clear())>> + : std::true_type {}; + + template > + struct has_clearAll : std::false_type {}; + template + struct has_clearAll().clearAll(1))>> + : std::true_type {}; + + template > + struct is_associative : std::false_type {}; + template + struct is_associative> + : std::true_type {}; + struct PerThreadData { size_t NElements = 0; std::unique_ptr ThEntry; @@ -71,6 +147,11 @@ template struct PerThreadTable { return ThData.NElements; } + void setNElements(size_t Size) { + auto &NElements = getThreadNElements(); + NElements = Size; + } + public: void add(ObjectType obj) { auto &Entry = getThreadEntry(); @@ -104,11 +185,81 @@ template struct PerThreadTable { for (auto ThData : ThreadDataList) { if (!ThData->ThEntry || ThData->NElements == 0) continue; - ThData->ThEntry->clear(f); + if constexpr (has_clearAll::value) { + ThData->ThEntry->clearAll(f); + } else if constexpr (has_iterator::value && + has_clear::value) { + for (auto &Obj : *ThData->ThEntry) { + if constexpr (is_associative::value) { + f(Obj.second); + } else { + f(Obj); + } + } + ThData->ThEntry->clear(); + } else { + static_assert(true, "Container type not supported"); + } ThData->NElements = 0; } ThreadDataList.clear(); } }; +template > struct ContainerValueType { + using type = typename T::value_type; +}; +template +struct ContainerValueType> { + using type = typename T::mapped_type; +}; + +template +struct PerThreadContainer + : public PerThreadTable::type> { + + // helpers + template > struct indexType { + using type = typename T::size_type; + }; + template struct indexType> { + using type = typename T::key_type; + }; + template > + struct has_resize : std::false_type {}; + template + struct has_resize().resize(1))>> + : std::true_type {}; + + template > + struct has_reserve : std::false_type {}; + template + struct has_reserve().reserve(1))>> + : std::true_type {}; + + using IndexType = typename indexType::type; + using ObjectType = typename ContainerValueType::type; + + // Get the object for the given index in the current thread + ObjectType &get(IndexType Index) { + auto &Entry = this->getThreadEntry(); + + // specialized code for vector-like containers + if constexpr (has_resize::value) { + if (Index >= Entry.size()) { + if constexpr (has_reserve::value && reserveSize > 0) { + if (Entry.capacity() < reserveSize) + Entry.reserve(reserveSize); + } + // If the index is out of bounds, try resize the container + Entry.resize(Index + 1); + } + } + ObjectType &Ret = Entry[Index]; + this->setNElements(Entry.size()); + return Ret; + } +}; + #endif diff --git a/offload/liboffload/API/Platform.td b/offload/liboffload/API/Platform.td index 906f899076a80..9e297efc1db6e 100644 --- a/offload/liboffload/API/Platform.td +++ b/offload/liboffload/API/Platform.td @@ -27,6 +27,7 @@ def ol_platform_backend_t : Enum { Etor<"UNKNOWN", "The backend is not recognized">, Etor<"CUDA", "The backend is CUDA">, Etor<"AMDGPU", "The backend is AMDGPU">, + Etor<"LEVEL_ZERO", "The backend is Level Zero">, Etor<"HOST", "The backend is the host">, ]; } diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 051882da7c6c7..495ebab4b8ae3 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -239,6 +239,8 @@ constexpr ol_platform_backend_t pluginNameToBackend(StringRef Name) { return OL_PLATFORM_BACKEND_AMDGPU; } else if (Name == "cuda") { return OL_PLATFORM_BACKEND_CUDA; + } else if (Name == "level_zero") { + return OL_PLATFORM_BACKEND_LEVEL_ZERO; } else { return OL_PLATFORM_BACKEND_UNKNOWN; } diff --git a/offload/plugins-nextgen/common/include/DLWrap.h b/offload/plugins-nextgen/common/include/DLWrap.h index 8934e7e701021..95ce86e123cd3 100644 --- a/offload/plugins-nextgen/common/include/DLWrap.h +++ b/offload/plugins-nextgen/common/include/DLWrap.h @@ -282,5 +282,21 @@ template constexpr void verboseAssert() { return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \ x9, x10); \ } +#define DLWRAP_INSTANTIATE_12(SYM_DEF, SYM_USE, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5, \ + typename T::template arg<6>::type x6, \ + typename T::template arg<7>::type x7, \ + typename T::template arg<8>::type x8, \ + typename T::template arg<9>::type x9, \ + typename T::template arg<10>::type x10, \ + typename T::template arg<11>::type x11) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \ + x9, x10, x11); \ + } #endif // OMPTARGET_SHARED_DLWRAP_H diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt new file mode 100644 index 0000000000000..719e46b03edaf --- /dev/null +++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt @@ -0,0 +1,54 @@ +# Create the library and add the default arguments. +add_target_library(omptarget.rtl.level_zero LEVEL_ZERO) + +set(LEVEL_ZERO_SRC_FILES + src/L0Context.cpp + src/L0Device.cpp + src/L0Kernel.cpp + src/L0Memory.cpp + src/L0Program.cpp + src/L0Plugin.cpp + src/L0Program.cpp + src/L0Options.cpp +) + +target_sources(omptarget.rtl.level_zero PRIVATE + ${LEVEL_ZERO_SRC_FILES} +) + +target_include_directories(omptarget.rtl.level_zero PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include +) + +target_include_directories(omptarget.rtl.level_zero PRIVATE + ${LIBOMPTARGET_INCLUDE_DIR} + ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIR} + ${LIBOMPTARGET_LLVM_INCLUDE_DIRS} + ${LIBOMPTARGET_OMP_HEADER_DIR} +) + +cmake_path(GET LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY FILENAME LEVEL_ZERO_LIBRARY_NAME) +if (EXISTS ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY} AND NOT "level_zero" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS) + message(STATUS "Building Level Zero NG plugin linked against level_zero library") + if(UNIX) + target_link_libraries(omptarget.rtl.level_zero PRIVATE + ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY}) + elseif(WIN32) + # Full path to the Level Zero library is recognized as a linker option, so we + # separate directory and file name + cmake_path(GET LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY PARENT_PATH LEVEL_ZERO_LIBRARY_PATH) + target_link_libraries(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_NAME} + ${LIBOMP_LIB_FILE}) + target_link_directories(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_PATH}) + else() + message(FATAL_ERROR "Missing platform support") + endif() +else() + message(STATUS "Building Level Zero NG plugin for dlopened level_zero") + if(WIN32) + cmake_path(REPLACE_EXTENSION LEVEL_ZERO_LIBRARY_NAME dll) + endif() + target_compile_definitions(omptarget.rtl.level_zero PRIVATE + LEVEL_ZERO_LIBRARY="${LEVEL_ZERO_LIBRARY_NAME}") + target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp) +endif() diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h new file mode 100644 index 0000000000000..a087a082639e4 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h @@ -0,0 +1,53 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Async Queue wrapper for Level Zero +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H + +#include + +#include "L0Memory.h" + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +/// Abstract queue that supports asynchronous command submission +struct AsyncQueueTy { + /// List of events attached to submitted commands + llvm::SmallVector WaitEvents; + /// Pending staging buffer to host copies + llvm::SmallVector> H2MList; + /// Pending USM memory copy commands that must wait for kernel completion + llvm::SmallVector> USM2MList; + /// Kernel event not signaled + ze_event_handle_t KernelEvent = nullptr; + /// Is this queue being used currently + bool InUse = false; + /// Clear data + void reset() { + WaitEvents.clear(); + H2MList.clear(); + USM2MList.clear(); + KernelEvent = nullptr; + } +}; + +typedef ObjPool AsyncQueuePoolTy; + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h new file mode 100644 index 0000000000000..29d01bb7b2a2a --- /dev/null +++ b/offload/plugins-nextgen/level_zero/include/L0Context.h @@ -0,0 +1,141 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Level Zero Context abstraction +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H + +#include "L0Memory.h" +#include "PerThreadTable.h" + +namespace llvm::omp::target::plugin { + +class LevelZeroPluginTy; + +class L0ContextTLSTy { + StagingBufferTy StagingBuffer; + +public: + auto &getStagingBuffer() { return StagingBuffer; } + const auto &getStagingBuffer() const { return StagingBuffer; } + + void clear() { StagingBuffer.clear(); } +}; + +struct L0ContextTLSTableTy + : public PerThreadContainer< + std::unordered_map> { + void clear() { + PerThreadTable::clear([](L0ContextTLSTy &Entry) { Entry.clear(); }); + } +}; + +/// Driver and context-specific resources. We assume a single context per +/// driver. +class L0ContextTy { + /// The plugin that created this context + LevelZeroPluginTy &Plugin; + + /// Level Zero Driver handle + ze_driver_handle_t zeDriver = nullptr; + + /// Common Level Zero context + ze_context_handle_t zeContext = nullptr; + + /// API version supported by the Level Zero driver + ze_api_version_t APIVersion = ZE_API_VERSION_CURRENT; + + /// Imported external pointers. Track this only for user-directed + /// imports/releases. + llvm::DenseMap ImportedPtrs; + + /// Common event pool + EventPoolTy EventPool; + + /// Host Memory allocator for this driver + MemAllocatorTy HostMemAllocator; + +public: + /// Named constants for checking the imported external pointer regions. + static constexpr int32_t ImportNotExist = -1; + static constexpr int32_t ImportUnknown = 0; + static constexpr int32_t ImportExist = 1; + + /// Create context, initialize event pool and extension functions + L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver, + int32_t DriverId); + + L0ContextTy(const L0ContextTy &) = delete; + L0ContextTy(L0ContextTy &&) = delete; + L0ContextTy &operator=(const L0ContextTy &) = delete; + L0ContextTy &operator=(const L0ContextTy &&) = delete; + + /// Release resources + ~L0ContextTy() { + EventPool.deinit(); + HostMemAllocator.deinit(); + if (zeContext) + CALL_ZE_RET_VOID(zeContextDestroy, zeContext); + } + + auto &getPlugin() const { return Plugin; } + + StagingBufferTy &getStagingBuffer(); + + /// Add imported external pointer region. + void addImported(void *Ptr, size_t Size) { + (void)ImportedPtrs.try_emplace((uintptr_t)Ptr, Size); + } + + /// Remove imported external pointer region + void removeImported(void *Ptr) { (void)ImportedPtrs.erase((uintptr_t)Ptr); } + + /// Check if imported regions contain the specified region. + int32_t checkImported(void *Ptr, size_t Size) const { + uintptr_t LB = (uintptr_t)Ptr; + uintptr_t UB = LB + Size; + // We do not expect a large number of user-directed imports, so use simple + // logic. + for (auto &I : ImportedPtrs) { + uintptr_t ILB = I.first; + uintptr_t IUB = ILB + I.second; + if (LB >= ILB && UB <= IUB) + return ImportExist; + if ((LB >= ILB && LB < IUB) || (UB > ILB && UB <= IUB)) + return ImportUnknown; + } + return ImportNotExist; + } + + ze_driver_handle_t getZeDriver() const { return zeDriver; } + + /// Return context associated with the driver + ze_context_handle_t getZeContext() const { return zeContext; } + + /// Return driver API version + ze_api_version_t getDriverAPIVersion() const { return APIVersion; } + + /// Return the event pool of this driver + auto &getEventPool() { return EventPool; } + const auto &getEventPool() const { return EventPool; } + + bool supportsLargeMem() const { + // Large memory support is available since API version 1.1 + return getDriverAPIVersion() >= ZE_API_VERSION_1_1; + } + + const MemAllocatorTy &getHostMemAllocator() const { return HostMemAllocator; } + MemAllocatorTy &getHostMemAllocator() { return HostMemAllocator; } +}; + +} // namespace llvm::omp::target::plugin + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h new file mode 100644 index 0000000000000..47dc25b85ce92 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h @@ -0,0 +1,67 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// External and other auxilary definitions +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H + +#include "PluginInterface.h" +#include "Shared/Requirements.h" +#include "omptarget.h" + +enum class AllocOptionTy : int32_t { + ALLOC_OPT_NONE = 0, + ALLOC_OPT_REDUCTION_SCRATCH = 1, + ALLOC_OPT_REDUCTION_COUNTER = 2, + ALLOC_OPT_HOST_MEM = 3, + ALLOC_OPT_SLM = 4, +}; + +#ifndef EXTRACT_BITS +// MSB=63, LSB=0 +#define EXTRACT_BITS(I64, HIGH, LOW) \ + (((uint64_t)I64) >> (LOW)) & (((uint64_t)1 << ((HIGH) - (LOW) + 1)) - 1) +#endif + +namespace llvm::omp::target::plugin { + +/// Default alignmnet for allocation +constexpr size_t L0DefaultAlignment = 0; +/// Default staging buffer size for host to device copy (16KB) +constexpr size_t L0StagingBufferSize = (1 << 14); +/// Default staging buffer count +constexpr size_t L0StagingBufferCount = 64; +/// USM allocation threshold where preallocation does not pay off (128MB) +constexpr size_t L0UsmPreAllocThreshold = (128 << 20); +/// Host USM allocation threshold where preallocation does not pay off (8MB) +constexpr size_t L0HostUsmPreAllocThreshold = (8 << 20); + +using namespace error; +/// Generic L0 handle type +using ZeHandleTy = void *; + +template +static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) { + + if (Code == OFFLOAD_SUCCESS) + return Plugin::success(); + const char *Desc = "Unknown error"; + return createStringError(inconvertibleErrorCode(), + ErrFmt, Args..., Desc); +} + +#define L0_UNIMPLEMENTED_ERR \ + return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet", \ + __func__); + +} // namespace llvm::omp::target::plugin + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h new file mode 100644 index 0000000000000..e6ebff0305a14 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/include/L0Device.h @@ -0,0 +1,681 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// GenericDevice instatiation for SPIR-V/Xe machine +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H + +#include "llvm/ADT/SmallVector.h" + +#include "PerThreadTable.h" + +#include "AsyncQueue.h" +#include "L0Context.h" +#include "L0Program.h" +#include "PluginInterface.h" +#include "TLS.h" + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +using OmpInteropTy = omp_interop_val_t *; +class LevelZeroPluginTy; + +// clang-format off +enum class PCIIdTy : int32_t { + None = 0x0000, + SKL = 0x1900, + KBL = 0x5900, + CFL = 0x3E00, + CFL_2 = 0x9B00, + ICX = 0x8A00, + TGL = 0xFF20, + TGL_2 = 0x9A00, + DG1 = 0x4900, + RKL = 0x4C00, + ADLS = 0x4600, + RTL = 0xA700, + MTL = 0x7D00, + PVC = 0x0B00, + DG2_ATS_M = 0x4F00, + DG2_ATS_M_2 = 0x5600, + LNL = 0x6400, + BMG = 0xE200, +}; + +/// Device type enumeration common to compiler and runtime +enum class DeviceArchTy : uint64_t { + DeviceArch_None = 0, + DeviceArch_Gen = 0x0001, // Gen 9, Gen 11 or Xe + DeviceArch_XeLPG = 0x0002, + DeviceArch_XeHPC = 0x0004, + DeviceArch_XeHPG = 0x0008, + DeviceArch_Xe2LP = 0x0010, + DeviceArch_Xe2HP = 0x0020, + DeviceArch_x86_64 = 0x0100 +}; +// clang-format on + +struct L0DeviceIdTy { + ze_device_handle_t zeId; + int32_t RootId; + int32_t SubId; + int32_t CCSId; + + L0DeviceIdTy(ze_device_handle_t Device, int32_t RootId, int32_t SubId = -1, + int32_t CCSId = -1) + : zeId(Device), RootId(RootId), SubId(SubId), CCSId(CCSId) {} +}; + +class L0DeviceTLSTy { + /// Command list for each device + ze_command_list_handle_t CmdList = nullptr; + + /// Main copy command list for each device + ze_command_list_handle_t CopyCmdList = nullptr; + + /// Link copy command list for each device + ze_command_list_handle_t LinkCopyCmdList = nullptr; + + /// Command queue for each device + ze_command_queue_handle_t CmdQueue = nullptr; + + /// Main copy command queue for each device + ze_command_queue_handle_t CopyCmdQueue = nullptr; + + /// Link copy command queues for each device + ze_command_queue_handle_t LinkCopyCmdQueue = nullptr; + + /// Immediate command list for each device + ze_command_list_handle_t ImmCmdList = nullptr; + + /// Immediate copy command list for each device + ze_command_list_handle_t ImmCopyCmdList = nullptr; + +public: + L0DeviceTLSTy() = default; + ~L0DeviceTLSTy() { + // assert all fields are nullptr on destruction + assert(CmdList == nullptr && "CmdList is not nullptr on destruction"); + assert(CopyCmdList == nullptr && + "CopyCmdList is not nullptr on destruction"); + assert(LinkCopyCmdList == nullptr && + "LinkCopyCmdList is not nullptr on destruction"); + assert(CmdQueue == nullptr && "CmdQueue is not nullptr on destruction"); + assert(CopyCmdQueue == nullptr && + "CopyCmdQueue is not nullptr on destruction"); + assert(LinkCopyCmdQueue == nullptr && + "LinkCopyCmdQueue is not nullptr on destruction"); + assert(ImmCmdList == nullptr && "ImmCmdList is not nullptr on destruction"); + assert(ImmCopyCmdList == nullptr && + "ImmCopyCmdList is not nullptr on destruction"); + } + + L0DeviceTLSTy(const L0DeviceTLSTy &) = delete; + L0DeviceTLSTy(L0DeviceTLSTy &&Other) { + CmdList = std::exchange(Other.CmdList, nullptr); + CopyCmdList = std::exchange(Other.CopyCmdList, nullptr); + LinkCopyCmdList = std::exchange(Other.LinkCopyCmdList, nullptr); + CmdQueue = std::exchange(Other.CmdQueue, nullptr); + CopyCmdQueue = std::exchange(Other.CopyCmdQueue, nullptr); + LinkCopyCmdQueue = std::exchange(Other.LinkCopyCmdQueue, nullptr); + ImmCmdList = std::exchange(Other.ImmCmdList, nullptr); + ImmCopyCmdList = std::exchange(Other.ImmCopyCmdList, nullptr); + } + + void clear() { + // destroy all lists and queues + if (CmdList) + CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CmdList); + if (CopyCmdList) + CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CopyCmdList); + if (LinkCopyCmdList) + CALL_ZE_EXIT_FAIL(zeCommandListDestroy, LinkCopyCmdList); + if (ImmCmdList) + CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCmdList); + if (ImmCopyCmdList) + CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCopyCmdList); + if (CmdQueue) + CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CmdQueue); + if (CopyCmdQueue) + CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CopyCmdQueue); + if (LinkCopyCmdQueue) + CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, LinkCopyCmdQueue); + + CmdList = nullptr; + CopyCmdList = nullptr; + LinkCopyCmdList = nullptr; + CmdQueue = nullptr; + CopyCmdQueue = nullptr; + LinkCopyCmdQueue = nullptr; + ImmCmdList = nullptr; + ImmCopyCmdList = nullptr; + } + + L0DeviceTLSTy &operator=(const L0DeviceTLSTy &) = delete; + L0DeviceTLSTy &operator=(L0DeviceTLSTy &&) = delete; + + auto getCmdList() const { return CmdList; } + void setCmdList(ze_command_list_handle_t _CmdList) { CmdList = _CmdList; } + + auto getCopyCmdList() const { return CopyCmdList; } + void setCopyCmdList(ze_command_list_handle_t _CopyCmdList) { + CopyCmdList = _CopyCmdList; + } + + auto getLinkCopyCmdList() const { return LinkCopyCmdList; } + void setLinkCopyCmdList(ze_command_list_handle_t _LinkCopyCmdList) { + LinkCopyCmdList = _LinkCopyCmdList; + } + + auto getImmCmdList() const { return ImmCmdList; } + void setImmCmdList(ze_command_list_handle_t _ImmCmdList) { + ImmCmdList = _ImmCmdList; + } + + auto getImmCopyCmdList() const { return ImmCopyCmdList; } + void setImmCopyCmdList(ze_command_list_handle_t _ImmCopyCmdList) { + ImmCopyCmdList = _ImmCopyCmdList; + } + + auto getCmdQueue() const { return CmdQueue; } + void setCmdQueue(ze_command_queue_handle_t _CmdQueue) { + CmdQueue = _CmdQueue; + } + + auto getCopyCmdQueue() const { return CopyCmdQueue; } + void setCopyCmdQueue(ze_command_queue_handle_t _CopyCmdQueue) { + CopyCmdQueue = _CopyCmdQueue; + } + + auto getLinkCopyCmdQueue() const { return LinkCopyCmdQueue; } + void setLinkCopyCmdQueue(ze_command_queue_handle_t _LinkCopyCmdQueue) { + LinkCopyCmdQueue = _LinkCopyCmdQueue; + } +}; + +struct L0DeviceTLSTableTy + : public PerThreadContainer, 8> { + void clear() { + PerThreadTable::clear([](L0DeviceTLSTy &Entry) { Entry.clear(); }); + } +}; + +class L0DeviceTy final : public GenericDeviceTy { + // Level Zero Context for this Device + L0ContextTy &l0Context; + + // Level Zero handle for this Device + ze_device_handle_t zeDevice; + // Device Properties + ze_device_properties_t DeviceProperties{}; + ze_device_compute_properties_t ComputeProperties{}; + ze_device_memory_properties_t MemoryProperties{}; + ze_device_cache_properties_t CacheProperties{}; + + /// Devices' default target allocation kind for internal allocation + int32_t AllocKind = TARGET_ALLOC_DEVICE; + + DeviceArchTy DeviceArch = DeviceArchTy::DeviceArch_None; + + std::string DeviceName; + + /// Common indirect access flags for this device + ze_kernel_indirect_access_flags_t IndirectAccessFlags = 0; + + /// Device UUID for toplevel devices only + std::string DeviceUuid; + + /// L0 Device ID as string + std::string zeId; + + /// Command queue group ordinals for each device + std::pair ComputeOrdinal{UINT32_MAX, 0}; + /// Command queue group ordinals for copying + std::pair CopyOrdinal{UINT32_MAX, 0}; + /// Command queue group ordinals and number of queues for link copy engines + std::pair LinkCopyOrdinal{UINT32_MAX, 0}; + + /// Command queue index for each device + uint32_t ComputeIndex = 0; + + bool IsAsyncEnabled = false; + + // lock for this device + std::mutex Mutex; + + /// Contains all modules (possibly from multiple device images) to handle + /// dynamic link across multiple images + llvm::SmallVector GlobalModules; + + /// L0 programs created for this device + std::list Programs; + + /// MemAllocator for this device + MemAllocatorTy MemAllocator; + + /// The current size of the global device memory pool (managed by us). + uint64_t HeapSize = 1L << 23L /*8MB=*/; + + int32_t synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue = true); + int32_t submitData(void *TgtPtr, const void *HstPtr, int64_t Size, + __tgt_async_info *AsyncInfo); + int32_t retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size, + __tgt_async_info *AsyncInfo); + + bool shouldSetupDeviceMemoryPool() const override { return false; } + DeviceArchTy computeArch() const; + + /// Get default compute group ordinal. Returns Ordinal-NumQueues pair + std::pair findComputeOrdinal(); + + /// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair + std::pair findCopyOrdinal(bool LinkCopy = false); + + Error internalInit(); + +public: + L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices, + ze_device_handle_t zeDevice, L0ContextTy &DriverInfo, + const std::string_view zeId, int32_t ComputeIndex) + : GenericDeviceTy(Plugin, DeviceId, NumDevices, SPIRVGridValues), + l0Context(DriverInfo), zeDevice(zeDevice), zeId(zeId), + ComputeIndex(ComputeIndex) { + DeviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + DeviceProperties.pNext = nullptr; + ComputeProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES; + ComputeProperties.pNext = nullptr; + MemoryProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES; + MemoryProperties.pNext = nullptr; + CacheProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES; + CacheProperties.pNext = nullptr; + + auto Err = internalInit(); + if (Err) + FATAL_MESSAGE(DeviceId, "Couldn't initialize device: %s\n", + toString(std::move(Err)).c_str()); + } + + static L0DeviceTy &makeL0Device(GenericDeviceTy &Device) { + return static_cast(Device); + } + + auto &getPlugin() { return (LevelZeroPluginTy &)Plugin; } + L0DeviceTLSTy &getTLS(); + + Error setContext() override { return Plugin::success(); } + Error initImpl(GenericPluginTy &Plugin) override; + Error deinitImpl() override { + Programs.clear(); + return Plugin::success(); + } + + auto getZeDevice() const { return zeDevice; } + + const L0ContextTy &getL0Context() const { return l0Context; } + L0ContextTy &getL0Context() { return l0Context; } + + const std::string_view getName() const { return DeviceName; } + const char *getNameCStr() const { return DeviceName.c_str(); } + + const std::string_view getZeId() const { return zeId; } + const char *getZeIdCStr() const { return zeId.c_str(); } + + std::mutex &getMutex() { return Mutex; } + + auto getComputeIndex() const { return ComputeIndex; } + auto getIndirectFlags() const { return IndirectAccessFlags; } + + auto getNumGlobalModules() const { return GlobalModules.size(); } + void addGlobalModule(ze_module_handle_t Module) { + GlobalModules.push_back(Module); + } + auto getGlobalModulesArray() { return GlobalModules.data(); } + + L0ProgramTy *getProgramFromImage(MemoryBufferRef Image) { + for (auto &PGM : Programs) + if (PGM.getMemoryBuffer() == Image) + return &PGM; + return nullptr; + } + + int32_t buildAllKernels() { + for (auto &PGM : Programs) { + int32_t RC = PGM.loadModuleKernels(); + if (RC != OFFLOAD_SUCCESS) + return RC; + } + return OFFLOAD_SUCCESS; + } + + // add a new program to the device. Return a reference to the new program + auto &addProgram(int32_t ImageId, std::unique_ptr &&Image) { + Programs.emplace_back(ImageId, *this, std::move(Image)); + return Programs.back(); + } + + const auto &getLastProgram() const { return Programs.back(); } + auto &getLastProgram() { return Programs.back(); } + // Device properties getters + auto getVendorId() const { return DeviceProperties.vendorId; } + bool isGPU() const { return DeviceProperties.type == ZE_DEVICE_TYPE_GPU; } + + auto getPCIId() const { return DeviceProperties.deviceId; } + auto getNumThreadsPerEU() const { return DeviceProperties.numThreadsPerEU; } + auto getSIMDWidth() const { return DeviceProperties.physicalEUSimdWidth; } + auto getNumEUsPerSubslice() const { + return DeviceProperties.numEUsPerSubslice; + } + auto getNumSubslicesPerSlice() const { + return DeviceProperties.numSubslicesPerSlice; + } + auto getNumSlices() const { return DeviceProperties.numSlices; } + auto getNumSubslices() const { + return DeviceProperties.numSubslicesPerSlice * DeviceProperties.numSlices; + } + uint32_t getNumEUs() const { + return DeviceProperties.numEUsPerSubslice * getNumSubslices(); + } + auto getTotalThreads() const { + return DeviceProperties.numThreadsPerEU * getNumEUs(); + } + auto getNumThreadsPerSubslice() const { + return getNumEUsPerSubslice() * getNumThreadsPerEU(); + } + auto getClockRate() const { return DeviceProperties.coreClockRate; } + + auto getMaxSharedLocalMemory() const { + return ComputeProperties.maxSharedLocalMemory; + } + auto getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; } + auto getGlobalMemorySize() const { return MemoryProperties.totalSize; } + auto getCacheSize() const { return CacheProperties.cacheSize; } + auto getMaxMemAllocSize() const { return DeviceProperties.maxMemAllocSize; } + + int32_t getAllocKind() const { return AllocKind; } + DeviceArchTy getDeviceArch() const { return DeviceArch; } + bool isDeviceArch(DeviceArchTy Arch) const { return DeviceArch == Arch; } + + static bool isDiscrete(uint32_t PCIId) { + switch (static_cast(PCIId & 0xFF00)) { + case PCIIdTy::BMG: + return true; + default: + return false; + } + } + + static bool isDiscrete(ze_device_handle_t Device) { + ze_device_properties_t PR{}; + PR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + PR.pNext = nullptr; + CALL_ZE_RET(false, zeDeviceGetProperties, Device, &PR); + return isDiscrete(PR.deviceId); + } + + bool isDiscreteDevice() { return isDiscrete(getPCIId()); } + bool isDeviceIPorNewer(uint32_t Version) const; + + const std::string_view getUuid() const { return DeviceUuid; } + + uint32_t getComputeEngine() const { return ComputeOrdinal.first; } + uint32_t getNumComputeQueues() const { return ComputeOrdinal.second; } + + bool hasMainCopyEngine() const { return CopyOrdinal.first != UINT32_MAX; } + uint32_t getMainCopyEngine() const { return CopyOrdinal.first; } + + uint32_t getLinkCopyEngine() const { return LinkCopyOrdinal.first; } + uint32_t getNumLinkCopyQueues() const { return LinkCopyOrdinal.second; } + bool hasLinkCopyEngine() const { return getNumLinkCopyQueues() > 0; } + + bool deviceRequiresImmCmdList() const { + return isDeviceIPorNewer(0x05004000); + } + bool asyncEnabled() const { return IsAsyncEnabled; } + bool useImmForCompute() const { return true; } + bool useImmForCopy() const { return true; } + bool useImmForInterop() const { return true; } + + void reportDeviceInfo() const; + + // Command queues related functions + /// Create a command list with given ordinal and flags + ze_command_list_handle_t createCmdList(ze_context_handle_t Context, + ze_device_handle_t Device, + uint32_t Ordinal, + ze_command_list_flags_t Flags, + const std::string_view DeviceIdStr); + + /// Create a command list with default flags + ze_command_list_handle_t createCmdList(ze_context_handle_t Context, + ze_device_handle_t Device, + uint32_t Ordinal, + const std::string_view DeviceIdStr); + + ze_command_list_handle_t getCmdList(); + + /// Create a command queue with given ordinal and flags + ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context, + ze_device_handle_t Device, + uint32_t Ordinal, uint32_t Index, + ze_command_queue_flags_t Flags, + const std::string_view DeviceIdStr); + + /// Create a command queue with default flags + ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context, + ze_device_handle_t Device, + uint32_t Ordinal, uint32_t Index, + const std::string_view DeviceIdStr, + bool InOrder = false); + + /// Create a new command queue for the given OpenMP device ID + ze_command_queue_handle_t createCommandQueue(bool InOrder = false); + + /// Create an immediate command list + ze_command_list_handle_t createImmCmdList(uint32_t Ordinal, uint32_t Index, + bool InOrder = false); + + /// Create an immediate command list for computing + ze_command_list_handle_t createImmCmdList(bool InOrder = false) { + return createImmCmdList(getComputeEngine(), getComputeIndex(), InOrder); + } + + /// Create an immediate command list for copying + ze_command_list_handle_t createImmCopyCmdList(); + ze_command_queue_handle_t getCmdQueue(); + ze_command_list_handle_t getCopyCmdList(); + ze_command_queue_handle_t getCopyCmdQueue(); + ze_command_list_handle_t getLinkCopyCmdList(); + ze_command_queue_handle_t getLinkCopyCmdQueue(); + ze_command_list_handle_t getImmCmdList(); + ze_command_list_handle_t getImmCopyCmdList(); + + /// Enqueue copy command + int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size, + __tgt_async_info *AsyncInfo = nullptr, + bool UseCopyEngine = true); + + /// Enqueue asynchronous copy command + int32_t enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size, + __tgt_async_info *AsyncInfo, bool CopyTo = true); + + /// Enqueue fill command + int32_t enqueueMemFill(void *Ptr, const void *Pattern, size_t PatternSize, + size_t Size); + + /// Driver related functions + + /// Reurn the driver handle for this device + ze_driver_handle_t getZeDriver() const { return l0Context.getZeDriver(); } + + /// Return context for this device + ze_context_handle_t getZeContext() const { return l0Context.getZeContext(); } + + /// Return driver API version for this device + ze_api_version_t getDriverAPIVersion() const { + return l0Context.getDriverAPIVersion(); + } + + /// Return an event from the driver associated to this device + ze_event_handle_t getEvent() { return l0Context.getEventPool().getEvent(); } + + /// Release event to the pool associated to this device + void releaseEvent(ze_event_handle_t Event) { + l0Context.getEventPool().releaseEvent(Event, *this); + } + + StagingBufferTy &getStagingBuffer() { return l0Context.getStagingBuffer(); } + + bool supportsLargeMem() const { return l0Context.supportsLargeMem(); } + + // Allocation related routines + + /// Data alloc + Expected + dataAlloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset, + bool UserAlloc, bool DevMalloc = false, + uint32_t MemAdvice = UINT32_MAX, + AllocOptionTy AllocOpt = AllocOptionTy::ALLOC_OPT_NONE); + + /// Data delete + Error dataDelete(void *Ptr); + + /// Return the memory allocation type for the specified memory location. + uint32_t getMemAllocType(const void *Ptr) const; + + const MemAllocatorTy &getDeviceMemAllocator() const { return MemAllocator; } + MemAllocatorTy &getDeviceMemAllocator() { return MemAllocator; } + + MemAllocatorTy &getMemAllocator(int32_t Kind) { + if (Kind == TARGET_ALLOC_HOST) + return l0Context.getHostMemAllocator(); + return getDeviceMemAllocator(); + } + + MemAllocatorTy &getMemAllocator(const void *Ptr) { + bool IsHostMem = (ZE_MEMORY_TYPE_HOST == getMemAllocType(Ptr)); + if (IsHostMem) + return l0Context.getHostMemAllocator(); + return getDeviceMemAllocator(); + } + + int32_t makeMemoryResident(void *Mem, size_t Size); + + // Generic device interface implementation + Expected + loadBinaryImpl(std::unique_ptr &&TgtImage, + int32_t ImageId) override; + Error unloadBinaryImpl(DeviceImageTy *Image) override; + Expected allocate(size_t Size, void *HstPtr, + TargetAllocTy Kind) override; + Error free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) override; + + Expected dataLockImpl(void *HstPtr, int64_t Size) override { + return Plugin::error(error::ErrorCode::UNKNOWN, + "dataLockImpl not supported"); + } + Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); } + + Expected isPinnedPtrImpl(void *, void *&, void *&, + size_t &) const override { + // Don't need to do anything, this is handled by the driver. + return false; + } + + Error dataFence(__tgt_async_info *Async) override; + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override; + Error synchronizeImpl(__tgt_async_info &AsyncInfo, + bool ReleaseQueue) override; + Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override; + Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override; + Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override; + Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override; + Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override; + Error initDeviceInfoImpl(__tgt_device_info *Info) override; + Expected + hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override; + + Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData, + AsyncInfoWrapperTy &AsyncInfo) override{ + L0_UNIMPLEMENTED_ERR} + + /* Event routines are used to ensure ordering between dataTransfers. Instead + * of adding extra events in the queues, we make sure they're ordered by + * using the events from the data submission APIs so we don't need to support + * these routines. + * They still need to report succes to indicate the event are handled + * somewhere waitEvent and syncEvent should remain unimplemented + */ + Expected isEventCompleteImpl(void *EventPtr, + AsyncInfoWrapperTy &) override { + return true; + } + + Error createEventImpl(void **EventPtrStorage) override { + return Plugin::success(); + } + Error destroyEventImpl(void *EventPtr) override { return Plugin::success(); } + Error recordEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return Plugin::success(); + } + + Error waitEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n", + __func__); + } + + Error syncEventImpl(void *EventPtr) override { + return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n", + __func__); + } + + Expected obtainInfoImpl() override; + + Error getDeviceStackSize(uint64_t &V) override { + V = 0; + return Plugin::success(); + } + Expected constructKernel(const char *Name) override; + + Error setDeviceStackSize(uint64_t V) override { return Plugin::success(); } + Error getDeviceHeapSize(uint64_t &V) override { + V = HeapSize; + return Plugin::success(); + } + Error setDeviceHeapSize(uint64_t V) override { + HeapSize = V; + return Plugin::success(); + } + + Expected + createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override; + Error releaseInterop(omp_interop_val_t *Interop) override; + + interop_spec_t selectInteropPreference(int32_t InteropType, + int32_t NumPrefers, + interop_spec_t *Prefers) override; +}; + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H diff --git a/offload/plugins-nextgen/level_zero/include/L0Interop.h b/offload/plugins-nextgen/level_zero/include/L0Interop.h new file mode 100644 index 0000000000000..69a1a5f274068 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/include/L0Interop.h @@ -0,0 +1,28 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Interop support for SPIR-V/Xe machine +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H + +namespace llvm::omp::target::plugin::L0Interop { + +/// Level Zero interop property +struct Property { + // Use this when command queue needs to be accessed as + // the targetsync field in interop will be changed if preferred type is sycl. + ze_command_queue_handle_t CommandQueue; + ze_command_list_handle_t ImmCmdList; +}; + +} // namespace llvm::omp::target::plugin::L0Interop + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h new file mode 100644 index 0000000000000..c5a3528dd2974 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h @@ -0,0 +1,158 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// GenericKernel implementation for SPIR-V/Xe machine +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H + +#include "L0Defs.h" +#include "L0Trace.h" +#include "PluginInterface.h" + +namespace llvm::omp::target::plugin { + +class L0DeviceTy; +class L0ProgramTy; + +/// Loop descriptor +struct TgtLoopDescTy { + int64_t Lb = 0; // The lower bound of the i-th loop + int64_t Ub = 0; // The upper bound of the i-th loop + int64_t Stride = 0; // The stride of the i-th loop +}; + +struct TgtNDRangeDescTy { + int32_t NumLoops = 0; // Number of loops/dimensions + int32_t DistributeDim = 0; // Dimensions lower than this one + // must end up in one WG + TgtLoopDescTy Levels[3]; // Up to 3 loops +}; + +/// Kernel properties. +struct KernelPropertiesTy { + uint32_t Width = 0; + uint32_t SIMDWidth = 0; + uint32_t MaxThreadGroupSize = 0; + + /// Cached input parameters used in the previous launch + TgtNDRangeDescTy LoopDesc; + int32_t NumTeams = -1; + int32_t ThreadLimit = -1; + + /// Cached parameters used in the previous launch + ze_kernel_indirect_access_flags_t IndirectAccessFlags = UINT32_MAX; + uint32_t GroupSizes[3] = {0, 0, 0}; + ze_group_count_t GroupCounts{0, 0, 0}; + bool AllowCooperative = false; + + std::mutex Mtx; + + static constexpr TgtNDRangeDescTy LoopDescInit = {}; + + /// Check if we can reuse group parameters. + bool reuseGroupParams(const TgtNDRangeDescTy *LoopDescPtr, + const int32_t NumTeamsIn, const int32_t ThreadLimitIn, + uint32_t *GroupSizesOut, + ze_group_count_t &GroupCountsOut, + bool &AllowCooperativeOut) const { + if (!LoopDescPtr && memcmp(&LoopDescInit, &LoopDesc, sizeof(LoopDesc))) + return false; + if (LoopDescPtr && memcmp(LoopDescPtr, &LoopDesc, sizeof(LoopDesc))) + return false; + if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit) + return false; + // Found matching input parameters. + std::copy_n(GroupSizes, 3, GroupSizesOut); + GroupCountsOut = GroupCounts; + AllowCooperativeOut = AllowCooperative; + return true; + } + + /// Update cached group parameters. + void cacheGroupParams(const TgtNDRangeDescTy *LoopDescPtr, + const int32_t NumTeamsIn, const int32_t ThreadLimitIn, + const uint32_t *GroupSizesIn, + const ze_group_count_t &GroupCountsIn, + const bool &AllowCooperativeIn) { + LoopDesc = LoopDescPtr ? *LoopDescPtr : LoopDescInit; + NumTeams = NumTeamsIn; + ThreadLimit = ThreadLimitIn; + std::copy_n(GroupSizesIn, 3, GroupSizes); + GroupCounts = GroupCountsIn; + AllowCooperative = AllowCooperativeIn; + } +}; + +class L0KernelTy : public GenericKernelTy { + // L0 Kernel Handle + ze_kernel_handle_t zeKernel; + // Kernel Properties + KernelPropertiesTy Properties; + auto &getProperties() { return Properties; } + + int32_t runTargetTeamRegion(L0DeviceTy &Device, KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, + __tgt_async_info *AsyncInfo) const; + + void decideKernelGroupArguments(L0DeviceTy &Device, uint32_t NumTeams, + uint32_t ThreadLimit, + TgtNDRangeDescTy *LoopLevels, + uint32_t *GroupSizes, + ze_group_count_t &GroupCounts, + bool HalfNumThreads, + bool IsTeamsNDRange) const; + + int32_t decideLoopKernelGroupArguments( + L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels, + uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads, + bool &AllowCooperative) const; + + Error buildKernel(L0ProgramTy &Program); + +public: + /// Create a L0 kernel with a name and an execution mode. + L0KernelTy(const char *Name) : GenericKernelTy(Name), zeKernel(nullptr) {} + ~L0KernelTy() { + if (zeKernel) + CALL_ZE_RET_VOID(zeKernelDestroy, zeKernel); + } + L0KernelTy(const L0KernelTy &) = delete; + L0KernelTy(L0KernelTy &&) = delete; + L0KernelTy &operator=(const L0KernelTy &) = delete; + L0KernelTy &operator=(const L0KernelTy &&) = delete; + + const auto &getProperties() const { return Properties; } + + /// Initialize the L0 kernel. + Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override; + /// Launch the L0 kernel function. + Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], + uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, + AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + + Expected maxGroupSize(GenericDeviceTy &GenericDevice, + uint64_t DynamicMemSize) const override{ + L0_UNIMPLEMENTED_ERR} + + ze_kernel_handle_t getZeKernel() const { + return zeKernel; + } + + int32_t getGroupsShape(L0DeviceTy &Device, int32_t NumTeams, + int32_t ThreadLimit, uint32_t *GroupSizes, + ze_group_count_t &GroupCounts, void *LoopDesc, + bool &AllowCooperative) const; +}; + +} // namespace llvm::omp::target::plugin + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h new file mode 100644 index 0000000000000..9b02aa8568f96 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h @@ -0,0 +1,579 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Memory related support for SPIR-V/Xe machine +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H + +#include +#include +#include +#include +#include +#include + +#include "L0Defs.h" +#include "L0Trace.h" + +namespace llvm::omp::target::plugin { + +class L0DeviceTy; + +#define ALLOC_KIND_TO_STR(Kind) \ + (Kind == TARGET_ALLOC_HOST \ + ? "host memory" \ + : (Kind == TARGET_ALLOC_SHARED \ + ? "shared memory" \ + : (Kind == TARGET_ALLOC_DEVICE ? "device memory" \ + : "unknown memory"))) + +// forward declarations +struct L0OptionsTy; +class L0DeviceTy; +class L0ContextTy; + +struct DynamicMemHeapTy { + /// Base address memory is allocated from + uintptr_t AllocBase = 0; + /// Minimal size served by the current heap + size_t BlockSize = 0; + /// Max size served by the current heap + size_t MaxSize = 0; + /// Available memory blocks + uint32_t NumBlocks = 0; + /// Number of block descriptors + uint32_t NumBlockDesc = 0; + /// Number of block counters + uint32_t NumBlockCounter = 0; + /// List of memory block descriptors + uint64_t *BlockDesc = nullptr; + /// List of memory block counters + uint32_t *BlockCounter = nullptr; +}; + +struct DynamicMemPoolTy { + /// Location of device memory blocks + void *PoolBase = nullptr; + /// Heap size common to all heaps + size_t HeapSize = 0; + /// Number of heaps available + uint32_t NumHeaps = 0; + /// Heap descriptors (using fixed-size array to simplify memory allocation) + DynamicMemHeapTy HeapDesc[8]; +}; + +/// Memory allocation information used in memory allocation/deallocation. +struct MemAllocInfoTy { + /// Base address allocated from compute runtime + void *Base = nullptr; + /// Allocation size known to users/libomptarget + size_t Size = 0; + /// TARGET_ALLOC kind + int32_t Kind = TARGET_ALLOC_DEFAULT; + /// Allocation from pool? + bool InPool = false; + /// Is implicit argument + bool ImplicitArg = false; + + MemAllocInfoTy() = default; + + MemAllocInfoTy(void *_Base, size_t _Size, int32_t _Kind, bool _InPool, + bool _ImplicitArg) + : Base(_Base), Size(_Size), Kind(_Kind), InPool(_InPool), + ImplicitArg(_ImplicitArg) {} +}; + +/// Responsible for all activities involving memory allocation/deallocation. +/// It contains memory pool management, memory allocation bookkeeping. +class MemAllocatorTy { + + /// Simple memory allocation statistics. Maintains numbers for pool allocation + /// and GPU RT allocation. + struct MemStatTy { + size_t Requested[2] = {0, 0}; // Requested bytes + size_t Allocated[2] = {0, 0}; // Allocated bytes + size_t Freed[2] = {0, 0}; // Freed bytes + size_t InUse[2] = {0, 0}; // Current memory in use + size_t PeakUse[2] = {0, 0}; // Peak bytes used + size_t NumAllocs[2] = {0, 0}; // Number of allocations + MemStatTy() = default; + }; + + /// Memory pool which enables reuse of already allocated blocks + /// -- Pool maintains a list of buckets each of which can allocate fixed-size + /// memory. + /// -- Each bucket maintains a list of memory blocks allocated by GPU RT. + /// -- Each memory block can allocate multiple fixed-size memory requested by + /// offload RT or user. + /// -- Memory allocation falls back to GPU RT allocation when the pool size + /// (total memory used by pool) reaches a threshold. + class MemPoolTy { + + /// Memory block maintained in each bucket + struct BlockTy { + /// Base address of this block + uintptr_t Base = 0; + /// Size of the block + size_t Size = 0; + /// Supported allocation size by this block + size_t ChunkSize = 0; + /// Total number of slots + uint32_t NumSlots = 0; + /// Number of slots in use + uint32_t NumUsedSlots = 0; + /// Cached available slot returned by the last dealloc() call + uint32_t FreeSlot = UINT32_MAX; + /// Marker for the currently used slots + std::vector UsedSlots; + + BlockTy(void *_Base, size_t _Size, size_t _ChunkSize) { + Base = reinterpret_cast(_Base); + Size = _Size; + ChunkSize = _ChunkSize; + NumSlots = Size / ChunkSize; + NumUsedSlots = 0; + UsedSlots.resize(NumSlots, false); + } + + /// Check if the current block is fully used + bool isFull() const { return NumUsedSlots == NumSlots; } + + /// Check if the given address belongs to the current block + bool contains(void *Mem) const { + auto M = reinterpret_cast(Mem); + return M >= Base && M < Base + Size; + } + + /// Allocate a single chunk from the block + void *alloc(); + + /// Deallocate the given memory + void dealloc(void *Mem); + }; // BlockTy + + /// Allocation kind for the current pool + int32_t AllocKind = TARGET_ALLOC_DEFAULT; + /// Access to the allocator + MemAllocatorTy *Allocator = nullptr; + /// Minimum supported memory allocation size from pool + size_t AllocMin = 1 << 6; // 64B + /// Maximum supported memory allocation size from pool + size_t AllocMax = 0; + /// Allocation size when the pool needs to allocate a block + size_t AllocUnit = 1 << 16; // 64KB + /// Capacity of each block in the buckets which decides number of + /// allocatable chunks from the block. Each block in the bucket can serve + /// at least BlockCapacity chunks. + /// If ChunkSize * BlockCapacity <= AllocUnit + /// BlockSize = AllocUnit + /// Otherwise, + /// BlockSize = ChunkSize * BlockCapacity + /// This simply means how much memory is over-allocated. + uint32_t BlockCapacity = 0; + /// Total memory allocated from GPU RT for this pool + size_t PoolSize = 0; + /// Maximum allowed pool size. Allocation falls back to GPU RT allocation if + /// when PoolSize reaches PoolSizeMax. + size_t PoolSizeMax = 0; + /// Small allocation size allowed in the pool even if pool size is over the + /// pool size limit + size_t SmallAllocMax = 1024; + /// Small allocation pool size + size_t SmallPoolSize = 0; + /// Small allocation pool size max (4MB) + size_t SmallPoolSizeMax = (4 << 20); + /// List of buckets + std::vector> Buckets; + /// List of bucket parameters + std::vector> BucketParams; + /// Map from allocated pointer to corresponding block. + llvm::DenseMap PtrToBlock; + /// Simple stats counting miss/hit in each bucket. + std::vector> BucketStats; + /// Need to zero-initialize after L0 allocation + bool ZeroInit = false; + + /// Get bucket ID from the specified allocation size. + uint32_t getBucketId(size_t Size) { + uint32_t Count = 0; + for (size_t SZ = AllocMin; SZ < Size; Count++) + SZ <<= 1; + return Count; + } + + public: + MemPoolTy() = default; + + /// Construct pool with allocation kind, allocator, and user options. + MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator, + const L0OptionsTy &Option); + // Used for reduction pool + MemPoolTy(MemAllocatorTy *_Allocator, const L0OptionsTy &Option); + // Used for small memory pool with fixed parameters + MemPoolTy(MemAllocatorTy *_Allocator); + + MemPoolTy(const MemPoolTy &) = delete; + MemPoolTy(MemPoolTy &&) = delete; + MemPoolTy &operator=(const MemPoolTy &) = delete; + MemPoolTy &operator=(const MemPoolTy &&) = delete; + + void printUsage(); + /// Release resources used in the pool. + ~MemPoolTy(); + + /// Allocate the requested size of memory from this pool. + /// AllocSize is the chunk size internally used for the returned memory. + void *alloc(size_t Size, size_t &AllocSize); + /// Deallocate the specified memory and returns block size deallocated. + size_t dealloc(void *Ptr); + }; // MemPoolTy + + /// Allocation information maintained in the plugin + class MemAllocInfoMapTy { + /// Map from allocated pointer to allocation information + std::map Map; + /// Map from target alloc kind to number of implicit arguments + std::map NumImplicitArgs; + + public: + /// Add allocation information to the map + void add(void *Ptr, void *Base, size_t Size, int32_t Kind, + bool InPool = false, bool ImplicitArg = false); + + /// Remove allocation information for the given memory location + bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr); + + /// Finds allocation information for the given memory location + const MemAllocInfoTy *find(void *Ptr) const { + auto AllocInfo = Map.find(Ptr); + if (AllocInfo == Map.end()) + return nullptr; + else + return &AllocInfo->second; + } + + /// Check if the map contains the given pointer and offset + bool contains(const void *Ptr, size_t Size) const { + if (Map.size() == 0) + return false; + auto I = Map.upper_bound(const_cast(Ptr)); + if (I == Map.begin()) + return false; + --I; + bool Ret = (uintptr_t)I->first <= (uintptr_t)Ptr && + (uintptr_t)Ptr + (uintptr_t)Size <= + (uintptr_t)I->first + (uintptr_t)I->second.Size; + return Ret; + } + + /// Returns the number of implicit arguments for the specified allocation + /// kind. + size_t getNumImplicitArgs(int32_t Kind) { return NumImplicitArgs[Kind]; } + }; // MemAllocInfoMapTy + + /// L0 context to use + const L0ContextTy *L0Context = nullptr; + /// L0 device to use + L0DeviceTy *Device = nullptr; + /// Whether the device supports large memory allocation + bool SupportsLargeMem = false; + /// Cached max alloc size supported by device + uint64_t MaxAllocSize = INT64_MAX; + /// Map from allocation kind to memory statistics + std::unordered_map Stats; + /// Map from allocation kind to memory pool + std::unordered_map Pools; + /// Memory pool dedicated to reduction scratch space + std::unique_ptr ReductionPool; + /// Memory pool dedicated to reduction counters + std::unique_ptr CounterPool; + /// Allocation information map + MemAllocInfoMapTy AllocInfo; + /// RTL-owned memory that needs to be freed automatically + std::vector MemOwned; + /// Lock protection + std::mutex Mtx; + /// Allocator only supports host memory + bool IsHostMem = false; + // Internal deallocation function to be called when already + // hondling the Mtx lock + Error dealloc_locked(void *Ptr); + +public: + MemAllocatorTy() = default; + + MemAllocatorTy(const MemAllocatorTy &) = delete; + MemAllocatorTy(MemAllocatorTy &&) = delete; + MemAllocatorTy &operator=(const MemAllocatorTy &) = delete; + MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete; + + /// Release resources and report statistics if requested + ~MemAllocatorTy() { + if (L0Context) + deinit(); // Release resources + } + void deinit(); + + /// Allocator only supports host memory + bool supportsHostMem() { return IsHostMem; } + + void initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option); + void initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option); + void updateMaxAllocSize(L0DeviceTy &L0Device); + + /// Allocate memory from L0 GPU RT. We use over-allocation workaround + /// to support target pointer with offset, and positive "ActiveSize" is + /// specified in such cases for correct debug logging. + void *allocL0(size_t Size, size_t Align, int32_t Kind, size_t ActiveSize = 0); + + /// Allocate memory with the specified information from a memory pool + Expected alloc(size_t Size, size_t Align, int32_t Kind, + intptr_t Offset, bool UserAlloc, bool DevMalloc, + uint32_t MemAdvice, AllocOptionTy AllocOpt); + + /// Deallocate memory + Error dealloc(void *Ptr) { + std::lock_guard Lock(Mtx); + return dealloc_locked(Ptr); + } + + /// Check if the given memory location and offset belongs to any allocated + /// memory + bool contains(const void *Ptr, size_t Size) { + std::lock_guard Lock(Mtx); + return AllocInfo.contains(Ptr, Size); + } + + /// Get allocation information for the specified memory location + const MemAllocInfoTy *getAllocInfo(void *Ptr) { + std::lock_guard Lock(Mtx); + return AllocInfo.find(Ptr); + } + + /// Get kernel indirect access flags using implicit argument info + ze_kernel_indirect_access_flags_t getIndirectFlags() { + std::lock_guard Lock(Mtx); + ze_kernel_indirect_access_flags_t Ret = 0; + if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_DEVICE) > 0) + Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE; + if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_HOST) > 0) + Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST; + if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_SHARED) > 0) + Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED; + return Ret; + } + + /// Log memory allocation/deallocation + void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) { + if (Stats.count(Kind) == 0) + return; // Stat is disabled + + auto &ST = Stats[Kind]; + int32_t I = Pool ? 1 : 0; + if (ReqSize > 0) { + ST.Requested[I] += ReqSize; + ST.Allocated[I] += Size; + ST.InUse[I] += Size; + ST.NumAllocs[I]++; + } else { + ST.Freed[I] += Size; + ST.InUse[I] -= Size; + } + ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]); + } + + /// Perform copy operation + int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size); + + /// Perform memory fill operation + int32_t enqueueMemSet(void *Dst, int8_t Value, size_t Size); + +}; /// MemAllocatorTy + +// simple generic wrapper to reuse objects +// objects must have zero argument accessible constructor +template class ObjPool { + // Protection + std::unique_ptr Mtx; + // List of Objects + std::list Objects; + +public: + ObjPool() { Mtx.reset(new std::mutex); } + + ObjPool(const ObjPool &) = delete; + ObjPool(ObjPool &) = delete; + ObjPool &operator=(const ObjPool &) = delete; + ObjPool &operator=(const ObjPool &&) = delete; + + ObjTy *get() { + if (!Objects.empty()) { + std::lock_guard Lock(*Mtx); + if (!Objects.empty()) { + const auto Ret = Objects.back(); + Objects.pop_back(); + return Ret; + } + } + return new ObjTy(); + } + + void release(ObjTy *obj) { + std::lock_guard Lock(*Mtx); + Objects.push_back(obj); + } + + ~ObjPool() { + for (auto object : Objects) + delete object; + } +}; + +/// Common event pool used in the plugin. This event pool assumes all events +/// from the pool are host-visible and use the same event pool flag. +class EventPoolTy { + /// Size of L0 event pool created on demand + size_t PoolSize = 64; + + /// Context of the events + ze_context_handle_t Context = nullptr; + + /// Additional event pool flags common to this pull + uint32_t Flags = 0; + + /// Protection + std::unique_ptr Mtx; + + /// List of created L0 event pools + std::list Pools; + + /// List of free L0 events + std::list Events; + +#ifdef OMPT_SUPPORT + /// Event to OMPT record map. The timestamp information is recorded to the + /// OMPT record before the event is recycled. + std::unordered_map EventToRecord; +#endif // OMPT_SUPPORT + +public: + /// Initialize context, flags, and mutex + void init(ze_context_handle_t ContextIn, uint32_t FlagsIn) { + Context = ContextIn; + Flags = FlagsIn; + Mtx.reset(new std::mutex); + } + + /// Destroys L0 resources + void deinit() { + for (auto E : Events) + CALL_ZE_RET_VOID(zeEventDestroy, E); + for (auto P : Pools) + CALL_ZE_RET_VOID(zeEventPoolDestroy, P); + } + + /// Get a free event from the pool + ze_event_handle_t getEvent(); + + /// Return an event to the pool + void releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device); +}; + +/// Staging buffer +/// A single staging buffer is not enough when batching is enabled since there +/// can be multiple pending copy operations. +class StagingBufferTy { + /// Context for L0 calls + ze_context_handle_t Context = nullptr; + /// Max allowed size for staging buffer + size_t Size = L0StagingBufferSize; + /// Number of buffers allocated together + size_t Count = L0StagingBufferCount; + /// Buffers increasing by Count if a new buffer is required + llvm::SmallVector Buffers; + /// Next buffer location in the buffers + size_t Offset = 0; + + void *addBuffers() { + ze_host_mem_alloc_desc_t AllocDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, + nullptr, 0}; + void *Ret = nullptr; + size_t AllocSize = Size * Count; + CALL_ZE_RET_NULL(zeMemAllocHost, Context, &AllocDesc, AllocSize, + L0DefaultAlignment, &Ret); + Buffers.push_back(Ret); + return Ret; + } + +public: + StagingBufferTy() = default; + StagingBufferTy(const StagingBufferTy &) = delete; + StagingBufferTy(StagingBufferTy &&) = delete; + StagingBufferTy &operator=(const StagingBufferTy &) = delete; + StagingBufferTy &operator=(const StagingBufferTy &&) = delete; + + ~StagingBufferTy() { + if (initialized()) + clear(); + } + + void clear() { + ze_result_t Rc; + (void)Rc; // GCC build compiler thinks Rc is unused for some reason. + for (auto Ptr : Buffers) + CALL_ZE(Rc, zeMemFree, Context, Ptr); + Context = nullptr; + } + + bool initialized() const { return Context != nullptr; } + + void init(ze_context_handle_t ContextIn, size_t SizeIn, size_t CountIn) { + Context = ContextIn; + Size = SizeIn; + Count = CountIn; + } + + void reset() { Offset = 0; } + + /// Always return the first buffer + void *get() { + if (Size == 0 || Count == 0) + return nullptr; + return Buffers.empty() ? addBuffers() : Buffers.front(); + } + + /// Return the next available buffer + void *getNext() { + void *Ret = nullptr; + if (Size == 0 || Count == 0) + return Ret; + + size_t AllocSize = Size * Count; + bool NeedToGrow = Buffers.empty() || Offset >= Buffers.size() * AllocSize; + if (NeedToGrow) + Ret = addBuffers(); + else + Ret = (void *)((uintptr_t)Buffers.back() + (Offset % AllocSize)); + + if (!Ret) + return nullptr; + + Offset += Size; + return Ret; + } + + /// Return either a fixed buffer or next buffer + void *get(bool Next) { return Next ? getNext() : get(); } +}; + +} // namespace llvm::omp::target::plugin + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h new file mode 100644 index 0000000000000..459eef312f076 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/include/L0Options.h @@ -0,0 +1,161 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Level Zero RTL Options support +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H + +#include + +#include "Shared/EnvironmentVar.h" + +#include "L0Defs.h" + +namespace llvm::omp::target::plugin { +/// Command submission mode +enum class CommandModeTy { Sync = 0, Async, AsyncOrdered }; + +/// Specialization constants used for a module compilation. +class SpecConstantsTy { + std::vector ConstantIds; + std::vector ConstantValues; + BumpPtrAllocator &Allocator; + +public: + SpecConstantsTy(BumpPtrAllocator &Allocator) : Allocator(Allocator) {} + SpecConstantsTy(const SpecConstantsTy &) = delete; + SpecConstantsTy(SpecConstantsTy &&) = delete; + SpecConstantsTy &operator=(const SpecConstantsTy &) = delete; + SpecConstantsTy &operator=(const SpecConstantsTy &&) = delete; + SpecConstantsTy(const SpecConstantsTy &&Other) + : ConstantIds(std::move(Other.ConstantIds)), + ConstantValues(std::move(Other.ConstantValues)), + Allocator(Other.Allocator) {} + ~SpecConstantsTy() {} + + template void addConstant(uint32_t Id, T Val) { + T *ValuePtr = + reinterpret_cast(Allocator.Allocate(sizeof(T), alignof(T))); + *ValuePtr = Val; + + ConstantIds.push_back(Id); + ConstantValues.push_back(reinterpret_cast(ValuePtr)); + } + + ze_module_constants_t getModuleConstants() const { + ze_module_constants_t Tmp{static_cast(ConstantValues.size()), + ConstantIds.data(), + // Unfortunately we have to const_cast it. + // L0 data type should probably be fixed. + const_cast(ConstantValues.data())}; + return Tmp; + } +}; + +/// L0 Plugin flags +struct L0OptionFlagsTy { + uint64_t UseMemoryPool : 1; + uint64_t Reserved : 63; + L0OptionFlagsTy() : UseMemoryPool(1), Reserved(0) {} +}; + +struct L0OptionsTy { + /// Binary flags + L0OptionFlagsTy Flags; + + /// Staging buffer size + size_t StagingBufferSize = L0StagingBufferSize; + + /// Staging buffer count + size_t StagingBufferCount = L0StagingBufferCount; + + // TODO: This should probably be an array indexed by AllocKind + /// Memory pool parameters + /// MemPoolInfo[MemType] = {AllocMax(MB), Capacity, PoolSize(MB)} + std::map> MemPoolInfo = { + {TARGET_ALLOC_DEVICE, {1, 4, 256}}, + {TARGET_ALLOC_HOST, {1, 4, 256}}, + {TARGET_ALLOC_SHARED, {8, 4, 256}}}; + + /// Parameters for memory pools dedicated to reduction scratch space + std::array ReductionPoolInfo{256, 8, 8192}; + + /// Oversubscription rate for normal kernels + uint32_t SubscriptionRate = 4; + + /// Loop kernels with known ND-range may be known to have + /// few iterations and they may not exploit the offload device + /// to the fullest extent. + /// Let's assume a device has N total HW threads available, + /// and the kernel requires M hardware threads with LWS set to L. + /// If (M < N * ThinThreadsThreshold), then we will try + /// to iteratively divide L by 2 to increase the number of HW + /// threads used for executing the kernel. Effectively, we will + /// end up with L less than the kernel's SIMD width, so the HW + /// threads will not use all their SIMD lanes. This (presumably) should + /// allow more parallelism, because the stalls in the SIMD lanes + /// will be distributed across more HW threads, and the probability + /// of having a stall (or a sequence of stalls) on a critical path + /// in the kernel should decrease. + /// Anyway, this is just a heuristics that seems to work well for some + /// kernels (which poorly expose parallelism in the first place). + double ThinThreadsThreshold = 0.1; + + // Compilation options for IGC + // OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by + // runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation + // option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0 + // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2 + // builtins. + static constexpr std::string_view CompilationOptions = "-cl-std=CL2.0 "; + static constexpr std::string_view InternalCompilationOptions = + "-cl-take-global-address"; + std::string UserCompilationOptions = ""; + + /// Spec constants used for all modules. + SpecConstantsTy CommonSpecConstants; + + /// Command execution mode. + /// Whether the runtime uses asynchronous mode or not depends on the type of + /// devices and whether immediate command list is fully enabled. + CommandModeTy CommandMode = CommandModeTy::Async; + + /// Controls if we need to reduce available HW threads. We need this + /// adjustment on XeHPG when Level Zero debug is enabled + /// (ZET_ENABLE_PROGRAM_DEBUGGING=1). + bool ZeDebugEnabled = false; + + bool Init = false; // have the options already been processed + + // Allocator for long-lived allocations (e.g. spec constants) + BumpPtrAllocator Allocator; + + L0OptionsTy() : CommonSpecConstants(Allocator) {} + + /// Read environment variables + void processEnvironmentVars(); + + void init() { + if (!Init) { + processEnvironmentVars(); + Init = true; + } + } + + bool match(const StringEnvar &Var, const llvm::StringRef Matched) { + return Matched.equals_insensitive(Var.get()); + } + +}; // L0OptionsTy + +} // namespace llvm::omp::target::plugin + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h new file mode 100644 index 0000000000000..9fbdafa288592 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h @@ -0,0 +1,138 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Plugin interface for SPIR-V/Xe machine +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H + +#include "AsyncQueue.h" +#include "L0Defs.h" +#include "L0Device.h" +#include "L0Memory.h" +#include "L0Options.h" +#include "L0Program.h" +#include "TLS.h" + +namespace llvm::omp::target::plugin { + +/// Class implementing the LevelZero specific functionalities of the plugin. +class LevelZeroPluginTy final : public GenericPluginTy { +private: + /// Number of devices available including subdevices + uint32_t NumDevices = 0; + + /// Context (and Driver) specific data + std::list ContextList; + + /// L0 device used by each OpenMP device + using DeviceContainerTy = llvm::SmallVector; + DeviceContainerTy L0Devices; + + // Table containing per-thread information using TLS + L0ThreadTblTy ThreadTLSTable; + // Table containing per-thread information for each device using TLS + L0DeviceTLSTableTy DeviceTLSTable; + // Table containing per-thread information for each Context using TLS + L0ContextTLSTableTy ContextTLSTable; + + /// L0 plugin global options + static L0OptionsTy Options; + + std::mutex GlobalMutex; + + /// Common pool of AsyncQueue + AsyncQueuePoolTy AsyncQueuePool; + + auto &getTLS() { return ThreadTLSTable.get(); } + +public: + LevelZeroPluginTy() : GenericPluginTy(getTripleArch()) {} + virtual ~LevelZeroPluginTy() {} + + auto &getDeviceTLS(int32_t DeviceId) { return DeviceTLSTable.get(DeviceId); } + auto &getContextTLS(ze_context_handle_t Context) { + return ContextTLSTable.get(Context); + } + + static const auto &getOptions() { return Options; } + + auto &getGlobalMutex() { return GlobalMutex; } + + struct DevicesRangeTy { + using iterator = DeviceContainerTy::iterator; + + iterator BeginIt; + iterator EndIt; + + DevicesRangeTy(iterator BeginIt, iterator EndIt) + : BeginIt(BeginIt), EndIt(EndIt) {} + + auto &begin() { return BeginIt; } + auto &end() { return EndIt; } + }; + + auto getDevicesRange() { + return DevicesRangeTy(L0Devices.begin(), L0Devices.end()); + } + + /// Clean-up routine to be invoked by the destructor or + /// LevelZeroPluginTy::deinit. + void closeRTL(); + + /// Find L0 devices and initialize device properties. + /// Returns number of devices reported to omptarget. + int32_t findDevices(); + + L0DeviceTy &getDeviceFromId(int32_t DeviceId) const { + assert("Invalid device ID" && DeviceId >= 0 && + DeviceId < static_cast(L0Devices.size())); + return *L0Devices[DeviceId]; + } + + uint32_t getNumRootDevices() const { return NumDevices; } + + AsyncQueueTy *getAsyncQueue() { + auto *Queue = getTLS().getAsyncQueue(); + if (!Queue) + Queue = AsyncQueuePool.get(); + return Queue; + } + + void releaseAsyncQueue(AsyncQueueTy *Queue) { + if (!Queue) + return; + Queue->reset(); + Queue->InUse = false; + if (!getTLS().releaseAsyncQueue(Queue)) + AsyncQueuePool.release(Queue); + } + + // Plugin interface + + Expected initImpl() override; + Error deinitImpl() override; + GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId, + int32_t NumDevices) override; + GenericGlobalHandlerTy *createGlobalHandler() override; + uint16_t getMagicElfBits() const override; + Triple::ArchType getTripleArch() const override; + const char *getName() const override; + Expected isELFCompatible(uint32_t DeviceId, + StringRef Image) const override; + + Error flushQueueImpl(omp_interop_val_t *Interop) override; + Error syncBarrierImpl(omp_interop_val_t *Interop) override; + Error asyncBarrierImpl(omp_interop_val_t *Interop) override; +}; + +} // namespace llvm::omp::target::plugin + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h new file mode 100644 index 0000000000000..520bfa688a5af --- /dev/null +++ b/offload/plugins-nextgen/level_zero/include/L0Program.h @@ -0,0 +1,136 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Level Zero Program abstraction +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H + +#include "L0Kernel.h" + +namespace llvm::omp::target::plugin { + +class L0DeviceTy; + +/// Program data to be initialized by plugin +struct ProgramDataTy { + int Initialized = 0; + int NumDevices = 0; + int DeviceNum = -1; + uint32_t TotalEUs = 0; + uint32_t HWThreadsPerEU = 0; + uintptr_t DynamicMemoryLB = 0; + uintptr_t DynamicMemoryUB = 0; + int DeviceType = 0; + void *DynamicMemPool = nullptr; + int TeamsThreadLimit = 0; +}; + +/// Level Zero program that can contain multiple modules. +class L0ProgramTy : public DeviceImageTy { + /// Handle multiple modules within a single target image + llvm::SmallVector Modules; + + /// Map of kernel names to Modules + std::unordered_map KernelsToModuleMap; + + /// List of kernels built for this image + /// We need to delete them ourselves as the main library is not doing + /// that right now + std::list Kernels; + + /// Module that contains global data including device RTL + ze_module_handle_t GlobalModule = nullptr; + + /// Requires module link + bool RequiresModuleLink = false; + + /// Is this module library + bool IsLibModule = false; + + /// Build a single module with the given image, build option, and format. + int32_t addModule(const size_t Size, const uint8_t *Image, + const std::string_view BuildOption, + ze_module_format_t Format); + /// Read file and return the size of the binary if successful. + size_t readFile(const char *FileName, std::vector &OutFile) const; + void replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device, + std::string &Options) const; + + /// Check if the image should be handled as a library module + void setLibModule(); + + L0DeviceTy &getL0Device() const; + +public: + L0ProgramTy() = delete; + + L0ProgramTy(int32_t ImageId, GenericDeviceTy &Device, + std::unique_ptr Image) + : DeviceImageTy(ImageId, Device, std::move(Image)) {} + + ~L0ProgramTy(); + + L0ProgramTy(const L0ProgramTy &other) = delete; + L0ProgramTy(L0ProgramTy &&) = delete; + L0ProgramTy &operator=(const L0ProgramTy &) = delete; + L0ProgramTy &operator=(const L0ProgramTy &&) = delete; + + static L0ProgramTy &makeL0Program(DeviceImageTy &Device) { + return static_cast(Device); + } + + /// Build modules from the target image description + int32_t buildModules(const std::string_view BuildOptions); + + /// Link modules stored in \p Modules. + int32_t linkModules(); + + /// Loads the kernels names from all modules + int32_t loadModuleKernels(); + + /// Read data from the location in the device image which corresponds to the + /// specified global variable name. + int32_t readGlobalVariable(const char *Name, size_t Size, void *HostPtr); + + /// Write data to the location in the device image which corresponds to the + /// specified global variable name. + int32_t writeGlobalVariable(const char *Name, size_t Size, + const void *HostPtr); + + /// Looks up an OpenMP declare target global variable with the given + /// \p Name and \p Size in the device environment for the current device. + /// The lookup is first done via the device offload table. If it fails, + /// then the lookup falls back to non-OpenMP specific lookup on the device. + void *getOffloadVarDeviceAddr(const char *Name) const; + + /// Returns the handle of a module that contains a given Kernel name + ze_module_handle_t findModuleFromKernelName(const char *KernelName) const { + auto K = KernelsToModuleMap.find(std::string(KernelName)); + if (K == KernelsToModuleMap.end()) + return nullptr; + + return K->second; + } + + void addKernel(L0KernelTy *Kernel) { Kernels.push_back(Kernel); } +}; + +struct L0GlobalHandlerTy final : public GenericGlobalHandlerTy { + Error getGlobalMetadataFromDevice(GenericDeviceTy &Device, + DeviceImageTy &Image, + GlobalTy &DeviceGlobal) override; +}; + +bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer, + uint64_t &MinorVer); +} // namespace llvm::omp::target::plugin + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h new file mode 100644 index 0000000000000..0faa76171cbc9 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h @@ -0,0 +1,189 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Code for tracing L0 +// +//===----------------------------------------------------------------------===// +// clang-format off +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H + +#include "Shared/Debug.h" +#include "omptarget.h" +#include +#include + +#define STR(x) #x +#define TO_STRING(x) STR(x) + +#define DPCALL(...) \ + do { \ + if (getDebugLevel() > 1) \ + DP(__VA_ARGS__); \ + } while (0) + +#define WARNING(...) \ + do { \ + fprintf(stderr, "%s --> ", DEBUG_PREFIX); \ + fprintf(stderr, "Warning: " __VA_ARGS__); \ + } while (0) + +#define INVALID_OPTION(Name, Value) \ + WARNING("Ignoring invalid option " #Name "=%s\n", Value) + +#define CALL_ZE(Rc, Fn, ...) \ + do { \ + Rc = Fn(__VA_ARGS__); \ + } while (0) + +#define CALL_ZE_RC(Rc, Fn, ...) \ + do { \ + CALL_ZE(Rc, Fn, __VA_ARGS__); \ + if (Rc != ZE_RESULT_SUCCESS) { \ + DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, Rc, \ + getZeErrorName(Rc)); \ + } \ + } while(0) + +/// For non-thread-safe functions +#define CALL_ZE_RET_MTX(Ret, Fn, Mtx, ...) \ + do { \ + Mtx.lock(); \ + ze_result_t rc; \ + CALL_ZE(rc, Fn, __VA_ARGS__); \ + Mtx.unlock(); \ + if (rc != ZE_RESULT_SUCCESS) { \ + DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc, \ + getZeErrorName(rc)); \ + return Ret; \ + } \ + } while (0) + +#define CALL_ZE_RET_FAIL_MTX(Fn, Mtx, ...) \ + CALL_ZE_RET_MTX(OFFLOAD_FAIL, Fn, Mtx, __VA_ARGS__) +#define CALL_ZE_RET_NULL_MTX(Fn, Mtx, ...) \ + CALL_ZE_RET_MTX(NULL, Fn, Mtx, __VA_ARGS__) +#define CALL_ZE_RET_ZERO_MTX(Fn, Mtx, ...) \ + CALL_ZE_RET_MTX(0, Fn, Mtx, __VA_ARGS__) + +/// For thread-safe functions +#define CALL_ZE_RET(Ret, Fn, ...) \ + do { \ + ze_result_t rc; \ + CALL_ZE(rc, Fn, __VA_ARGS__); \ + if (rc != ZE_RESULT_SUCCESS) { \ + DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc, \ + getZeErrorName(rc)); \ + return Ret; \ + } \ + } while (0) + +#define CALL_ZE_RET_FAIL(Fn, ...) CALL_ZE_RET(OFFLOAD_FAIL, Fn, __VA_ARGS__) +#define CALL_ZE_RET_NULL(Fn, ...) CALL_ZE_RET(NULL, Fn, __VA_ARGS__) +#define CALL_ZE_RET_ZERO(Fn, ...) CALL_ZE_RET(0, Fn, __VA_ARGS__) +#define CALL_ZE_RET_VOID(Fn, ...) CALL_ZE_RET(, Fn, __VA_ARGS__) +#define CALL_ZE_RET_ERROR(Fn, ...) \ + CALL_ZE_RET( \ + Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d, %s", \ + STR(Fn), rc, getZeErrorName(rc)), Fn, __VA_ARGS__) + + + +#define CALL_ZE_RET_FAIL_MSG(Fn, Dev, ...) \ + do { \ + ze_result_t rc; \ + CALL_ZE(rc, Fn, __VA_ARGS__); \ + if (rc != ZE_RESULT_SUCCESS) { \ + DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc, \ + getZeErrorName(rc)); \ + const char *err_str = nullptr; \ + rc = zeDriverGetLastErrorDescription( \ + Dev.getDriverHandle(), &err_str); \ + fprintf(stderr, "Error: %s:%s failed with %s\n", __func__, #Fn, \ + err_str); \ + } \ + } while (0) + +#define CALL_ZE_EXIT_FAIL(Fn, ...) \ + do { \ + ze_result_t rc; \ + CALL_ZE(rc, Fn, __VA_ARGS__); \ + if (rc != ZE_RESULT_SUCCESS) { \ + DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc, \ + getZeErrorName(rc)); \ + std::exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CALL_ZE_EXT_SILENT_RET(Device, Ret, Name, ...) \ + do { \ + ze_result_t rc; \ + CALL_ZE_EXT_SILENT(Device, rc, Name, __VA_ARGS__); \ + if (rc != ZE_RESULT_SUCCESS) \ + return Ret; \ + } while (0) + + +#define CALL_ZE_EXT_RET_ERROR(Device, Name, ...) \ + CALL_ZE_EXT_SILENT_RET(Device, \ + Plugin::error(ErrorCode::UNKNOWN, "%s failed with code %d, %s", \ + STR(Name), rc, getZeErrorName(rc)), Name, __VA_ARGS__) + +#define FOREACH_ZE_ERROR_CODE(Fn) \ + Fn(ZE_RESULT_SUCCESS) \ + Fn(ZE_RESULT_NOT_READY) \ + Fn(ZE_RESULT_ERROR_DEVICE_LOST) \ + Fn(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY) \ + Fn(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) \ + Fn(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) \ + Fn(ZE_RESULT_ERROR_MODULE_LINK_FAILURE) \ + Fn(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET) \ + Fn(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE) \ + Fn(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS) \ + Fn(ZE_RESULT_ERROR_NOT_AVAILABLE) \ + Fn(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE) \ + Fn(ZE_RESULT_WARNING_DROPPED_DATA) \ + Fn(ZE_RESULT_ERROR_UNINITIALIZED) \ + Fn(ZE_RESULT_ERROR_UNSUPPORTED_VERSION) \ + Fn(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE) \ + Fn(ZE_RESULT_ERROR_INVALID_ARGUMENT) \ + Fn(ZE_RESULT_ERROR_INVALID_NULL_HANDLE) \ + Fn(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE) \ + Fn(ZE_RESULT_ERROR_INVALID_NULL_POINTER) \ + Fn(ZE_RESULT_ERROR_INVALID_SIZE) \ + Fn(ZE_RESULT_ERROR_UNSUPPORTED_SIZE) \ + Fn(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT) \ + Fn(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT) \ + Fn(ZE_RESULT_ERROR_INVALID_ENUMERATION) \ + Fn(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION) \ + Fn(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT) \ + Fn(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY) \ + Fn(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME) \ + Fn(ZE_RESULT_ERROR_INVALID_KERNEL_NAME) \ + Fn(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME) \ + Fn(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION) \ + Fn(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION) \ + Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX) \ + Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE) \ + Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE) \ + Fn(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED) \ + Fn(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE) \ + Fn(ZE_RESULT_ERROR_OVERLAPPING_REGIONS) \ + Fn(ZE_RESULT_WARNING_ACTION_REQUIRED) \ + Fn(ZE_RESULT_ERROR_UNKNOWN) + +#define CASE_TO_STRING(Num) case Num: return #Num; +inline const char *getZeErrorName(int32_t Error) { + switch (Error) { + FOREACH_ZE_ERROR_CODE(CASE_TO_STRING) + default: + return "ZE_RESULT_ERROR_UNKNOWN"; + } +} + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H diff --git a/offload/plugins-nextgen/level_zero/include/TLS.h b/offload/plugins-nextgen/level_zero/include/TLS.h new file mode 100644 index 0000000000000..257ada0b33b37 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/include/TLS.h @@ -0,0 +1,82 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Thread Level Storage abstraction +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H + +#include "AsyncQueue.h" +#include "L0Memory.h" +#include "L0Trace.h" +#include "PerThreadTable.h" + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +/// All thread-local data used by the Plugin +class L0ThreadTLSTy { + /// Async info tracking + static constexpr int32_t PerThreadQueues = 10; + AsyncQueueTy AsyncQueues[PerThreadQueues]; + int32_t UsedQueues = 0; + +public: + L0ThreadTLSTy() = default; + L0ThreadTLSTy(const L0ThreadTLSTy &) = delete; + L0ThreadTLSTy(L0ThreadTLSTy &&) = delete; + L0ThreadTLSTy &operator=(const L0ThreadTLSTy &) = delete; + L0ThreadTLSTy &operator=(const L0ThreadTLSTy &&) = delete; + ~L0ThreadTLSTy() {} + + void clear() {} + + AsyncQueueTy *getAsyncQueue() { + AsyncQueueTy *ret = nullptr; + if (UsedQueues < PerThreadQueues) { + // there's a free queue in this thread, find it + for (int32_t q = 0; q < PerThreadQueues; q++) { + if (!AsyncQueues[q].InUse) { + UsedQueues++; + ret = &AsyncQueues[q]; + break; + } + } + assert(ret && "A queue should have been found!"); + ret->InUse = true; + } + return ret; + } + + bool releaseAsyncQueue(AsyncQueueTy *queue) { + if (queue >= &AsyncQueues[0] && queue < &AsyncQueues[PerThreadQueues]) { + // it's a local queue + queue->InUse = false; + UsedQueues--; + return true; + } + return false; + } +}; + +struct L0ThreadTblTy : public PerThread { + void clear() { + PerThread::clear([](auto &Entry) { Entry.clear(); }); + } +}; + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H diff --git a/offload/plugins-nextgen/level_zero/src/L0Context.cpp b/offload/plugins-nextgen/level_zero/src/L0Context.cpp new file mode 100644 index 0000000000000..3f50ffd2a7260 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/src/L0Context.cpp @@ -0,0 +1,41 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Level Zero Context abstraction +// +//===----------------------------------------------------------------------===// + +#include "L0Context.h" +#include "L0Plugin.h" + +namespace llvm::omp::target::plugin { + +L0ContextTy::L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver, + int32_t /*DriverId*/) + : Plugin(Plugin), zeDriver(zeDriver) { + CALL_ZE_RET_VOID(zeDriverGetApiVersion, zeDriver, &APIVersion); + DP("Driver API version is %" PRIx32 "\n", APIVersion); + + ze_context_desc_t Desc{ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0}; + CALL_ZE_RET_VOID(zeContextCreate, zeDriver, &Desc, &zeContext); + + EventPool.init(zeContext, 0); + HostMemAllocator.initHostPool(*this, Plugin.getOptions()); +} + +StagingBufferTy &L0ContextTy::getStagingBuffer() { + auto &TLS = Plugin.getContextTLS(getZeContext()); + auto &Buffer = TLS.getStagingBuffer(); + const auto &Options = Plugin.getOptions(); + if (!Buffer.initialized()) + Buffer.init(getZeContext(), Options.StagingBufferSize, + Options.StagingBufferCount); + return Buffer; +} + +} // namespace llvm::omp::target::plugin diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp new file mode 100644 index 0000000000000..715de0d1b3c12 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp @@ -0,0 +1,1079 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// GenericDevice instatiation for SPIR-V/Xe machine +// +//===----------------------------------------------------------------------===// + +#include "L0Device.h" +#include "L0Defs.h" +#include "L0Interop.h" +#include "L0Plugin.h" +#include "L0Program.h" +#include "L0Trace.h" + +namespace llvm::omp::target::plugin { + +L0DeviceTLSTy &L0DeviceTy::getTLS() { + return getPlugin().getDeviceTLS(getDeviceId()); +} + +// clang-format off +/// Mapping from device arch to GPU runtime's device identifiers +static struct { + DeviceArchTy arch; + PCIIdTy ids[10]; +} DeviceArchMap[] = {{DeviceArchTy::DeviceArch_Gen, + {PCIIdTy::SKL, + PCIIdTy::KBL, + PCIIdTy::CFL, PCIIdTy::CFL_2, + PCIIdTy::ICX, + PCIIdTy::None}}, + {DeviceArchTy::DeviceArch_Gen, + {PCIIdTy::TGL, PCIIdTy::TGL_2, + PCIIdTy::DG1, + PCIIdTy::RKL, + PCIIdTy::ADLS, + PCIIdTy::RTL, + PCIIdTy::None}}, + {DeviceArchTy::DeviceArch_XeLPG, + {PCIIdTy::MTL, + PCIIdTy::None}}, + {DeviceArchTy::DeviceArch_XeHPC, + {PCIIdTy::PVC, + PCIIdTy::None}}, + {DeviceArchTy::DeviceArch_XeHPG, + {PCIIdTy::DG2_ATS_M, + PCIIdTy::DG2_ATS_M_2, + PCIIdTy::None}}, + {DeviceArchTy::DeviceArch_Xe2LP, + {PCIIdTy::LNL, + PCIIdTy::None}}, + {DeviceArchTy::DeviceArch_Xe2HP, + {PCIIdTy::BMG, + PCIIdTy::None}}, +}; +constexpr int DeviceArchMapSize = sizeof(DeviceArchMap) / sizeof(DeviceArchMap[0]); +// clang-format on + +DeviceArchTy L0DeviceTy::computeArch() const { + const auto PCIDeviceId = getPCIId(); + if (PCIDeviceId != 0) { + for (int ArchIndex = 0; ArchIndex < DeviceArchMapSize; ArchIndex++) { + for (int i = 0;; i++) { + const auto Id = DeviceArchMap[ArchIndex].ids[i]; + if (Id == PCIIdTy::None) + break; + + auto maskedId = static_cast(PCIDeviceId & 0xFF00); + if (maskedId == Id) + return DeviceArchMap[ArchIndex].arch; // Exact match or prefix match + } + } + } + + DP("Warning: Cannot decide device arch for %s.\n", getNameCStr()); + return DeviceArchTy::DeviceArch_None; +} + +bool L0DeviceTy::isDeviceIPorNewer(uint32_t Version) const { + ze_device_ip_version_ext_t IPVersion{}; + IPVersion.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT; + IPVersion.pNext = nullptr; + ze_device_properties_t DevicePR{}; + DevicePR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + DevicePR.pNext = &IPVersion; + CALL_ZE_RET(false, zeDeviceGetProperties, zeDevice, &DevicePR); + return IPVersion.ipVersion >= Version; +} + +/// Get default compute group ordinal. Returns Ordinal-NumQueues pair +std::pair L0DeviceTy::findComputeOrdinal() { + std::pair Ordinal{UINT32_MAX, 0}; + uint32_t Count = 0; + const auto zeDevice = getZeDevice(); + CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count, + nullptr); + ze_command_queue_group_properties_t Init{ + ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0}; + std::vector Properties(Count, Init); + CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count, + Properties.data()); + for (uint32_t I = 0; I < Count; I++) { + // TODO: add a separate set of ordinals for compute queue groups which + // support cooperative kernels + if (Properties[I].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) { + Ordinal.first = I; + Ordinal.second = Properties[I].numQueues; + break; + } + } + if (Ordinal.first == UINT32_MAX) + DP("Error: no command queues are found\n"); + + return Ordinal; +} + +/// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair +std::pair L0DeviceTy::findCopyOrdinal(bool LinkCopy) { + std::pair Ordinal{UINT32_MAX, 0}; + uint32_t Count = 0; + const auto zeDevice = getZeDevice(); + CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count, + nullptr); + ze_command_queue_group_properties_t Init{ + ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0}; + std::vector Properties(Count, Init); + CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count, + Properties.data()); + + for (uint32_t I = 0; I < Count; I++) { + const auto &Flags = Properties[I].flags; + if ((Flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) && + (Flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) { + auto NumQueues = Properties[I].numQueues; + if (LinkCopy && NumQueues > 1) { + Ordinal = {I, NumQueues}; + DP("Found link copy command queue for device " DPxMOD + ", ordinal = %" PRIu32 ", number of queues = %" PRIu32 "\n", + DPxPTR(zeDevice), Ordinal.first, Ordinal.second); + break; + } else if (!LinkCopy && NumQueues == 1) { + Ordinal = {I, NumQueues}; + DP("Found copy command queue for device " DPxMOD ", ordinal = %" PRIu32 + "\n", + DPxPTR(zeDevice), Ordinal.first); + break; + } + } + } + return Ordinal; +} + +void L0DeviceTy::reportDeviceInfo() const { + DP("Device %" PRIu32 "\n", DeviceId); + DP("-- Name : %s\n", getNameCStr()); + DP("-- PCI ID : 0x%" PRIx32 "\n", getPCIId()); + DP("-- UUID : %s\n", getUuid().data()); + DP("-- Number of total EUs : %" PRIu32 "\n", getNumEUs()); + DP("-- Number of threads per EU : %" PRIu32 "\n", getNumThreadsPerEU()); + DP("-- EU SIMD width : %" PRIu32 "\n", getSIMDWidth()); + DP("-- Number of EUs per subslice : %" PRIu32 "\n", getNumEUsPerSubslice()); + DP("-- Number of subslices per slice: %" PRIu32 "\n", + getNumSubslicesPerSlice()); + DP("-- Number of slices : %" PRIu32 "\n", getNumSlices()); + DP("-- Local memory size (bytes) : %" PRIu32 "\n", + getMaxSharedLocalMemory()); + DP("-- Global memory size (bytes) : %" PRIu64 "\n", getGlobalMemorySize()); + DP("-- Cache size (bytes) : %" PRIu64 "\n", getCacheSize()); + DP("-- Max clock frequency (MHz) : %" PRIu32 "\n", getClockRate()); +} + +Error L0DeviceTy::internalInit() { + const auto &Options = getPlugin().getOptions(); + + uint32_t Count = 1; + const auto zeDevice = getZeDevice(); + CALL_ZE_RET_ERROR(zeDeviceGetProperties, zeDevice, &DeviceProperties); + CALL_ZE_RET_ERROR(zeDeviceGetComputeProperties, zeDevice, &ComputeProperties); + CALL_ZE_RET_ERROR(zeDeviceGetMemoryProperties, zeDevice, &Count, + &MemoryProperties); + CALL_ZE_RET_ERROR(zeDeviceGetCacheProperties, zeDevice, &Count, + &CacheProperties); + + DeviceName = + std::string(DeviceProperties.name, sizeof(DeviceProperties.name)); + + DP("Found a GPU device, Name = %s\n", DeviceProperties.name); + + DeviceArch = computeArch(); + // Default allocation kind for this device + AllocKind = isDiscreteDevice() ? TARGET_ALLOC_DEVICE : TARGET_ALLOC_SHARED; + + ze_kernel_indirect_access_flags_t Flags = + (AllocKind == TARGET_ALLOC_DEVICE) + ? ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE + : ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED; + IndirectAccessFlags = Flags; + + // Get the UUID + std::string uid = ""; + for (int n = 0; n < ZE_MAX_DEVICE_UUID_SIZE; n++) + uid += std::to_string(DeviceProperties.uuid.id[n]); + DeviceUuid = std::move(uid); + + ComputeOrdinal = findComputeOrdinal(); + + CopyOrdinal = findCopyOrdinal(); + + LinkCopyOrdinal = findCopyOrdinal(true); + IsAsyncEnabled = + isDiscreteDevice() && Options.CommandMode != CommandModeTy::Sync; + MemAllocator.initDevicePools(*this, getPlugin().getOptions()); + l0Context.getHostMemAllocator().updateMaxAllocSize(*this); + return Plugin::success(); +} + +Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) { + return Plugin::success(); +} + +int32_t L0DeviceTy::synchronize(__tgt_async_info *AsyncInfo, + bool ReleaseQueue) { + bool IsAsync = AsyncInfo && asyncEnabled(); + if (!IsAsync) + return OFFLOAD_SUCCESS; + + auto &Plugin = getPlugin(); + + AsyncQueueTy *AsyncQueue = (AsyncQueueTy *)AsyncInfo->Queue; + + if (!AsyncQueue->WaitEvents.empty()) { + const auto &WaitEvents = AsyncQueue->WaitEvents; + if (Plugin.getOptions().CommandMode == CommandModeTy::AsyncOrdered) { + // Only need to wait for the last event + CALL_ZE_RET_FAIL(zeEventHostSynchronize, WaitEvents.back(), UINT64_MAX); + // Synchronize on kernel event to support printf() + auto KE = AsyncQueue->KernelEvent; + if (KE && KE != WaitEvents.back()) { + CALL_ZE_RET_FAIL(zeEventHostSynchronize, KE, UINT64_MAX); + } + for (auto &Event : WaitEvents) { + releaseEvent(Event); + } + } else { // Async + // Wait for all events. We should wait and reset events in reverse order + // to avoid premature event reset. If we have a kernel event in the + // queue, it is the last event to wait for since all wait events of the + // kernel are signaled before the kernel is invoked. We always invoke + // synchronization on kernel event to support printf(). + bool WaitDone = false; + for (auto Itr = WaitEvents.rbegin(); Itr != WaitEvents.rend(); Itr++) { + if (!WaitDone) { + CALL_ZE_RET_FAIL(zeEventHostSynchronize, *Itr, UINT64_MAX); + if (*Itr == AsyncQueue->KernelEvent) + WaitDone = true; + } + releaseEvent(*Itr); + } + } + } + + // Commit delayed USM2M copies + for (auto &USM2M : AsyncQueue->USM2MList) { + std::copy_n(static_cast(std::get<0>(USM2M)), + std::get<2>(USM2M), static_cast(std::get<1>(USM2M))); + } + // Commit delayed H2M copies + for (auto &H2M : AsyncQueue->H2MList) { + std::copy_n(static_cast(std::get<0>(H2M)), std::get<2>(H2M), + static_cast(std::get<1>(H2M))); + } + if (ReleaseQueue) { + Plugin.releaseAsyncQueue(AsyncQueue); + getStagingBuffer().reset(); + AsyncInfo->Queue = nullptr; + } + return OFFLOAD_SUCCESS; +} + +int32_t L0DeviceTy::submitData(void *TgtPtr, const void *HstPtr, int64_t Size, + __tgt_async_info *AsyncInfo) { + if (Size == 0) + return OFFLOAD_SUCCESS; + + auto &Plugin = getPlugin(); + + const auto DeviceId = getDeviceId(); + bool IsAsync = AsyncInfo && asyncEnabled(); + if (IsAsync && !AsyncInfo->Queue) { + AsyncInfo->Queue = reinterpret_cast(Plugin.getAsyncQueue()); + if (!AsyncInfo->Queue) + IsAsync = false; // Couldn't get a queue, revert to sync + } + const auto TgtPtrType = getMemAllocType(TgtPtr); + if (TgtPtrType == ZE_MEMORY_TYPE_SHARED || + TgtPtrType == ZE_MEMORY_TYPE_HOST) { + std::copy_n(static_cast(HstPtr), Size, + static_cast(TgtPtr)); + } else { + const void *SrcPtr = HstPtr; + if (isDiscreteDevice() && + static_cast(Size) <= Plugin.getOptions().StagingBufferSize && + getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) { + SrcPtr = getStagingBuffer().get(IsAsync); + std::copy_n(static_cast(HstPtr), Size, + static_cast(const_cast(SrcPtr))); + } + int32_t RC; + if (IsAsync) + RC = enqueueMemCopyAsync(TgtPtr, SrcPtr, Size, AsyncInfo); + else + RC = enqueueMemCopy(TgtPtr, SrcPtr, Size, AsyncInfo); + if (RC != OFFLOAD_SUCCESS) + return RC; + } + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "%s %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", + IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(HstPtr), + DPxPTR(TgtPtr)); + + return OFFLOAD_SUCCESS; +} + +int32_t L0DeviceTy::retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size, + __tgt_async_info *AsyncInfo) { + if (Size == 0) + return OFFLOAD_SUCCESS; + + auto &Plugin = getPlugin(); + const auto DeviceId = getDeviceId(); + bool IsAsync = AsyncInfo && asyncEnabled(); + if (IsAsync && !AsyncInfo->Queue) { + AsyncInfo->Queue = Plugin.getAsyncQueue(); + if (!AsyncInfo->Queue) + IsAsync = false; // Couldn't get a queue, revert to sync + } + auto AsyncQueue = + IsAsync ? static_cast(AsyncInfo->Queue) : nullptr; + auto TgtPtrType = getMemAllocType(TgtPtr); + if (TgtPtrType == ZE_MEMORY_TYPE_HOST || + TgtPtrType == ZE_MEMORY_TYPE_SHARED) { + bool CopyNow = true; + if (IsAsync) { + if (AsyncQueue->KernelEvent) { + // Delay Host/Shared USM to host memory copy since it must wait for + // kernel completion. + AsyncQueue->USM2MList.emplace_back(TgtPtr, HstPtr, Size); + CopyNow = false; + } + } + if (CopyNow) { + std::copy_n(static_cast(TgtPtr), Size, + static_cast(HstPtr)); + } + } else { + void *DstPtr = HstPtr; + if (isDiscreteDevice() && + static_cast(Size) <= + getPlugin().getOptions().StagingBufferSize && + getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) { + DstPtr = getStagingBuffer().get(IsAsync); + } + int32_t RC; + if (IsAsync) + RC = enqueueMemCopyAsync(DstPtr, TgtPtr, Size, AsyncInfo, + /* CopyTo */ false); + else + RC = enqueueMemCopy(DstPtr, TgtPtr, Size, AsyncInfo); + if (RC != OFFLOAD_SUCCESS) + return RC; + if (DstPtr != HstPtr) { + if (IsAsync) { + // Store delayed H2M data copies + auto &H2MList = AsyncQueue->H2MList; + H2MList.emplace_back(DstPtr, HstPtr, static_cast(Size)); + } else { + std::copy_n(static_cast(DstPtr), Size, + static_cast(HstPtr)); + } + } + } + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "%s %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", + IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(TgtPtr), + DPxPTR(HstPtr)); + + return OFFLOAD_SUCCESS; +} + +Expected +L0DeviceTy::loadBinaryImpl(std::unique_ptr &&TgtImage, + int32_t ImageId) { + auto *PGM = getProgramFromImage(TgtImage->getMemBufferRef()); + if (PGM) { + // Program already exists + return PGM; + } + + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(), + "Device %" PRId32 ": Loading binary from " DPxMOD "\n", getDeviceId(), + DPxPTR(TgtImage->getBufferStart())); + + const auto &Options = getPlugin().getOptions(); + std::string CompilationOptions(Options.CompilationOptions); + CompilationOptions += " " + Options.UserCompilationOptions; + + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(), + "Base L0 module compilation options: %s\n", CompilationOptions.c_str()); + + CompilationOptions += " "; + CompilationOptions += Options.InternalCompilationOptions; + auto &Program = addProgram(ImageId, std::move(TgtImage)); + + int32_t RC = Program.buildModules(CompilationOptions); + if (RC != OFFLOAD_SUCCESS) + return Plugin::check(RC, "Error in buildModules %d", RC); + + RC = Program.linkModules(); + if (RC != OFFLOAD_SUCCESS) + return Plugin::check(RC, "Error in linkModules %d", RC); + + RC = Program.loadModuleKernels(); + if (RC != OFFLOAD_SUCCESS) + return Plugin::check(RC, "Error in buildKernels %d", RC); + + return &Program; +} + +Error L0DeviceTy::unloadBinaryImpl(DeviceImageTy *Image) { + // Ignoring for now + // TODO: call properly L0Program unload + return Plugin::success(); +} + +Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo, + bool ReleaseQueue) { + if (!ReleaseQueue) { + return Plugin::error(ErrorCode::UNIMPLEMENTED, + "Support for ReleaseQueue=false in %s" + " not implemented yet\n", + __func__); + } + int32_t RC = synchronize(&AsyncInfo, ReleaseQueue); + return Plugin::check(RC, "Error in synchronizeImpl %d", RC); +} + +Expected +L0DeviceTy::hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) { + auto &AsyncInfo = *static_cast<__tgt_async_info *>(AsyncInfoWrapper); + const bool IsAsync = AsyncInfo.Queue && asyncEnabled(); + if (!IsAsync) + return false; + + auto *AsyncQueue = static_cast(AsyncInfo.Queue); + + if (AsyncQueue->WaitEvents.empty()) + return false; + + return true; +} + +Error L0DeviceTy::queryAsyncImpl(__tgt_async_info &AsyncInfo) { + const bool IsAsync = AsyncInfo.Queue && asyncEnabled(); + if (!IsAsync) + return Plugin::success(); + + auto &Plugin = getPlugin(); + auto *AsyncQueue = static_cast(AsyncInfo.Queue); + + if (!AsyncQueue->WaitEvents.empty()) + return Plugin::success(); + + // Commit delayed USM2M copies + for (auto &USM2M : AsyncQueue->USM2MList) { + std::copy_n(static_cast(std::get<0>(USM2M)), + std::get<2>(USM2M), static_cast(std::get<1>(USM2M))); + } + // Commit delayed H2M copies + for (auto &H2M : AsyncQueue->H2MList) { + std::copy_n(static_cast(std::get<0>(H2M)), std::get<2>(H2M), + static_cast(std::get<1>(H2M))); + } + Plugin.releaseAsyncQueue(AsyncQueue); + getStagingBuffer().reset(); + AsyncInfo.Queue = nullptr; + + return Plugin::success(); +} + +Expected L0DeviceTy::allocate(size_t Size, void *HstPtr, + TargetAllocTy Kind) { + return dataAlloc(Size, /*Align=*/0, Kind, + /*Offset=*/0, /*UserAlloc=*/HstPtr == nullptr, + /*DevMalloc=*/false); +} + +Error L0DeviceTy::free(void *TgtPtr, TargetAllocTy Kind) { + return dataDelete(TgtPtr); +} + +Error L0DeviceTy::dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) { + int32_t RC = submitData(TgtPtr, HstPtr, Size, AsyncInfoWrapper); + return Plugin::check(RC, "Error in dataSubmitImpl %d", RC); +} + +Error L0DeviceTy::dataRetrieveImpl(void *HstPtr, const void *TgtPtr, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) { + int32_t RC = retrieveData(HstPtr, TgtPtr, Size, AsyncInfoWrapper); + return Plugin::check(RC, "Error in dataRetrieveImpl %d", RC); +} + +Error L0DeviceTy::dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) { + + L0DeviceTy &L0DstDev = L0DeviceTy::makeL0Device(DstDev); + // Use copy engine only for across-tile/device copies. + const bool UseCopyEngine = getZeDevice() != L0DstDev.getZeDevice(); + + if (asyncEnabled() && AsyncInfoWrapper.hasQueue()) { + if (enqueueMemCopyAsync(DstPtr, SrcPtr, Size, + (__tgt_async_info *)AsyncInfoWrapper)) + return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed"); + } else { + if (enqueueMemCopy(DstPtr, SrcPtr, Size, + /* AsyncInfo */ nullptr, UseCopyEngine)) + return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed"); + } + return Plugin::success(); +} + +Error L0DeviceTy::initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) { + AsyncQueueTy *Queue = AsyncInfoWrapper.getQueueAs(); + if (!Queue) { + Queue = getPlugin().getAsyncQueue(); + AsyncInfoWrapper.setQueueAs(Queue); + } + return Plugin::success(); +} + +Error L0DeviceTy::initDeviceInfoImpl(__tgt_device_info *Info) { + if (!Info->Context) + Info->Context = getZeContext(); + if (!Info->Device) + Info->Device = reinterpret_cast(getZeDevice()); + return Plugin::success(); +} + +static const char *DriverVersionToStrTable[] = { + "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", + "1.7", "1.8", "1.9", "1.10", "1.11", "1.12"}; +constexpr size_t DriverVersionToStrTableSize = + sizeof(DriverVersionToStrTable) / sizeof(DriverVersionToStrTable[0]); + +Expected L0DeviceTy::obtainInfoImpl() { + InfoTreeNode Info; + Info.add("Device Number", getDeviceId()); + Info.add("Device Name", getNameCStr(), "", DeviceInfo::NAME); + Info.add("Device Type", "GPU", "", DeviceInfo::TYPE); + Info.add("Vendor", "Intel", "", DeviceInfo::VENDOR); + Info.add("Vendor ID", getVendorId(), "", DeviceInfo::VENDOR_ID); + auto DriverVersion = getDriverAPIVersion(); + if (DriverVersion < DriverVersionToStrTableSize) + Info.add("Driver Version", DriverVersionToStrTable[DriverVersion], "", + DeviceInfo::DRIVER_VERSION); + else + Info.add("Driver Version", "Unknown", "", DeviceInfo::DRIVER_VERSION); + Info.add("Device PCI ID", getPCIId()); + Info.add("Device UUID", getUuid().data()); + Info.add("Number of total EUs", getNumEUs(), "", + DeviceInfo::NUM_COMPUTE_UNITS); + Info.add("Number of threads per EU", getNumThreadsPerEU()); + Info.add("EU SIMD width", getSIMDWidth()); + Info.add("Number of EUs per subslice", getNumEUsPerSubslice()); + Info.add("Number of subslices per slice", getNumSubslicesPerSlice()); + Info.add("Number of slices", getNumSlices()); + Info.add("Max Group size", getMaxGroupSize(), "", + DeviceInfo::MAX_WORK_GROUP_SIZE); + Info.add("Local memory size (bytes)", getMaxSharedLocalMemory()); + Info.add("Global memory size (bytes)", getGlobalMemorySize(), "", + DeviceInfo::GLOBAL_MEM_SIZE); + Info.add("Cache size (bytes)", getCacheSize()); + Info.add("Max Memory Allocation Size (bytes)", getMaxMemAllocSize(), "", + DeviceInfo::MAX_MEM_ALLOC_SIZE); + Info.add("Max clock frequency (MHz)", getClockRate(), "", + DeviceInfo::MAX_CLOCK_FREQUENCY); + return Info; +} + +Expected L0DeviceTy::constructKernel(const char *Name) { + // Allocate and construct the L0 kernel. + L0KernelTy *L0Kernel = getPlugin().allocate(); + if (!L0Kernel) + return Plugin::error(ErrorCode::UNKNOWN, + "Failed to allocate memory for L0 kernel"); + + new (L0Kernel) L0KernelTy(Name); + + return *L0Kernel; +} + +uint32_t L0DeviceTy::getMemAllocType(const void *Ptr) const { + ze_memory_allocation_properties_t properties = { + ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, + nullptr, // extension + ZE_MEMORY_TYPE_UNKNOWN, // type + 0, // id + 0, // page size + }; + + ze_result_t rc; + CALL_ZE(rc, zeMemGetAllocProperties, getZeContext(), Ptr, &properties, + nullptr); + + if (rc == ZE_RESULT_ERROR_INVALID_ARGUMENT) + return ZE_MEMORY_TYPE_UNKNOWN; + else + return properties.type; +} + +interop_spec_t L0DeviceTy::selectInteropPreference(int32_t InteropType, + int32_t NumPrefers, + interop_spec_t *Prefers) { + // no supported preference found, set default to level_zero, + // non-ordered unless is targetsync + return interop_spec_t{ + tgt_fr_level_zero, + {InteropType == kmp_interop_type_targetsync ? true : false /*inorder*/, + 0}, + 0}; +} + +Expected L0DeviceTy::createInterop(int32_t InteropContext, + interop_spec_t &InteropSpec) { + auto Ret = + new omp_interop_val_t(DeviceId, (kmp_interop_type_t)InteropContext); + Ret->fr_id = tgt_fr_level_zero; + Ret->vendor_id = omp_vendor_intel; + + if (InteropContext == kmp_interop_type_target || + InteropContext == kmp_interop_type_targetsync) { + Ret->device_info.Platform = getZeDriver(); + Ret->device_info.Device = getZeDevice(); + Ret->device_info.Context = getZeContext(); + } + + Ret->rtl_property = new L0Interop::Property(); + if (InteropContext == kmp_interop_type_targetsync) { + Ret->async_info = new __tgt_async_info(); + auto L0 = static_cast(Ret->rtl_property); + + bool InOrder = InteropSpec.attrs.inorder; + Ret->attrs.inorder = InOrder; + if (useImmForInterop()) { + auto CmdList = createImmCmdList(InOrder); + Ret->async_info->Queue = CmdList; + L0->ImmCmdList = CmdList; + } else { + Ret->async_info->Queue = createCommandQueue(InOrder); + L0->CommandQueue = + static_cast(Ret->async_info->Queue); + } + } + + return Ret; +} + +Error L0DeviceTy::releaseInterop(OmpInteropTy Interop) { + const auto DeviceId = getDeviceId(); + + if (!Interop || Interop->device_id != (intptr_t)DeviceId) { + return Plugin::error(ErrorCode::INVALID_ARGUMENT, + "Invalid/inconsistent OpenMP interop " DPxMOD "\n", + DPxPTR(Interop)); + } + auto L0 = static_cast(Interop->rtl_property); + if (Interop->async_info && Interop->async_info->Queue) { + if (useImmForInterop()) { + auto ImmCmdList = L0->ImmCmdList; + CALL_ZE_RET_ERROR(zeCommandListDestroy, ImmCmdList); + } else { + auto CmdQueue = L0->CommandQueue; + CALL_ZE_RET_ERROR(zeCommandQueueDestroy, CmdQueue); + } + } + delete L0; + delete Interop; + + return Plugin::success(); +} + +int32_t L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size, + __tgt_async_info *AsyncInfo, + bool UseCopyEngine) { + ze_command_list_handle_t CmdList = nullptr; + ze_command_queue_handle_t CmdQueue = nullptr; + + if (useImmForCopy()) { + CmdList = UseCopyEngine ? getImmCopyCmdList() : getImmCmdList(); + CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size, + nullptr, 0, nullptr); + CALL_ZE_RET_FAIL(zeCommandListHostSynchronize, CmdList, UINT64_MAX); + } else { + if (UseCopyEngine) { + CmdList = getCopyCmdList(); + CmdQueue = getCopyCmdQueue(); + } else { + CmdList = getCmdList(); + CmdQueue = getCmdQueue(); + } + + CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size, + nullptr, 0, nullptr); + CALL_ZE_RET_FAIL(zeCommandListClose, CmdList); + CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, getMutex(), + CmdQueue, 1, &CmdList, nullptr); + CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX); + CALL_ZE_RET_FAIL(zeCommandListReset, CmdList); + } + return OFFLOAD_SUCCESS; +} + +/// Enqueue non-blocking memory copy. This function is invoked only when IMM is +/// fully enabled and async mode is requested. +int32_t L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size, + __tgt_async_info *AsyncInfo, + bool CopyTo) { + const bool Ordered = + (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered); + ze_event_handle_t SignalEvent = getEvent(); + size_t NumWaitEvents = 0; + ze_event_handle_t *WaitEvents = nullptr; + AsyncQueueTy *AsyncQueue = reinterpret_cast(AsyncInfo->Queue); + if (!AsyncQueue->WaitEvents.empty()) { + // Use a single wait event if events are ordered or a kernel event exists. + NumWaitEvents = 1; + if (Ordered) + WaitEvents = &AsyncQueue->WaitEvents.back(); + else if (AsyncQueue->KernelEvent) + WaitEvents = &AsyncQueue->KernelEvent; + else + NumWaitEvents = 0; + } + auto CmdList = getImmCopyCmdList(); + CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size, + SignalEvent, NumWaitEvents, WaitEvents); + AsyncQueue->WaitEvents.push_back(SignalEvent); + return OFFLOAD_SUCCESS; +} + +/// Enqueue memory fill +int32_t L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern, + size_t PatternSize, size_t Size) { + if (useImmForCopy()) { + const auto CmdList = getImmCopyCmdList(); + auto Event = getEvent(); + CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern, + PatternSize, Size, Event, 0, nullptr); + CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX); + } else { + auto CmdList = getCopyCmdList(); + const auto CmdQueue = getCopyCmdQueue(); + CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern, + PatternSize, Size, nullptr, 0, nullptr); + CALL_ZE_RET_FAIL(zeCommandListClose, CmdList); + CALL_ZE_RET_FAIL(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList, + nullptr); + CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX); + CALL_ZE_RET_FAIL(zeCommandListReset, CmdList); + } + return OFFLOAD_SUCCESS; +} + +Error L0DeviceTy::dataFillImpl(void *TgtPtr, const void *PatternPtr, + int64_t PatternSize, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) { + // TODO: support async version + // TODO: convert enqueueMemFill to return Error code + if (enqueueMemFill(TgtPtr, PatternPtr, PatternSize, Size) == OFFLOAD_SUCCESS) + return Plugin::success(); + + return Plugin::error(error::ErrorCode::UNKNOWN, "%s failed\n", __func__); +} + +Expected L0DeviceTy::dataAlloc(size_t Size, size_t Align, int32_t Kind, + intptr_t Offset, bool UserAlloc, + bool DevMalloc, uint32_t MemAdvice, + AllocOptionTy AllocOpt) { + + const bool UseDedicatedPool = + (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH) || + (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER); + if (Kind == TARGET_ALLOC_DEFAULT) { + if (UserAlloc) + Kind = TARGET_ALLOC_DEVICE; + else if (AllocOpt == AllocOptionTy::ALLOC_OPT_HOST_MEM) + Kind = TARGET_ALLOC_HOST; + else if (UseDedicatedPool) + Kind = TARGET_ALLOC_DEVICE; + else + Kind = getAllocKind(); + } + auto &Allocator = getMemAllocator(Kind); + return Allocator.alloc(Size, Align, Kind, Offset, UserAlloc, DevMalloc, + MemAdvice, AllocOpt); +} + +Error L0DeviceTy::dataDelete(void *Ptr) { + auto &Allocator = getMemAllocator(Ptr); + return Allocator.dealloc(Ptr); +} + +int32_t L0DeviceTy::makeMemoryResident(void *Mem, size_t Size) { + ze_result_t RC; + CALL_ZE(RC, zeContextMakeMemoryResident, getZeContext(), getZeDevice(), Mem, + Size); + if (RC != ZE_RESULT_SUCCESS) { + DP("Could not make memory " DPxMOD " resident on Level Zero device " DPxMOD + ".\n", + DPxPTR(Mem), DPxPTR(getZeDevice())); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +// Command queues related functions +/// Create a command list with given ordinal and flags +ze_command_list_handle_t L0DeviceTy::createCmdList( + ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal, + ze_command_list_flags_t Flags, const std::string_view DeviceIdStr) { + ze_command_list_desc_t cmdListDesc = {ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + nullptr, // extension + Ordinal, Flags}; + ze_command_list_handle_t cmdList; + CALL_ZE_RET_NULL(zeCommandListCreate, Context, Device, &cmdListDesc, + &cmdList); + DP("Created a command list " DPxMOD " (Ordinal: %" PRIu32 + ") for device %s.\n", + DPxPTR(cmdList), Ordinal, DeviceIdStr.data()); + return cmdList; +} + +/// Create a command list with default flags +ze_command_list_handle_t +L0DeviceTy::createCmdList(ze_context_handle_t Context, + ze_device_handle_t Device, uint32_t Ordinal, + const std::string_view DeviceIdStr) { + return (Ordinal == UINT32_MAX) + ? nullptr + : createCmdList(Context, Device, Ordinal, 0, DeviceIdStr); +} + +ze_command_list_handle_t L0DeviceTy::getCmdList() { + auto &TLS = getTLS(); + auto CmdList = TLS.getCmdList(); + if (!CmdList) { + CmdList = createCmdList(getZeContext(), getZeDevice(), getComputeEngine(), + getZeId()); + TLS.setCmdList(CmdList); + } + return CmdList; +} + +/// Create a command queue with given ordinal and flags +ze_command_queue_handle_t +L0DeviceTy::createCmdQueue(ze_context_handle_t Context, + ze_device_handle_t Device, uint32_t Ordinal, + uint32_t Index, ze_command_queue_flags_t Flags, + const std::string_view DeviceIdStr) { + ze_command_queue_desc_t cmdQueueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + nullptr, // extension + Ordinal, + Index, + Flags, // flags + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; + ze_command_queue_handle_t cmdQueue; + CALL_ZE_RET_NULL(zeCommandQueueCreate, Context, Device, &cmdQueueDesc, + &cmdQueue); + DP("Created a command queue " DPxMOD " (Ordinal: %" PRIu32 ", Index: %" PRIu32 + ", Flags: %" PRIu32 ") for device %s.\n", + DPxPTR(cmdQueue), Ordinal, Index, Flags, DeviceIdStr.data()); + return cmdQueue; +} + +/// Create a command queue with default flags +ze_command_queue_handle_t L0DeviceTy::createCmdQueue( + ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal, + uint32_t Index, const std::string_view DeviceIdStr, bool InOrder) { + ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0; + return (Ordinal == UINT32_MAX) ? nullptr + : createCmdQueue(Context, Device, Ordinal, + Index, Flags, DeviceIdStr); +} + +/// Create a new command queue for the given OpenMP device ID +ze_command_queue_handle_t L0DeviceTy::createCommandQueue(bool InOrder) { + auto cmdQueue = + createCmdQueue(getZeContext(), getZeDevice(), getComputeEngine(), + getComputeIndex(), getZeId(), InOrder); + return cmdQueue; +} + +/// Create an immediate command list +ze_command_list_handle_t +L0DeviceTy::createImmCmdList(uint32_t Ordinal, uint32_t Index, bool InOrder) { + ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0; + ze_command_queue_desc_t Desc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + nullptr, + Ordinal, + Index, + Flags, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; + ze_command_list_handle_t CmdList = nullptr; + CALL_ZE_RET_NULL(zeCommandListCreateImmediate, getZeContext(), getZeDevice(), + &Desc, &CmdList); + DP("Created an immediate command list " DPxMOD " (Ordinal: %" PRIu32 + ", Index: %" PRIu32 ", Flags: %" PRIu32 ") for device %s.\n", + DPxPTR(CmdList), Ordinal, Index, Flags, getZeIdCStr()); + return CmdList; +} + +/// Create an immediate command list for copying +ze_command_list_handle_t L0DeviceTy::createImmCopyCmdList() { + uint32_t Ordinal = getMainCopyEngine(); + if (Ordinal == UINT32_MAX) + Ordinal = getLinkCopyEngine(); + if (Ordinal == UINT32_MAX) + Ordinal = getComputeEngine(); + return createImmCmdList(Ordinal, /*Index*/ 0); +} + +ze_command_queue_handle_t L0DeviceTy::getCmdQueue() { + auto &TLS = getTLS(); + auto CmdQueue = TLS.getCmdQueue(); + if (!CmdQueue) { + CmdQueue = createCommandQueue(); + TLS.setCmdQueue(CmdQueue); + } + return CmdQueue; +} + +ze_command_list_handle_t L0DeviceTy::getCopyCmdList() { + // Use main copy engine if available + if (hasMainCopyEngine()) { + auto &TLS = getTLS(); + auto CmdList = TLS.getCopyCmdList(); + if (!CmdList) { + CmdList = createCmdList(getZeContext(), getZeDevice(), + getMainCopyEngine(), getZeId()); + TLS.setCopyCmdList(CmdList); + } + return CmdList; + } + // Use link copy engine if available + if (hasLinkCopyEngine()) + return getLinkCopyCmdList(); + // Use compute engine otherwise + return getCmdList(); +} + +ze_command_queue_handle_t L0DeviceTy::getCopyCmdQueue() { + // Use main copy engine if available + if (hasMainCopyEngine()) { + auto &TLS = getTLS(); + auto CmdQueue = TLS.getCopyCmdQueue(); + if (!CmdQueue) { + CmdQueue = createCmdQueue(getZeContext(), getZeDevice(), + getMainCopyEngine(), 0, getZeId()); + TLS.setCopyCmdQueue(CmdQueue); + } + return CmdQueue; + } + // Use link copy engine if available + if (hasLinkCopyEngine()) + return getLinkCopyCmdQueue(); + // Use compute engine otherwise + return getCmdQueue(); +} + +ze_command_list_handle_t L0DeviceTy::getLinkCopyCmdList() { + // Use link copy engine if available + if (hasLinkCopyEngine()) { + auto &TLS = getTLS(); + auto CmdList = TLS.getLinkCopyCmdList(); + if (!CmdList) { + CmdList = + createCmdList(getZeContext(), getZeDevice(), getLinkCopyEngine(), + ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY, getZeId()); + TLS.setLinkCopyCmdList(CmdList); + } + return CmdList; + } + // Use main copy engine if available + if (hasMainCopyEngine()) + return getCopyCmdList(); + // Use compute engine otherwise + return getCmdList(); +} + +ze_command_queue_handle_t L0DeviceTy::getLinkCopyCmdQueue() { + // Use link copy engine if available + if (hasLinkCopyEngine()) { + auto &TLS = getTLS(); + auto CmdQueue = TLS.getLinkCopyCmdQueue(); + if (!CmdQueue) { + // Try to use different copy engines for multiple threads + uint32_t Index = + __kmpc_global_thread_num(nullptr) % getNumLinkCopyQueues(); + CmdQueue = + createCmdQueue(getZeContext(), getZeDevice(), getLinkCopyEngine(), + Index, ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, getZeId()); + TLS.setLinkCopyCmdQueue(CmdQueue); + } + return CmdQueue; + } + // Use main copy engine if available + if (hasMainCopyEngine()) + return getCopyCmdQueue(); + // Use compute engine otherwise + return getCmdQueue(); +} + +ze_command_list_handle_t L0DeviceTy::getImmCmdList() { + auto &TLS = getTLS(); + auto CmdList = TLS.getImmCmdList(); + if (!CmdList) { + CmdList = createImmCmdList(); + TLS.setImmCmdList(CmdList); + } + return CmdList; +} + +ze_command_list_handle_t L0DeviceTy::getImmCopyCmdList() { + auto &TLS = getTLS(); + auto CmdList = TLS.getImmCopyCmdList(); + if (!CmdList) { + CmdList = createImmCopyCmdList(); + TLS.setImmCopyCmdList(CmdList); + } + return CmdList; +} + +Error L0DeviceTy::dataFence(__tgt_async_info *Async) { + const bool Ordered = + (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered); + + // Nothing to do if everything is ordered + if (Ordered) + return Plugin::success(); + + ze_command_list_handle_t CmdList = nullptr; + ze_command_queue_handle_t CmdQueue = nullptr; + + if (useImmForCopy()) { + CmdList = getImmCopyCmdList(); + CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr); + } else { + CmdList = getCopyCmdList(); + CmdQueue = getCopyCmdQueue(); + CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr); + CALL_ZE_RET_ERROR(zeCommandListClose, CmdList); + CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList, + nullptr); + CALL_ZE_RET_ERROR(zeCommandListReset, CmdList); + } + + return Plugin::success(); +} + +} // namespace llvm::omp::target::plugin diff --git a/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp new file mode 100644 index 0000000000000..e1ee9d5fa033b --- /dev/null +++ b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp @@ -0,0 +1,135 @@ +//===--- level_zero/dynamic_level_zero/level_zero.cpp ------------- C++ -*-===// +// +// Implement wrapper for level_zero API calls through dlopen +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "DLWrap.h" +#include "Shared/Debug.h" +#include "llvm/Support/DynamicLibrary.h" + +DLWRAP_INITIALIZE() + +DLWRAP_INTERNAL(zeInit, 1) +DLWRAP(zeDriverGet, 2) +DLWRAP(zeDeviceGet, 3) +DLWRAP(zeDeviceGetSubDevices, 3) +DLWRAP(zeModuleCreate, 5) +DLWRAP(zeModuleGetProperties, 2) +DLWRAP(zeModuleBuildLogDestroy, 1) +DLWRAP(zeModuleBuildLogGetString, 3) +DLWRAP(zeModuleGetKernelNames, 3) +DLWRAP(zeModuleDestroy, 1) +DLWRAP(zeCommandListAppendBarrier, 4) +DLWRAP(zeCommandListAppendLaunchKernel, 6) +DLWRAP(zeCommandListAppendLaunchCooperativeKernel, 6) +DLWRAP(zeCommandListAppendMemoryCopy, 7) +DLWRAP(zeCommandListAppendMemoryCopyRegion, 12) +DLWRAP(zeCommandListAppendMemoryFill, 8) +DLWRAP(zeCommandListAppendMemoryPrefetch, 3) +DLWRAP(zeCommandListAppendMemAdvise, 5) +DLWRAP(zeCommandListClose, 1) +DLWRAP(zeCommandListCreate, 4) +DLWRAP(zeCommandListCreateImmediate, 4) +DLWRAP(zeCommandListDestroy, 1) +DLWRAP(zeCommandListReset, 1) +DLWRAP(zeCommandQueueCreate, 4) +DLWRAP(zeCommandQueueDestroy, 1) +DLWRAP(zeCommandQueueExecuteCommandLists, 4) +DLWRAP(zeCommandQueueSynchronize, 2) +DLWRAP(zeContextCreate, 3) +DLWRAP(zeContextDestroy, 1) +DLWRAP(zeContextMakeMemoryResident, 4) +DLWRAP(zeDeviceCanAccessPeer, 3) +DLWRAP(zeDeviceGetProperties, 2) +DLWRAP(zeDeviceGetCommandQueueGroupProperties, 3) +DLWRAP(zeDeviceGetComputeProperties, 2) +DLWRAP(zeDeviceGetMemoryProperties, 3) +DLWRAP(zeDeviceGetCacheProperties, 3) +DLWRAP(zeDeviceGetGlobalTimestamps, 3) +DLWRAP(zeDriverGetApiVersion, 2) +DLWRAP(zeDriverGetExtensionFunctionAddress, 3) +DLWRAP(zeDriverGetExtensionProperties, 3) +DLWRAP(zeEventCreate, 3) +DLWRAP(zeEventDestroy, 1) +DLWRAP(zeEventHostReset, 1) +DLWRAP(zeEventHostSynchronize, 2) +DLWRAP(zeEventPoolCreate, 5) +DLWRAP(zeEventPoolDestroy, 1) +DLWRAP(zeEventQueryKernelTimestamp, 2) +DLWRAP(zeFenceCreate, 3) +DLWRAP(zeFenceDestroy, 1) +DLWRAP(zeFenceHostSynchronize, 2) +DLWRAP(zeKernelCreate, 3) +DLWRAP(zeKernelDestroy, 1) +DLWRAP(zeKernelGetName, 3) +DLWRAP(zeKernelGetProperties, 2) +DLWRAP(zeKernelSetArgumentValue, 4) +DLWRAP(zeKernelSetGroupSize, 4) +DLWRAP(zeKernelSetIndirectAccess, 2) +DLWRAP(zeKernelSuggestGroupSize, 7) +DLWRAP(zeKernelSuggestMaxCooperativeGroupCount, 2) +DLWRAP(zeMemAllocDevice, 6) +DLWRAP(zeMemAllocHost, 5) +DLWRAP(zeMemAllocShared, 7) +DLWRAP(zeMemFree, 2) +DLWRAP(zeMemGetAddressRange, 4) +DLWRAP(zeMemGetAllocProperties, 4) +DLWRAP(zeModuleDynamicLink, 3) +DLWRAP(zeModuleGetGlobalPointer, 4) +DLWRAP(zesDeviceEnumMemoryModules, 3) +DLWRAP(zesMemoryGetState, 2) +DLWRAP(zeCommandListHostSynchronize, 2) + +DLWRAP_FINALIZE() + +#ifndef LEVEL_ZERO_LIBRARY +#error "Level zero library not defined" +#endif + +#ifndef TARGET_NAME +#error "Missing TARGET_NAME macro" +#endif +#ifndef DEBUG_PREFIX +#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL" +#endif + +static bool loadLevelZero() { + const char *L0Library = LEVEL_ZERO_LIBRARY; + std::string ErrMsg; + + DP("Trying to load %s\n", L0Library); + auto DynlibHandle = std::make_unique( + llvm::sys::DynamicLibrary::getPermanentLibrary(L0Library, &ErrMsg)); + if (!DynlibHandle->isValid()) { + if (ErrMsg.empty()) + ErrMsg = "unknown error"; + DP("Unable to load library '%s': %s!\n", L0Library, ErrMsg.c_str()); + return false; + } + + for (size_t I = 0; I < dlwrap::size(); I++) { + const char *Sym = dlwrap::symbol(I); + + void *P = DynlibHandle->getAddressOfSymbol(Sym); + if (P == nullptr) { + DP("Unable to find '%s' in '%s'!\n", Sym, L0Library); + return false; + } + DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P); + + *dlwrap::pointer(I) = P; + } + + return true; +} + +ze_result_t ZE_APICALL zeInit(ze_init_flags_t flags) { + if (!loadLevelZero()) + return ZE_RESULT_ERROR_UNKNOWN; + return dlwrap_zeInit(flags); +} diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp new file mode 100644 index 0000000000000..53642eba20475 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp @@ -0,0 +1,625 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// GenericKernel implementation for SPIR-V/Xe machine +// +//===----------------------------------------------------------------------===// + +#include "L0Kernel.h" +#include "L0Device.h" +#include "L0Plugin.h" +#include "L0Program.h" + +namespace llvm::omp::target::plugin { + +Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice, + uint32_t NumThreads[3], uint32_t NumBlocks[3], + KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, + AsyncInfoWrapperTy &AsyncInfoWrapper) const { + + auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice); + int32_t RC = runTargetTeamRegion(l0Device, KernelArgs, + std::move(LaunchParams), AsyncInfoWrapper); + if (RC == OFFLOAD_SUCCESS) + return Plugin::success(); + return Plugin::error(error::ErrorCode::UNKNOWN, + "Error in launch Kernel %s: %d", getName(), RC); +} + +Error L0KernelTy::buildKernel(L0ProgramTy &Program) { + const auto *KernelName = getName(); + + auto Module = Program.findModuleFromKernelName(KernelName); + ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, + KernelName}; + CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel); + return Plugin::success(); +} + +Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice, + DeviceImageTy &Image) { + auto &Program = L0ProgramTy::makeL0Program(Image); + + Error Err = buildKernel(Program); + if (Err) + return Err; + Program.addKernel(this); + + return Plugin::success(); +} + +void L0KernelTy::decideKernelGroupArguments( + L0DeviceTy &Device, uint32_t NumTeams, uint32_t ThreadLimit, + TgtNDRangeDescTy *LoopLevels, uint32_t *GroupSizes, + ze_group_count_t &GroupCounts, bool HalfNumThreads, + bool IsTeamsNDRange) const { + + const KernelPropertiesTy &KernelPR = getProperties(); + + const auto DeviceId = Device.getDeviceId(); + bool MaxGroupSizeForced = false; + bool MaxGroupCountForced = false; + uint32_t MaxGroupSize = Device.getMaxGroupSize(); + const auto &Option = LevelZeroPluginTy::getOptions(); + const auto OptSubscRate = Option.SubscriptionRate; + + uint32_t SIMDWidth = KernelPR.SIMDWidth; + uint32_t KernelWidth = KernelPR.Width; + uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize; + + if (KernelMaxThreadGroupSize < MaxGroupSize) { + MaxGroupSize = KernelMaxThreadGroupSize; + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Capping maximum team size to %" PRIu32 + " due to kernel constraints.\n", + MaxGroupSize); + } + + if (ThreadLimit > 0) { + MaxGroupSizeForced = true; + MaxGroupSize = ThreadLimit; + } + + uint32_t MaxGroupCount = 0; + if (NumTeams > 0) { + MaxGroupCount = NumTeams; + MaxGroupCountForced = true; + } + + if (MaxGroupCountForced) { + // If number of teams is specified by the user, then use KernelWidth + // WIs per WG by default, so that it matches + // decideLoopKernelGroupArguments() behavior. + if (!MaxGroupSizeForced) { + MaxGroupSize = KernelWidth; + } + } else { + const uint32_t NumSubslices = Device.getNumSubslices(); + uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice(); + if (HalfNumThreads) + NumThreadsPerSubslice /= 2; + + MaxGroupCount = NumSubslices * NumThreadsPerSubslice; + if (MaxGroupSizeForced) { + // Set group size for the HW capacity + uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth; + uint32_t NumGroupsPerSubslice = + (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup; + MaxGroupCount = NumGroupsPerSubslice * NumSubslices; + } else { + assert(!MaxGroupSizeForced && !MaxGroupCountForced); + assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) && + "Invalid maxGroupSize"); + // Maximize group size + while (MaxGroupSize >= KernelWidth) { + uint32_t NumThreadsPerGroup = + (MaxGroupSize + SIMDWidth - 1) / SIMDWidth; + + if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) { + uint32_t NumGroupsPerSubslice = + NumThreadsPerSubslice / NumThreadsPerGroup; + MaxGroupCount = NumGroupsPerSubslice * NumSubslices; + break; + } + MaxGroupSize -= KernelWidth; + } + } + } + + uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1}; + uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1}; + bool UsedReductionSubscriptionRate = false; + if (!MaxGroupCountForced) { + { + GRPCounts[0] *= OptSubscRate; + } + + size_t LoopTripcount = 0; + if (LoopLevels) { + // TODO: consider other possible LoopDesc uses + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Loop desciptor provided but specific ND-range is disabled\n"); + // TODO: get rid of this constraint + if (LoopLevels->NumLoops > 1) { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "More than 1 loop found (%" PRIu32 "), ignoring loop info\n", + LoopLevels->NumLoops); + } else if (LoopLevels->Levels[0].Ub >= LoopLevels->Levels[0].Lb) { + LoopTripcount = (LoopLevels->Levels[0].Ub - LoopLevels->Levels[0].Lb + + LoopLevels->Levels[0].Stride) / + LoopLevels->Levels[0].Stride; + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Loop TC = (%" PRId64 " - %" PRId64 " + %" PRId64 ") / %" PRId64 + " = %zu\n", + LoopLevels->Levels[0].Ub, LoopLevels->Levels[0].Lb, + LoopLevels->Levels[0].Stride, LoopLevels->Levels[0].Stride, + LoopTripcount); + } + } + + if (LoopTripcount && !UsedReductionSubscriptionRate) { + const size_t MaxTotalThreads = Device.getNumThreadsPerSubslice() * + Device.getNumSubslices() * SIMDWidth; + size_t AdjustedGroupCount = + IsTeamsNDRange ? (std::min)(((LoopTripcount + 7) & ~7), + MaxTotalThreads / GRPSizes[0]) + : ((LoopTripcount + GRPSizes[0] - 1) / GRPSizes[0]); + AdjustedGroupCount = std::max(AdjustedGroupCount, size_t{1}); + AdjustedGroupCount *= OptSubscRate; + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Adjusting number of teams using the loop tripcount\n"); + if (AdjustedGroupCount < GRPCounts[0]) + GRPCounts[0] = AdjustedGroupCount; + } + } + GroupCounts.groupCountX = GRPCounts[0]; + GroupCounts.groupCountY = GRPCounts[1]; + GroupCounts.groupCountZ = GRPCounts[2]; + std::copy(GRPSizes, GRPSizes + 3, GroupSizes); +} + +// Return the number of total HW threads required to execute +// a loop kernel compiled with the given SIMDWidth, and the given +// loop(s) trip counts and group sizes. +// Returns UINT64_MAX, if computations overflow. +static uint64_t computeThreadsNeeded(const llvm::ArrayRef TripCounts, + const llvm::ArrayRef GroupSizes, + uint32_t SIMDWidth) { + assert(TripCounts.size() == 3 && "Invalid trip counts array size"); + assert(GroupSizes.size() == 3 && "Invalid group sizes array size"); + // Compute the number of groups in each dimension. + std::array GroupCount; + + for (int I = 0; I < 3; ++I) { + if (TripCounts[I] == 0 || GroupSizes[I] == 0) + return (std::numeric_limits::max)(); + GroupCount[I] = + (uint64_t(TripCounts[I]) + GroupSizes[I] - 1) / GroupSizes[I]; + if (GroupCount[I] > (std::numeric_limits::max)()) + return (std::numeric_limits::max)(); + } + for (int I = 1; I < 3; ++I) { + if ((std::numeric_limits::max)() / GroupCount[0] < GroupCount[I]) + return (std::numeric_limits::max)(); + GroupCount[0] *= GroupCount[I]; + } + // Multiplication of the group sizes must never overflow uint64_t + // for any existing device. + uint64_t LocalWorkSize = + uint64_t(GroupSizes[0]) * GroupSizes[1] * GroupSizes[2]; + uint64_t ThreadsPerWG = ((LocalWorkSize + SIMDWidth - 1) / SIMDWidth); + + // Check that the total number of threads fits uint64_t. + if ((std::numeric_limits::max)() / GroupCount[0] < ThreadsPerWG) + return (std::numeric_limits::max)(); + + return GroupCount[0] * ThreadsPerWG; +} + +int32_t L0KernelTy::decideLoopKernelGroupArguments( + L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels, + uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads, + bool &AllowCooperative) const { + + const auto DeviceId = Device.getDeviceId(); + const auto &Options = LevelZeroPluginTy::getOptions(); + const auto &KernelPR = getProperties(); + uint32_t MaxGroupSize = Device.getMaxGroupSize(); + + bool MaxGroupSizeForced = false; + if (ThreadLimit > 0) { + MaxGroupSizeForced = true; + MaxGroupSize = ThreadLimit; + } + + uint32_t GRPCounts[3] = {1, 1, 1}; + uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1}; + TgtLoopDescTy *Levels = LoopLevels->Levels; + int32_t DistributeDim = LoopLevels->DistributeDim; + assert(DistributeDim >= 0 && DistributeDim <= 2 && + "Invalid distribute dimension."); + int32_t NumLoops = LoopLevels->NumLoops; + assert((NumLoops > 0 && NumLoops <= 3) && + "Invalid loop nest description for ND partitioning"); + + // Compute global widths for X/Y/Z dimensions. + size_t TripCounts[3] = {1, 1, 1}; + + for (int32_t I = 0; I < NumLoops; I++) { + assert(Levels[I].Stride > 0 && "Invalid loop stride for ND partitioning"); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Loop %" PRIu32 ": lower bound = %" PRId64 ", upper bound = %" PRId64 + ", Stride = %" PRId64 "\n", + I, Levels[I].Lb, Levels[I].Ub, Levels[I].Stride); + if (Levels[I].Ub < Levels[I].Lb) + TripCounts[I] = 0; + else + TripCounts[I] = + (Levels[I].Ub - Levels[I].Lb + Levels[I].Stride) / Levels[I].Stride; + } + + // Check if any of the loop has zero iterations. + if (TripCounts[0] == 0 || TripCounts[1] == 0 || TripCounts[2] == 0) { + std::fill(GroupSizes, GroupSizes + 3, 1); + std::fill(GRPCounts, GRPCounts + 3, 1); + if (DistributeDim > 0 && TripCounts[DistributeDim] != 0) { + // There is a distribute dimension, and the distribute loop + // has non-zero iterations, but some inner parallel loop + // has zero iterations. We still want to split the distribute + // loop's iterations between many WGs (of size 1), but the inner/lower + // dimensions should be 1x1. + // Note that this code is currently dead, because we are not + // hoisting the inner loops' bounds outside of the target regions. + // The code is here just for completeness. + size_t DistributeTripCount = TripCounts[DistributeDim]; + if (DistributeTripCount > UINT32_MAX) { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Invalid number of teams %zu due to large loop trip count\n", + DistributeTripCount); + return OFFLOAD_FAIL; + } + GRPCounts[DistributeDim] = DistributeTripCount; + } + AllowCooperative = false; + GroupCounts.groupCountX = GRPCounts[0]; + GroupCounts.groupCountY = GRPCounts[1]; + GroupCounts.groupCountZ = GRPCounts[2]; + return OFFLOAD_SUCCESS; + } + + if (!MaxGroupSizeForced) { + // Use zeKernelSuggestGroupSize to compute group sizes, + // or fallback to setting dimension 0 width to SIMDWidth. + // Note that in case of user-specified LWS GRPSizes[0] + // is already set according to the specified value. + size_t GlobalSizes[3] = {TripCounts[0], TripCounts[1], TripCounts[2]}; + if (DistributeDim > 0) { + // There is a distribute dimension. + GlobalSizes[DistributeDim - 1] *= GlobalSizes[DistributeDim]; + GlobalSizes[DistributeDim] = 1; + } + + { + if (MaxGroupSize > KernelPR.Width) { + GRPSizes[0] = KernelPR.Width; + } + if (DistributeDim == 0) { + // If there is a distribute dimension, then we do not use + // thin HW threads, since we do not know anything about + // the iteration space of the inner parallel loop regions. + // + // If there is no distribute dimension, then try to use thiner + // HW threads to get more independent HW threads executing + // the kernel - this may allow more parallelism due to + // the stalls being distributed across multiple HW threads rather + // than across SIMD lanes within one HW thread. + assert(GRPSizes[1] == 1 && GRPSizes[2] == 1 && + "Unexpected team sizes for dimensions 1 or/and 2."); + uint32_t SimdWidth = KernelPR.SIMDWidth; + uint64_t TotalThreads = Device.getTotalThreads(); + TotalThreads *= Options.ThinThreadsThreshold; + + uint32_t GRPSizePrev = GRPSizes[0]; + uint64_t ThreadsNeeded = + computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth); + while (ThreadsNeeded < TotalThreads) { + GRPSizePrev = GRPSizes[0]; + // Try to half the local work size (if possible) and see + // how many HW threads the kernel will require with this + // new local work size. + // In most implementations the initial GRPSizes[0] + // will be a power-of-two. + if (GRPSizes[0] <= 1) + break; + GRPSizes[0] >>= 1; + ThreadsNeeded = computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth); + } + GRPSizes[0] = GRPSizePrev; + } + } + } + + for (int32_t I = 0; I < NumLoops; I++) { + if (I < DistributeDim) { + GRPCounts[I] = 1; + continue; + } + size_t Trip = TripCounts[I]; + if (GRPSizes[I] >= Trip) + GRPSizes[I] = Trip; + size_t Count = (Trip + GRPSizes[I] - 1) / GRPSizes[I]; + if (Count > UINT32_MAX) { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Invalid number of teams %zu due to large loop trip count\n", Count); + return OFFLOAD_FAIL; + } + GRPCounts[I] = (uint32_t)Count; + } + AllowCooperative = false; + GroupCounts.groupCountX = GRPCounts[0]; + GroupCounts.groupCountY = GRPCounts[1]; + GroupCounts.groupCountZ = GRPCounts[2]; + std::copy(GRPSizes, GRPSizes + 3, GroupSizes); + + return OFFLOAD_SUCCESS; +} + +int32_t L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams, + int32_t ThreadLimit, uint32_t *GroupSizes, + ze_group_count_t &GroupCounts, + void *LoopDesc, + bool &AllowCooperative) const { + + const auto DeviceId = Device.getDeviceId(); + const auto &KernelPR = getProperties(); + + // Read the most recent global thread limit and max teams. + const auto [NumTeamsICV, ThreadLimitICV] = std::make_tuple(0, 0); + + bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG); + bool HalfNumThreads = + LevelZeroPluginTy::getOptions().ZeDebugEnabled && IsXeHPG; + uint32_t KernelWidth = KernelPR.Width; + uint32_t SIMDWidth = KernelPR.SIMDWidth; + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth); + assert(SIMDWidth <= KernelWidth && "Invalid SIMD width."); + + if (ThreadLimit > 0) { + // use thread_limit clause value default + DP("Max team size is set to %" PRId32 " (thread_limit clause)\n", + ThreadLimit); + } else if (ThreadLimitICV > 0) { + // else use thread-limit-var ICV + ThreadLimit = ThreadLimitICV; + DP("Max team size is set to %" PRId32 " (thread-limit-icv)\n", ThreadLimit); + } + + size_t MaxThreadLimit = Device.getMaxGroupSize(); + // Set correct max group size if the kernel was compiled with explicit SIMD + if (SIMDWidth == 1) { + MaxThreadLimit = Device.getNumThreadsPerSubslice(); + } + + if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) { + MaxThreadLimit = KernelPR.MaxThreadGroupSize; + DP("Capping maximum team size to %zu due to kernel constraints.\n", + MaxThreadLimit); + } + + if (ThreadLimit > static_cast(MaxThreadLimit)) { + ThreadLimit = MaxThreadLimit; + DP("Max team size execceds current maximum %zu. Adjusted\n", + MaxThreadLimit); + } + { + if (NumTeams > 0) { + DP("Number of teams is set to %" PRId32 + "(num_teams clause or no teams construct)\n", + NumTeams); + } else if (NumTeamsICV > 0) { + // OMP_NUM_TEAMS only matters, if num_teams() clause is absent. + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV); + + NumTeams = NumTeamsICV; + DP("Max number of teams is set to %" PRId32 " (OMP_NUM_TEAMS)\n", + NumTeams); + } + + bool UseLoopTC = LoopDesc; + decideKernelGroupArguments( + Device, (uint32_t)NumTeams, (uint32_t)ThreadLimit, + UseLoopTC ? (TgtNDRangeDescTy *)LoopDesc : nullptr, GroupSizes, + GroupCounts, HalfNumThreads, false); + AllowCooperative = false; + } + + return OFFLOAD_SUCCESS; +} + +int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device, + KernelArgsTy &KernelArgs, + KernelLaunchParamsTy LaunchParams, + __tgt_async_info *AsyncInfo) const { + // Libomptarget can pass negative NumTeams and ThreadLimit now after + // introducing __tgt_target_kernel. This happens only when we have valid + // LoopDesc and the region is not a teams region. + + auto zeKernel = getZeKernel(); + auto DeviceId = l0Device.getDeviceId(); + int32_t NumArgs = KernelArgs.NumArgs; + int32_t NumTeams = KernelArgs.NumTeams[0]; + int32_t ThreadLimit = KernelArgs.ThreadLimit[0]; + void *LoopDesc = nullptr; + + if (NumTeams < 0) + NumTeams = 0; + if (ThreadLimit < 0) + ThreadLimit = 0; + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Executing a kernel " DPxMOD "...\n", DPxPTR(zeKernel)); + + auto &Plugin = l0Device.getPlugin(); + auto &Device = Plugin.getDeviceFromId(DeviceId); + + auto *IdStr = Device.getZeIdCStr(); + auto &Options = LevelZeroPluginTy::getOptions(); + bool IsAsync = AsyncInfo && Device.asyncEnabled(); + if (IsAsync && !AsyncInfo->Queue) { + AsyncInfo->Queue = reinterpret_cast(Plugin.getAsyncQueue()); + if (!AsyncInfo->Queue) + IsAsync = false; // Couldn't get a queue, revert to sync + } + auto *AsyncQueue = + IsAsync ? static_cast(AsyncInfo->Queue) : NULL; + + // We need to get a non-const version of the Properties structure in order to + // use its lock and be able to cache the group params and indirect flags + auto &KernelPR = const_cast(getProperties()); + // Protect from kernel preparation to submission as kernels are shared. + std::unique_lock KernelLock(KernelPR.Mtx); + + // Decide group sizes and counts + uint32_t GroupSizes[3]; + ze_group_count_t GroupCounts; + + bool AllowCooperative = false; + + // Check if we can reuse previous group parameters + bool GroupParamsReused = KernelPR.reuseGroupParams( + static_cast(LoopDesc), NumTeams, ThreadLimit, + GroupSizes, GroupCounts, AllowCooperative); + + if (!GroupParamsReused) { + auto RC = getGroupsShape(Device, NumTeams, ThreadLimit, GroupSizes, + GroupCounts, LoopDesc, AllowCooperative); + + if (RC != OFFLOAD_SUCCESS) { + return RC; + } + + KernelPR.cacheGroupParams(static_cast(LoopDesc), + NumTeams, ThreadLimit, GroupSizes, GroupCounts, + AllowCooperative); + } + + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0], + GroupSizes[1], GroupSizes[2]); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", + GroupCounts.groupCountX, GroupCounts.groupCountY, + GroupCounts.groupCountZ); + for (int32_t I = 0; I < NumArgs; I++) { + { + void *Arg = (static_cast(LaunchParams.Data))[I]; + CALL_ZE_RET_FAIL(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg), + Arg == nullptr ? nullptr : &Arg); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Kernel Pointer argument %" PRId32 " (value: " DPxMOD + ") was set successfully for device %s.\n", + I, DPxPTR(Arg), IdStr); + } + } + + // Set Kernel Indirect flags + auto &PrevFlags = KernelPR.IndirectAccessFlags; + ze_kernel_indirect_access_flags_t Flags = 0; + Flags |= Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags(); + Flags |= Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags(); + + if (PrevFlags != Flags) { + // Combine with common access flags + const auto FinalFlags = Device.getIndirectFlags() | Flags; + CALL_ZE_RET_FAIL(zeKernelSetIndirectAccess, getZeKernel(), FinalFlags); + DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags)); + PrevFlags = Flags; + } + + if (!GroupParamsReused) { + CALL_ZE_RET_FAIL(zeKernelSetGroupSize, zeKernel, GroupSizes[0], + GroupSizes[1], GroupSizes[2]); + } + + ze_command_list_handle_t CmdList = nullptr; + ze_command_queue_handle_t CmdQueue = nullptr; + const bool UseImmCmdList = Device.useImmForCompute(); + + if (UseImmCmdList) { + CmdList = Device.getImmCmdList(); + // Command queue is not used with immediate command list + } else { + CmdList = Device.getCmdList(); + CmdQueue = Device.getCmdQueue(); + } + + if (UseImmCmdList) { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Using immediate command list for kernel submission.\n"); + auto Event = Device.getEvent(); + size_t NumWaitEvents = 0; + ze_event_handle_t *WaitEvents = nullptr; + if (IsAsync && !AsyncQueue->WaitEvents.empty()) { + if (Options.CommandMode == CommandModeTy::AsyncOrdered) { + NumWaitEvents = 1; + WaitEvents = &AsyncQueue->WaitEvents.back(); + } else { + NumWaitEvents = AsyncQueue->WaitEvents.size(); + WaitEvents = AsyncQueue->WaitEvents.data(); + } + } + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Kernel depends on %zu data copying events.\n", NumWaitEvents); + if (AllowCooperative) + CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList, + zeKernel, &GroupCounts, Event, NumWaitEvents, + WaitEvents); + else + CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel, + &GroupCounts, Event, NumWaitEvents, WaitEvents); + KernelLock.unlock(); + if (IsAsync) { + AsyncQueue->WaitEvents.push_back(Event); + AsyncQueue->KernelEvent = Event; + } else { + CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX); + Device.releaseEvent(Event); + } + } else { + ze_event_handle_t Event = nullptr; + if (AllowCooperative) + CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList, + zeKernel, &GroupCounts, Event, 0, nullptr); + else + CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel, + &GroupCounts, Event, 0, nullptr); + KernelLock.unlock(); + CALL_ZE_RET_FAIL(zeCommandListClose, CmdList); + CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(), + CmdQueue, 1, &CmdList, nullptr); + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr); + CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX); + CALL_ZE_RET_FAIL(zeCommandListReset, CmdList); + if (Event) { + Device.releaseEvent(Event); + } + } + + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel), + IdStr); + + return OFFLOAD_SUCCESS; +} + +} // namespace llvm::omp::target::plugin diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp new file mode 100644 index 0000000000000..c26e3fb328645 --- /dev/null +++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp @@ -0,0 +1,647 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Memory related support for SPIR-V/Xe machine +// +//===----------------------------------------------------------------------===// + +#include "L0Memory.h" +#include "L0Device.h" +#include "L0Plugin.h" + +namespace llvm::omp::target::plugin { + +void *MemAllocatorTy::MemPoolTy::BlockTy::alloc() { + if (isFull()) + return nullptr; + if (FreeSlot != UINT32_MAX) { + const uint32_t Slot = FreeSlot; + FreeSlot = UINT32_MAX; + UsedSlots[Slot] = true; + NumUsedSlots++; + return reinterpret_cast(Base + Slot * ChunkSize); + } + for (uint32_t I = 0; I < NumSlots; I++) { + if (UsedSlots[I]) + continue; + UsedSlots[I] = true; + NumUsedSlots++; + return reinterpret_cast(Base + I * ChunkSize); + } + // Should not reach here. + assert(0 && "Inconsistent memory pool state"); + return nullptr; +} + +/// Deallocate the given memory +void MemAllocatorTy::MemPoolTy::BlockTy::dealloc(void *Mem) { + if (!contains(Mem)) + assert(0 && "Inconsistent memory pool state"); + const uint32_t Slot = (reinterpret_cast(Mem) - Base) / ChunkSize; + UsedSlots[Slot] = false; + NumUsedSlots--; + FreeSlot = Slot; +} + +MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator, + const L0OptionsTy &Option) { + AllocKind = Kind; + Allocator = _Allocator; + + // Read user-defined options + const auto &UserOptions = Option.MemPoolInfo.at(AllocKind); + const size_t UserAllocMax = UserOptions[0]; + const size_t UserCapacity = UserOptions[1]; + const size_t UserPoolSize = UserOptions[2]; + + BlockCapacity = UserCapacity; + PoolSizeMax = UserPoolSize << 20; // MB to B + PoolSize = 0; + + auto Context = Allocator->L0Context->getZeContext(); + const auto Device = Allocator->Device; + + // Check page size used for this allocation kind to decide minimum + // allocation size when allocating from L0. + void *Mem = Allocator->allocL0(8, 0, AllocKind); + ze_memory_allocation_properties_t AP{ + ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, nullptr, + ZE_MEMORY_TYPE_UNKNOWN, 0, 0}; + CALL_ZE_RET_VOID(zeMemGetAllocProperties, Context, Mem, &AP, nullptr); + AllocUnit = (std::max)(AP.pageSize, AllocUnit); + CALL_ZE_RET_VOID(zeMemFree, Context, Mem); + + bool IsDiscrete = false; + if (Device) { + ze_device_properties_t Properties{}; + Properties.deviceId = 0; + Properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + Properties.pNext = nullptr; + CALL_ZE_RET_VOID(zeDeviceGetProperties, Device->getZeDevice(), &Properties); + IsDiscrete = Device->isDiscreteDevice(); + + if (AllocKind == TARGET_ALLOC_SHARED && IsDiscrete) { + // Use page size as minimum chunk size for USM shared on discrete + // device. + // FIXME: pageSize is not returned correctly (=0) on some new devices, + // so use fallback value for now. + AllocMin = (std::max)(AP.pageSize, AllocUnit); + AllocUnit = AllocMin * BlockCapacity; + } + } + + // Convert MB to B and round up to power of 2 + AllocMax = AllocMin << getBucketId(UserAllocMax * (1 << 20)); + if (AllocMin >= AllocMax) { + AllocMax = 2 * AllocMin; + DP("Warning: Adjusting pool's AllocMax to %zu for %s due to device " + "requirements.\n", + AllocMax, ALLOC_KIND_TO_STR(AllocKind)); + } + assert(AllocMin < AllocMax && + "Invalid parameters while initializing memory pool"); + const auto MinSize = getBucketId(AllocMin); + const auto MaxSize = getBucketId(AllocMax); + Buckets.resize(MaxSize - MinSize + 1); + BucketStats.resize(Buckets.size(), {0, 0}); + + // Set bucket parameters + for (size_t I = 0; I < Buckets.size(); I++) { + const size_t ChunkSize = AllocMin << I; + size_t BlockSize = ChunkSize * BlockCapacity; + // On discrete device, the cost of native L0 invocation doubles when the + // the requested size doubles after certain threshold, so allocating + // larger block does not pay off at all. It is better to keep a single + // chunk in a single block in such cases. + if (BlockSize <= AllocUnit) { + BlockSize = AllocUnit; // Allocation unit is already large enough + } else if (IsDiscrete) { + // Do not preallocate if it does not pay off + if (ChunkSize >= L0UsmPreAllocThreshold || + (AllocKind == TARGET_ALLOC_HOST && + ChunkSize >= L0HostUsmPreAllocThreshold)) + BlockSize = ChunkSize; + } + BucketParams.emplace_back(ChunkSize, BlockSize); + } + + DP("Initialized %s pool for device " DPxMOD ": AllocUnit = %zu, " + "AllocMax = %zu, " + "Capacity = %" PRIu32 ", PoolSizeMax = %zu\n", + ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Device), AllocUnit, AllocMax, + BlockCapacity, PoolSizeMax); +} + +// Used for reduction pool +MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator, + const L0OptionsTy &Option) { + AllocKind = TARGET_ALLOC_DEVICE; + Allocator = _Allocator; + AllocMin = AllocUnit = 1024 << 6; // 64KB + AllocMax = Option.ReductionPoolInfo[0] << 20; + BlockCapacity = Option.ReductionPoolInfo[1]; + PoolSize = 0; + PoolSizeMax = (size_t)Option.ReductionPoolInfo[2] << 20; + + const auto MinSize = getBucketId(AllocMin); + const auto MaxSize = getBucketId(AllocMax); + Buckets.resize(MaxSize - MinSize + 1); + BucketStats.resize(Buckets.size(), {0, 0}); + for (size_t I = 0; I < Buckets.size(); I++) { + const size_t ChunkSize = AllocMin << I; + BucketParams.emplace_back(ChunkSize, ChunkSize * BlockCapacity); + } + + DP("Initialized reduction scratch pool for device " DPxMOD + ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n", + DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax); +} + +// Used for small memory pool with fixed parameters +MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator) { + AllocKind = TARGET_ALLOC_DEVICE; + Allocator = _Allocator; + AllocMax = AllocMin; + BlockCapacity = AllocUnit / AllocMax; + PoolSize = 0; + PoolSizeMax = (1 << 20); // this should be sufficiently large + Buckets.resize(1); + BucketStats.resize(1, {0, 0}); + BucketParams.emplace_back(AllocMax, AllocUnit); + ZeroInit = true; + DP("Initialized zero-initialized reduction counter pool for " + "device " DPxMOD ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n", + DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax); +} + +void MemAllocatorTy::MemPoolTy::printUsage() { + auto PrintNum = [](uint64_t Num) { + if (Num > 1e9) + fprintf(stderr, "%11.2e", float(Num)); + else + fprintf(stderr, "%11" PRIu64, Num); + }; + + bool HasPoolAlloc = false; + for (auto &Stat : BucketStats) { + if (Stat.first > 0 || Stat.second > 0) { + HasPoolAlloc = true; + break; + } + } + + DP("MemPool usage for %s, device " DPxMOD "\n", ALLOC_KIND_TO_STR(AllocKind), + DPxPTR(Allocator->Device)); + + if (HasPoolAlloc) { + DP("-- AllocMax=%zu(MB), Capacity=%" PRIu32 ", PoolSizeMax=%zu(MB)\n", + AllocMax >> 20, BlockCapacity, PoolSizeMax >> 20); + DP("-- %18s:%11s%11s%11s\n", "", "NewAlloc", "Reuse", "Hit(%)"); + for (size_t I = 0; I < Buckets.size(); I++) { + const auto &Stat = BucketStats[I]; + if (Stat.first > 0 || Stat.second > 0) { + DP("-- Bucket[%10zu]:", BucketParams[I].first); + PrintNum(Stat.first); + PrintNum(Stat.second); + fprintf(stderr, "%11.2f\n", + float(Stat.second) / float(Stat.first + Stat.second) * 100); + } + } + } else { + DP("-- Not used\n"); + } +} + +/// Release resources used in the pool. +MemAllocatorTy::MemPoolTy::~MemPoolTy() { + const int DebugLevel = getDebugLevel(); + if (DebugLevel > 0) + printUsage(); + for (auto &Bucket : Buckets) { + for (auto *Block : Bucket) { + if (DebugLevel > 0) + Allocator->log(0, Block->Size, AllocKind); + CALL_ZE_RET_VOID(zeMemFree, Allocator->L0Context->getZeContext(), + reinterpret_cast(Block->Base)); + delete Block; + } + } +} + +/// Allocate the requested size of memory from this pool. +/// AllocSize is the chunk size internally used for the returned memory. +void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) { + if (Size == 0 || Size > AllocMax) + return nullptr; + + const uint32_t BucketId = getBucketId(Size); + auto &Blocks = Buckets[BucketId]; + void *Mem = nullptr; + + for (auto *Block : Blocks) { + if (Block->isFull()) + continue; + Mem = Block->alloc(); + assert(Mem && "Inconsistent state while allocating memory from pool"); + PtrToBlock.try_emplace(Mem, Block); + break; + } + + if (Mem == nullptr) { + const bool IsSmallAllocatable = + (Size <= SmallAllocMax && SmallPoolSize <= SmallPoolSizeMax); + const bool IsFull = (PoolSize > PoolSizeMax); + if (IsFull && !IsSmallAllocatable) + return nullptr; + // Bucket is empty or all blocks in the bucket are full + const auto ChunkSize = BucketParams[BucketId].first; + const auto BlockSize = BucketParams[BucketId].second; + void *Base = Allocator->allocL0(BlockSize, 0, AllocKind); + + if (ZeroInit) { + auto RC = Allocator->enqueueMemSet(Base, 0, BlockSize); + if (RC != OFFLOAD_SUCCESS) { + DP("Failed to zero-initialize pool memory\n"); + return nullptr; + } + } + + BlockTy *Block = new BlockTy(Base, BlockSize, ChunkSize); + Blocks.push_back(Block); + Mem = Block->alloc(); + PtrToBlock.try_emplace(Mem, Block); + if (IsFull) + SmallPoolSize += BlockSize; + else + PoolSize += BlockSize; + DP("New block allocation for %s pool: base = " DPxMOD + ", size = %zu, pool size = %zu\n", + ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Base), BlockSize, PoolSize); + BucketStats[BucketId].first++; + } else { + BucketStats[BucketId].second++; + } + + AllocSize = (AllocMin << BucketId); + + return Mem; +} + +/// Deallocate the specified memory and returns block size deallocated. +size_t MemAllocatorTy::MemPoolTy::dealloc(void *Ptr) { + if (PtrToBlock.count(Ptr) == 0) + return 0; + PtrToBlock[Ptr]->dealloc(Ptr); + const size_t Deallocated = PtrToBlock[Ptr]->ChunkSize; + PtrToBlock.erase(Ptr); + return Deallocated; +} + +void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base, size_t Size, + int32_t Kind, bool InPool, + bool ImplicitArg) { + const auto Inserted = + Map.emplace(Ptr, MemAllocInfoTy{Base, Size, Kind, InPool, ImplicitArg}); + // Check if we keep valid disjoint memory ranges. + [[maybe_unused]] bool Valid = Inserted.second; + if (Valid) { + if (Inserted.first != Map.begin()) { + const auto I = std::prev(Inserted.first, 1); + Valid = Valid && (uintptr_t)I->first + I->second.Size <= (uintptr_t)Ptr; + } + if (Valid) { + const auto I = std::next(Inserted.first, 1); + if (I != Map.end()) + Valid = Valid && (uintptr_t)Ptr + Size <= (uintptr_t)I->first; + } + } + assert(Valid && "Invalid overlapping memory allocation"); + if (ImplicitArg) + NumImplicitArgs[Kind]++; +} + +/// Remove allocation information for the given memory location +bool MemAllocatorTy::MemAllocInfoMapTy::remove(void *Ptr, + MemAllocInfoTy *Removed) { + const auto AllocInfo = Map.find(Ptr); + if (AllocInfo == Map.end()) + return false; + if (AllocInfo->second.ImplicitArg) + NumImplicitArgs[AllocInfo->second.Kind]--; + if (Removed) + *Removed = AllocInfo->second; + Map.erase(AllocInfo); + return true; +} + +void MemAllocatorTy::initDevicePools(L0DeviceTy &L0Device, + const L0OptionsTy &Option) { + SupportsLargeMem = L0Device.supportsLargeMem(); + IsHostMem = false; + Device = &L0Device; + L0Context = &L0Device.getL0Context(); + for (auto Kind : {TARGET_ALLOC_DEVICE, TARGET_ALLOC_SHARED}) { + if (Option.MemPoolInfo.count(Kind) > 0) { + std::lock_guard Lock(Mtx); + Pools.emplace(std::piecewise_construct, std::forward_as_tuple(Kind), + std::forward_as_tuple(Kind, this, Option)); + } + if (getDebugLevel() > 0) + Stats.emplace(std::piecewise_construct, std::forward_as_tuple(Kind), + std::tuple<>{}); + } + ReductionPool = std::make_unique(this, Option); + CounterPool = std::make_unique(this); + updateMaxAllocSize(L0Device); +} + +void MemAllocatorTy::initHostPool(L0ContextTy &Driver, + const L0OptionsTy &Option) { + SupportsLargeMem = Driver.supportsLargeMem(); + IsHostMem = true; + this->L0Context = &Driver; + if (Option.MemPoolInfo.count(TARGET_ALLOC_HOST) > 0) { + std::lock_guard Lock(Mtx); + Pools.emplace(std::piecewise_construct, + std::forward_as_tuple(TARGET_ALLOC_HOST), + std::forward_as_tuple(TARGET_ALLOC_HOST, this, Option)); + } + if (getDebugLevel() > 0) + Stats.emplace(std::piecewise_construct, + std::forward_as_tuple(TARGET_ALLOC_HOST), std::tuple<>{}); +} + +void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) { + // Update the maximum allocation size for this Allocator + ze_device_properties_t P; + P.maxMemAllocSize = 0; + P.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + P.pNext = nullptr; + CALL_ZE_RET_VOID(zeDeviceGetProperties, L0Device.getZeDevice(), &P); + + if (IsHostMem) { + // MaxAllocSize should be the minimum of all devices from the driver + if (MaxAllocSize > P.maxMemAllocSize) { + MaxAllocSize = P.maxMemAllocSize; + DP("Updated MaxAllocSize for driver " DPxMOD " to %zu\n", + DPxPTR(L0Context), MaxAllocSize); + } + return; + } + + MaxAllocSize = P.maxMemAllocSize; + DP("Updated MaxAllocSize for device " DPxMOD " to %zu\n", DPxPTR(Device), + MaxAllocSize); +} + +/// Release resources and report statistics if requested +void MemAllocatorTy::deinit() { + std::lock_guard Lock(Mtx); + // Release RTL-owned memory + for (auto *M : MemOwned) { + auto Err = dealloc_locked(M); + if (Err) + consumeError(std::move(Err)); + } + // Release resources used in the pool + Pools.clear(); + ReductionPool.reset(nullptr); + CounterPool.reset(nullptr); + // Report memory usage if requested + if (getDebugLevel() > 0) { + for (auto &Stat : Stats) { + DP("Memory usage for %s, device " DPxMOD "\n", + ALLOC_KIND_TO_STR(Stat.first), DPxPTR(Device)); + const auto &ST = Stat.second; + if (ST.NumAllocs[0] == 0 && ST.NumAllocs[1] == 0) { + DP("-- Not used\n"); + continue; + } + DP("-- Allocator: %12s, %12s\n", "Native", "Pool"); + DP("-- Requested: %12zu, %12zu\n", ST.Requested[0], ST.Requested[1]); + DP("-- Allocated: %12zu, %12zu\n", ST.Allocated[0], ST.Allocated[1]); + DP("-- Freed : %12zu, %12zu\n", ST.Freed[0], ST.Freed[1]); + DP("-- InUse : %12zu, %12zu\n", ST.InUse[0], ST.InUse[1]); + DP("-- PeakUse : %12zu, %12zu\n", ST.PeakUse[0], ST.PeakUse[1]); + DP("-- NumAllocs: %12zu, %12zu\n", ST.NumAllocs[0], ST.NumAllocs[1]); + } + } + + // mark as deinitialized + L0Context = nullptr; +} + +/// Allocate memory with the specified information +Expected MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind, + intptr_t Offset, bool UserAlloc, + bool DevMalloc, uint32_t MemAdvice, + AllocOptionTy AllocOpt) { + assert((Kind == TARGET_ALLOC_DEVICE || Kind == TARGET_ALLOC_HOST || + Kind == TARGET_ALLOC_SHARED) && + "Unknown memory kind while allocating target memory"); + + std::lock_guard Lock(Mtx); + + // We do not expect meaningful Align parameter when Offset > 0, so the + // following code does not handle such case. + + size_t AllocSize = Size + Offset; + void *Mem = nullptr; + void *AllocBase = nullptr; + const bool UseScratchPool = + (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH); + const bool UseZeroInitPool = + (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER); + const bool UseDedicatedPool = UseScratchPool || UseZeroInitPool; + + if ((Pools.count(Kind) > 0 && MemAdvice == UINT32_MAX) || UseDedicatedPool) { + // Pool is enabled for the allocation kind, and we do not use any memory + // advice. We should avoid using pool if there is any meaningful memory + // advice not to affect sibling allocation in the same block. + if (Align > 0) + AllocSize += (Align - 1); + size_t PoolAllocSize = 0; + if (UseScratchPool) + AllocBase = ReductionPool->alloc(AllocSize, PoolAllocSize); + else if (UseZeroInitPool) + AllocBase = CounterPool->alloc(AllocSize, PoolAllocSize); + else + AllocBase = Pools[Kind].alloc(AllocSize, PoolAllocSize); + if (AllocBase) { + uintptr_t Base = (uintptr_t)AllocBase; + if (Align > 0) + Base = (Base + Align) & ~(Align - 1); + Mem = (void *)(Base + Offset); + AllocInfo.add(Mem, AllocBase, Size, Kind, true, UserAlloc); + log(Size, PoolAllocSize, Kind, true /* Pool */); + if (DevMalloc) + MemOwned.push_back(AllocBase); + if (UseDedicatedPool) { + DP("Allocated %zu bytes from %s pool\n", Size, + UseScratchPool ? "scratch" : "zero-initialized"); + } + return Mem; + } + } + + AllocBase = allocL0(AllocSize, Align, Kind, Size); + if (AllocBase) { + Mem = (void *)((uintptr_t)AllocBase + Offset); + AllocInfo.add(Mem, AllocBase, Size, Kind, false, UserAlloc); + if (DevMalloc) + MemOwned.push_back(AllocBase); + if (UseDedicatedPool) { + // We do not want this happen in general. + DP("Allocated %zu bytes from L0 for %s pool\n", Size, + UseScratchPool ? "scratch" : "zero-initialized"); + } + } + return Mem; +} + +/// Deallocate memory +Error MemAllocatorTy::dealloc_locked(void *Ptr) { + MemAllocInfoTy Info; + if (!AllocInfo.remove(Ptr, &Info)) { + return Plugin::error(ErrorCode::BACKEND_FAILURE, + "Cannot find memory allocation information for " DPxMOD + "\n", + DPxPTR(Ptr)); + } + if (Info.InPool) { + size_t DeallocSize = 0; + if (Pools.count(Info.Kind) > 0) + DeallocSize = Pools.at(Info.Kind).dealloc(Info.Base); + if (DeallocSize == 0) { + // Try reduction scratch pool + DeallocSize = ReductionPool->dealloc(Info.Base); + // Try reduction counter pool + if (DeallocSize == 0) + DeallocSize = CounterPool->dealloc(Info.Base); + if (DeallocSize == 0) { + return Plugin::error(ErrorCode::BACKEND_FAILURE, + "Cannot return memory " DPxMOD " to pool\n", + DPxPTR(Ptr)); + } + } + log(0, DeallocSize, Info.Kind, true /* Pool */); + return Plugin::success(); + } + if (!Info.Base) { + DP("Error: Cannot find base address of " DPxMOD "\n", DPxPTR(Ptr)); + return Plugin::error(ErrorCode::INVALID_ARGUMENT, + "Cannot find base address of " DPxMOD "\n", + DPxPTR(Ptr)); + } + CALL_ZE_RET_ERROR(zeMemFree, L0Context->getZeContext(), Info.Base); + log(0, Info.Size, Info.Kind); + + DP("Deleted device memory " DPxMOD " (Base: " DPxMOD ", Size: %zu)\n", + DPxPTR(Ptr), DPxPTR(Info.Base), Info.Size); + + return Plugin::success(); +} + +int32_t MemAllocatorTy::enqueueMemSet(void *Dst, int8_t Value, size_t Size) { + return Device->enqueueMemFill(Dst, &Value, sizeof(int8_t), Size); +} + +int32_t MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src, + size_t Size) { + return Device->enqueueMemCopy(Dst, Src, Size); +} + +void *MemAllocatorTy::allocL0(size_t Size, size_t Align, int32_t Kind, + size_t ActiveSize) { + void *Mem = nullptr; + ze_device_mem_alloc_desc_t DeviceDesc{ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, + nullptr, 0, 0}; + ze_host_mem_alloc_desc_t HostDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, + nullptr, 0}; + + // Use relaxed allocation limit if driver supports + ze_relaxed_allocation_limits_exp_desc_t RelaxedDesc{ + ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC, nullptr, + ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE}; + if (Size > MaxAllocSize && SupportsLargeMem) { + DeviceDesc.pNext = &RelaxedDesc; + HostDesc.pNext = &RelaxedDesc; + } + + auto zeDevice = Device ? Device->getZeDevice() : 0; + auto zeContext = L0Context->getZeContext(); + bool makeResident = false; + switch (Kind) { + case TARGET_ALLOC_DEVICE: + makeResident = true; + CALL_ZE_RET_NULL(zeMemAllocDevice, zeContext, &DeviceDesc, Size, Align, + zeDevice, &Mem); + DP("Allocated a device memory " DPxMOD "\n", DPxPTR(Mem)); + break; + case TARGET_ALLOC_HOST: + CALL_ZE_RET_NULL(zeMemAllocHost, zeContext, &HostDesc, Size, Align, &Mem); + DP("Allocated a host memory " DPxMOD "\n", DPxPTR(Mem)); + break; + case TARGET_ALLOC_SHARED: + CALL_ZE_RET_NULL(zeMemAllocShared, zeContext, &DeviceDesc, &HostDesc, Size, + Align, zeDevice, &Mem); + DP("Allocated a shared memory " DPxMOD "\n", DPxPTR(Mem)); + break; + default: + assert(0 && "Invalid target data allocation kind"); + } + + size_t LoggedSize = ActiveSize ? ActiveSize : Size; + log(LoggedSize, LoggedSize, Kind); + if (makeResident) { + assert(Device && + "Device is not set for memory allocation. Is this a Device Pool?"); + if (Device->makeMemoryResident(Mem, Size) != OFFLOAD_SUCCESS) + Mem = nullptr; + } + return Mem; +} + +ze_event_handle_t EventPoolTy::getEvent() { + std::lock_guard Lock(*Mtx); + + if (Events.empty()) { + // Need to create a new L0 pool + ze_event_pool_desc_t Desc{ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr, 0, 0}; + Desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | Flags; + Desc.count = PoolSize; + ze_event_pool_handle_t Pool; + CALL_ZE_RET_NULL(zeEventPoolCreate, Context, &Desc, 0, nullptr, &Pool); + Pools.push_back(Pool); + + // Create events + ze_event_desc_t EventDesc{ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, 0, 0}; + EventDesc.wait = 0; + EventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + for (uint32_t I = 0; I < PoolSize; I++) { + EventDesc.index = I; + ze_event_handle_t Event; + CALL_ZE_RET_NULL(zeEventCreate, Pool, &EventDesc, &Event); + Events.push_back(Event); + } + } + + auto Ret = Events.back(); + Events.pop_back(); + + return Ret; +} + +/// Return an event to the pool +void EventPoolTy::releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device) { + std::lock_guard Lock(*Mtx); + CALL_ZE_RET_VOID(zeEventHostReset, Event); + Events.push_back(Event); +} + +} // namespace llvm::omp::target::plugin diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp new file mode 100644 index 0000000000000..2e2c2cd5a5bbf --- /dev/null +++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp @@ -0,0 +1,180 @@ +//===--- Level Zero Target RTL Implementation -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Level Zero RTL Options support +// +//===----------------------------------------------------------------------===// + +#include "omptarget.h" + +#include "L0Defs.h" +#include "L0Options.h" +#include "L0Trace.h" + +namespace llvm::omp::target::plugin { + +/// Read environment variables +void L0OptionsTy::processEnvironmentVars() { + // Compilation options for IGC + UserCompilationOptions += + std::string(" ") + + StringEnvar("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "").get(); + + // Memory pool + // LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=