diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index b277380783500..4a2890e5ca741 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -150,9 +150,9 @@ if(DEFINED LIBOMPTARGET_BUILD_CUDA_PLUGIN OR
   message(WARNING "Option removed, use 'LIBOMPTARGET_PLUGINS_TO_BUILD' instead")
 endif()
 
-set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host)
+set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host level_zero)
 set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
-    "Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".")
+    "Semicolon-separated list of plugins to use: cuda, amdgpu, level_zero, host or \"all\".")
 
 if(LIBOMPTARGET_PLUGINS_TO_BUILD STREQUAL "all")
   set(LIBOMPTARGET_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_PLUGIN_TARGETS})
@@ -176,6 +176,18 @@ if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$"
     list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "cuda")
   endif()
 endif()
+if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64)$" AND
+        CMAKE_SYSTEM_NAME MATCHES "Linux|Windows"))
+  if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+    message(STATUS "Not building Level Zero plugin: it is only supported on "
+                   "Linux/Windows x86_64 or ppc64le hosts")
+    list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
+  endif()
+endif()
+if("level_zero" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD AND
+   NOT LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND)
+  list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "level_zero")
+endif()
 message(STATUS "Building the offload library with support for "
                "the \"${LIBOMPTARGET_PLUGINS_TO_BUILD}\" plugins")
 
diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
index 2a8bdebf2c1dd..dc5ea50c958a0 100644
--- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -89,4 +89,16 @@ if(LIBOMPTARGET_AMDGPU_ARCH)
   endif()
 endif()
 
+################################################################################
+# Looking for Level0
+################################################################################
+find_path(LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIR NAMES level_zero/ze_api.h)
+
+if(NOT LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIR)
+  set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND FALSE)
+else()
+  set(LIBOMPTARGET_DEP_LEVEL_ZERO_FOUND TRUE)
+  find_library(LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY NAMES ze_loader)
+endif()
+
 set(OPENMP_PTHREAD_LIB ${LLVM_PTHREAD_LIB})
diff --git a/offload/include/OpenMP/InteropAPI.h b/offload/include/OpenMP/InteropAPI.h
index 53ac4be2e2e98..2553bfa930784 100644
--- a/offload/include/OpenMP/InteropAPI.h
+++ b/offload/include/OpenMP/InteropAPI.h
@@ -160,17 +160,12 @@ struct InteropTableEntry {
     Interops.push_back(obj);
   }
 
-  template <class ClearFuncTy> void clear(ClearFuncTy f) {
-    for (auto &Obj : Interops) {
-      f(Obj);
-    }
-  }
-
   /// vector interface
   int size() const { return Interops.size(); }
   iterator begin() { return Interops.begin(); }
   iterator end() { return Interops.end(); }
   iterator erase(iterator it) { return Interops.erase(it); }
+  void clear() { Interops.clear(); }
 };
 
 struct InteropTblTy
diff --git a/offload/include/PerThreadTable.h b/offload/include/PerThreadTable.h
index 45b196171b4c8..0241370953c67 100644
--- a/offload/include/PerThreadTable.h
+++ b/offload/include/PerThreadTable.h
@@ -16,6 +16,60 @@
 #include <list>
 #include <memory>
 #include <mutex>
+#include <type_traits>
+
+template <typename ObjectType> struct PerThread {
+  struct PerThreadData {
+    std::unique_ptr<ObjectType> ThEntry;
+  };
+
+  std::mutex Mtx;
+  std::list<std::shared_ptr<PerThreadData>> ThreadDataList;
+
+  // define default constructors, disable copy and move constructors
+  PerThread() = default;
+  PerThread(const PerThread &) = delete;
+  PerThread(PerThread &&) = delete;
+  PerThread &operator=(const PerThread &) = delete;
+  PerThread &operator=(PerThread &&) = delete;
+  ~PerThread() {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    ThreadDataList.clear();
+  }
+
+private:
+  PerThreadData &getThreadData() {
+    static thread_local std::shared_ptr<PerThreadData> ThData = nullptr;
+    if (!ThData) {
+      ThData = std::make_shared<PerThreadData>();
+      std::lock_guard<std::mutex> Lock(Mtx);
+      ThreadDataList.push_back(ThData);
+    }
+    return *ThData;
+  }
+
+protected:
+  ObjectType &getThreadEntry() {
+    auto &ThData = getThreadData();
+    if (ThData.ThEntry)
+      return *ThData.ThEntry;
+    ThData.ThEntry = std::make_unique<ObjectType>();
+    return *ThData.ThEntry;
+  }
+
+public:
+  ObjectType &get() { return getThreadEntry(); }
+
+  template <class F> void clear(F f) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    for (auto ThData : ThreadDataList) {
+      if (!ThData->ThEntry)
+        continue;
+      f(*ThData->ThEntry);
+    }
+    ThreadDataList.clear();
+  }
+};
 
 // Using an STL container (such as std::vector) indexed by thread ID has
 // too many race conditions issues so we store each thread entry into a
@@ -23,10 +77,32 @@
 // T is the container type used to store the objects, e.g., std::vector,
 // std::set, etc. by each thread. O is the type of the stored objects e.g.,
 // omp_interop_val_t *, ...
-
 template <typename ContainerType, typename ObjectType> struct PerThreadTable {
   using iterator = typename ContainerType::iterator;
 
+  template <typename, typename = std::void_t<>>
+  struct has_iterator : std::false_type {};
+  template <typename T>
+  struct has_iterator<T, std::void_t<typename T::iterator>> : std::true_type {};
+
+  template <typename T, typename = std::void_t<>>
+  struct has_clear : std::false_type {};
+  template <typename T>
+  struct has_clear<T, std::void_t<decltype(std::declval<T>().clear())>>
+      : std::true_type {};
+
+  template <typename T, typename = std::void_t<>>
+  struct has_clearAll : std::false_type {};
+  template <typename T>
+  struct has_clearAll<T, std::void_t<decltype(std::declval<T>().clearAll(1))>>
+      : std::true_type {};
+
+  template <typename, typename = std::void_t<>>
+  struct is_associative : std::false_type {};
+  template <typename T>
+  struct is_associative<T, std::void_t<typename T::mapped_type>>
+      : std::true_type {};
+
   struct PerThreadData {
     size_t NElements = 0;
     std::unique_ptr<ContainerType> ThEntry;
@@ -71,6 +147,11 @@ template <typename ContainerType, typename ObjectType> struct PerThreadTable {
     return ThData.NElements;
   }
 
+  void setNElements(size_t Size) {
+    auto &NElements = getThreadNElements();
+    NElements = Size;
+  }
+
 public:
   void add(ObjectType obj) {
     auto &Entry = getThreadEntry();
@@ -104,11 +185,81 @@ template <typename ContainerType, typename ObjectType> struct PerThreadTable {
     for (auto ThData : ThreadDataList) {
       if (!ThData->ThEntry || ThData->NElements == 0)
         continue;
-      ThData->ThEntry->clear(f);
+      if constexpr (has_clearAll<ContainerType>::value) {
+        ThData->ThEntry->clearAll(f);
+      } else if constexpr (has_iterator<ContainerType>::value &&
+                           has_clear<ContainerType>::value) {
+        for (auto &Obj : *ThData->ThEntry) {
+          if constexpr (is_associative<ContainerType>::value) {
+            f(Obj.second);
+          } else {
+            f(Obj);
+          }
+        }
+        ThData->ThEntry->clear();
+      } else {
+        static_assert(true, "Container type not supported");
+      }
       ThData->NElements = 0;
     }
     ThreadDataList.clear();
   }
 };
 
+template <typename T, typename = std::void_t<>> struct ContainerValueType {
+  using type = typename T::value_type;
+};
+template <typename T>
+struct ContainerValueType<T, std::void_t<typename T::mapped_type>> {
+  using type = typename T::mapped_type;
+};
+
+template <typename ContainerType, size_t reserveSize = 0>
+struct PerThreadContainer
+    : public PerThreadTable<ContainerType,
+                            typename ContainerValueType<ContainerType>::type> {
+
+  // helpers
+  template <typename T, typename = std::void_t<>> struct indexType {
+    using type = typename T::size_type;
+  };
+  template <typename T> struct indexType<T, std::void_t<typename T::key_type>> {
+    using type = typename T::key_type;
+  };
+  template <typename T, typename = std::void_t<>>
+  struct has_resize : std::false_type {};
+  template <typename T>
+  struct has_resize<T, std::void_t<decltype(std::declval<T>().resize(1))>>
+      : std::true_type {};
+
+  template <typename T, typename = std::void_t<>>
+  struct has_reserve : std::false_type {};
+  template <typename T>
+  struct has_reserve<T, std::void_t<decltype(std::declval<T>().reserve(1))>>
+      : std::true_type {};
+
+  using IndexType = typename indexType<ContainerType>::type;
+  using ObjectType = typename ContainerValueType<ContainerType>::type;
+
+  // Get the object for the given index in the current thread
+  ObjectType &get(IndexType Index) {
+    auto &Entry = this->getThreadEntry();
+
+    // specialized code for vector-like containers
+    if constexpr (has_resize<ContainerType>::value) {
+      if (Index >= Entry.size()) {
+        if constexpr (has_reserve<ContainerType>::value && reserveSize > 0) {
+          if (Entry.capacity() < reserveSize)
+            Entry.reserve(reserveSize);
+        }
+        // If the index is out of bounds, try resize the container
+        Entry.resize(Index + 1);
+      }
+    }
+    ObjectType &Ret = Entry[Index];
+    this->setNElements(Entry.size());
+    return Ret;
+  }
+};
+
 #endif
diff --git a/offload/liboffload/API/Platform.td b/offload/liboffload/API/Platform.td
index 906f899076a80..9e297efc1db6e 100644
--- a/offload/liboffload/API/Platform.td
+++ b/offload/liboffload/API/Platform.td
@@ -27,6 +27,7 @@ def ol_platform_backend_t : Enum {
     Etor<"UNKNOWN", "The backend is not recognized">,
     Etor<"CUDA", "The backend is CUDA">,
     Etor<"AMDGPU", "The backend is AMDGPU">,
+    Etor<"LEVEL_ZERO", "The backend is Level Zero">,
     Etor<"HOST", "The backend is the host">,
   ];
 }
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 051882da7c6c7..495ebab4b8ae3 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -239,6 +239,8 @@ constexpr ol_platform_backend_t pluginNameToBackend(StringRef Name) {
     return OL_PLATFORM_BACKEND_AMDGPU;
   } else if (Name == "cuda") {
     return OL_PLATFORM_BACKEND_CUDA;
+  } else if (Name == "level_zero") {
+    return OL_PLATFORM_BACKEND_LEVEL_ZERO;
   } else {
     return OL_PLATFORM_BACKEND_UNKNOWN;
   }
diff --git a/offload/plugins-nextgen/common/include/DLWrap.h b/offload/plugins-nextgen/common/include/DLWrap.h
index 8934e7e701021..95ce86e123cd3 100644
--- a/offload/plugins-nextgen/common/include/DLWrap.h
+++ b/offload/plugins-nextgen/common/include/DLWrap.h
@@ -282,5 +282,21 @@ template <size_t Requested, size_t Required> constexpr void verboseAssert() {
     return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8,  \
                                           x9, x10);                            \
   }
+#define DLWRAP_INSTANTIATE_12(SYM_DEF, SYM_USE, T)                             \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2,                  \
+                        typename T::template arg<3>::type x3,                  \
+                        typename T::template arg<4>::type x4,                  \
+                        typename T::template arg<5>::type x5,                  \
+                        typename T::template arg<6>::type x6,                  \
+                        typename T::template arg<7>::type x7,                  \
+                        typename T::template arg<8>::type x8,                  \
+                        typename T::template arg<9>::type x9,                  \
+                        typename T::template arg<10>::type x10,                \
+                        typename T::template arg<11>::type x11) {              \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8,  \
+                                          x9, x10, x11);                       \
+  }
 
 #endif // OMPTARGET_SHARED_DLWRAP_H
diff --git a/offload/plugins-nextgen/level_zero/CMakeLists.txt b/offload/plugins-nextgen/level_zero/CMakeLists.txt
new file mode 100644
index 0000000000000..719e46b03edaf
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/CMakeLists.txt
@@ -0,0 +1,54 @@
+# Create the library and add the default arguments.
+add_target_library(omptarget.rtl.level_zero LEVEL_ZERO)
+
+set(LEVEL_ZERO_SRC_FILES
+  src/L0Context.cpp
+  src/L0Device.cpp
+  src/L0Kernel.cpp
+  src/L0Memory.cpp
+  src/L0Program.cpp
+  src/L0Plugin.cpp
+  src/L0Program.cpp
+  src/L0Options.cpp
+)
+
+target_sources(omptarget.rtl.level_zero PRIVATE
+  ${LEVEL_ZERO_SRC_FILES}
+)
+
+target_include_directories(omptarget.rtl.level_zero PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+
+target_include_directories(omptarget.rtl.level_zero PRIVATE
+  ${LIBOMPTARGET_INCLUDE_DIR}
+  ${LIBOMPTARGET_DEP_LEVEL_ZERO_INCLUDE_DIR}
+  ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+  ${LIBOMPTARGET_OMP_HEADER_DIR}
+)
+
+cmake_path(GET LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY FILENAME LEVEL_ZERO_LIBRARY_NAME)
+if (EXISTS ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY} AND NOT "level_zero" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
+  message(STATUS "Building Level Zero NG plugin linked against level_zero library")
+ if(UNIX)
+    target_link_libraries(omptarget.rtl.level_zero PRIVATE
+        ${LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY})
+  elseif(WIN32)
+    # Full path to the Level Zero library is recognized as a linker option, so we
+    # separate directory and file name
+    cmake_path(GET LIBOMPTARGET_DEP_LEVEL_ZERO_LIBRARY PARENT_PATH LEVEL_ZERO_LIBRARY_PATH)
+    target_link_libraries(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_NAME}
+        ${LIBOMP_LIB_FILE})
+    target_link_directories(omptarget.rtl.level_zero PRIVATE ${LEVEL_ZERO_LIBRARY_PATH})
+  else()
+    message(FATAL_ERROR "Missing platform support")
+  endif()
+else()
+  message(STATUS "Building Level Zero NG plugin for dlopened level_zero")
+  if(WIN32)
+    cmake_path(REPLACE_EXTENSION LEVEL_ZERO_LIBRARY_NAME dll)
+  endif()
+  target_compile_definitions(omptarget.rtl.level_zero PRIVATE
+      LEVEL_ZERO_LIBRARY="${LEVEL_ZERO_LIBRARY_NAME}")
+  target_sources(omptarget.rtl.level_zero PRIVATE src/L0DynWrapper.cpp)
+endif()
diff --git a/offload/plugins-nextgen/level_zero/include/AsyncQueue.h b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
new file mode 100644
index 0000000000000..a087a082639e4
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/AsyncQueue.h
@@ -0,0 +1,53 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Async Queue wrapper for Level Zero
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
+
+#include <tuple>
+
+#include "L0Memory.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// Abstract queue that supports asynchronous command submission
+struct AsyncQueueTy {
+  /// List of events attached to submitted commands
+  llvm::SmallVector<ze_event_handle_t> WaitEvents;
+  /// Pending staging buffer to host copies
+  llvm::SmallVector<std::tuple<void *, void *, size_t>> H2MList;
+  /// Pending USM memory copy commands that must wait for kernel completion
+  llvm::SmallVector<std::tuple<const void *, void *, size_t>> USM2MList;
+  /// Kernel event not signaled
+  ze_event_handle_t KernelEvent = nullptr;
+  /// Is this queue being used currently
+  bool InUse = false;
+  /// Clear data
+  void reset() {
+    WaitEvents.clear();
+    H2MList.clear();
+    USM2MList.clear();
+    KernelEvent = nullptr;
+  }
+};
+
+typedef ObjPool<AsyncQueueTy> AsyncQueuePoolTy;
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_ASYNCQUEUE_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Context.h b/offload/plugins-nextgen/level_zero/include/L0Context.h
new file mode 100644
index 0000000000000..29d01bb7b2a2a
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Context.h
@@ -0,0 +1,141 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Context abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H
+
+#include "L0Memory.h"
+#include "PerThreadTable.h"
+
+namespace llvm::omp::target::plugin {
+
+class LevelZeroPluginTy;
+
+class L0ContextTLSTy {
+  StagingBufferTy StagingBuffer;
+
+public:
+  auto &getStagingBuffer() { return StagingBuffer; }
+  const auto &getStagingBuffer() const { return StagingBuffer; }
+
+  void clear() { StagingBuffer.clear(); }
+};
+
+struct L0ContextTLSTableTy
+    : public PerThreadContainer<
+          std::unordered_map<ze_context_handle_t, L0ContextTLSTy>> {
+  void clear() {
+    PerThreadTable::clear([](L0ContextTLSTy &Entry) { Entry.clear(); });
+  }
+};
+
+/// Driver and context-specific resources. We assume a single context per
+/// driver.
+class L0ContextTy {
+  /// The plugin that created this context
+  LevelZeroPluginTy &Plugin;
+
+  /// Level Zero Driver handle
+  ze_driver_handle_t zeDriver = nullptr;
+
+  /// Common Level Zero context
+  ze_context_handle_t zeContext = nullptr;
+
+  /// API version supported by the Level Zero driver
+  ze_api_version_t APIVersion = ZE_API_VERSION_CURRENT;
+
+  /// Imported external pointers. Track this only for user-directed
+  /// imports/releases.
+  llvm::DenseMap<uintptr_t, size_t> ImportedPtrs;
+
+  /// Common event pool
+  EventPoolTy EventPool;
+
+  /// Host Memory allocator for this driver
+  MemAllocatorTy HostMemAllocator;
+
+public:
+  /// Named constants for checking the imported external pointer regions.
+  static constexpr int32_t ImportNotExist = -1;
+  static constexpr int32_t ImportUnknown = 0;
+  static constexpr int32_t ImportExist = 1;
+
+  /// Create context, initialize event pool and extension functions
+  L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver,
+              int32_t DriverId);
+
+  L0ContextTy(const L0ContextTy &) = delete;
+  L0ContextTy(L0ContextTy &&) = delete;
+  L0ContextTy &operator=(const L0ContextTy &) = delete;
+  L0ContextTy &operator=(const L0ContextTy &&) = delete;
+
+  /// Release resources
+  ~L0ContextTy() {
+    EventPool.deinit();
+    HostMemAllocator.deinit();
+    if (zeContext)
+      CALL_ZE_RET_VOID(zeContextDestroy, zeContext);
+  }
+
+  auto &getPlugin() const { return Plugin; }
+
+  StagingBufferTy &getStagingBuffer();
+
+  /// Add imported external pointer region.
+  void addImported(void *Ptr, size_t Size) {
+    (void)ImportedPtrs.try_emplace((uintptr_t)Ptr, Size);
+  }
+
+  /// Remove imported external pointer region
+  void removeImported(void *Ptr) { (void)ImportedPtrs.erase((uintptr_t)Ptr); }
+
+  /// Check if imported regions contain the specified region.
+  int32_t checkImported(void *Ptr, size_t Size) const {
+    uintptr_t LB = (uintptr_t)Ptr;
+    uintptr_t UB = LB + Size;
+    // We do not expect a large number of user-directed imports, so use simple
+    // logic.
+    for (auto &I : ImportedPtrs) {
+      uintptr_t ILB = I.first;
+      uintptr_t IUB = ILB + I.second;
+      if (LB >= ILB && UB <= IUB)
+        return ImportExist;
+      if ((LB >= ILB && LB < IUB) || (UB > ILB && UB <= IUB))
+        return ImportUnknown;
+    }
+    return ImportNotExist;
+  }
+
+  ze_driver_handle_t getZeDriver() const { return zeDriver; }
+
+  /// Return context associated with the driver
+  ze_context_handle_t getZeContext() const { return zeContext; }
+
+  /// Return driver API version
+  ze_api_version_t getDriverAPIVersion() const { return APIVersion; }
+
+  /// Return the event pool of this driver
+  auto &getEventPool() { return EventPool; }
+  const auto &getEventPool() const { return EventPool; }
+
+  bool supportsLargeMem() const {
+    // Large memory support is available since API version 1.1
+    return getDriverAPIVersion() >= ZE_API_VERSION_1_1;
+  }
+
+  const MemAllocatorTy &getHostMemAllocator() const { return HostMemAllocator; }
+  MemAllocatorTy &getHostMemAllocator() { return HostMemAllocator; }
+};
+
+} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0CONTEXT_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Defs.h b/offload/plugins-nextgen/level_zero/include/L0Defs.h
new file mode 100644
index 0000000000000..47dc25b85ce92
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Defs.h
@@ -0,0 +1,67 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// External and other auxilary definitions
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
+
+#include "PluginInterface.h"
+#include "Shared/Requirements.h"
+#include "omptarget.h"
+
+enum class AllocOptionTy : int32_t {
+  ALLOC_OPT_NONE = 0,
+  ALLOC_OPT_REDUCTION_SCRATCH = 1,
+  ALLOC_OPT_REDUCTION_COUNTER = 2,
+  ALLOC_OPT_HOST_MEM = 3,
+  ALLOC_OPT_SLM = 4,
+};
+
+#ifndef EXTRACT_BITS
+// MSB=63, LSB=0
+#define EXTRACT_BITS(I64, HIGH, LOW)                                           \
+  (((uint64_t)I64) >> (LOW)) & (((uint64_t)1 << ((HIGH) - (LOW) + 1)) - 1)
+#endif
+
+namespace llvm::omp::target::plugin {
+
+/// Default alignmnet for allocation
+constexpr size_t L0DefaultAlignment = 0;
+/// Default staging buffer size for host to device copy (16KB)
+constexpr size_t L0StagingBufferSize = (1 << 14);
+/// Default staging buffer count
+constexpr size_t L0StagingBufferCount = 64;
+/// USM allocation threshold where preallocation does not pay off (128MB)
+constexpr size_t L0UsmPreAllocThreshold = (128 << 20);
+/// Host USM allocation threshold where preallocation does not pay off (8MB)
+constexpr size_t L0HostUsmPreAllocThreshold = (8 << 20);
+
+using namespace error;
+/// Generic L0 handle type
+using ZeHandleTy = void *;
+
+template <typename... ArgsTy>
+static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
+
+  if (Code == OFFLOAD_SUCCESS)
+    return Plugin::success();
+  const char *Desc = "Unknown error";
+  return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
+                                                    ErrFmt, Args..., Desc);
+}
+
+#define L0_UNIMPLEMENTED_ERR                                                   \
+  return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet",     \
+                       __func__);
+
+} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEFS_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
new file mode 100644
index 0000000000000..e6ebff0305a14
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -0,0 +1,681 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericDevice instatiation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
+
+#include "llvm/ADT/SmallVector.h"
+
+#include "PerThreadTable.h"
+
+#include "AsyncQueue.h"
+#include "L0Context.h"
+#include "L0Program.h"
+#include "PluginInterface.h"
+#include "TLS.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+using OmpInteropTy = omp_interop_val_t *;
+class LevelZeroPluginTy;
+
+// clang-format off
+enum class PCIIdTy : int32_t {
+  None            = 0x0000,
+  SKL             = 0x1900,
+  KBL             = 0x5900,
+  CFL             = 0x3E00,
+  CFL_2           = 0x9B00,
+  ICX             = 0x8A00,
+  TGL             = 0xFF20,
+  TGL_2           = 0x9A00,
+  DG1             = 0x4900,
+  RKL             = 0x4C00,
+  ADLS            = 0x4600,
+  RTL             = 0xA700,
+  MTL             = 0x7D00,
+  PVC             = 0x0B00,
+  DG2_ATS_M       = 0x4F00,
+  DG2_ATS_M_2     = 0x5600,
+  LNL             = 0x6400,
+  BMG             = 0xE200,
+};
+
+/// Device type enumeration common to compiler and runtime
+enum class DeviceArchTy : uint64_t {
+  DeviceArch_None   = 0,
+  DeviceArch_Gen    = 0x0001, // Gen 9, Gen 11 or Xe
+  DeviceArch_XeLPG  = 0x0002,
+  DeviceArch_XeHPC  = 0x0004,
+  DeviceArch_XeHPG  = 0x0008,
+  DeviceArch_Xe2LP  = 0x0010,
+  DeviceArch_Xe2HP  = 0x0020,
+  DeviceArch_x86_64 = 0x0100
+};
+// clang-format on
+
+struct L0DeviceIdTy {
+  ze_device_handle_t zeId;
+  int32_t RootId;
+  int32_t SubId;
+  int32_t CCSId;
+
+  L0DeviceIdTy(ze_device_handle_t Device, int32_t RootId, int32_t SubId = -1,
+               int32_t CCSId = -1)
+      : zeId(Device), RootId(RootId), SubId(SubId), CCSId(CCSId) {}
+};
+
+class L0DeviceTLSTy {
+  /// Command list for each device
+  ze_command_list_handle_t CmdList = nullptr;
+
+  /// Main copy command list for each device
+  ze_command_list_handle_t CopyCmdList = nullptr;
+
+  /// Link copy command list for each device
+  ze_command_list_handle_t LinkCopyCmdList = nullptr;
+
+  /// Command queue for each device
+  ze_command_queue_handle_t CmdQueue = nullptr;
+
+  /// Main copy command queue for each device
+  ze_command_queue_handle_t CopyCmdQueue = nullptr;
+
+  /// Link copy command queues for each device
+  ze_command_queue_handle_t LinkCopyCmdQueue = nullptr;
+
+  /// Immediate command list for each device
+  ze_command_list_handle_t ImmCmdList = nullptr;
+
+  /// Immediate copy command list for each device
+  ze_command_list_handle_t ImmCopyCmdList = nullptr;
+
+public:
+  L0DeviceTLSTy() = default;
+  ~L0DeviceTLSTy() {
+    // assert all fields are nullptr on destruction
+    assert(CmdList == nullptr && "CmdList is not nullptr on destruction");
+    assert(CopyCmdList == nullptr &&
+           "CopyCmdList is not nullptr on destruction");
+    assert(LinkCopyCmdList == nullptr &&
+           "LinkCopyCmdList is not nullptr on destruction");
+    assert(CmdQueue == nullptr && "CmdQueue is not nullptr on destruction");
+    assert(CopyCmdQueue == nullptr &&
+           "CopyCmdQueue is not nullptr on destruction");
+    assert(LinkCopyCmdQueue == nullptr &&
+           "LinkCopyCmdQueue is not nullptr on destruction");
+    assert(ImmCmdList == nullptr && "ImmCmdList is not nullptr on destruction");
+    assert(ImmCopyCmdList == nullptr &&
+           "ImmCopyCmdList is not nullptr on destruction");
+  }
+
+  L0DeviceTLSTy(const L0DeviceTLSTy &) = delete;
+  L0DeviceTLSTy(L0DeviceTLSTy &&Other) {
+    CmdList = std::exchange(Other.CmdList, nullptr);
+    CopyCmdList = std::exchange(Other.CopyCmdList, nullptr);
+    LinkCopyCmdList = std::exchange(Other.LinkCopyCmdList, nullptr);
+    CmdQueue = std::exchange(Other.CmdQueue, nullptr);
+    CopyCmdQueue = std::exchange(Other.CopyCmdQueue, nullptr);
+    LinkCopyCmdQueue = std::exchange(Other.LinkCopyCmdQueue, nullptr);
+    ImmCmdList = std::exchange(Other.ImmCmdList, nullptr);
+    ImmCopyCmdList = std::exchange(Other.ImmCopyCmdList, nullptr);
+  }
+
+  void clear() {
+    // destroy all lists and queues
+    if (CmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CmdList);
+    if (CopyCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, CopyCmdList);
+    if (LinkCopyCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, LinkCopyCmdList);
+    if (ImmCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCmdList);
+    if (ImmCopyCmdList)
+      CALL_ZE_EXIT_FAIL(zeCommandListDestroy, ImmCopyCmdList);
+    if (CmdQueue)
+      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CmdQueue);
+    if (CopyCmdQueue)
+      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, CopyCmdQueue);
+    if (LinkCopyCmdQueue)
+      CALL_ZE_EXIT_FAIL(zeCommandQueueDestroy, LinkCopyCmdQueue);
+
+    CmdList = nullptr;
+    CopyCmdList = nullptr;
+    LinkCopyCmdList = nullptr;
+    CmdQueue = nullptr;
+    CopyCmdQueue = nullptr;
+    LinkCopyCmdQueue = nullptr;
+    ImmCmdList = nullptr;
+    ImmCopyCmdList = nullptr;
+  }
+
+  L0DeviceTLSTy &operator=(const L0DeviceTLSTy &) = delete;
+  L0DeviceTLSTy &operator=(L0DeviceTLSTy &&) = delete;
+
+  auto getCmdList() const { return CmdList; }
+  void setCmdList(ze_command_list_handle_t _CmdList) { CmdList = _CmdList; }
+
+  auto getCopyCmdList() const { return CopyCmdList; }
+  void setCopyCmdList(ze_command_list_handle_t _CopyCmdList) {
+    CopyCmdList = _CopyCmdList;
+  }
+
+  auto getLinkCopyCmdList() const { return LinkCopyCmdList; }
+  void setLinkCopyCmdList(ze_command_list_handle_t _LinkCopyCmdList) {
+    LinkCopyCmdList = _LinkCopyCmdList;
+  }
+
+  auto getImmCmdList() const { return ImmCmdList; }
+  void setImmCmdList(ze_command_list_handle_t _ImmCmdList) {
+    ImmCmdList = _ImmCmdList;
+  }
+
+  auto getImmCopyCmdList() const { return ImmCopyCmdList; }
+  void setImmCopyCmdList(ze_command_list_handle_t _ImmCopyCmdList) {
+    ImmCopyCmdList = _ImmCopyCmdList;
+  }
+
+  auto getCmdQueue() const { return CmdQueue; }
+  void setCmdQueue(ze_command_queue_handle_t _CmdQueue) {
+    CmdQueue = _CmdQueue;
+  }
+
+  auto getCopyCmdQueue() const { return CopyCmdQueue; }
+  void setCopyCmdQueue(ze_command_queue_handle_t _CopyCmdQueue) {
+    CopyCmdQueue = _CopyCmdQueue;
+  }
+
+  auto getLinkCopyCmdQueue() const { return LinkCopyCmdQueue; }
+  void setLinkCopyCmdQueue(ze_command_queue_handle_t _LinkCopyCmdQueue) {
+    LinkCopyCmdQueue = _LinkCopyCmdQueue;
+  }
+};
+
+struct L0DeviceTLSTableTy
+    : public PerThreadContainer<std::vector<L0DeviceTLSTy>, 8> {
+  void clear() {
+    PerThreadTable::clear([](L0DeviceTLSTy &Entry) { Entry.clear(); });
+  }
+};
+
+class L0DeviceTy final : public GenericDeviceTy {
+  // Level Zero Context for this Device
+  L0ContextTy &l0Context;
+
+  // Level Zero handle  for this Device
+  ze_device_handle_t zeDevice;
+  // Device Properties
+  ze_device_properties_t DeviceProperties{};
+  ze_device_compute_properties_t ComputeProperties{};
+  ze_device_memory_properties_t MemoryProperties{};
+  ze_device_cache_properties_t CacheProperties{};
+
+  /// Devices' default target allocation kind for internal allocation
+  int32_t AllocKind = TARGET_ALLOC_DEVICE;
+
+  DeviceArchTy DeviceArch = DeviceArchTy::DeviceArch_None;
+
+  std::string DeviceName;
+
+  /// Common indirect access flags for this device
+  ze_kernel_indirect_access_flags_t IndirectAccessFlags = 0;
+
+  /// Device UUID for toplevel devices only
+  std::string DeviceUuid;
+
+  /// L0 Device ID as string
+  std::string zeId;
+
+  /// Command queue group ordinals for each device
+  std::pair<uint32_t, uint32_t> ComputeOrdinal{UINT32_MAX, 0};
+  /// Command queue group ordinals for copying
+  std::pair<uint32_t, uint32_t> CopyOrdinal{UINT32_MAX, 0};
+  /// Command queue group ordinals and number of queues for link copy engines
+  std::pair<uint32_t, uint32_t> LinkCopyOrdinal{UINT32_MAX, 0};
+
+  /// Command queue index for each device
+  uint32_t ComputeIndex = 0;
+
+  bool IsAsyncEnabled = false;
+
+  // lock for this device
+  std::mutex Mutex;
+
+  /// Contains all modules (possibly from multiple device images) to handle
+  /// dynamic link across multiple images
+  llvm::SmallVector<ze_module_handle_t> GlobalModules;
+
+  /// L0 programs created for this device
+  std::list<L0ProgramTy> Programs;
+
+  /// MemAllocator for this device
+  MemAllocatorTy MemAllocator;
+
+  /// The current size of the global device memory pool (managed by us).
+  uint64_t HeapSize = 1L << 23L /*8MB=*/;
+
+  int32_t synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue = true);
+  int32_t submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
+                     __tgt_async_info *AsyncInfo);
+  int32_t retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
+                       __tgt_async_info *AsyncInfo);
+
+  bool shouldSetupDeviceMemoryPool() const override { return false; }
+  DeviceArchTy computeArch() const;
+
+  /// Get default compute group ordinal. Returns Ordinal-NumQueues pair
+  std::pair<uint32_t, uint32_t> findComputeOrdinal();
+
+  /// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair
+  std::pair<uint32_t, uint32_t> findCopyOrdinal(bool LinkCopy = false);
+
+  Error internalInit();
+
+public:
+  L0DeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices,
+             ze_device_handle_t zeDevice, L0ContextTy &DriverInfo,
+             const std::string_view zeId, int32_t ComputeIndex)
+      : GenericDeviceTy(Plugin, DeviceId, NumDevices, SPIRVGridValues),
+        l0Context(DriverInfo), zeDevice(zeDevice), zeId(zeId),
+        ComputeIndex(ComputeIndex) {
+    DeviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    DeviceProperties.pNext = nullptr;
+    ComputeProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES;
+    ComputeProperties.pNext = nullptr;
+    MemoryProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES;
+    MemoryProperties.pNext = nullptr;
+    CacheProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES;
+    CacheProperties.pNext = nullptr;
+
+    auto Err = internalInit();
+    if (Err)
+      FATAL_MESSAGE(DeviceId, "Couldn't initialize device: %s\n",
+                    toString(std::move(Err)).c_str());
+  }
+
+  static L0DeviceTy &makeL0Device(GenericDeviceTy &Device) {
+    return static_cast<L0DeviceTy &>(Device);
+  }
+
+  auto &getPlugin() { return (LevelZeroPluginTy &)Plugin; }
+  L0DeviceTLSTy &getTLS();
+
+  Error setContext() override { return Plugin::success(); }
+  Error initImpl(GenericPluginTy &Plugin) override;
+  Error deinitImpl() override {
+    Programs.clear();
+    return Plugin::success();
+  }
+
+  auto getZeDevice() const { return zeDevice; }
+
+  const L0ContextTy &getL0Context() const { return l0Context; }
+  L0ContextTy &getL0Context() { return l0Context; }
+
+  const std::string_view getName() const { return DeviceName; }
+  const char *getNameCStr() const { return DeviceName.c_str(); }
+
+  const std::string_view getZeId() const { return zeId; }
+  const char *getZeIdCStr() const { return zeId.c_str(); }
+
+  std::mutex &getMutex() { return Mutex; }
+
+  auto getComputeIndex() const { return ComputeIndex; }
+  auto getIndirectFlags() const { return IndirectAccessFlags; }
+
+  auto getNumGlobalModules() const { return GlobalModules.size(); }
+  void addGlobalModule(ze_module_handle_t Module) {
+    GlobalModules.push_back(Module);
+  }
+  auto getGlobalModulesArray() { return GlobalModules.data(); }
+
+  L0ProgramTy *getProgramFromImage(MemoryBufferRef Image) {
+    for (auto &PGM : Programs)
+      if (PGM.getMemoryBuffer() == Image)
+        return &PGM;
+    return nullptr;
+  }
+
+  int32_t buildAllKernels() {
+    for (auto &PGM : Programs) {
+      int32_t RC = PGM.loadModuleKernels();
+      if (RC != OFFLOAD_SUCCESS)
+        return RC;
+    }
+    return OFFLOAD_SUCCESS;
+  }
+
+  // add a new program to the device. Return a reference to the new program
+  auto &addProgram(int32_t ImageId, std::unique_ptr<MemoryBuffer> &&Image) {
+    Programs.emplace_back(ImageId, *this, std::move(Image));
+    return Programs.back();
+  }
+
+  const auto &getLastProgram() const { return Programs.back(); }
+  auto &getLastProgram() { return Programs.back(); }
+  // Device properties getters
+  auto getVendorId() const { return DeviceProperties.vendorId; }
+  bool isGPU() const { return DeviceProperties.type == ZE_DEVICE_TYPE_GPU; }
+
+  auto getPCIId() const { return DeviceProperties.deviceId; }
+  auto getNumThreadsPerEU() const { return DeviceProperties.numThreadsPerEU; }
+  auto getSIMDWidth() const { return DeviceProperties.physicalEUSimdWidth; }
+  auto getNumEUsPerSubslice() const {
+    return DeviceProperties.numEUsPerSubslice;
+  }
+  auto getNumSubslicesPerSlice() const {
+    return DeviceProperties.numSubslicesPerSlice;
+  }
+  auto getNumSlices() const { return DeviceProperties.numSlices; }
+  auto getNumSubslices() const {
+    return DeviceProperties.numSubslicesPerSlice * DeviceProperties.numSlices;
+  }
+  uint32_t getNumEUs() const {
+    return DeviceProperties.numEUsPerSubslice * getNumSubslices();
+  }
+  auto getTotalThreads() const {
+    return DeviceProperties.numThreadsPerEU * getNumEUs();
+  }
+  auto getNumThreadsPerSubslice() const {
+    return getNumEUsPerSubslice() * getNumThreadsPerEU();
+  }
+  auto getClockRate() const { return DeviceProperties.coreClockRate; }
+
+  auto getMaxSharedLocalMemory() const {
+    return ComputeProperties.maxSharedLocalMemory;
+  }
+  auto getMaxGroupSize() const { return ComputeProperties.maxTotalGroupSize; }
+  auto getGlobalMemorySize() const { return MemoryProperties.totalSize; }
+  auto getCacheSize() const { return CacheProperties.cacheSize; }
+  auto getMaxMemAllocSize() const { return DeviceProperties.maxMemAllocSize; }
+
+  int32_t getAllocKind() const { return AllocKind; }
+  DeviceArchTy getDeviceArch() const { return DeviceArch; }
+  bool isDeviceArch(DeviceArchTy Arch) const { return DeviceArch == Arch; }
+
+  static bool isDiscrete(uint32_t PCIId) {
+    switch (static_cast<PCIIdTy>(PCIId & 0xFF00)) {
+    case PCIIdTy::BMG:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  static bool isDiscrete(ze_device_handle_t Device) {
+    ze_device_properties_t PR{};
+    PR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    PR.pNext = nullptr;
+    CALL_ZE_RET(false, zeDeviceGetProperties, Device, &PR);
+    return isDiscrete(PR.deviceId);
+  }
+
+  bool isDiscreteDevice() { return isDiscrete(getPCIId()); }
+  bool isDeviceIPorNewer(uint32_t Version) const;
+
+  const std::string_view getUuid() const { return DeviceUuid; }
+
+  uint32_t getComputeEngine() const { return ComputeOrdinal.first; }
+  uint32_t getNumComputeQueues() const { return ComputeOrdinal.second; }
+
+  bool hasMainCopyEngine() const { return CopyOrdinal.first != UINT32_MAX; }
+  uint32_t getMainCopyEngine() const { return CopyOrdinal.first; }
+
+  uint32_t getLinkCopyEngine() const { return LinkCopyOrdinal.first; }
+  uint32_t getNumLinkCopyQueues() const { return LinkCopyOrdinal.second; }
+  bool hasLinkCopyEngine() const { return getNumLinkCopyQueues() > 0; }
+
+  bool deviceRequiresImmCmdList() const {
+    return isDeviceIPorNewer(0x05004000);
+  }
+  bool asyncEnabled() const { return IsAsyncEnabled; }
+  bool useImmForCompute() const { return true; }
+  bool useImmForCopy() const { return true; }
+  bool useImmForInterop() const { return true; }
+
+  void reportDeviceInfo() const;
+
+  // Command queues related functions
+  /// Create a command list with given ordinal and flags
+  ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+                                         ze_device_handle_t Device,
+                                         uint32_t Ordinal,
+                                         ze_command_list_flags_t Flags,
+                                         const std::string_view DeviceIdStr);
+
+  /// Create a command list with default flags
+  ze_command_list_handle_t createCmdList(ze_context_handle_t Context,
+                                         ze_device_handle_t Device,
+                                         uint32_t Ordinal,
+                                         const std::string_view DeviceIdStr);
+
+  ze_command_list_handle_t getCmdList();
+
+  /// Create a command queue with given ordinal and flags
+  ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+                                           ze_device_handle_t Device,
+                                           uint32_t Ordinal, uint32_t Index,
+                                           ze_command_queue_flags_t Flags,
+                                           const std::string_view DeviceIdStr);
+
+  /// Create a command queue with default flags
+  ze_command_queue_handle_t createCmdQueue(ze_context_handle_t Context,
+                                           ze_device_handle_t Device,
+                                           uint32_t Ordinal, uint32_t Index,
+                                           const std::string_view DeviceIdStr,
+                                           bool InOrder = false);
+
+  /// Create a new command queue for the given OpenMP device ID
+  ze_command_queue_handle_t createCommandQueue(bool InOrder = false);
+
+  /// Create an immediate command list
+  ze_command_list_handle_t createImmCmdList(uint32_t Ordinal, uint32_t Index,
+                                            bool InOrder = false);
+
+  /// Create an immediate command list for computing
+  ze_command_list_handle_t createImmCmdList(bool InOrder = false) {
+    return createImmCmdList(getComputeEngine(), getComputeIndex(), InOrder);
+  }
+
+  /// Create an immediate command list for copying
+  ze_command_list_handle_t createImmCopyCmdList();
+  ze_command_queue_handle_t getCmdQueue();
+  ze_command_list_handle_t getCopyCmdList();
+  ze_command_queue_handle_t getCopyCmdQueue();
+  ze_command_list_handle_t getLinkCopyCmdList();
+  ze_command_queue_handle_t getLinkCopyCmdQueue();
+  ze_command_list_handle_t getImmCmdList();
+  ze_command_list_handle_t getImmCopyCmdList();
+
+  /// Enqueue copy command
+  int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+                         __tgt_async_info *AsyncInfo = nullptr,
+                         bool UseCopyEngine = true);
+
+  /// Enqueue asynchronous copy command
+  int32_t enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
+                              __tgt_async_info *AsyncInfo, bool CopyTo = true);
+
+  /// Enqueue fill command
+  int32_t enqueueMemFill(void *Ptr, const void *Pattern, size_t PatternSize,
+                         size_t Size);
+
+  /// Driver related functions
+
+  /// Reurn the driver handle for this device
+  ze_driver_handle_t getZeDriver() const { return l0Context.getZeDriver(); }
+
+  /// Return context for this device
+  ze_context_handle_t getZeContext() const { return l0Context.getZeContext(); }
+
+  /// Return driver API version for this device
+  ze_api_version_t getDriverAPIVersion() const {
+    return l0Context.getDriverAPIVersion();
+  }
+
+  /// Return an event from the driver associated to this device
+  ze_event_handle_t getEvent() { return l0Context.getEventPool().getEvent(); }
+
+  /// Release event to the pool associated to this device
+  void releaseEvent(ze_event_handle_t Event) {
+    l0Context.getEventPool().releaseEvent(Event, *this);
+  }
+
+  StagingBufferTy &getStagingBuffer() { return l0Context.getStagingBuffer(); }
+
+  bool supportsLargeMem() const { return l0Context.supportsLargeMem(); }
+
+  // Allocation related routines
+
+  /// Data alloc
+  Expected<void *>
+  dataAlloc(size_t Size, size_t Align, int32_t Kind, intptr_t Offset,
+            bool UserAlloc, bool DevMalloc = false,
+            uint32_t MemAdvice = UINT32_MAX,
+            AllocOptionTy AllocOpt = AllocOptionTy::ALLOC_OPT_NONE);
+
+  /// Data delete
+  Error dataDelete(void *Ptr);
+
+  /// Return the memory allocation type for the specified memory location.
+  uint32_t getMemAllocType(const void *Ptr) const;
+
+  const MemAllocatorTy &getDeviceMemAllocator() const { return MemAllocator; }
+  MemAllocatorTy &getDeviceMemAllocator() { return MemAllocator; }
+
+  MemAllocatorTy &getMemAllocator(int32_t Kind) {
+    if (Kind == TARGET_ALLOC_HOST)
+      return l0Context.getHostMemAllocator();
+    return getDeviceMemAllocator();
+  }
+
+  MemAllocatorTy &getMemAllocator(const void *Ptr) {
+    bool IsHostMem = (ZE_MEMORY_TYPE_HOST == getMemAllocType(Ptr));
+    if (IsHostMem)
+      return l0Context.getHostMemAllocator();
+    return getDeviceMemAllocator();
+  }
+
+  int32_t makeMemoryResident(void *Mem, size_t Size);
+
+  // Generic device interface implementation
+  Expected<DeviceImageTy *>
+  loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
+                 int32_t ImageId) override;
+  Error unloadBinaryImpl(DeviceImageTy *Image) override;
+  Expected<void *> allocate(size_t Size, void *HstPtr,
+                            TargetAllocTy Kind) override;
+  Error free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) override;
+
+  Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN,
+                         "dataLockImpl not supported");
+  }
+  Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
+
+  Expected<bool> isPinnedPtrImpl(void *, void *&, void *&,
+                                 size_t &) const override {
+    // Don't need to do anything, this is handled by the driver.
+    return false;
+  }
+
+  Error dataFence(__tgt_async_info *Async) override;
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+                        bool ReleaseQueue) override;
+  Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override;
+  Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                       AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
+                         AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+                         void *DstPtr, int64_t Size,
+                         AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+  Error initDeviceInfoImpl(__tgt_device_info *Info) override;
+  Expected<bool>
+  hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+
+  Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+                            AsyncInfoWrapperTy &AsyncInfo) override{
+      L0_UNIMPLEMENTED_ERR}
+
+  /* Event routines are used to ensure ordering between dataTransfers. Instead
+   * of adding extra events in the queues, we make sure they're ordered by
+   * using the events from the data submission APIs so we don't need to support
+   * these routines.
+   * They still need to report succes to indicate the event are handled
+   * somewhere waitEvent and syncEvent should remain unimplemented
+   */
+  Expected<bool> isEventCompleteImpl(void *EventPtr,
+                                     AsyncInfoWrapperTy &) override {
+    return true;
+  }
+
+  Error createEventImpl(void **EventPtrStorage) override {
+    return Plugin::success();
+  }
+  Error destroyEventImpl(void *EventPtr) override { return Plugin::success(); }
+  Error recordEventImpl(void *EventPtr,
+                        AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::success();
+  }
+
+  Error waitEventImpl(void *EventPtr,
+                      AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+                         __func__);
+  }
+
+  Error syncEventImpl(void *EventPtr) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+                         __func__);
+  }
+
+  Expected<InfoTreeNode> obtainInfoImpl() override;
+
+  Error getDeviceStackSize(uint64_t &V) override {
+    V = 0;
+    return Plugin::success();
+  }
+  Expected<GenericKernelTy &> constructKernel(const char *Name) override;
+
+  Error setDeviceStackSize(uint64_t V) override { return Plugin::success(); }
+  Error getDeviceHeapSize(uint64_t &V) override {
+    V = HeapSize;
+    return Plugin::success();
+  }
+  Error setDeviceHeapSize(uint64_t V) override {
+    HeapSize = V;
+    return Plugin::success();
+  }
+
+  Expected<omp_interop_val_t *>
+  createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override;
+  Error releaseInterop(omp_interop_val_t *Interop) override;
+
+  interop_spec_t selectInteropPreference(int32_t InteropType,
+                                         int32_t NumPrefers,
+                                         interop_spec_t *Prefers) override;
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0DEVICE_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Interop.h b/offload/plugins-nextgen/level_zero/include/L0Interop.h
new file mode 100644
index 0000000000000..69a1a5f274068
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Interop.h
@@ -0,0 +1,28 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interop support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H
+
+namespace llvm::omp::target::plugin::L0Interop {
+
+/// Level Zero interop property
+struct Property {
+  // Use this when command queue needs to be accessed as
+  // the targetsync field in interop will be changed if preferred type is sycl.
+  ze_command_queue_handle_t CommandQueue;
+  ze_command_list_handle_t ImmCmdList;
+};
+
+} // namespace llvm::omp::target::plugin::L0Interop
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0INTEROP_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
new file mode 100644
index 0000000000000..c5a3528dd2974
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -0,0 +1,158 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericKernel implementation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
+
+#include "L0Defs.h"
+#include "L0Trace.h"
+#include "PluginInterface.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+class L0ProgramTy;
+
+/// Loop descriptor
+struct TgtLoopDescTy {
+  int64_t Lb = 0;     // The lower bound of the i-th loop
+  int64_t Ub = 0;     // The upper bound of the i-th loop
+  int64_t Stride = 0; // The stride of the i-th loop
+};
+
+struct TgtNDRangeDescTy {
+  int32_t NumLoops = 0;      // Number of loops/dimensions
+  int32_t DistributeDim = 0; // Dimensions lower than this one
+                             // must end up in one WG
+  TgtLoopDescTy Levels[3];   // Up to 3 loops
+};
+
+/// Kernel properties.
+struct KernelPropertiesTy {
+  uint32_t Width = 0;
+  uint32_t SIMDWidth = 0;
+  uint32_t MaxThreadGroupSize = 0;
+
+  /// Cached input parameters used in the previous launch
+  TgtNDRangeDescTy LoopDesc;
+  int32_t NumTeams = -1;
+  int32_t ThreadLimit = -1;
+
+  /// Cached parameters used in the previous launch
+  ze_kernel_indirect_access_flags_t IndirectAccessFlags = UINT32_MAX;
+  uint32_t GroupSizes[3] = {0, 0, 0};
+  ze_group_count_t GroupCounts{0, 0, 0};
+  bool AllowCooperative = false;
+
+  std::mutex Mtx;
+
+  static constexpr TgtNDRangeDescTy LoopDescInit = {};
+
+  /// Check if we can reuse group parameters.
+  bool reuseGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
+                        const int32_t NumTeamsIn, const int32_t ThreadLimitIn,
+                        uint32_t *GroupSizesOut,
+                        ze_group_count_t &GroupCountsOut,
+                        bool &AllowCooperativeOut) const {
+    if (!LoopDescPtr && memcmp(&LoopDescInit, &LoopDesc, sizeof(LoopDesc)))
+      return false;
+    if (LoopDescPtr && memcmp(LoopDescPtr, &LoopDesc, sizeof(LoopDesc)))
+      return false;
+    if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit)
+      return false;
+    // Found matching input parameters.
+    std::copy_n(GroupSizes, 3, GroupSizesOut);
+    GroupCountsOut = GroupCounts;
+    AllowCooperativeOut = AllowCooperative;
+    return true;
+  }
+
+  /// Update cached group parameters.
+  void cacheGroupParams(const TgtNDRangeDescTy *LoopDescPtr,
+                        const int32_t NumTeamsIn, const int32_t ThreadLimitIn,
+                        const uint32_t *GroupSizesIn,
+                        const ze_group_count_t &GroupCountsIn,
+                        const bool &AllowCooperativeIn) {
+    LoopDesc = LoopDescPtr ? *LoopDescPtr : LoopDescInit;
+    NumTeams = NumTeamsIn;
+    ThreadLimit = ThreadLimitIn;
+    std::copy_n(GroupSizesIn, 3, GroupSizes);
+    GroupCounts = GroupCountsIn;
+    AllowCooperative = AllowCooperativeIn;
+  }
+};
+
+class L0KernelTy : public GenericKernelTy {
+  // L0 Kernel Handle
+  ze_kernel_handle_t zeKernel;
+  // Kernel Properties
+  KernelPropertiesTy Properties;
+  auto &getProperties() { return Properties; }
+
+  int32_t runTargetTeamRegion(L0DeviceTy &Device, KernelArgsTy &KernelArgs,
+                              KernelLaunchParamsTy LaunchParams,
+                              __tgt_async_info *AsyncInfo) const;
+
+  void decideKernelGroupArguments(L0DeviceTy &Device, uint32_t NumTeams,
+                                  uint32_t ThreadLimit,
+                                  TgtNDRangeDescTy *LoopLevels,
+                                  uint32_t *GroupSizes,
+                                  ze_group_count_t &GroupCounts,
+                                  bool HalfNumThreads,
+                                  bool IsTeamsNDRange) const;
+
+  int32_t decideLoopKernelGroupArguments(
+      L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+      uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
+      bool &AllowCooperative) const;
+
+  Error buildKernel(L0ProgramTy &Program);
+
+public:
+  /// Create a L0 kernel with a name and an execution mode.
+  L0KernelTy(const char *Name) : GenericKernelTy(Name), zeKernel(nullptr) {}
+  ~L0KernelTy() {
+    if (zeKernel)
+      CALL_ZE_RET_VOID(zeKernelDestroy, zeKernel);
+  }
+  L0KernelTy(const L0KernelTy &) = delete;
+  L0KernelTy(L0KernelTy &&) = delete;
+  L0KernelTy &operator=(const L0KernelTy &) = delete;
+  L0KernelTy &operator=(const L0KernelTy &&) = delete;
+
+  const auto &getProperties() const { return Properties; }
+
+  /// Initialize the L0 kernel.
+  Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
+  /// Launch the L0 kernel function.
+  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
+                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
+                   KernelLaunchParamsTy LaunchParams,
+                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
+
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                  uint64_t DynamicMemSize) const override{
+      L0_UNIMPLEMENTED_ERR}
+
+  ze_kernel_handle_t getZeKernel() const {
+    return zeKernel;
+  }
+
+  int32_t getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
+                         int32_t ThreadLimit, uint32_t *GroupSizes,
+                         ze_group_count_t &GroupCounts, void *LoopDesc,
+                         bool &AllowCooperative) const;
+};
+
+} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0KERNEL_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Memory.h b/offload/plugins-nextgen/level_zero/include/L0Memory.h
new file mode 100644
index 0000000000000..9b02aa8568f96
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Memory.h
@@ -0,0 +1,579 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Memory related support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
+
+#include <cassert>
+#include <level_zero/ze_api.h>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+
+#include "L0Defs.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+
+#define ALLOC_KIND_TO_STR(Kind)                                                \
+  (Kind == TARGET_ALLOC_HOST                                                   \
+       ? "host memory"                                                         \
+       : (Kind == TARGET_ALLOC_SHARED                                          \
+              ? "shared memory"                                                \
+              : (Kind == TARGET_ALLOC_DEVICE ? "device memory"                 \
+                                             : "unknown memory")))
+
+// forward declarations
+struct L0OptionsTy;
+class L0DeviceTy;
+class L0ContextTy;
+
+struct DynamicMemHeapTy {
+  /// Base address memory is allocated from
+  uintptr_t AllocBase = 0;
+  /// Minimal size served by the current heap
+  size_t BlockSize = 0;
+  /// Max size served by the current heap
+  size_t MaxSize = 0;
+  /// Available memory blocks
+  uint32_t NumBlocks = 0;
+  /// Number of block descriptors
+  uint32_t NumBlockDesc = 0;
+  /// Number of block counters
+  uint32_t NumBlockCounter = 0;
+  /// List of memory block descriptors
+  uint64_t *BlockDesc = nullptr;
+  /// List of memory block counters
+  uint32_t *BlockCounter = nullptr;
+};
+
+struct DynamicMemPoolTy {
+  /// Location of device memory blocks
+  void *PoolBase = nullptr;
+  /// Heap size common to all heaps
+  size_t HeapSize = 0;
+  /// Number of heaps available
+  uint32_t NumHeaps = 0;
+  /// Heap descriptors (using fixed-size array to simplify memory allocation)
+  DynamicMemHeapTy HeapDesc[8];
+};
+
+/// Memory allocation information used in memory allocation/deallocation.
+struct MemAllocInfoTy {
+  /// Base address allocated from compute runtime
+  void *Base = nullptr;
+  /// Allocation size known to users/libomptarget
+  size_t Size = 0;
+  /// TARGET_ALLOC kind
+  int32_t Kind = TARGET_ALLOC_DEFAULT;
+  /// Allocation from pool?
+  bool InPool = false;
+  /// Is implicit argument
+  bool ImplicitArg = false;
+
+  MemAllocInfoTy() = default;
+
+  MemAllocInfoTy(void *_Base, size_t _Size, int32_t _Kind, bool _InPool,
+                 bool _ImplicitArg)
+      : Base(_Base), Size(_Size), Kind(_Kind), InPool(_InPool),
+        ImplicitArg(_ImplicitArg) {}
+};
+
+/// Responsible for all activities involving memory allocation/deallocation.
+/// It contains memory pool management, memory allocation bookkeeping.
+class MemAllocatorTy {
+
+  /// Simple memory allocation statistics. Maintains numbers for pool allocation
+  /// and GPU RT allocation.
+  struct MemStatTy {
+    size_t Requested[2] = {0, 0}; // Requested bytes
+    size_t Allocated[2] = {0, 0}; // Allocated bytes
+    size_t Freed[2] = {0, 0};     // Freed bytes
+    size_t InUse[2] = {0, 0};     // Current memory in use
+    size_t PeakUse[2] = {0, 0};   // Peak bytes used
+    size_t NumAllocs[2] = {0, 0}; // Number of allocations
+    MemStatTy() = default;
+  };
+
+  /// Memory pool which enables reuse of already allocated blocks
+  /// -- Pool maintains a list of buckets each of which can allocate fixed-size
+  ///    memory.
+  /// -- Each bucket maintains a list of memory blocks allocated by GPU RT.
+  /// -- Each memory block can allocate multiple fixed-size memory requested by
+  ///    offload RT or user.
+  /// -- Memory allocation falls back to GPU RT allocation when the pool size
+  ///    (total memory used by pool) reaches a threshold.
+  class MemPoolTy {
+
+    /// Memory block maintained in each bucket
+    struct BlockTy {
+      /// Base address of this block
+      uintptr_t Base = 0;
+      /// Size of the block
+      size_t Size = 0;
+      /// Supported allocation size by this block
+      size_t ChunkSize = 0;
+      /// Total number of slots
+      uint32_t NumSlots = 0;
+      /// Number of slots in use
+      uint32_t NumUsedSlots = 0;
+      /// Cached available slot returned by the last dealloc() call
+      uint32_t FreeSlot = UINT32_MAX;
+      /// Marker for the currently used slots
+      std::vector<bool> UsedSlots;
+
+      BlockTy(void *_Base, size_t _Size, size_t _ChunkSize) {
+        Base = reinterpret_cast<uintptr_t>(_Base);
+        Size = _Size;
+        ChunkSize = _ChunkSize;
+        NumSlots = Size / ChunkSize;
+        NumUsedSlots = 0;
+        UsedSlots.resize(NumSlots, false);
+      }
+
+      /// Check if the current block is fully used
+      bool isFull() const { return NumUsedSlots == NumSlots; }
+
+      /// Check if the given address belongs to the current block
+      bool contains(void *Mem) const {
+        auto M = reinterpret_cast<uintptr_t>(Mem);
+        return M >= Base && M < Base + Size;
+      }
+
+      /// Allocate a single chunk from the block
+      void *alloc();
+
+      /// Deallocate the given memory
+      void dealloc(void *Mem);
+    }; // BlockTy
+
+    /// Allocation kind for the current pool
+    int32_t AllocKind = TARGET_ALLOC_DEFAULT;
+    /// Access to the allocator
+    MemAllocatorTy *Allocator = nullptr;
+    /// Minimum supported memory allocation size from pool
+    size_t AllocMin = 1 << 6; // 64B
+    /// Maximum supported memory allocation size from pool
+    size_t AllocMax = 0;
+    /// Allocation size when the pool needs to allocate a block
+    size_t AllocUnit = 1 << 16; // 64KB
+    /// Capacity of each block in the buckets which decides number of
+    /// allocatable chunks from the block. Each block in the bucket can serve
+    /// at least BlockCapacity chunks.
+    /// If ChunkSize * BlockCapacity <= AllocUnit
+    ///   BlockSize = AllocUnit
+    /// Otherwise,
+    ///   BlockSize = ChunkSize * BlockCapacity
+    /// This simply means how much memory is over-allocated.
+    uint32_t BlockCapacity = 0;
+    /// Total memory allocated from GPU RT for this pool
+    size_t PoolSize = 0;
+    /// Maximum allowed pool size. Allocation falls back to GPU RT allocation if
+    /// when PoolSize reaches PoolSizeMax.
+    size_t PoolSizeMax = 0;
+    /// Small allocation size allowed in the pool even if pool size is over the
+    /// pool size limit
+    size_t SmallAllocMax = 1024;
+    /// Small allocation pool size
+    size_t SmallPoolSize = 0;
+    /// Small allocation pool size max (4MB)
+    size_t SmallPoolSizeMax = (4 << 20);
+    /// List of buckets
+    std::vector<std::vector<BlockTy *>> Buckets;
+    /// List of bucket parameters
+    std::vector<std::pair<size_t, size_t>> BucketParams;
+    /// Map from allocated pointer to corresponding block.
+    llvm::DenseMap<void *, BlockTy *> PtrToBlock;
+    /// Simple stats counting miss/hit in each bucket.
+    std::vector<std::pair<uint64_t, uint64_t>> BucketStats;
+    /// Need to zero-initialize after L0 allocation
+    bool ZeroInit = false;
+
+    /// Get bucket ID from the specified allocation size.
+    uint32_t getBucketId(size_t Size) {
+      uint32_t Count = 0;
+      for (size_t SZ = AllocMin; SZ < Size; Count++)
+        SZ <<= 1;
+      return Count;
+    }
+
+  public:
+    MemPoolTy() = default;
+
+    /// Construct pool with allocation kind, allocator, and user options.
+    MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
+              const L0OptionsTy &Option);
+    // Used for reduction pool
+    MemPoolTy(MemAllocatorTy *_Allocator, const L0OptionsTy &Option);
+    // Used for small memory pool with fixed parameters
+    MemPoolTy(MemAllocatorTy *_Allocator);
+
+    MemPoolTy(const MemPoolTy &) = delete;
+    MemPoolTy(MemPoolTy &&) = delete;
+    MemPoolTy &operator=(const MemPoolTy &) = delete;
+    MemPoolTy &operator=(const MemPoolTy &&) = delete;
+
+    void printUsage();
+    /// Release resources used in the pool.
+    ~MemPoolTy();
+
+    /// Allocate the requested size of memory from this pool.
+    /// AllocSize is the chunk size internally used for the returned memory.
+    void *alloc(size_t Size, size_t &AllocSize);
+    /// Deallocate the specified memory and returns block size deallocated.
+    size_t dealloc(void *Ptr);
+  }; // MemPoolTy
+
+  /// Allocation information maintained in the plugin
+  class MemAllocInfoMapTy {
+    /// Map from allocated pointer to allocation information
+    std::map<void *, MemAllocInfoTy> Map;
+    /// Map from target alloc kind to number of implicit arguments
+    std::map<int32_t, uint32_t> NumImplicitArgs;
+
+  public:
+    /// Add allocation information to the map
+    void add(void *Ptr, void *Base, size_t Size, int32_t Kind,
+             bool InPool = false, bool ImplicitArg = false);
+
+    /// Remove allocation information for the given memory location
+    bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr);
+
+    /// Finds allocation information for the given memory location
+    const MemAllocInfoTy *find(void *Ptr) const {
+      auto AllocInfo = Map.find(Ptr);
+      if (AllocInfo == Map.end())
+        return nullptr;
+      else
+        return &AllocInfo->second;
+    }
+
+    /// Check if the map contains the given pointer and offset
+    bool contains(const void *Ptr, size_t Size) const {
+      if (Map.size() == 0)
+        return false;
+      auto I = Map.upper_bound(const_cast<void *>(Ptr));
+      if (I == Map.begin())
+        return false;
+      --I;
+      bool Ret = (uintptr_t)I->first <= (uintptr_t)Ptr &&
+                 (uintptr_t)Ptr + (uintptr_t)Size <=
+                     (uintptr_t)I->first + (uintptr_t)I->second.Size;
+      return Ret;
+    }
+
+    /// Returns the number of implicit arguments for the specified allocation
+    /// kind.
+    size_t getNumImplicitArgs(int32_t Kind) { return NumImplicitArgs[Kind]; }
+  }; // MemAllocInfoMapTy
+
+  /// L0 context to use
+  const L0ContextTy *L0Context = nullptr;
+  /// L0 device to use
+  L0DeviceTy *Device = nullptr;
+  /// Whether the device supports large memory allocation
+  bool SupportsLargeMem = false;
+  /// Cached max alloc size supported by device
+  uint64_t MaxAllocSize = INT64_MAX;
+  /// Map from allocation kind to memory statistics
+  std::unordered_map<int32_t, MemStatTy> Stats;
+  /// Map from allocation kind to memory pool
+  std::unordered_map<int32_t, MemPoolTy> Pools;
+  /// Memory pool dedicated to reduction scratch space
+  std::unique_ptr<MemPoolTy> ReductionPool;
+  /// Memory pool dedicated to reduction counters
+  std::unique_ptr<MemPoolTy> CounterPool;
+  /// Allocation information map
+  MemAllocInfoMapTy AllocInfo;
+  /// RTL-owned memory that needs to be freed automatically
+  std::vector<void *> MemOwned;
+  /// Lock protection
+  std::mutex Mtx;
+  /// Allocator only supports host memory
+  bool IsHostMem = false;
+  // Internal deallocation function to be called when already
+  // hondling the Mtx lock
+  Error dealloc_locked(void *Ptr);
+
+public:
+  MemAllocatorTy() = default;
+
+  MemAllocatorTy(const MemAllocatorTy &) = delete;
+  MemAllocatorTy(MemAllocatorTy &&) = delete;
+  MemAllocatorTy &operator=(const MemAllocatorTy &) = delete;
+  MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete;
+
+  /// Release resources and report statistics if requested
+  ~MemAllocatorTy() {
+    if (L0Context)
+      deinit(); // Release resources
+  }
+  void deinit();
+
+  /// Allocator only supports host memory
+  bool supportsHostMem() { return IsHostMem; }
+
+  void initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option);
+  void initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option);
+  void updateMaxAllocSize(L0DeviceTy &L0Device);
+
+  /// Allocate memory from L0 GPU RT. We use over-allocation workaround
+  /// to support target pointer with offset, and positive "ActiveSize" is
+  /// specified in such cases for correct debug logging.
+  void *allocL0(size_t Size, size_t Align, int32_t Kind, size_t ActiveSize = 0);
+
+  /// Allocate memory with the specified information from a memory pool
+  Expected<void *> alloc(size_t Size, size_t Align, int32_t Kind,
+                         intptr_t Offset, bool UserAlloc, bool DevMalloc,
+                         uint32_t MemAdvice, AllocOptionTy AllocOpt);
+
+  /// Deallocate memory
+  Error dealloc(void *Ptr) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return dealloc_locked(Ptr);
+  }
+
+  /// Check if the given memory location and offset belongs to any allocated
+  /// memory
+  bool contains(const void *Ptr, size_t Size) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return AllocInfo.contains(Ptr, Size);
+  }
+
+  /// Get allocation information for the specified memory location
+  const MemAllocInfoTy *getAllocInfo(void *Ptr) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    return AllocInfo.find(Ptr);
+  }
+
+  /// Get kernel indirect access flags using implicit argument info
+  ze_kernel_indirect_access_flags_t getIndirectFlags() {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    ze_kernel_indirect_access_flags_t Ret = 0;
+    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_DEVICE) > 0)
+      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE;
+    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_HOST) > 0)
+      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST;
+    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_SHARED) > 0)
+      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
+    return Ret;
+  }
+
+  /// Log memory allocation/deallocation
+  void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) {
+    if (Stats.count(Kind) == 0)
+      return; // Stat is disabled
+
+    auto &ST = Stats[Kind];
+    int32_t I = Pool ? 1 : 0;
+    if (ReqSize > 0) {
+      ST.Requested[I] += ReqSize;
+      ST.Allocated[I] += Size;
+      ST.InUse[I] += Size;
+      ST.NumAllocs[I]++;
+    } else {
+      ST.Freed[I] += Size;
+      ST.InUse[I] -= Size;
+    }
+    ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]);
+  }
+
+  /// Perform copy operation
+  int32_t enqueueMemCopy(void *Dst, const void *Src, size_t Size);
+
+  /// Perform memory fill operation
+  int32_t enqueueMemSet(void *Dst, int8_t Value, size_t Size);
+
+}; /// MemAllocatorTy
+
+// simple generic wrapper to reuse objects
+// objects must have zero argument accessible constructor
+template <class ObjTy> class ObjPool {
+  // Protection
+  std::unique_ptr<std::mutex> Mtx;
+  // List of Objects
+  std::list<ObjTy *> Objects;
+
+public:
+  ObjPool() { Mtx.reset(new std::mutex); }
+
+  ObjPool(const ObjPool &) = delete;
+  ObjPool(ObjPool &) = delete;
+  ObjPool &operator=(const ObjPool &) = delete;
+  ObjPool &operator=(const ObjPool &&) = delete;
+
+  ObjTy *get() {
+    if (!Objects.empty()) {
+      std::lock_guard<std::mutex> Lock(*Mtx);
+      if (!Objects.empty()) {
+        const auto Ret = Objects.back();
+        Objects.pop_back();
+        return Ret;
+      }
+    }
+    return new ObjTy();
+  }
+
+  void release(ObjTy *obj) {
+    std::lock_guard<std::mutex> Lock(*Mtx);
+    Objects.push_back(obj);
+  }
+
+  ~ObjPool() {
+    for (auto object : Objects)
+      delete object;
+  }
+};
+
+/// Common event pool used in the plugin. This event pool assumes all events
+/// from the pool are host-visible and use the same event pool flag.
+class EventPoolTy {
+  /// Size of L0 event pool created on demand
+  size_t PoolSize = 64;
+
+  /// Context of the events
+  ze_context_handle_t Context = nullptr;
+
+  /// Additional event pool flags common to this pull
+  uint32_t Flags = 0;
+
+  /// Protection
+  std::unique_ptr<std::mutex> Mtx;
+
+  /// List of created L0 event pools
+  std::list<ze_event_pool_handle_t> Pools;
+
+  /// List of free L0 events
+  std::list<ze_event_handle_t> Events;
+
+#ifdef OMPT_SUPPORT
+  /// Event to OMPT record map. The timestamp information is recorded to the
+  /// OMPT record before the event is recycled.
+  std::unordered_map<ze_event_handle_t, ompt_record_ompt_t *> EventToRecord;
+#endif // OMPT_SUPPORT
+
+public:
+  /// Initialize context, flags, and mutex
+  void init(ze_context_handle_t ContextIn, uint32_t FlagsIn) {
+    Context = ContextIn;
+    Flags = FlagsIn;
+    Mtx.reset(new std::mutex);
+  }
+
+  /// Destroys L0 resources
+  void deinit() {
+    for (auto E : Events)
+      CALL_ZE_RET_VOID(zeEventDestroy, E);
+    for (auto P : Pools)
+      CALL_ZE_RET_VOID(zeEventPoolDestroy, P);
+  }
+
+  /// Get a free event from the pool
+  ze_event_handle_t getEvent();
+
+  /// Return an event to the pool
+  void releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device);
+};
+
+/// Staging buffer
+/// A single staging buffer is not enough when batching is enabled since there
+/// can be multiple pending copy operations.
+class StagingBufferTy {
+  /// Context for L0 calls
+  ze_context_handle_t Context = nullptr;
+  /// Max allowed size for staging buffer
+  size_t Size = L0StagingBufferSize;
+  /// Number of buffers allocated together
+  size_t Count = L0StagingBufferCount;
+  /// Buffers increasing by Count if a new buffer is required
+  llvm::SmallVector<void *> Buffers;
+  /// Next buffer location in the buffers
+  size_t Offset = 0;
+
+  void *addBuffers() {
+    ze_host_mem_alloc_desc_t AllocDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+                                       nullptr, 0};
+    void *Ret = nullptr;
+    size_t AllocSize = Size * Count;
+    CALL_ZE_RET_NULL(zeMemAllocHost, Context, &AllocDesc, AllocSize,
+                     L0DefaultAlignment, &Ret);
+    Buffers.push_back(Ret);
+    return Ret;
+  }
+
+public:
+  StagingBufferTy() = default;
+  StagingBufferTy(const StagingBufferTy &) = delete;
+  StagingBufferTy(StagingBufferTy &&) = delete;
+  StagingBufferTy &operator=(const StagingBufferTy &) = delete;
+  StagingBufferTy &operator=(const StagingBufferTy &&) = delete;
+
+  ~StagingBufferTy() {
+    if (initialized())
+      clear();
+  }
+
+  void clear() {
+    ze_result_t Rc;
+    (void)Rc; // GCC build compiler thinks Rc is unused for some reason.
+    for (auto Ptr : Buffers)
+      CALL_ZE(Rc, zeMemFree, Context, Ptr);
+    Context = nullptr;
+  }
+
+  bool initialized() const { return Context != nullptr; }
+
+  void init(ze_context_handle_t ContextIn, size_t SizeIn, size_t CountIn) {
+    Context = ContextIn;
+    Size = SizeIn;
+    Count = CountIn;
+  }
+
+  void reset() { Offset = 0; }
+
+  /// Always return the first buffer
+  void *get() {
+    if (Size == 0 || Count == 0)
+      return nullptr;
+    return Buffers.empty() ? addBuffers() : Buffers.front();
+  }
+
+  /// Return the next available buffer
+  void *getNext() {
+    void *Ret = nullptr;
+    if (Size == 0 || Count == 0)
+      return Ret;
+
+    size_t AllocSize = Size * Count;
+    bool NeedToGrow = Buffers.empty() || Offset >= Buffers.size() * AllocSize;
+    if (NeedToGrow)
+      Ret = addBuffers();
+    else
+      Ret = (void *)((uintptr_t)Buffers.back() + (Offset % AllocSize));
+
+    if (!Ret)
+      return nullptr;
+
+    Offset += Size;
+    return Ret;
+  }
+
+  /// Return either a fixed buffer or next buffer
+  void *get(bool Next) { return Next ? getNext() : get(); }
+};
+
+} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Options.h b/offload/plugins-nextgen/level_zero/include/L0Options.h
new file mode 100644
index 0000000000000..459eef312f076
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Options.h
@@ -0,0 +1,161 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero RTL Options support
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
+
+#include <level_zero/ze_api.h>
+
+#include "Shared/EnvironmentVar.h"
+
+#include "L0Defs.h"
+
+namespace llvm::omp::target::plugin {
+/// Command submission mode
+enum class CommandModeTy { Sync = 0, Async, AsyncOrdered };
+
+/// Specialization constants used for a module compilation.
+class SpecConstantsTy {
+  std::vector<uint32_t> ConstantIds;
+  std::vector<const void *> ConstantValues;
+  BumpPtrAllocator &Allocator;
+
+public:
+  SpecConstantsTy(BumpPtrAllocator &Allocator) : Allocator(Allocator) {}
+  SpecConstantsTy(const SpecConstantsTy &) = delete;
+  SpecConstantsTy(SpecConstantsTy &&) = delete;
+  SpecConstantsTy &operator=(const SpecConstantsTy &) = delete;
+  SpecConstantsTy &operator=(const SpecConstantsTy &&) = delete;
+  SpecConstantsTy(const SpecConstantsTy &&Other)
+      : ConstantIds(std::move(Other.ConstantIds)),
+        ConstantValues(std::move(Other.ConstantValues)),
+        Allocator(Other.Allocator) {}
+  ~SpecConstantsTy() {}
+
+  template <typename T> void addConstant(uint32_t Id, T Val) {
+    T *ValuePtr =
+        reinterpret_cast<T *>(Allocator.Allocate(sizeof(T), alignof(T)));
+    *ValuePtr = Val;
+
+    ConstantIds.push_back(Id);
+    ConstantValues.push_back(reinterpret_cast<void *>(ValuePtr));
+  }
+
+  ze_module_constants_t getModuleConstants() const {
+    ze_module_constants_t Tmp{static_cast<uint32_t>(ConstantValues.size()),
+                              ConstantIds.data(),
+                              // Unfortunately we have to const_cast it.
+                              // L0 data type should probably be fixed.
+                              const_cast<const void **>(ConstantValues.data())};
+    return Tmp;
+  }
+};
+
+/// L0 Plugin flags
+struct L0OptionFlagsTy {
+  uint64_t UseMemoryPool : 1;
+  uint64_t Reserved : 63;
+  L0OptionFlagsTy() : UseMemoryPool(1), Reserved(0) {}
+};
+
+struct L0OptionsTy {
+  /// Binary flags
+  L0OptionFlagsTy Flags;
+
+  /// Staging buffer size
+  size_t StagingBufferSize = L0StagingBufferSize;
+
+  /// Staging buffer count
+  size_t StagingBufferCount = L0StagingBufferCount;
+
+  // TODO: This should probably be an array indexed by AllocKind
+  /// Memory pool parameters
+  /// MemPoolInfo[MemType] = {AllocMax(MB), Capacity, PoolSize(MB)}
+  std::map<int32_t, std::array<int32_t, 3>> MemPoolInfo = {
+      {TARGET_ALLOC_DEVICE, {1, 4, 256}},
+      {TARGET_ALLOC_HOST, {1, 4, 256}},
+      {TARGET_ALLOC_SHARED, {8, 4, 256}}};
+
+  /// Parameters for memory pools dedicated to reduction scratch space
+  std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};
+
+  /// Oversubscription rate for normal kernels
+  uint32_t SubscriptionRate = 4;
+
+  /// Loop kernels with known ND-range may be known to have
+  /// few iterations and they may not exploit the offload device
+  /// to the fullest extent.
+  /// Let's assume a device has N total HW threads available,
+  /// and the kernel requires M hardware threads with LWS set to L.
+  /// If (M < N * ThinThreadsThreshold), then we will try
+  /// to iteratively divide L by 2 to increase the number of HW
+  /// threads used for executing the kernel. Effectively, we will
+  /// end up with L less than the kernel's SIMD width, so the HW
+  /// threads will not use all their SIMD lanes. This (presumably) should
+  /// allow more parallelism, because the stalls in the SIMD lanes
+  /// will be distributed across more HW threads, and the probability
+  /// of having a stall (or a sequence of stalls) on a critical path
+  /// in the kernel should decrease.
+  /// Anyway, this is just a heuristics that seems to work well for some
+  /// kernels (which poorly expose parallelism in the first place).
+  double ThinThreadsThreshold = 0.1;
+
+  // Compilation options for IGC
+  // OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
+  // runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
+  // option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
+  // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
+  // builtins.
+  static constexpr std::string_view CompilationOptions = "-cl-std=CL2.0 ";
+  static constexpr std::string_view InternalCompilationOptions =
+      "-cl-take-global-address";
+  std::string UserCompilationOptions = "";
+
+  /// Spec constants used for all modules.
+  SpecConstantsTy CommonSpecConstants;
+
+  /// Command execution mode.
+  /// Whether the runtime uses asynchronous mode or not depends on the type of
+  /// devices and whether immediate command list is fully enabled.
+  CommandModeTy CommandMode = CommandModeTy::Async;
+
+  /// Controls if we need to reduce available HW threads. We need this
+  /// adjustment on XeHPG when Level Zero debug is enabled
+  /// (ZET_ENABLE_PROGRAM_DEBUGGING=1).
+  bool ZeDebugEnabled = false;
+
+  bool Init = false; // have the options already been processed
+
+  // Allocator for long-lived allocations (e.g. spec constants)
+  BumpPtrAllocator Allocator;
+
+  L0OptionsTy() : CommonSpecConstants(Allocator) {}
+
+  /// Read environment variables
+  void processEnvironmentVars();
+
+  void init() {
+    if (!Init) {
+      processEnvironmentVars();
+      Init = true;
+    }
+  }
+
+  bool match(const StringEnvar &Var, const llvm::StringRef Matched) {
+    return Matched.equals_insensitive(Var.get());
+  }
+
+}; // L0OptionsTy
+
+} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Plugin.h b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
new file mode 100644
index 0000000000000..9fbdafa288592
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Plugin.h
@@ -0,0 +1,138 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Plugin interface for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H
+
+#include "AsyncQueue.h"
+#include "L0Defs.h"
+#include "L0Device.h"
+#include "L0Memory.h"
+#include "L0Options.h"
+#include "L0Program.h"
+#include "TLS.h"
+
+namespace llvm::omp::target::plugin {
+
+/// Class implementing the LevelZero specific functionalities of the plugin.
+class LevelZeroPluginTy final : public GenericPluginTy {
+private:
+  /// Number of devices available including subdevices
+  uint32_t NumDevices = 0;
+
+  /// Context (and Driver) specific data
+  std::list<L0ContextTy> ContextList;
+
+  /// L0 device used by each OpenMP device
+  using DeviceContainerTy = llvm::SmallVector<L0DeviceTy *>;
+  DeviceContainerTy L0Devices;
+
+  // Table containing per-thread information using TLS
+  L0ThreadTblTy ThreadTLSTable;
+  // Table containing per-thread information for each device using TLS
+  L0DeviceTLSTableTy DeviceTLSTable;
+  // Table containing per-thread information for each Context using TLS
+  L0ContextTLSTableTy ContextTLSTable;
+
+  /// L0 plugin global options
+  static L0OptionsTy Options;
+
+  std::mutex GlobalMutex;
+
+  /// Common pool of AsyncQueue
+  AsyncQueuePoolTy AsyncQueuePool;
+
+  auto &getTLS() { return ThreadTLSTable.get(); }
+
+public:
+  LevelZeroPluginTy() : GenericPluginTy(getTripleArch()) {}
+  virtual ~LevelZeroPluginTy() {}
+
+  auto &getDeviceTLS(int32_t DeviceId) { return DeviceTLSTable.get(DeviceId); }
+  auto &getContextTLS(ze_context_handle_t Context) {
+    return ContextTLSTable.get(Context);
+  }
+
+  static const auto &getOptions() { return Options; }
+
+  auto &getGlobalMutex() { return GlobalMutex; }
+
+  struct DevicesRangeTy {
+    using iterator = DeviceContainerTy::iterator;
+
+    iterator BeginIt;
+    iterator EndIt;
+
+    DevicesRangeTy(iterator BeginIt, iterator EndIt)
+        : BeginIt(BeginIt), EndIt(EndIt) {}
+
+    auto &begin() { return BeginIt; }
+    auto &end() { return EndIt; }
+  };
+
+  auto getDevicesRange() {
+    return DevicesRangeTy(L0Devices.begin(), L0Devices.end());
+  }
+
+  /// Clean-up routine to be invoked by the destructor or
+  /// LevelZeroPluginTy::deinit.
+  void closeRTL();
+
+  /// Find L0 devices and initialize device properties.
+  /// Returns number of devices reported to omptarget.
+  int32_t findDevices();
+
+  L0DeviceTy &getDeviceFromId(int32_t DeviceId) const {
+    assert("Invalid device ID" && DeviceId >= 0 &&
+           DeviceId < static_cast<int32_t>(L0Devices.size()));
+    return *L0Devices[DeviceId];
+  }
+
+  uint32_t getNumRootDevices() const { return NumDevices; }
+
+  AsyncQueueTy *getAsyncQueue() {
+    auto *Queue = getTLS().getAsyncQueue();
+    if (!Queue)
+      Queue = AsyncQueuePool.get();
+    return Queue;
+  }
+
+  void releaseAsyncQueue(AsyncQueueTy *Queue) {
+    if (!Queue)
+      return;
+    Queue->reset();
+    Queue->InUse = false;
+    if (!getTLS().releaseAsyncQueue(Queue))
+      AsyncQueuePool.release(Queue);
+  }
+
+  // Plugin interface
+
+  Expected<int32_t> initImpl() override;
+  Error deinitImpl() override;
+  GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
+                                int32_t NumDevices) override;
+  GenericGlobalHandlerTy *createGlobalHandler() override;
+  uint16_t getMagicElfBits() const override;
+  Triple::ArchType getTripleArch() const override;
+  const char *getName() const override;
+  Expected<bool> isELFCompatible(uint32_t DeviceId,
+                                 StringRef Image) const override;
+
+  Error flushQueueImpl(omp_interop_val_t *Interop) override;
+  Error syncBarrierImpl(omp_interop_val_t *Interop) override;
+  Error asyncBarrierImpl(omp_interop_val_t *Interop) override;
+};
+
+} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PLUGIN_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Program.h b/offload/plugins-nextgen/level_zero/include/L0Program.h
new file mode 100644
index 0000000000000..520bfa688a5af
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Program.h
@@ -0,0 +1,136 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Program abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H
+
+#include "L0Kernel.h"
+
+namespace llvm::omp::target::plugin {
+
+class L0DeviceTy;
+
+/// Program data to be initialized by plugin
+struct ProgramDataTy {
+  int Initialized = 0;
+  int NumDevices = 0;
+  int DeviceNum = -1;
+  uint32_t TotalEUs = 0;
+  uint32_t HWThreadsPerEU = 0;
+  uintptr_t DynamicMemoryLB = 0;
+  uintptr_t DynamicMemoryUB = 0;
+  int DeviceType = 0;
+  void *DynamicMemPool = nullptr;
+  int TeamsThreadLimit = 0;
+};
+
+/// Level Zero program that can contain multiple modules.
+class L0ProgramTy : public DeviceImageTy {
+  /// Handle multiple modules within a single target image
+  llvm::SmallVector<ze_module_handle_t> Modules;
+
+  /// Map of kernel names to Modules
+  std::unordered_map<std::string, ze_module_handle_t> KernelsToModuleMap;
+
+  /// List of kernels built for this image
+  /// We need to delete them ourselves as the main library is not doing
+  /// that right now
+  std::list<L0KernelTy *> Kernels;
+
+  /// Module that contains global data including device RTL
+  ze_module_handle_t GlobalModule = nullptr;
+
+  /// Requires module link
+  bool RequiresModuleLink = false;
+
+  /// Is this module library
+  bool IsLibModule = false;
+
+  /// Build a single module with the given image, build option, and format.
+  int32_t addModule(const size_t Size, const uint8_t *Image,
+                    const std::string_view BuildOption,
+                    ze_module_format_t Format);
+  /// Read file and return the size of the binary if successful.
+  size_t readFile(const char *FileName, std::vector<uint8_t> &OutFile) const;
+  void replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
+                                        std::string &Options) const;
+
+  /// Check if the image should be handled as a library module
+  void setLibModule();
+
+  L0DeviceTy &getL0Device() const;
+
+public:
+  L0ProgramTy() = delete;
+
+  L0ProgramTy(int32_t ImageId, GenericDeviceTy &Device,
+              std::unique_ptr<MemoryBuffer> Image)
+      : DeviceImageTy(ImageId, Device, std::move(Image)) {}
+
+  ~L0ProgramTy();
+
+  L0ProgramTy(const L0ProgramTy &other) = delete;
+  L0ProgramTy(L0ProgramTy &&) = delete;
+  L0ProgramTy &operator=(const L0ProgramTy &) = delete;
+  L0ProgramTy &operator=(const L0ProgramTy &&) = delete;
+
+  static L0ProgramTy &makeL0Program(DeviceImageTy &Device) {
+    return static_cast<L0ProgramTy &>(Device);
+  }
+
+  /// Build modules from the target image description
+  int32_t buildModules(const std::string_view BuildOptions);
+
+  /// Link modules stored in \p Modules.
+  int32_t linkModules();
+
+  /// Loads the kernels names from all modules
+  int32_t loadModuleKernels();
+
+  /// Read data from the location in the device image which corresponds to the
+  /// specified global variable name.
+  int32_t readGlobalVariable(const char *Name, size_t Size, void *HostPtr);
+
+  /// Write data to the location in the device image which corresponds to the
+  /// specified global variable name.
+  int32_t writeGlobalVariable(const char *Name, size_t Size,
+                              const void *HostPtr);
+
+  /// Looks up an OpenMP declare target global variable with the given
+  /// \p Name and \p Size in the device environment for the current device.
+  /// The lookup is first done via the device offload table. If it fails,
+  /// then the lookup falls back to non-OpenMP specific lookup on the device.
+  void *getOffloadVarDeviceAddr(const char *Name) const;
+
+  /// Returns the handle of a module that contains a given Kernel name
+  ze_module_handle_t findModuleFromKernelName(const char *KernelName) const {
+    auto K = KernelsToModuleMap.find(std::string(KernelName));
+    if (K == KernelsToModuleMap.end())
+      return nullptr;
+
+    return K->second;
+  }
+
+  void addKernel(L0KernelTy *Kernel) { Kernels.push_back(Kernel); }
+};
+
+struct L0GlobalHandlerTy final : public GenericGlobalHandlerTy {
+  Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+                                    DeviceImageTy &Image,
+                                    GlobalTy &DeviceGlobal) override;
+};
+
+bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer);
+} // namespace llvm::omp::target::plugin
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0PROGRAM_H
diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h
new file mode 100644
index 0000000000000..0faa76171cbc9
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h
@@ -0,0 +1,189 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Code for tracing L0
+//
+//===----------------------------------------------------------------------===//
+// clang-format off
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H
+
+#include "Shared/Debug.h"
+#include "omptarget.h"
+#include <string>
+#include <level_zero/ze_api.h>
+
+#define STR(x) #x
+#define TO_STRING(x) STR(x)
+
+#define DPCALL(...)                                                            \
+  do {                                                                         \
+    if (getDebugLevel() > 1)                                                   \
+      DP(__VA_ARGS__);                                                         \
+  } while (0)
+
+#define WARNING(...)                                                           \
+  do {                                                                         \
+    fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
+    fprintf(stderr, "Warning: " __VA_ARGS__);                                  \
+  } while (0)
+
+#define INVALID_OPTION(Name, Value)                                            \
+  WARNING("Ignoring invalid option " #Name "=%s\n", Value)
+
+#define CALL_ZE(Rc, Fn, ...)                                                   \
+  do {                                                                         \
+      Rc = Fn(__VA_ARGS__);                                                    \
+  } while (0)
+
+#define CALL_ZE_RC(Rc, Fn, ...)                                                \
+  do {                                                                         \
+    CALL_ZE(Rc, Fn, __VA_ARGS__);                                              \
+    if (Rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, Rc,    \
+         getZeErrorName(Rc));                                                  \
+    }                                                                          \
+  } while(0)
+
+/// For non-thread-safe functions
+#define CALL_ZE_RET_MTX(Ret, Fn, Mtx, ...)                                     \
+  do {                                                                         \
+    Mtx.lock();                                                                \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    Mtx.unlock();                                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      return Ret;                                                              \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_RET_FAIL_MTX(Fn, Mtx, ...)                                     \
+  CALL_ZE_RET_MTX(OFFLOAD_FAIL, Fn, Mtx, __VA_ARGS__)
+#define CALL_ZE_RET_NULL_MTX(Fn, Mtx, ...)                                     \
+  CALL_ZE_RET_MTX(NULL, Fn, Mtx, __VA_ARGS__)
+#define CALL_ZE_RET_ZERO_MTX(Fn, Mtx, ...)                                     \
+  CALL_ZE_RET_MTX(0, Fn, Mtx, __VA_ARGS__)
+
+/// For thread-safe functions
+#define CALL_ZE_RET(Ret, Fn, ...)                                              \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      return Ret;                                                              \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_RET_FAIL(Fn, ...) CALL_ZE_RET(OFFLOAD_FAIL, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_NULL(Fn, ...) CALL_ZE_RET(NULL, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_ZERO(Fn, ...) CALL_ZE_RET(0, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_VOID(Fn, ...) CALL_ZE_RET(, Fn, __VA_ARGS__)
+#define CALL_ZE_RET_ERROR(Fn, ...)                                             \
+  CALL_ZE_RET(                                                                 \
+    Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d, %s",           \
+    STR(Fn), rc, getZeErrorName(rc)), Fn, __VA_ARGS__)
+
+
+
+#define CALL_ZE_RET_FAIL_MSG(Fn, Dev, ...)                                     \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      const char *err_str = nullptr;                                           \
+      rc = zeDriverGetLastErrorDescription(                                    \
+          Dev.getDriverHandle(), &err_str);                                    \
+      fprintf(stderr, "Error: %s:%s failed with %s\n", __func__, #Fn,          \
+              err_str);                                                        \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_EXIT_FAIL(Fn, ...)                                             \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != ZE_RESULT_SUCCESS) {                                             \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, rc,    \
+         getZeErrorName(rc));                                                  \
+      std::exit(EXIT_FAILURE);                                                 \
+    }                                                                          \
+  } while (0)
+
+#define CALL_ZE_EXT_SILENT_RET(Device, Ret, Name, ...)                         \
+  do {                                                                         \
+    ze_result_t rc;                                                            \
+    CALL_ZE_EXT_SILENT(Device, rc, Name, __VA_ARGS__);                         \
+    if (rc != ZE_RESULT_SUCCESS)                                               \
+      return Ret;                                                              \
+  } while (0)
+
+
+#define CALL_ZE_EXT_RET_ERROR(Device, Name, ...)                               \
+  CALL_ZE_EXT_SILENT_RET(Device,                                               \
+      Plugin::error(ErrorCode::UNKNOWN, "%s failed with code %d, %s",          \
+			 STR(Name), rc, getZeErrorName(rc)), Name, __VA_ARGS__)                    
+
+#define FOREACH_ZE_ERROR_CODE(Fn)                                              \
+  Fn(ZE_RESULT_SUCCESS)                                                        \
+  Fn(ZE_RESULT_NOT_READY)                                                      \
+  Fn(ZE_RESULT_ERROR_DEVICE_LOST)                                              \
+  Fn(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY)                                       \
+  Fn(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)                                     \
+  Fn(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE)                                     \
+  Fn(ZE_RESULT_ERROR_MODULE_LINK_FAILURE)                                      \
+  Fn(ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET)                                    \
+  Fn(ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE)                                \
+  Fn(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS)                                 \
+  Fn(ZE_RESULT_ERROR_NOT_AVAILABLE)                                            \
+  Fn(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE)                                   \
+  Fn(ZE_RESULT_WARNING_DROPPED_DATA)                                           \
+  Fn(ZE_RESULT_ERROR_UNINITIALIZED)                                            \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_VERSION)                                      \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE)                                      \
+  Fn(ZE_RESULT_ERROR_INVALID_ARGUMENT)                                         \
+  Fn(ZE_RESULT_ERROR_INVALID_NULL_HANDLE)                                      \
+  Fn(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE)                                     \
+  Fn(ZE_RESULT_ERROR_INVALID_NULL_POINTER)                                     \
+  Fn(ZE_RESULT_ERROR_INVALID_SIZE)                                             \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_SIZE)                                         \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT)                                    \
+  Fn(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT)                           \
+  Fn(ZE_RESULT_ERROR_INVALID_ENUMERATION)                                      \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION)                                  \
+  Fn(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT)                                 \
+  Fn(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY)                                    \
+  Fn(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME)                                      \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_NAME)                                      \
+  Fn(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME)                                    \
+  Fn(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION)                             \
+  Fn(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION)                           \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX)                            \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE)                             \
+  Fn(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE)                           \
+  Fn(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED)                                  \
+  Fn(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE)                                \
+  Fn(ZE_RESULT_ERROR_OVERLAPPING_REGIONS)                                      \
+  Fn(ZE_RESULT_WARNING_ACTION_REQUIRED)                                        \
+  Fn(ZE_RESULT_ERROR_UNKNOWN)
+
+#define CASE_TO_STRING(Num) case Num: return #Num;
+inline const char *getZeErrorName(int32_t Error) {
+  switch (Error) {
+    FOREACH_ZE_ERROR_CODE(CASE_TO_STRING)
+  default:
+    return "ZE_RESULT_ERROR_UNKNOWN";
+  }
+}
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0TRACE_H
diff --git a/offload/plugins-nextgen/level_zero/include/TLS.h b/offload/plugins-nextgen/level_zero/include/TLS.h
new file mode 100644
index 0000000000000..257ada0b33b37
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/include/TLS.h
@@ -0,0 +1,82 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Thread Level Storage abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H
+
+#include "AsyncQueue.h"
+#include "L0Memory.h"
+#include "L0Trace.h"
+#include "PerThreadTable.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// All thread-local data used by the Plugin
+class L0ThreadTLSTy {
+  /// Async info tracking
+  static constexpr int32_t PerThreadQueues = 10;
+  AsyncQueueTy AsyncQueues[PerThreadQueues];
+  int32_t UsedQueues = 0;
+
+public:
+  L0ThreadTLSTy() = default;
+  L0ThreadTLSTy(const L0ThreadTLSTy &) = delete;
+  L0ThreadTLSTy(L0ThreadTLSTy &&) = delete;
+  L0ThreadTLSTy &operator=(const L0ThreadTLSTy &) = delete;
+  L0ThreadTLSTy &operator=(const L0ThreadTLSTy &&) = delete;
+  ~L0ThreadTLSTy() {}
+
+  void clear() {}
+
+  AsyncQueueTy *getAsyncQueue() {
+    AsyncQueueTy *ret = nullptr;
+    if (UsedQueues < PerThreadQueues) {
+      // there's a free queue in this thread, find it
+      for (int32_t q = 0; q < PerThreadQueues; q++) {
+        if (!AsyncQueues[q].InUse) {
+          UsedQueues++;
+          ret = &AsyncQueues[q];
+          break;
+        }
+      }
+      assert(ret && "A queue should have been found!");
+      ret->InUse = true;
+    }
+    return ret;
+  }
+
+  bool releaseAsyncQueue(AsyncQueueTy *queue) {
+    if (queue >= &AsyncQueues[0] && queue < &AsyncQueues[PerThreadQueues]) {
+      // it's a local queue
+      queue->InUse = false;
+      UsedQueues--;
+      return true;
+    }
+    return false;
+  }
+};
+
+struct L0ThreadTblTy : public PerThread<L0ThreadTLSTy> {
+  void clear() {
+    PerThread::clear([](auto &Entry) { Entry.clear(); });
+  }
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_TLS_H
diff --git a/offload/plugins-nextgen/level_zero/src/L0Context.cpp b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
new file mode 100644
index 0000000000000..3f50ffd2a7260
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Context.cpp
@@ -0,0 +1,41 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  Level Zero Context abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Context.h"
+#include "L0Plugin.h"
+
+namespace llvm::omp::target::plugin {
+
+L0ContextTy::L0ContextTy(LevelZeroPluginTy &Plugin, ze_driver_handle_t zeDriver,
+                         int32_t /*DriverId*/)
+    : Plugin(Plugin), zeDriver(zeDriver) {
+  CALL_ZE_RET_VOID(zeDriverGetApiVersion, zeDriver, &APIVersion);
+  DP("Driver API version is %" PRIx32 "\n", APIVersion);
+
+  ze_context_desc_t Desc{ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
+  CALL_ZE_RET_VOID(zeContextCreate, zeDriver, &Desc, &zeContext);
+
+  EventPool.init(zeContext, 0);
+  HostMemAllocator.initHostPool(*this, Plugin.getOptions());
+}
+
+StagingBufferTy &L0ContextTy::getStagingBuffer() {
+  auto &TLS = Plugin.getContextTLS(getZeContext());
+  auto &Buffer = TLS.getStagingBuffer();
+  const auto &Options = Plugin.getOptions();
+  if (!Buffer.initialized())
+    Buffer.init(getZeContext(), Options.StagingBufferSize,
+                Options.StagingBufferCount);
+  return Buffer;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
new file mode 100644
index 0000000000000..715de0d1b3c12
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -0,0 +1,1079 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericDevice instatiation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Device.h"
+#include "L0Defs.h"
+#include "L0Interop.h"
+#include "L0Plugin.h"
+#include "L0Program.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+L0DeviceTLSTy &L0DeviceTy::getTLS() {
+  return getPlugin().getDeviceTLS(getDeviceId());
+}
+
+// clang-format off
+/// Mapping from device arch to GPU runtime's device identifiers
+static struct {
+  DeviceArchTy arch;
+  PCIIdTy ids[10];
+} DeviceArchMap[] = {{DeviceArchTy::DeviceArch_Gen,
+                      {PCIIdTy::SKL,
+                       PCIIdTy::KBL,
+                       PCIIdTy::CFL, PCIIdTy::CFL_2,
+                       PCIIdTy::ICX,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_Gen,
+                      {PCIIdTy::TGL, PCIIdTy::TGL_2,
+                       PCIIdTy::DG1,
+                       PCIIdTy::RKL,
+                       PCIIdTy::ADLS,
+                       PCIIdTy::RTL,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_XeLPG,
+                      {PCIIdTy::MTL,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_XeHPC,
+                      {PCIIdTy::PVC,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_XeHPG,
+                      {PCIIdTy::DG2_ATS_M,
+                       PCIIdTy::DG2_ATS_M_2,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_Xe2LP,
+                      {PCIIdTy::LNL,
+                       PCIIdTy::None}},
+                     {DeviceArchTy::DeviceArch_Xe2HP,
+                      {PCIIdTy::BMG,
+                       PCIIdTy::None}},
+};
+constexpr int DeviceArchMapSize = sizeof(DeviceArchMap) / sizeof(DeviceArchMap[0]);
+// clang-format on
+
+DeviceArchTy L0DeviceTy::computeArch() const {
+  const auto PCIDeviceId = getPCIId();
+  if (PCIDeviceId != 0) {
+    for (int ArchIndex = 0; ArchIndex < DeviceArchMapSize; ArchIndex++) {
+      for (int i = 0;; i++) {
+        const auto Id = DeviceArchMap[ArchIndex].ids[i];
+        if (Id == PCIIdTy::None)
+          break;
+
+        auto maskedId = static_cast<PCIIdTy>(PCIDeviceId & 0xFF00);
+        if (maskedId == Id)
+          return DeviceArchMap[ArchIndex].arch; // Exact match or prefix match
+      }
+    }
+  }
+
+  DP("Warning: Cannot decide device arch for %s.\n", getNameCStr());
+  return DeviceArchTy::DeviceArch_None;
+}
+
+bool L0DeviceTy::isDeviceIPorNewer(uint32_t Version) const {
+  ze_device_ip_version_ext_t IPVersion{};
+  IPVersion.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
+  IPVersion.pNext = nullptr;
+  ze_device_properties_t DevicePR{};
+  DevicePR.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+  DevicePR.pNext = &IPVersion;
+  CALL_ZE_RET(false, zeDeviceGetProperties, zeDevice, &DevicePR);
+  return IPVersion.ipVersion >= Version;
+}
+
+/// Get default compute group ordinal. Returns Ordinal-NumQueues pair
+std::pair<uint32_t, uint32_t> L0DeviceTy::findComputeOrdinal() {
+  std::pair<uint32_t, uint32_t> Ordinal{UINT32_MAX, 0};
+  uint32_t Count = 0;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              nullptr);
+  ze_command_queue_group_properties_t Init{
+      ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0};
+  std::vector<ze_command_queue_group_properties_t> Properties(Count, Init);
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              Properties.data());
+  for (uint32_t I = 0; I < Count; I++) {
+    // TODO: add a separate set of ordinals for compute queue groups which
+    // support cooperative kernels
+    if (Properties[I].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
+      Ordinal.first = I;
+      Ordinal.second = Properties[I].numQueues;
+      break;
+    }
+  }
+  if (Ordinal.first == UINT32_MAX)
+    DP("Error: no command queues are found\n");
+
+  return Ordinal;
+}
+
+/// Get copy command queue group ordinal. Returns Ordinal-NumQueues pair
+std::pair<uint32_t, uint32_t> L0DeviceTy::findCopyOrdinal(bool LinkCopy) {
+  std::pair<uint32_t, uint32_t> Ordinal{UINT32_MAX, 0};
+  uint32_t Count = 0;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              nullptr);
+  ze_command_queue_group_properties_t Init{
+      ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, nullptr, 0, 0, 0};
+  std::vector<ze_command_queue_group_properties_t> Properties(Count, Init);
+  CALL_ZE_RET(Ordinal, zeDeviceGetCommandQueueGroupProperties, zeDevice, &Count,
+              Properties.data());
+
+  for (uint32_t I = 0; I < Count; I++) {
+    const auto &Flags = Properties[I].flags;
+    if ((Flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) &&
+        (Flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) {
+      auto NumQueues = Properties[I].numQueues;
+      if (LinkCopy && NumQueues > 1) {
+        Ordinal = {I, NumQueues};
+        DP("Found link copy command queue for device " DPxMOD
+           ", ordinal = %" PRIu32 ", number of queues = %" PRIu32 "\n",
+           DPxPTR(zeDevice), Ordinal.first, Ordinal.second);
+        break;
+      } else if (!LinkCopy && NumQueues == 1) {
+        Ordinal = {I, NumQueues};
+        DP("Found copy command queue for device " DPxMOD ", ordinal = %" PRIu32
+           "\n",
+           DPxPTR(zeDevice), Ordinal.first);
+        break;
+      }
+    }
+  }
+  return Ordinal;
+}
+
+void L0DeviceTy::reportDeviceInfo() const {
+  DP("Device %" PRIu32 "\n", DeviceId);
+  DP("-- Name                         : %s\n", getNameCStr());
+  DP("-- PCI ID                       : 0x%" PRIx32 "\n", getPCIId());
+  DP("-- UUID                         : %s\n", getUuid().data());
+  DP("-- Number of total EUs          : %" PRIu32 "\n", getNumEUs());
+  DP("-- Number of threads per EU     : %" PRIu32 "\n", getNumThreadsPerEU());
+  DP("-- EU SIMD width                : %" PRIu32 "\n", getSIMDWidth());
+  DP("-- Number of EUs per subslice   : %" PRIu32 "\n", getNumEUsPerSubslice());
+  DP("-- Number of subslices per slice: %" PRIu32 "\n",
+     getNumSubslicesPerSlice());
+  DP("-- Number of slices             : %" PRIu32 "\n", getNumSlices());
+  DP("-- Local memory size (bytes)    : %" PRIu32 "\n",
+     getMaxSharedLocalMemory());
+  DP("-- Global memory size (bytes)   : %" PRIu64 "\n", getGlobalMemorySize());
+  DP("-- Cache size (bytes)           : %" PRIu64 "\n", getCacheSize());
+  DP("-- Max clock frequency (MHz)    : %" PRIu32 "\n", getClockRate());
+}
+
+Error L0DeviceTy::internalInit() {
+  const auto &Options = getPlugin().getOptions();
+
+  uint32_t Count = 1;
+  const auto zeDevice = getZeDevice();
+  CALL_ZE_RET_ERROR(zeDeviceGetProperties, zeDevice, &DeviceProperties);
+  CALL_ZE_RET_ERROR(zeDeviceGetComputeProperties, zeDevice, &ComputeProperties);
+  CALL_ZE_RET_ERROR(zeDeviceGetMemoryProperties, zeDevice, &Count,
+                    &MemoryProperties);
+  CALL_ZE_RET_ERROR(zeDeviceGetCacheProperties, zeDevice, &Count,
+                    &CacheProperties);
+
+  DeviceName =
+      std::string(DeviceProperties.name, sizeof(DeviceProperties.name));
+
+  DP("Found a GPU device, Name = %s\n", DeviceProperties.name);
+
+  DeviceArch = computeArch();
+  // Default allocation kind for this device
+  AllocKind = isDiscreteDevice() ? TARGET_ALLOC_DEVICE : TARGET_ALLOC_SHARED;
+
+  ze_kernel_indirect_access_flags_t Flags =
+      (AllocKind == TARGET_ALLOC_DEVICE)
+          ? ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE
+          : ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
+  IndirectAccessFlags = Flags;
+
+  // Get the UUID
+  std::string uid = "";
+  for (int n = 0; n < ZE_MAX_DEVICE_UUID_SIZE; n++)
+    uid += std::to_string(DeviceProperties.uuid.id[n]);
+  DeviceUuid = std::move(uid);
+
+  ComputeOrdinal = findComputeOrdinal();
+
+  CopyOrdinal = findCopyOrdinal();
+
+  LinkCopyOrdinal = findCopyOrdinal(true);
+  IsAsyncEnabled =
+      isDiscreteDevice() && Options.CommandMode != CommandModeTy::Sync;
+  MemAllocator.initDevicePools(*this, getPlugin().getOptions());
+  l0Context.getHostMemAllocator().updateMaxAllocSize(*this);
+  return Plugin::success();
+}
+
+Error L0DeviceTy::initImpl(GenericPluginTy &Plugin) {
+  return Plugin::success();
+}
+
+int32_t L0DeviceTy::synchronize(__tgt_async_info *AsyncInfo,
+                                bool ReleaseQueue) {
+  bool IsAsync = AsyncInfo && asyncEnabled();
+  if (!IsAsync)
+    return OFFLOAD_SUCCESS;
+
+  auto &Plugin = getPlugin();
+
+  AsyncQueueTy *AsyncQueue = (AsyncQueueTy *)AsyncInfo->Queue;
+
+  if (!AsyncQueue->WaitEvents.empty()) {
+    const auto &WaitEvents = AsyncQueue->WaitEvents;
+    if (Plugin.getOptions().CommandMode == CommandModeTy::AsyncOrdered) {
+      // Only need to wait for the last event
+      CALL_ZE_RET_FAIL(zeEventHostSynchronize, WaitEvents.back(), UINT64_MAX);
+      // Synchronize on kernel event to support printf()
+      auto KE = AsyncQueue->KernelEvent;
+      if (KE && KE != WaitEvents.back()) {
+        CALL_ZE_RET_FAIL(zeEventHostSynchronize, KE, UINT64_MAX);
+      }
+      for (auto &Event : WaitEvents) {
+        releaseEvent(Event);
+      }
+    } else { // Async
+      // Wait for all events. We should wait and reset events in reverse order
+      // to avoid premature event reset. If we have a kernel event in the
+      // queue, it is the last event to wait for since all wait events of the
+      // kernel are signaled before the kernel is invoked. We always invoke
+      // synchronization on kernel event to support printf().
+      bool WaitDone = false;
+      for (auto Itr = WaitEvents.rbegin(); Itr != WaitEvents.rend(); Itr++) {
+        if (!WaitDone) {
+          CALL_ZE_RET_FAIL(zeEventHostSynchronize, *Itr, UINT64_MAX);
+          if (*Itr == AsyncQueue->KernelEvent)
+            WaitDone = true;
+        }
+        releaseEvent(*Itr);
+      }
+    }
+  }
+
+  // Commit delayed USM2M copies
+  for (auto &USM2M : AsyncQueue->USM2MList) {
+    std::copy_n(static_cast<const char *>(std::get<0>(USM2M)),
+                std::get<2>(USM2M), static_cast<char *>(std::get<1>(USM2M)));
+  }
+  // Commit delayed H2M copies
+  for (auto &H2M : AsyncQueue->H2MList) {
+    std::copy_n(static_cast<char *>(std::get<0>(H2M)), std::get<2>(H2M),
+                static_cast<char *>(std::get<1>(H2M)));
+  }
+  if (ReleaseQueue) {
+    Plugin.releaseAsyncQueue(AsyncQueue);
+    getStagingBuffer().reset();
+    AsyncInfo->Queue = nullptr;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0DeviceTy::submitData(void *TgtPtr, const void *HstPtr, int64_t Size,
+                               __tgt_async_info *AsyncInfo) {
+  if (Size == 0)
+    return OFFLOAD_SUCCESS;
+
+  auto &Plugin = getPlugin();
+
+  const auto DeviceId = getDeviceId();
+  bool IsAsync = AsyncInfo && asyncEnabled();
+  if (IsAsync && !AsyncInfo->Queue) {
+    AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
+    if (!AsyncInfo->Queue)
+      IsAsync = false; // Couldn't get a queue, revert to sync
+  }
+  const auto TgtPtrType = getMemAllocType(TgtPtr);
+  if (TgtPtrType == ZE_MEMORY_TYPE_SHARED ||
+      TgtPtrType == ZE_MEMORY_TYPE_HOST) {
+    std::copy_n(static_cast<const char *>(HstPtr), Size,
+                static_cast<char *>(TgtPtr));
+  } else {
+    const void *SrcPtr = HstPtr;
+    if (isDiscreteDevice() &&
+        static_cast<size_t>(Size) <= Plugin.getOptions().StagingBufferSize &&
+        getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
+      SrcPtr = getStagingBuffer().get(IsAsync);
+      std::copy_n(static_cast<const char *>(HstPtr), Size,
+                  static_cast<char *>(const_cast<void *>(SrcPtr)));
+    }
+    int32_t RC;
+    if (IsAsync)
+      RC = enqueueMemCopyAsync(TgtPtr, SrcPtr, Size, AsyncInfo);
+    else
+      RC = enqueueMemCopy(TgtPtr, SrcPtr, Size, AsyncInfo);
+    if (RC != OFFLOAD_SUCCESS)
+      return RC;
+  }
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "%s %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+       IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(HstPtr),
+       DPxPTR(TgtPtr));
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0DeviceTy::retrieveData(void *HstPtr, const void *TgtPtr, int64_t Size,
+                                 __tgt_async_info *AsyncInfo) {
+  if (Size == 0)
+    return OFFLOAD_SUCCESS;
+
+  auto &Plugin = getPlugin();
+  const auto DeviceId = getDeviceId();
+  bool IsAsync = AsyncInfo && asyncEnabled();
+  if (IsAsync && !AsyncInfo->Queue) {
+    AsyncInfo->Queue = Plugin.getAsyncQueue();
+    if (!AsyncInfo->Queue)
+      IsAsync = false; // Couldn't get a queue, revert to sync
+  }
+  auto AsyncQueue =
+      IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : nullptr;
+  auto TgtPtrType = getMemAllocType(TgtPtr);
+  if (TgtPtrType == ZE_MEMORY_TYPE_HOST ||
+      TgtPtrType == ZE_MEMORY_TYPE_SHARED) {
+    bool CopyNow = true;
+    if (IsAsync) {
+      if (AsyncQueue->KernelEvent) {
+        // Delay Host/Shared USM to host memory copy since it must wait for
+        // kernel completion.
+        AsyncQueue->USM2MList.emplace_back(TgtPtr, HstPtr, Size);
+        CopyNow = false;
+      }
+    }
+    if (CopyNow) {
+      std::copy_n(static_cast<const char *>(TgtPtr), Size,
+                  static_cast<char *>(HstPtr));
+    }
+  } else {
+    void *DstPtr = HstPtr;
+    if (isDiscreteDevice() &&
+        static_cast<size_t>(Size) <=
+            getPlugin().getOptions().StagingBufferSize &&
+        getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
+      DstPtr = getStagingBuffer().get(IsAsync);
+    }
+    int32_t RC;
+    if (IsAsync)
+      RC = enqueueMemCopyAsync(DstPtr, TgtPtr, Size, AsyncInfo,
+                               /* CopyTo */ false);
+    else
+      RC = enqueueMemCopy(DstPtr, TgtPtr, Size, AsyncInfo);
+    if (RC != OFFLOAD_SUCCESS)
+      return RC;
+    if (DstPtr != HstPtr) {
+      if (IsAsync) {
+        // Store delayed H2M data copies
+        auto &H2MList = AsyncQueue->H2MList;
+        H2MList.emplace_back(DstPtr, HstPtr, static_cast<size_t>(Size));
+      } else {
+        std::copy_n(static_cast<char *>(DstPtr), Size,
+                    static_cast<char *>(HstPtr));
+      }
+    }
+  }
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "%s %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+       IsAsync ? "Submitted copy" : "Copied", Size, DPxPTR(TgtPtr),
+       DPxPTR(HstPtr));
+
+  return OFFLOAD_SUCCESS;
+}
+
+Expected<DeviceImageTy *>
+L0DeviceTy::loadBinaryImpl(std::unique_ptr<MemoryBuffer> &&TgtImage,
+                           int32_t ImageId) {
+  auto *PGM = getProgramFromImage(TgtImage->getMemBufferRef());
+  if (PGM) {
+    // Program already exists
+    return PGM;
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Device %" PRId32 ": Loading binary from " DPxMOD "\n", getDeviceId(),
+       DPxPTR(TgtImage->getBufferStart()));
+
+  const auto &Options = getPlugin().getOptions();
+  std::string CompilationOptions(Options.CompilationOptions);
+  CompilationOptions += " " + Options.UserCompilationOptions;
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
+       "Base L0 module compilation options: %s\n", CompilationOptions.c_str());
+
+  CompilationOptions += " ";
+  CompilationOptions += Options.InternalCompilationOptions;
+  auto &Program = addProgram(ImageId, std::move(TgtImage));
+
+  int32_t RC = Program.buildModules(CompilationOptions);
+  if (RC != OFFLOAD_SUCCESS)
+    return Plugin::check(RC, "Error in buildModules %d", RC);
+
+  RC = Program.linkModules();
+  if (RC != OFFLOAD_SUCCESS)
+    return Plugin::check(RC, "Error in linkModules %d", RC);
+
+  RC = Program.loadModuleKernels();
+  if (RC != OFFLOAD_SUCCESS)
+    return Plugin::check(RC, "Error in buildKernels %d", RC);
+
+  return &Program;
+}
+
+Error L0DeviceTy::unloadBinaryImpl(DeviceImageTy *Image) {
+  // Ignoring for now
+  // TODO: call properly L0Program unload
+  return Plugin::success();
+}
+
+Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo,
+                                  bool ReleaseQueue) {
+  if (!ReleaseQueue) {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED,
+                         "Support for ReleaseQueue=false in %s"
+                         " not implemented yet\n",
+                         __func__);
+  }
+  int32_t RC = synchronize(&AsyncInfo, ReleaseQueue);
+  return Plugin::check(RC, "Error in synchronizeImpl %d", RC);
+}
+
+Expected<bool>
+L0DeviceTy::hasPendingWorkImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  auto &AsyncInfo = *static_cast<__tgt_async_info *>(AsyncInfoWrapper);
+  const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
+  if (!IsAsync)
+    return false;
+
+  auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
+
+  if (AsyncQueue->WaitEvents.empty())
+    return false;
+
+  return true;
+}
+
+Error L0DeviceTy::queryAsyncImpl(__tgt_async_info &AsyncInfo) {
+  const bool IsAsync = AsyncInfo.Queue && asyncEnabled();
+  if (!IsAsync)
+    return Plugin::success();
+
+  auto &Plugin = getPlugin();
+  auto *AsyncQueue = static_cast<AsyncQueueTy *>(AsyncInfo.Queue);
+
+  if (!AsyncQueue->WaitEvents.empty())
+    return Plugin::success();
+
+  // Commit delayed USM2M copies
+  for (auto &USM2M : AsyncQueue->USM2MList) {
+    std::copy_n(static_cast<const char *>(std::get<0>(USM2M)),
+                std::get<2>(USM2M), static_cast<char *>(std::get<1>(USM2M)));
+  }
+  // Commit delayed H2M copies
+  for (auto &H2M : AsyncQueue->H2MList) {
+    std::copy_n(static_cast<char *>(std::get<0>(H2M)), std::get<2>(H2M),
+                static_cast<char *>(std::get<1>(H2M)));
+  }
+  Plugin.releaseAsyncQueue(AsyncQueue);
+  getStagingBuffer().reset();
+  AsyncInfo.Queue = nullptr;
+
+  return Plugin::success();
+}
+
+Expected<void *> L0DeviceTy::allocate(size_t Size, void *HstPtr,
+                                      TargetAllocTy Kind) {
+  return dataAlloc(Size, /*Align=*/0, Kind,
+                   /*Offset=*/0, /*UserAlloc=*/HstPtr == nullptr,
+                   /*DevMalloc=*/false);
+}
+
+Error L0DeviceTy::free(void *TgtPtr, TargetAllocTy Kind) {
+  return dataDelete(TgtPtr);
+}
+
+Error L0DeviceTy::dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                                 AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  int32_t RC = submitData(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
+  return Plugin::check(RC, "Error in dataSubmitImpl %d", RC);
+}
+
+Error L0DeviceTy::dataRetrieveImpl(void *HstPtr, const void *TgtPtr,
+                                   int64_t Size,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  int32_t RC = retrieveData(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
+  return Plugin::check(RC, "Error in dataRetrieveImpl %d", RC);
+}
+
+Error L0DeviceTy::dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+                                   void *DstPtr, int64_t Size,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) {
+
+  L0DeviceTy &L0DstDev = L0DeviceTy::makeL0Device(DstDev);
+  // Use copy engine only for across-tile/device copies.
+  const bool UseCopyEngine = getZeDevice() != L0DstDev.getZeDevice();
+
+  if (asyncEnabled() && AsyncInfoWrapper.hasQueue()) {
+    if (enqueueMemCopyAsync(DstPtr, SrcPtr, Size,
+                            (__tgt_async_info *)AsyncInfoWrapper))
+      return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
+  } else {
+    if (enqueueMemCopy(DstPtr, SrcPtr, Size,
+                       /* AsyncInfo */ nullptr, UseCopyEngine))
+      return Plugin::error(ErrorCode::UNKNOWN, "dataExchangeImpl failed");
+  }
+  return Plugin::success();
+}
+
+Error L0DeviceTy::initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  AsyncQueueTy *Queue = AsyncInfoWrapper.getQueueAs<AsyncQueueTy *>();
+  if (!Queue) {
+    Queue = getPlugin().getAsyncQueue();
+    AsyncInfoWrapper.setQueueAs<AsyncQueueTy *>(Queue);
+  }
+  return Plugin::success();
+}
+
+Error L0DeviceTy::initDeviceInfoImpl(__tgt_device_info *Info) {
+  if (!Info->Context)
+    Info->Context = getZeContext();
+  if (!Info->Device)
+    Info->Device = reinterpret_cast<void *>(getZeDevice());
+  return Plugin::success();
+}
+
+static const char *DriverVersionToStrTable[] = {
+    "1.0", "1.1", "1.2", "1.3",  "1.4",  "1.5", "1.6",
+    "1.7", "1.8", "1.9", "1.10", "1.11", "1.12"};
+constexpr size_t DriverVersionToStrTableSize =
+    sizeof(DriverVersionToStrTable) / sizeof(DriverVersionToStrTable[0]);
+
+Expected<InfoTreeNode> L0DeviceTy::obtainInfoImpl() {
+  InfoTreeNode Info;
+  Info.add("Device Number", getDeviceId());
+  Info.add("Device Name", getNameCStr(), "", DeviceInfo::NAME);
+  Info.add("Device Type", "GPU", "", DeviceInfo::TYPE);
+  Info.add("Vendor", "Intel", "", DeviceInfo::VENDOR);
+  Info.add("Vendor ID", getVendorId(), "", DeviceInfo::VENDOR_ID);
+  auto DriverVersion = getDriverAPIVersion();
+  if (DriverVersion < DriverVersionToStrTableSize)
+    Info.add("Driver Version", DriverVersionToStrTable[DriverVersion], "",
+             DeviceInfo::DRIVER_VERSION);
+  else
+    Info.add("Driver Version", "Unknown", "", DeviceInfo::DRIVER_VERSION);
+  Info.add("Device PCI ID", getPCIId());
+  Info.add("Device UUID", getUuid().data());
+  Info.add("Number of total EUs", getNumEUs(), "",
+           DeviceInfo::NUM_COMPUTE_UNITS);
+  Info.add("Number of threads per EU", getNumThreadsPerEU());
+  Info.add("EU SIMD width", getSIMDWidth());
+  Info.add("Number of EUs per subslice", getNumEUsPerSubslice());
+  Info.add("Number of subslices per slice", getNumSubslicesPerSlice());
+  Info.add("Number of slices", getNumSlices());
+  Info.add("Max Group size", getMaxGroupSize(), "",
+           DeviceInfo::MAX_WORK_GROUP_SIZE);
+  Info.add("Local memory size (bytes)", getMaxSharedLocalMemory());
+  Info.add("Global memory size (bytes)", getGlobalMemorySize(), "",
+           DeviceInfo::GLOBAL_MEM_SIZE);
+  Info.add("Cache size (bytes)", getCacheSize());
+  Info.add("Max Memory Allocation Size (bytes)", getMaxMemAllocSize(), "",
+           DeviceInfo::MAX_MEM_ALLOC_SIZE);
+  Info.add("Max clock frequency (MHz)", getClockRate(), "",
+           DeviceInfo::MAX_CLOCK_FREQUENCY);
+  return Info;
+}
+
+Expected<GenericKernelTy &> L0DeviceTy::constructKernel(const char *Name) {
+  // Allocate and construct the L0 kernel.
+  L0KernelTy *L0Kernel = getPlugin().allocate<L0KernelTy>();
+  if (!L0Kernel)
+    return Plugin::error(ErrorCode::UNKNOWN,
+                         "Failed to allocate memory for L0 kernel");
+
+  new (L0Kernel) L0KernelTy(Name);
+
+  return *L0Kernel;
+}
+
+uint32_t L0DeviceTy::getMemAllocType(const void *Ptr) const {
+  ze_memory_allocation_properties_t properties = {
+      ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES,
+      nullptr,                // extension
+      ZE_MEMORY_TYPE_UNKNOWN, // type
+      0,                      // id
+      0,                      // page size
+  };
+
+  ze_result_t rc;
+  CALL_ZE(rc, zeMemGetAllocProperties, getZeContext(), Ptr, &properties,
+          nullptr);
+
+  if (rc == ZE_RESULT_ERROR_INVALID_ARGUMENT)
+    return ZE_MEMORY_TYPE_UNKNOWN;
+  else
+    return properties.type;
+}
+
+interop_spec_t L0DeviceTy::selectInteropPreference(int32_t InteropType,
+                                                   int32_t NumPrefers,
+                                                   interop_spec_t *Prefers) {
+  // no supported preference found, set default to level_zero,
+  // non-ordered unless is targetsync
+  return interop_spec_t{
+      tgt_fr_level_zero,
+      {InteropType == kmp_interop_type_targetsync ? true : false /*inorder*/,
+       0},
+      0};
+}
+
+Expected<OmpInteropTy> L0DeviceTy::createInterop(int32_t InteropContext,
+                                                 interop_spec_t &InteropSpec) {
+  auto Ret =
+      new omp_interop_val_t(DeviceId, (kmp_interop_type_t)InteropContext);
+  Ret->fr_id = tgt_fr_level_zero;
+  Ret->vendor_id = omp_vendor_intel;
+
+  if (InteropContext == kmp_interop_type_target ||
+      InteropContext == kmp_interop_type_targetsync) {
+    Ret->device_info.Platform = getZeDriver();
+    Ret->device_info.Device = getZeDevice();
+    Ret->device_info.Context = getZeContext();
+  }
+
+  Ret->rtl_property = new L0Interop::Property();
+  if (InteropContext == kmp_interop_type_targetsync) {
+    Ret->async_info = new __tgt_async_info();
+    auto L0 = static_cast<L0Interop::Property *>(Ret->rtl_property);
+
+    bool InOrder = InteropSpec.attrs.inorder;
+    Ret->attrs.inorder = InOrder;
+    if (useImmForInterop()) {
+      auto CmdList = createImmCmdList(InOrder);
+      Ret->async_info->Queue = CmdList;
+      L0->ImmCmdList = CmdList;
+    } else {
+      Ret->async_info->Queue = createCommandQueue(InOrder);
+      L0->CommandQueue =
+          static_cast<ze_command_queue_handle_t>(Ret->async_info->Queue);
+    }
+  }
+
+  return Ret;
+}
+
+Error L0DeviceTy::releaseInterop(OmpInteropTy Interop) {
+  const auto DeviceId = getDeviceId();
+
+  if (!Interop || Interop->device_id != (intptr_t)DeviceId) {
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+                         DPxPTR(Interop));
+  }
+  auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+  if (Interop->async_info && Interop->async_info->Queue) {
+    if (useImmForInterop()) {
+      auto ImmCmdList = L0->ImmCmdList;
+      CALL_ZE_RET_ERROR(zeCommandListDestroy, ImmCmdList);
+    } else {
+      auto CmdQueue = L0->CommandQueue;
+      CALL_ZE_RET_ERROR(zeCommandQueueDestroy, CmdQueue);
+    }
+  }
+  delete L0;
+  delete Interop;
+
+  return Plugin::success();
+}
+
+int32_t L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
+                                   __tgt_async_info *AsyncInfo,
+                                   bool UseCopyEngine) {
+  ze_command_list_handle_t CmdList = nullptr;
+  ze_command_queue_handle_t CmdQueue = nullptr;
+
+  if (useImmForCopy()) {
+    CmdList = UseCopyEngine ? getImmCopyCmdList() : getImmCmdList();
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                     nullptr, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeCommandListHostSynchronize, CmdList, UINT64_MAX);
+  } else {
+    if (UseCopyEngine) {
+      CmdList = getCopyCmdList();
+      CmdQueue = getCopyCmdQueue();
+    } else {
+      CmdList = getCmdList();
+      CmdQueue = getCmdQueue();
+    }
+
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                     nullptr, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+    CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, getMutex(),
+                         CmdQueue, 1, &CmdList, nullptr);
+    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+/// Enqueue non-blocking memory copy. This function is invoked only when IMM is
+/// fully enabled and async mode is requested.
+int32_t L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
+                                        __tgt_async_info *AsyncInfo,
+                                        bool CopyTo) {
+  const bool Ordered =
+      (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
+  ze_event_handle_t SignalEvent = getEvent();
+  size_t NumWaitEvents = 0;
+  ze_event_handle_t *WaitEvents = nullptr;
+  AsyncQueueTy *AsyncQueue = reinterpret_cast<AsyncQueueTy *>(AsyncInfo->Queue);
+  if (!AsyncQueue->WaitEvents.empty()) {
+    // Use a single wait event if events are ordered or a kernel event exists.
+    NumWaitEvents = 1;
+    if (Ordered)
+      WaitEvents = &AsyncQueue->WaitEvents.back();
+    else if (AsyncQueue->KernelEvent)
+      WaitEvents = &AsyncQueue->KernelEvent;
+    else
+      NumWaitEvents = 0;
+  }
+  auto CmdList = getImmCopyCmdList();
+  CALL_ZE_RET_FAIL(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+                   SignalEvent, NumWaitEvents, WaitEvents);
+  AsyncQueue->WaitEvents.push_back(SignalEvent);
+  return OFFLOAD_SUCCESS;
+}
+
+/// Enqueue memory fill
+int32_t L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
+                                   size_t PatternSize, size_t Size) {
+  if (useImmForCopy()) {
+    const auto CmdList = getImmCopyCmdList();
+    auto Event = getEvent();
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+                     PatternSize, Size, Event, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+  } else {
+    auto CmdList = getCopyCmdList();
+    const auto CmdQueue = getCopyCmdQueue();
+    CALL_ZE_RET_FAIL(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+                     PatternSize, Size, nullptr, 0, nullptr);
+    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+    CALL_ZE_RET_FAIL(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                     nullptr);
+    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+Error L0DeviceTy::dataFillImpl(void *TgtPtr, const void *PatternPtr,
+                               int64_t PatternSize, int64_t Size,
+                               AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  // TODO: support async version
+  // TODO: convert enqueueMemFill to return Error code
+  if (enqueueMemFill(TgtPtr, PatternPtr, PatternSize, Size) == OFFLOAD_SUCCESS)
+    return Plugin::success();
+
+  return Plugin::error(error::ErrorCode::UNKNOWN, "%s failed\n", __func__);
+}
+
+Expected<void *> L0DeviceTy::dataAlloc(size_t Size, size_t Align, int32_t Kind,
+                                       intptr_t Offset, bool UserAlloc,
+                                       bool DevMalloc, uint32_t MemAdvice,
+                                       AllocOptionTy AllocOpt) {
+
+  const bool UseDedicatedPool =
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH) ||
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER);
+  if (Kind == TARGET_ALLOC_DEFAULT) {
+    if (UserAlloc)
+      Kind = TARGET_ALLOC_DEVICE;
+    else if (AllocOpt == AllocOptionTy::ALLOC_OPT_HOST_MEM)
+      Kind = TARGET_ALLOC_HOST;
+    else if (UseDedicatedPool)
+      Kind = TARGET_ALLOC_DEVICE;
+    else
+      Kind = getAllocKind();
+  }
+  auto &Allocator = getMemAllocator(Kind);
+  return Allocator.alloc(Size, Align, Kind, Offset, UserAlloc, DevMalloc,
+                         MemAdvice, AllocOpt);
+}
+
+Error L0DeviceTy::dataDelete(void *Ptr) {
+  auto &Allocator = getMemAllocator(Ptr);
+  return Allocator.dealloc(Ptr);
+}
+
+int32_t L0DeviceTy::makeMemoryResident(void *Mem, size_t Size) {
+  ze_result_t RC;
+  CALL_ZE(RC, zeContextMakeMemoryResident, getZeContext(), getZeDevice(), Mem,
+          Size);
+  if (RC != ZE_RESULT_SUCCESS) {
+    DP("Could not make memory " DPxMOD " resident on Level Zero device " DPxMOD
+       ".\n",
+       DPxPTR(Mem), DPxPTR(getZeDevice()));
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+// Command queues related functions
+/// Create a command list with given ordinal and flags
+ze_command_list_handle_t L0DeviceTy::createCmdList(
+    ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
+    ze_command_list_flags_t Flags, const std::string_view DeviceIdStr) {
+  ze_command_list_desc_t cmdListDesc = {ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
+                                        nullptr, // extension
+                                        Ordinal, Flags};
+  ze_command_list_handle_t cmdList;
+  CALL_ZE_RET_NULL(zeCommandListCreate, Context, Device, &cmdListDesc,
+                   &cmdList);
+  DP("Created a command list " DPxMOD " (Ordinal: %" PRIu32
+     ") for device %s.\n",
+     DPxPTR(cmdList), Ordinal, DeviceIdStr.data());
+  return cmdList;
+}
+
+/// Create a command list with default flags
+ze_command_list_handle_t
+L0DeviceTy::createCmdList(ze_context_handle_t Context,
+                          ze_device_handle_t Device, uint32_t Ordinal,
+                          const std::string_view DeviceIdStr) {
+  return (Ordinal == UINT32_MAX)
+             ? nullptr
+             : createCmdList(Context, Device, Ordinal, 0, DeviceIdStr);
+}
+
+ze_command_list_handle_t L0DeviceTy::getCmdList() {
+  auto &TLS = getTLS();
+  auto CmdList = TLS.getCmdList();
+  if (!CmdList) {
+    CmdList = createCmdList(getZeContext(), getZeDevice(), getComputeEngine(),
+                            getZeId());
+    TLS.setCmdList(CmdList);
+  }
+  return CmdList;
+}
+
+/// Create a command queue with given ordinal and flags
+ze_command_queue_handle_t
+L0DeviceTy::createCmdQueue(ze_context_handle_t Context,
+                           ze_device_handle_t Device, uint32_t Ordinal,
+                           uint32_t Index, ze_command_queue_flags_t Flags,
+                           const std::string_view DeviceIdStr) {
+  ze_command_queue_desc_t cmdQueueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+                                          nullptr, // extension
+                                          Ordinal,
+                                          Index,
+                                          Flags, // flags
+                                          ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+                                          ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+  ze_command_queue_handle_t cmdQueue;
+  CALL_ZE_RET_NULL(zeCommandQueueCreate, Context, Device, &cmdQueueDesc,
+                   &cmdQueue);
+  DP("Created a command queue " DPxMOD " (Ordinal: %" PRIu32 ", Index: %" PRIu32
+     ", Flags: %" PRIu32 ") for device %s.\n",
+     DPxPTR(cmdQueue), Ordinal, Index, Flags, DeviceIdStr.data());
+  return cmdQueue;
+}
+
+/// Create a command queue with default flags
+ze_command_queue_handle_t L0DeviceTy::createCmdQueue(
+    ze_context_handle_t Context, ze_device_handle_t Device, uint32_t Ordinal,
+    uint32_t Index, const std::string_view DeviceIdStr, bool InOrder) {
+  ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
+  return (Ordinal == UINT32_MAX) ? nullptr
+                                 : createCmdQueue(Context, Device, Ordinal,
+                                                  Index, Flags, DeviceIdStr);
+}
+
+/// Create a new command queue for the given OpenMP device ID
+ze_command_queue_handle_t L0DeviceTy::createCommandQueue(bool InOrder) {
+  auto cmdQueue =
+      createCmdQueue(getZeContext(), getZeDevice(), getComputeEngine(),
+                     getComputeIndex(), getZeId(), InOrder);
+  return cmdQueue;
+}
+
+/// Create an immediate command list
+ze_command_list_handle_t
+L0DeviceTy::createImmCmdList(uint32_t Ordinal, uint32_t Index, bool InOrder) {
+  ze_command_queue_flags_t Flags = InOrder ? ZE_COMMAND_QUEUE_FLAG_IN_ORDER : 0;
+  ze_command_queue_desc_t Desc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+                               nullptr,
+                               Ordinal,
+                               Index,
+                               Flags,
+                               ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+                               ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
+  ze_command_list_handle_t CmdList = nullptr;
+  CALL_ZE_RET_NULL(zeCommandListCreateImmediate, getZeContext(), getZeDevice(),
+                   &Desc, &CmdList);
+  DP("Created an immediate command list " DPxMOD " (Ordinal: %" PRIu32
+     ", Index: %" PRIu32 ", Flags: %" PRIu32 ") for device %s.\n",
+     DPxPTR(CmdList), Ordinal, Index, Flags, getZeIdCStr());
+  return CmdList;
+}
+
+/// Create an immediate command list for copying
+ze_command_list_handle_t L0DeviceTy::createImmCopyCmdList() {
+  uint32_t Ordinal = getMainCopyEngine();
+  if (Ordinal == UINT32_MAX)
+    Ordinal = getLinkCopyEngine();
+  if (Ordinal == UINT32_MAX)
+    Ordinal = getComputeEngine();
+  return createImmCmdList(Ordinal, /*Index*/ 0);
+}
+
+ze_command_queue_handle_t L0DeviceTy::getCmdQueue() {
+  auto &TLS = getTLS();
+  auto CmdQueue = TLS.getCmdQueue();
+  if (!CmdQueue) {
+    CmdQueue = createCommandQueue();
+    TLS.setCmdQueue(CmdQueue);
+  }
+  return CmdQueue;
+}
+
+ze_command_list_handle_t L0DeviceTy::getCopyCmdList() {
+  // Use main copy engine if available
+  if (hasMainCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdList = TLS.getCopyCmdList();
+    if (!CmdList) {
+      CmdList = createCmdList(getZeContext(), getZeDevice(),
+                              getMainCopyEngine(), getZeId());
+      TLS.setCopyCmdList(CmdList);
+    }
+    return CmdList;
+  }
+  // Use link copy engine if available
+  if (hasLinkCopyEngine())
+    return getLinkCopyCmdList();
+  // Use compute engine otherwise
+  return getCmdList();
+}
+
+ze_command_queue_handle_t L0DeviceTy::getCopyCmdQueue() {
+  // Use main copy engine if available
+  if (hasMainCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdQueue = TLS.getCopyCmdQueue();
+    if (!CmdQueue) {
+      CmdQueue = createCmdQueue(getZeContext(), getZeDevice(),
+                                getMainCopyEngine(), 0, getZeId());
+      TLS.setCopyCmdQueue(CmdQueue);
+    }
+    return CmdQueue;
+  }
+  // Use link copy engine if available
+  if (hasLinkCopyEngine())
+    return getLinkCopyCmdQueue();
+  // Use compute engine otherwise
+  return getCmdQueue();
+}
+
+ze_command_list_handle_t L0DeviceTy::getLinkCopyCmdList() {
+  // Use link copy engine if available
+  if (hasLinkCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdList = TLS.getLinkCopyCmdList();
+    if (!CmdList) {
+      CmdList =
+          createCmdList(getZeContext(), getZeDevice(), getLinkCopyEngine(),
+                        ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY, getZeId());
+      TLS.setLinkCopyCmdList(CmdList);
+    }
+    return CmdList;
+  }
+  // Use main copy engine if available
+  if (hasMainCopyEngine())
+    return getCopyCmdList();
+  // Use compute engine otherwise
+  return getCmdList();
+}
+
+ze_command_queue_handle_t L0DeviceTy::getLinkCopyCmdQueue() {
+  // Use link copy engine if available
+  if (hasLinkCopyEngine()) {
+    auto &TLS = getTLS();
+    auto CmdQueue = TLS.getLinkCopyCmdQueue();
+    if (!CmdQueue) {
+      // Try to use different copy engines for multiple threads
+      uint32_t Index =
+          __kmpc_global_thread_num(nullptr) % getNumLinkCopyQueues();
+      CmdQueue =
+          createCmdQueue(getZeContext(), getZeDevice(), getLinkCopyEngine(),
+                         Index, ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, getZeId());
+      TLS.setLinkCopyCmdQueue(CmdQueue);
+    }
+    return CmdQueue;
+  }
+  // Use main copy engine if available
+  if (hasMainCopyEngine())
+    return getCopyCmdQueue();
+  // Use compute engine otherwise
+  return getCmdQueue();
+}
+
+ze_command_list_handle_t L0DeviceTy::getImmCmdList() {
+  auto &TLS = getTLS();
+  auto CmdList = TLS.getImmCmdList();
+  if (!CmdList) {
+    CmdList = createImmCmdList();
+    TLS.setImmCmdList(CmdList);
+  }
+  return CmdList;
+}
+
+ze_command_list_handle_t L0DeviceTy::getImmCopyCmdList() {
+  auto &TLS = getTLS();
+  auto CmdList = TLS.getImmCopyCmdList();
+  if (!CmdList) {
+    CmdList = createImmCopyCmdList();
+    TLS.setImmCopyCmdList(CmdList);
+  }
+  return CmdList;
+}
+
+Error L0DeviceTy::dataFence(__tgt_async_info *Async) {
+  const bool Ordered =
+      (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
+
+  // Nothing to do if everything is ordered
+  if (Ordered)
+    return Plugin::success();
+
+  ze_command_list_handle_t CmdList = nullptr;
+  ze_command_queue_handle_t CmdQueue = nullptr;
+
+  if (useImmForCopy()) {
+    CmdList = getImmCopyCmdList();
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+  } else {
+    CmdList = getCopyCmdList();
+    CmdQueue = getCopyCmdQueue();
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+    CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                      nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
+  }
+
+  return Plugin::success();
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
new file mode 100644
index 0000000000000..e1ee9d5fa033b
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0DynWrapper.cpp
@@ -0,0 +1,135 @@
+//===--- level_zero/dynamic_level_zero/level_zero.cpp ------------- C++ -*-===//
+//
+// Implement wrapper for level_zero API calls through dlopen
+//
+//===----------------------------------------------------------------------===//
+
+#include <level_zero/ze_api.h>
+#include <level_zero/zes_api.h>
+#include <memory>
+
+#include "DLWrap.h"
+#include "Shared/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+DLWRAP_INITIALIZE()
+
+DLWRAP_INTERNAL(zeInit, 1)
+DLWRAP(zeDriverGet, 2)
+DLWRAP(zeDeviceGet, 3)
+DLWRAP(zeDeviceGetSubDevices, 3)
+DLWRAP(zeModuleCreate, 5)
+DLWRAP(zeModuleGetProperties, 2)
+DLWRAP(zeModuleBuildLogDestroy, 1)
+DLWRAP(zeModuleBuildLogGetString, 3)
+DLWRAP(zeModuleGetKernelNames, 3)
+DLWRAP(zeModuleDestroy, 1)
+DLWRAP(zeCommandListAppendBarrier, 4)
+DLWRAP(zeCommandListAppendLaunchKernel, 6)
+DLWRAP(zeCommandListAppendLaunchCooperativeKernel, 6)
+DLWRAP(zeCommandListAppendMemoryCopy, 7)
+DLWRAP(zeCommandListAppendMemoryCopyRegion, 12)
+DLWRAP(zeCommandListAppendMemoryFill, 8)
+DLWRAP(zeCommandListAppendMemoryPrefetch, 3)
+DLWRAP(zeCommandListAppendMemAdvise, 5)
+DLWRAP(zeCommandListClose, 1)
+DLWRAP(zeCommandListCreate, 4)
+DLWRAP(zeCommandListCreateImmediate, 4)
+DLWRAP(zeCommandListDestroy, 1)
+DLWRAP(zeCommandListReset, 1)
+DLWRAP(zeCommandQueueCreate, 4)
+DLWRAP(zeCommandQueueDestroy, 1)
+DLWRAP(zeCommandQueueExecuteCommandLists, 4)
+DLWRAP(zeCommandQueueSynchronize, 2)
+DLWRAP(zeContextCreate, 3)
+DLWRAP(zeContextDestroy, 1)
+DLWRAP(zeContextMakeMemoryResident, 4)
+DLWRAP(zeDeviceCanAccessPeer, 3)
+DLWRAP(zeDeviceGetProperties, 2)
+DLWRAP(zeDeviceGetCommandQueueGroupProperties, 3)
+DLWRAP(zeDeviceGetComputeProperties, 2)
+DLWRAP(zeDeviceGetMemoryProperties, 3)
+DLWRAP(zeDeviceGetCacheProperties, 3)
+DLWRAP(zeDeviceGetGlobalTimestamps, 3)
+DLWRAP(zeDriverGetApiVersion, 2)
+DLWRAP(zeDriverGetExtensionFunctionAddress, 3)
+DLWRAP(zeDriverGetExtensionProperties, 3)
+DLWRAP(zeEventCreate, 3)
+DLWRAP(zeEventDestroy, 1)
+DLWRAP(zeEventHostReset, 1)
+DLWRAP(zeEventHostSynchronize, 2)
+DLWRAP(zeEventPoolCreate, 5)
+DLWRAP(zeEventPoolDestroy, 1)
+DLWRAP(zeEventQueryKernelTimestamp, 2)
+DLWRAP(zeFenceCreate, 3)
+DLWRAP(zeFenceDestroy, 1)
+DLWRAP(zeFenceHostSynchronize, 2)
+DLWRAP(zeKernelCreate, 3)
+DLWRAP(zeKernelDestroy, 1)
+DLWRAP(zeKernelGetName, 3)
+DLWRAP(zeKernelGetProperties, 2)
+DLWRAP(zeKernelSetArgumentValue, 4)
+DLWRAP(zeKernelSetGroupSize, 4)
+DLWRAP(zeKernelSetIndirectAccess, 2)
+DLWRAP(zeKernelSuggestGroupSize, 7)
+DLWRAP(zeKernelSuggestMaxCooperativeGroupCount, 2)
+DLWRAP(zeMemAllocDevice, 6)
+DLWRAP(zeMemAllocHost, 5)
+DLWRAP(zeMemAllocShared, 7)
+DLWRAP(zeMemFree, 2)
+DLWRAP(zeMemGetAddressRange, 4)
+DLWRAP(zeMemGetAllocProperties, 4)
+DLWRAP(zeModuleDynamicLink, 3)
+DLWRAP(zeModuleGetGlobalPointer, 4)
+DLWRAP(zesDeviceEnumMemoryModules, 3)
+DLWRAP(zesMemoryGetState, 2)
+DLWRAP(zeCommandListHostSynchronize, 2)
+
+DLWRAP_FINALIZE()
+
+#ifndef LEVEL_ZERO_LIBRARY
+#error "Level zero library not defined"
+#endif
+
+#ifndef TARGET_NAME
+#error "Missing TARGET_NAME macro"
+#endif
+#ifndef DEBUG_PREFIX
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
+#endif
+
+static bool loadLevelZero() {
+  const char *L0Library = LEVEL_ZERO_LIBRARY;
+  std::string ErrMsg;
+
+  DP("Trying to load %s\n", L0Library);
+  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
+      llvm::sys::DynamicLibrary::getPermanentLibrary(L0Library, &ErrMsg));
+  if (!DynlibHandle->isValid()) {
+    if (ErrMsg.empty())
+      ErrMsg = "unknown error";
+    DP("Unable to load library '%s': %s!\n", L0Library, ErrMsg.c_str());
+    return false;
+  }
+
+  for (size_t I = 0; I < dlwrap::size(); I++) {
+    const char *Sym = dlwrap::symbol(I);
+
+    void *P = DynlibHandle->getAddressOfSymbol(Sym);
+    if (P == nullptr) {
+      DP("Unable to find '%s' in '%s'!\n", Sym, L0Library);
+      return false;
+    }
+    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
+
+    *dlwrap::pointer(I) = P;
+  }
+
+  return true;
+}
+
+ze_result_t ZE_APICALL zeInit(ze_init_flags_t flags) {
+  if (!loadLevelZero())
+    return ZE_RESULT_ERROR_UNKNOWN;
+  return dlwrap_zeInit(flags);
+}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
new file mode 100644
index 0000000000000..53642eba20475
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -0,0 +1,625 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GenericKernel implementation for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Kernel.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+#include "L0Program.h"
+
+namespace llvm::omp::target::plugin {
+
+Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
+                             uint32_t NumThreads[3], uint32_t NumBlocks[3],
+                             KernelArgsTy &KernelArgs,
+                             KernelLaunchParamsTy LaunchParams,
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+
+  auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
+  int32_t RC = runTargetTeamRegion(l0Device, KernelArgs,
+                                   std::move(LaunchParams), AsyncInfoWrapper);
+  if (RC == OFFLOAD_SUCCESS)
+    return Plugin::success();
+  return Plugin::error(error::ErrorCode::UNKNOWN,
+                       "Error in launch Kernel %s: %d", getName(), RC);
+}
+
+Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
+  const auto *KernelName = getName();
+
+  auto Module = Program.findModuleFromKernelName(KernelName);
+  ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0,
+                                 KernelName};
+  CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel);
+  return Plugin::success();
+}
+
+Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
+                           DeviceImageTy &Image) {
+  auto &Program = L0ProgramTy::makeL0Program(Image);
+
+  Error Err = buildKernel(Program);
+  if (Err)
+    return Err;
+  Program.addKernel(this);
+
+  return Plugin::success();
+}
+
+void L0KernelTy::decideKernelGroupArguments(
+    L0DeviceTy &Device, uint32_t NumTeams, uint32_t ThreadLimit,
+    TgtNDRangeDescTy *LoopLevels, uint32_t *GroupSizes,
+    ze_group_count_t &GroupCounts, bool HalfNumThreads,
+    bool IsTeamsNDRange) const {
+
+  const KernelPropertiesTy &KernelPR = getProperties();
+
+  const auto DeviceId = Device.getDeviceId();
+  bool MaxGroupSizeForced = false;
+  bool MaxGroupCountForced = false;
+  uint32_t MaxGroupSize = Device.getMaxGroupSize();
+  const auto &Option = LevelZeroPluginTy::getOptions();
+  const auto OptSubscRate = Option.SubscriptionRate;
+
+  uint32_t SIMDWidth = KernelPR.SIMDWidth;
+  uint32_t KernelWidth = KernelPR.Width;
+  uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize;
+
+  if (KernelMaxThreadGroupSize < MaxGroupSize) {
+    MaxGroupSize = KernelMaxThreadGroupSize;
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Capping maximum team size to %" PRIu32
+         " due to kernel constraints.\n",
+         MaxGroupSize);
+  }
+
+  if (ThreadLimit > 0) {
+    MaxGroupSizeForced = true;
+    MaxGroupSize = ThreadLimit;
+  }
+
+  uint32_t MaxGroupCount = 0;
+  if (NumTeams > 0) {
+    MaxGroupCount = NumTeams;
+    MaxGroupCountForced = true;
+  }
+
+  if (MaxGroupCountForced) {
+    // If number of teams is specified by the user, then use KernelWidth
+    // WIs per WG by default, so that it matches
+    // decideLoopKernelGroupArguments() behavior.
+    if (!MaxGroupSizeForced) {
+      MaxGroupSize = KernelWidth;
+    }
+  } else {
+    const uint32_t NumSubslices = Device.getNumSubslices();
+    uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice();
+    if (HalfNumThreads)
+      NumThreadsPerSubslice /= 2;
+
+    MaxGroupCount = NumSubslices * NumThreadsPerSubslice;
+    if (MaxGroupSizeForced) {
+      // Set group size for the HW capacity
+      uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+      uint32_t NumGroupsPerSubslice =
+          (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup;
+      MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+    } else {
+      assert(!MaxGroupSizeForced && !MaxGroupCountForced);
+      assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) &&
+             "Invalid maxGroupSize");
+      // Maximize group size
+      while (MaxGroupSize >= KernelWidth) {
+        uint32_t NumThreadsPerGroup =
+            (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
+
+        if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) {
+          uint32_t NumGroupsPerSubslice =
+              NumThreadsPerSubslice / NumThreadsPerGroup;
+          MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
+          break;
+        }
+        MaxGroupSize -= KernelWidth;
+      }
+    }
+  }
+
+  uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1};
+  uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+  bool UsedReductionSubscriptionRate = false;
+  if (!MaxGroupCountForced) {
+    {
+      GRPCounts[0] *= OptSubscRate;
+    }
+
+    size_t LoopTripcount = 0;
+    if (LoopLevels) {
+      // TODO: consider other possible LoopDesc uses
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Loop desciptor provided but specific ND-range is disabled\n");
+      // TODO: get rid of this constraint
+      if (LoopLevels->NumLoops > 1) {
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "More than 1 loop found (%" PRIu32 "), ignoring loop info\n",
+             LoopLevels->NumLoops);
+      } else if (LoopLevels->Levels[0].Ub >= LoopLevels->Levels[0].Lb) {
+        LoopTripcount = (LoopLevels->Levels[0].Ub - LoopLevels->Levels[0].Lb +
+                         LoopLevels->Levels[0].Stride) /
+                        LoopLevels->Levels[0].Stride;
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Loop TC = (%" PRId64 " - %" PRId64 " + %" PRId64 ") / %" PRId64
+             " = %zu\n",
+             LoopLevels->Levels[0].Ub, LoopLevels->Levels[0].Lb,
+             LoopLevels->Levels[0].Stride, LoopLevels->Levels[0].Stride,
+             LoopTripcount);
+      }
+    }
+
+    if (LoopTripcount && !UsedReductionSubscriptionRate) {
+      const size_t MaxTotalThreads = Device.getNumThreadsPerSubslice() *
+                                     Device.getNumSubslices() * SIMDWidth;
+      size_t AdjustedGroupCount =
+          IsTeamsNDRange ? (std::min)(((LoopTripcount + 7) & ~7),
+                                      MaxTotalThreads / GRPSizes[0])
+                         : ((LoopTripcount + GRPSizes[0] - 1) / GRPSizes[0]);
+      AdjustedGroupCount = std::max(AdjustedGroupCount, size_t{1});
+      AdjustedGroupCount *= OptSubscRate;
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Adjusting number of teams using the loop tripcount\n");
+      if (AdjustedGroupCount < GRPCounts[0])
+        GRPCounts[0] = AdjustedGroupCount;
+    }
+  }
+  GroupCounts.groupCountX = GRPCounts[0];
+  GroupCounts.groupCountY = GRPCounts[1];
+  GroupCounts.groupCountZ = GRPCounts[2];
+  std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+}
+
+// Return the number of total HW threads required to execute
+// a loop kernel compiled with the given SIMDWidth, and the given
+// loop(s) trip counts and group sizes.
+// Returns UINT64_MAX, if computations overflow.
+static uint64_t computeThreadsNeeded(const llvm::ArrayRef<size_t> TripCounts,
+                                     const llvm::ArrayRef<uint32_t> GroupSizes,
+                                     uint32_t SIMDWidth) {
+  assert(TripCounts.size() == 3 && "Invalid trip counts array size");
+  assert(GroupSizes.size() == 3 && "Invalid group sizes array size");
+  // Compute the number of groups in each dimension.
+  std::array<uint64_t, 3> GroupCount;
+
+  for (int I = 0; I < 3; ++I) {
+    if (TripCounts[I] == 0 || GroupSizes[I] == 0)
+      return (std::numeric_limits<uint64_t>::max)();
+    GroupCount[I] =
+        (uint64_t(TripCounts[I]) + GroupSizes[I] - 1) / GroupSizes[I];
+    if (GroupCount[I] > (std::numeric_limits<uint32_t>::max)())
+      return (std::numeric_limits<uint64_t>::max)();
+  }
+  for (int I = 1; I < 3; ++I) {
+    if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < GroupCount[I])
+      return (std::numeric_limits<uint64_t>::max)();
+    GroupCount[0] *= GroupCount[I];
+  }
+  // Multiplication of the group sizes must never overflow uint64_t
+  // for any existing device.
+  uint64_t LocalWorkSize =
+      uint64_t(GroupSizes[0]) * GroupSizes[1] * GroupSizes[2];
+  uint64_t ThreadsPerWG = ((LocalWorkSize + SIMDWidth - 1) / SIMDWidth);
+
+  // Check that the total number of threads fits uint64_t.
+  if ((std::numeric_limits<uint64_t>::max)() / GroupCount[0] < ThreadsPerWG)
+    return (std::numeric_limits<uint64_t>::max)();
+
+  return GroupCount[0] * ThreadsPerWG;
+}
+
+int32_t L0KernelTy::decideLoopKernelGroupArguments(
+    L0DeviceTy &Device, uint32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+    uint32_t *GroupSizes, ze_group_count_t &GroupCounts, bool HalfNumThreads,
+    bool &AllowCooperative) const {
+
+  const auto DeviceId = Device.getDeviceId();
+  const auto &Options = LevelZeroPluginTy::getOptions();
+  const auto &KernelPR = getProperties();
+  uint32_t MaxGroupSize = Device.getMaxGroupSize();
+
+  bool MaxGroupSizeForced = false;
+  if (ThreadLimit > 0) {
+    MaxGroupSizeForced = true;
+    MaxGroupSize = ThreadLimit;
+  }
+
+  uint32_t GRPCounts[3] = {1, 1, 1};
+  uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
+  TgtLoopDescTy *Levels = LoopLevels->Levels;
+  int32_t DistributeDim = LoopLevels->DistributeDim;
+  assert(DistributeDim >= 0 && DistributeDim <= 2 &&
+         "Invalid distribute dimension.");
+  int32_t NumLoops = LoopLevels->NumLoops;
+  assert((NumLoops > 0 && NumLoops <= 3) &&
+         "Invalid loop nest description for ND partitioning");
+
+  // Compute global widths for X/Y/Z dimensions.
+  size_t TripCounts[3] = {1, 1, 1};
+
+  for (int32_t I = 0; I < NumLoops; I++) {
+    assert(Levels[I].Stride > 0 && "Invalid loop stride for ND partitioning");
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Loop %" PRIu32 ": lower bound = %" PRId64 ", upper bound = %" PRId64
+         ", Stride = %" PRId64 "\n",
+         I, Levels[I].Lb, Levels[I].Ub, Levels[I].Stride);
+    if (Levels[I].Ub < Levels[I].Lb)
+      TripCounts[I] = 0;
+    else
+      TripCounts[I] =
+          (Levels[I].Ub - Levels[I].Lb + Levels[I].Stride) / Levels[I].Stride;
+  }
+
+  // Check if any of the loop has zero iterations.
+  if (TripCounts[0] == 0 || TripCounts[1] == 0 || TripCounts[2] == 0) {
+    std::fill(GroupSizes, GroupSizes + 3, 1);
+    std::fill(GRPCounts, GRPCounts + 3, 1);
+    if (DistributeDim > 0 && TripCounts[DistributeDim] != 0) {
+      // There is a distribute dimension, and the distribute loop
+      // has non-zero iterations, but some inner parallel loop
+      // has zero iterations. We still want to split the distribute
+      // loop's iterations between many WGs (of size 1), but the inner/lower
+      // dimensions should be 1x1.
+      // Note that this code is currently dead, because we are not
+      // hoisting the inner loops' bounds outside of the target regions.
+      // The code is here just for completeness.
+      size_t DistributeTripCount = TripCounts[DistributeDim];
+      if (DistributeTripCount > UINT32_MAX) {
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Invalid number of teams %zu due to large loop trip count\n",
+             DistributeTripCount);
+        return OFFLOAD_FAIL;
+      }
+      GRPCounts[DistributeDim] = DistributeTripCount;
+    }
+    AllowCooperative = false;
+    GroupCounts.groupCountX = GRPCounts[0];
+    GroupCounts.groupCountY = GRPCounts[1];
+    GroupCounts.groupCountZ = GRPCounts[2];
+    return OFFLOAD_SUCCESS;
+  }
+
+  if (!MaxGroupSizeForced) {
+    // Use zeKernelSuggestGroupSize to compute group sizes,
+    // or fallback to setting dimension 0 width to SIMDWidth.
+    // Note that in case of user-specified LWS GRPSizes[0]
+    // is already set according to the specified value.
+    size_t GlobalSizes[3] = {TripCounts[0], TripCounts[1], TripCounts[2]};
+    if (DistributeDim > 0) {
+      // There is a distribute dimension.
+      GlobalSizes[DistributeDim - 1] *= GlobalSizes[DistributeDim];
+      GlobalSizes[DistributeDim] = 1;
+    }
+
+    {
+      if (MaxGroupSize > KernelPR.Width) {
+        GRPSizes[0] = KernelPR.Width;
+      }
+      if (DistributeDim == 0) {
+        // If there is a distribute dimension, then we do not use
+        // thin HW threads, since we do not know anything about
+        // the iteration space of the inner parallel loop regions.
+        //
+        // If there is no distribute dimension, then try to use thiner
+        // HW threads to get more independent HW threads executing
+        // the kernel - this may allow more parallelism due to
+        // the stalls being distributed across multiple HW threads rather
+        // than across SIMD lanes within one HW thread.
+        assert(GRPSizes[1] == 1 && GRPSizes[2] == 1 &&
+               "Unexpected team sizes for dimensions 1 or/and 2.");
+        uint32_t SimdWidth = KernelPR.SIMDWidth;
+        uint64_t TotalThreads = Device.getTotalThreads();
+        TotalThreads *= Options.ThinThreadsThreshold;
+
+        uint32_t GRPSizePrev = GRPSizes[0];
+        uint64_t ThreadsNeeded =
+            computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
+        while (ThreadsNeeded < TotalThreads) {
+          GRPSizePrev = GRPSizes[0];
+          // Try to half the local work size (if possible) and see
+          // how many HW threads the kernel will require with this
+          // new local work size.
+          // In most implementations the initial GRPSizes[0]
+          // will be a power-of-two.
+          if (GRPSizes[0] <= 1)
+            break;
+          GRPSizes[0] >>= 1;
+          ThreadsNeeded = computeThreadsNeeded(TripCounts, GRPSizes, SimdWidth);
+        }
+        GRPSizes[0] = GRPSizePrev;
+      }
+    }
+  }
+
+  for (int32_t I = 0; I < NumLoops; I++) {
+    if (I < DistributeDim) {
+      GRPCounts[I] = 1;
+      continue;
+    }
+    size_t Trip = TripCounts[I];
+    if (GRPSizes[I] >= Trip)
+      GRPSizes[I] = Trip;
+    size_t Count = (Trip + GRPSizes[I] - 1) / GRPSizes[I];
+    if (Count > UINT32_MAX) {
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Invalid number of teams %zu due to large loop trip count\n", Count);
+      return OFFLOAD_FAIL;
+    }
+    GRPCounts[I] = (uint32_t)Count;
+  }
+  AllowCooperative = false;
+  GroupCounts.groupCountX = GRPCounts[0];
+  GroupCounts.groupCountY = GRPCounts[1];
+  GroupCounts.groupCountZ = GRPCounts[2];
+  std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
+                                   int32_t ThreadLimit, uint32_t *GroupSizes,
+                                   ze_group_count_t &GroupCounts,
+                                   void *LoopDesc,
+                                   bool &AllowCooperative) const {
+
+  const auto DeviceId = Device.getDeviceId();
+  const auto &KernelPR = getProperties();
+
+  // Read the most recent global thread limit and max teams.
+  const auto [NumTeamsICV, ThreadLimitICV] = std::make_tuple(0, 0);
+
+  bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
+  bool HalfNumThreads =
+      LevelZeroPluginTy::getOptions().ZeDebugEnabled && IsXeHPG;
+  uint32_t KernelWidth = KernelPR.Width;
+  uint32_t SIMDWidth = KernelPR.SIMDWidth;
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth);
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth);
+  assert(SIMDWidth <= KernelWidth && "Invalid SIMD width.");
+
+  if (ThreadLimit > 0) {
+    // use thread_limit clause value default
+    DP("Max team size is set to %" PRId32 " (thread_limit clause)\n",
+       ThreadLimit);
+  } else if (ThreadLimitICV > 0) {
+    // else use thread-limit-var ICV
+    ThreadLimit = ThreadLimitICV;
+    DP("Max team size is set to %" PRId32 " (thread-limit-icv)\n", ThreadLimit);
+  }
+
+  size_t MaxThreadLimit = Device.getMaxGroupSize();
+  // Set correct max group size if the kernel was compiled with explicit SIMD
+  if (SIMDWidth == 1) {
+    MaxThreadLimit = Device.getNumThreadsPerSubslice();
+  }
+
+  if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) {
+    MaxThreadLimit = KernelPR.MaxThreadGroupSize;
+    DP("Capping maximum team size to %zu due to kernel constraints.\n",
+       MaxThreadLimit);
+  }
+
+  if (ThreadLimit > static_cast<int32_t>(MaxThreadLimit)) {
+    ThreadLimit = MaxThreadLimit;
+    DP("Max team size execceds current maximum %zu. Adjusted\n",
+       MaxThreadLimit);
+  }
+  {
+    if (NumTeams > 0) {
+      DP("Number of teams is set to %" PRId32
+         "(num_teams clause or no teams construct)\n",
+         NumTeams);
+    } else if (NumTeamsICV > 0) {
+      // OMP_NUM_TEAMS only matters, if num_teams() clause is absent.
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV);
+
+      NumTeams = NumTeamsICV;
+      DP("Max number of teams is set to %" PRId32 " (OMP_NUM_TEAMS)\n",
+         NumTeams);
+    }
+
+    bool UseLoopTC = LoopDesc;
+    decideKernelGroupArguments(
+        Device, (uint32_t)NumTeams, (uint32_t)ThreadLimit,
+        UseLoopTC ? (TgtNDRangeDescTy *)LoopDesc : nullptr, GroupSizes,
+        GroupCounts, HalfNumThreads, false);
+    AllowCooperative = false;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0KernelTy::runTargetTeamRegion(L0DeviceTy &l0Device,
+                                        KernelArgsTy &KernelArgs,
+                                        KernelLaunchParamsTy LaunchParams,
+                                        __tgt_async_info *AsyncInfo) const {
+  // Libomptarget can pass negative NumTeams and ThreadLimit now after
+  // introducing __tgt_target_kernel. This happens only when we have valid
+  // LoopDesc and the region is not a teams region.
+
+  auto zeKernel = getZeKernel();
+  auto DeviceId = l0Device.getDeviceId();
+  int32_t NumArgs = KernelArgs.NumArgs;
+  int32_t NumTeams = KernelArgs.NumTeams[0];
+  int32_t ThreadLimit = KernelArgs.ThreadLimit[0];
+  void *LoopDesc = nullptr;
+
+  if (NumTeams < 0)
+    NumTeams = 0;
+  if (ThreadLimit < 0)
+    ThreadLimit = 0;
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Executing a kernel " DPxMOD "...\n", DPxPTR(zeKernel));
+
+  auto &Plugin = l0Device.getPlugin();
+  auto &Device = Plugin.getDeviceFromId(DeviceId);
+
+  auto *IdStr = Device.getZeIdCStr();
+  auto &Options = LevelZeroPluginTy::getOptions();
+  bool IsAsync = AsyncInfo && Device.asyncEnabled();
+  if (IsAsync && !AsyncInfo->Queue) {
+    AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
+    if (!AsyncInfo->Queue)
+      IsAsync = false; // Couldn't get a queue, revert to sync
+  }
+  auto *AsyncQueue =
+      IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : NULL;
+
+  // We need to get a non-const version of the Properties structure in order to
+  // use its lock and be able to cache the group params and indirect flags
+  auto &KernelPR = const_cast<KernelPropertiesTy &>(getProperties());
+  // Protect from kernel preparation to submission as kernels are shared.
+  std::unique_lock<std::mutex> KernelLock(KernelPR.Mtx);
+
+  // Decide group sizes and counts
+  uint32_t GroupSizes[3];
+  ze_group_count_t GroupCounts;
+
+  bool AllowCooperative = false;
+
+  // Check if we can reuse previous group parameters
+  bool GroupParamsReused = KernelPR.reuseGroupParams(
+      static_cast<TgtNDRangeDescTy *>(LoopDesc), NumTeams, ThreadLimit,
+      GroupSizes, GroupCounts, AllowCooperative);
+
+  if (!GroupParamsReused) {
+    auto RC = getGroupsShape(Device, NumTeams, ThreadLimit, GroupSizes,
+                             GroupCounts, LoopDesc, AllowCooperative);
+
+    if (RC != OFFLOAD_SUCCESS) {
+      return RC;
+    }
+
+    KernelPR.cacheGroupParams(static_cast<TgtNDRangeDescTy *>(LoopDesc),
+                              NumTeams, ThreadLimit, GroupSizes, GroupCounts,
+                              AllowCooperative);
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0],
+       GroupSizes[1], GroupSizes[2]);
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n",
+       GroupCounts.groupCountX, GroupCounts.groupCountY,
+       GroupCounts.groupCountZ);
+  for (int32_t I = 0; I < NumArgs; I++) {
+    {
+      void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
+      CALL_ZE_RET_FAIL(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
+                       Arg == nullptr ? nullptr : &Arg);
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "Kernel Pointer argument %" PRId32 " (value: " DPxMOD
+           ") was set successfully for device %s.\n",
+           I, DPxPTR(Arg), IdStr);
+    }
+  }
+
+  // Set Kernel Indirect flags
+  auto &PrevFlags = KernelPR.IndirectAccessFlags;
+  ze_kernel_indirect_access_flags_t Flags = 0;
+  Flags |= Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
+  Flags |= Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();
+
+  if (PrevFlags != Flags) {
+    // Combine with common access flags
+    const auto FinalFlags = Device.getIndirectFlags() | Flags;
+    CALL_ZE_RET_FAIL(zeKernelSetIndirectAccess, getZeKernel(), FinalFlags);
+    DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
+    PrevFlags = Flags;
+  }
+
+  if (!GroupParamsReused) {
+    CALL_ZE_RET_FAIL(zeKernelSetGroupSize, zeKernel, GroupSizes[0],
+                     GroupSizes[1], GroupSizes[2]);
+  }
+
+  ze_command_list_handle_t CmdList = nullptr;
+  ze_command_queue_handle_t CmdQueue = nullptr;
+  const bool UseImmCmdList = Device.useImmForCompute();
+
+  if (UseImmCmdList) {
+    CmdList = Device.getImmCmdList();
+    // Command queue is not used with immediate command list
+  } else {
+    CmdList = Device.getCmdList();
+    CmdQueue = Device.getCmdQueue();
+  }
+
+  if (UseImmCmdList) {
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Using immediate command list for kernel submission.\n");
+    auto Event = Device.getEvent();
+    size_t NumWaitEvents = 0;
+    ze_event_handle_t *WaitEvents = nullptr;
+    if (IsAsync && !AsyncQueue->WaitEvents.empty()) {
+      if (Options.CommandMode == CommandModeTy::AsyncOrdered) {
+        NumWaitEvents = 1;
+        WaitEvents = &AsyncQueue->WaitEvents.back();
+      } else {
+        NumWaitEvents = AsyncQueue->WaitEvents.size();
+        WaitEvents = AsyncQueue->WaitEvents.data();
+      }
+    }
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Kernel depends on %zu data copying events.\n", NumWaitEvents);
+    if (AllowCooperative)
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                       zeKernel, &GroupCounts, Event, NumWaitEvents,
+                       WaitEvents);
+    else
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                       &GroupCounts, Event, NumWaitEvents, WaitEvents);
+    KernelLock.unlock();
+    if (IsAsync) {
+      AsyncQueue->WaitEvents.push_back(Event);
+      AsyncQueue->KernelEvent = Event;
+    } else {
+      CALL_ZE_RET_FAIL(zeEventHostSynchronize, Event, UINT64_MAX);
+      Device.releaseEvent(Event);
+    }
+  } else {
+    ze_event_handle_t Event = nullptr;
+    if (AllowCooperative)
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchCooperativeKernel, CmdList,
+                       zeKernel, &GroupCounts, Event, 0, nullptr);
+    else
+      CALL_ZE_RET_FAIL(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+                       &GroupCounts, Event, 0, nullptr);
+    KernelLock.unlock();
+    CALL_ZE_RET_FAIL(zeCommandListClose, CmdList);
+    CALL_ZE_RET_FAIL_MTX(zeCommandQueueExecuteCommandLists, Device.getMutex(),
+                         CmdQueue, 1, &CmdList, nullptr);
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
+    CALL_ZE_RET_FAIL(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+    CALL_ZE_RET_FAIL(zeCommandListReset, CmdList);
+    if (Event) {
+      Device.releaseEvent(Event);
+    }
+  }
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+       "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
+       IdStr);
+
+  return OFFLOAD_SUCCESS;
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
new file mode 100644
index 0000000000000..c26e3fb328645
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp
@@ -0,0 +1,647 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Memory related support for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include "L0Memory.h"
+#include "L0Device.h"
+#include "L0Plugin.h"
+
+namespace llvm::omp::target::plugin {
+
+void *MemAllocatorTy::MemPoolTy::BlockTy::alloc() {
+  if (isFull())
+    return nullptr;
+  if (FreeSlot != UINT32_MAX) {
+    const uint32_t Slot = FreeSlot;
+    FreeSlot = UINT32_MAX;
+    UsedSlots[Slot] = true;
+    NumUsedSlots++;
+    return reinterpret_cast<void *>(Base + Slot * ChunkSize);
+  }
+  for (uint32_t I = 0; I < NumSlots; I++) {
+    if (UsedSlots[I])
+      continue;
+    UsedSlots[I] = true;
+    NumUsedSlots++;
+    return reinterpret_cast<void *>(Base + I * ChunkSize);
+  }
+  // Should not reach here.
+  assert(0 && "Inconsistent memory pool state");
+  return nullptr;
+}
+
+/// Deallocate the given memory
+void MemAllocatorTy::MemPoolTy::BlockTy::dealloc(void *Mem) {
+  if (!contains(Mem))
+    assert(0 && "Inconsistent memory pool state");
+  const uint32_t Slot = (reinterpret_cast<uintptr_t>(Mem) - Base) / ChunkSize;
+  UsedSlots[Slot] = false;
+  NumUsedSlots--;
+  FreeSlot = Slot;
+}
+
+MemAllocatorTy::MemPoolTy::MemPoolTy(int32_t Kind, MemAllocatorTy *_Allocator,
+                                     const L0OptionsTy &Option) {
+  AllocKind = Kind;
+  Allocator = _Allocator;
+
+  // Read user-defined options
+  const auto &UserOptions = Option.MemPoolInfo.at(AllocKind);
+  const size_t UserAllocMax = UserOptions[0];
+  const size_t UserCapacity = UserOptions[1];
+  const size_t UserPoolSize = UserOptions[2];
+
+  BlockCapacity = UserCapacity;
+  PoolSizeMax = UserPoolSize << 20; // MB to B
+  PoolSize = 0;
+
+  auto Context = Allocator->L0Context->getZeContext();
+  const auto Device = Allocator->Device;
+
+  // Check page size used for this allocation kind to decide minimum
+  // allocation size when allocating from L0.
+  void *Mem = Allocator->allocL0(8, 0, AllocKind);
+  ze_memory_allocation_properties_t AP{
+      ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, nullptr,
+      ZE_MEMORY_TYPE_UNKNOWN, 0, 0};
+  CALL_ZE_RET_VOID(zeMemGetAllocProperties, Context, Mem, &AP, nullptr);
+  AllocUnit = (std::max)(AP.pageSize, AllocUnit);
+  CALL_ZE_RET_VOID(zeMemFree, Context, Mem);
+
+  bool IsDiscrete = false;
+  if (Device) {
+    ze_device_properties_t Properties{};
+    Properties.deviceId = 0;
+    Properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    Properties.pNext = nullptr;
+    CALL_ZE_RET_VOID(zeDeviceGetProperties, Device->getZeDevice(), &Properties);
+    IsDiscrete = Device->isDiscreteDevice();
+
+    if (AllocKind == TARGET_ALLOC_SHARED && IsDiscrete) {
+      // Use page size as minimum chunk size for USM shared on discrete
+      // device.
+      // FIXME: pageSize is not returned correctly (=0) on some new devices,
+      //        so use fallback value for now.
+      AllocMin = (std::max)(AP.pageSize, AllocUnit);
+      AllocUnit = AllocMin * BlockCapacity;
+    }
+  }
+
+  // Convert MB to B and round up to power of 2
+  AllocMax = AllocMin << getBucketId(UserAllocMax * (1 << 20));
+  if (AllocMin >= AllocMax) {
+    AllocMax = 2 * AllocMin;
+    DP("Warning: Adjusting pool's AllocMax to %zu for %s due to device "
+       "requirements.\n",
+       AllocMax, ALLOC_KIND_TO_STR(AllocKind));
+  }
+  assert(AllocMin < AllocMax &&
+         "Invalid parameters while initializing memory pool");
+  const auto MinSize = getBucketId(AllocMin);
+  const auto MaxSize = getBucketId(AllocMax);
+  Buckets.resize(MaxSize - MinSize + 1);
+  BucketStats.resize(Buckets.size(), {0, 0});
+
+  // Set bucket parameters
+  for (size_t I = 0; I < Buckets.size(); I++) {
+    const size_t ChunkSize = AllocMin << I;
+    size_t BlockSize = ChunkSize * BlockCapacity;
+    // On discrete device, the cost of native L0 invocation doubles when the
+    // the requested size doubles after certain threshold, so allocating
+    // larger block does not pay off at all. It is better to keep a single
+    // chunk in a single block in such cases.
+    if (BlockSize <= AllocUnit) {
+      BlockSize = AllocUnit; // Allocation unit is already large enough
+    } else if (IsDiscrete) {
+      // Do not preallocate if it does not pay off
+      if (ChunkSize >= L0UsmPreAllocThreshold ||
+          (AllocKind == TARGET_ALLOC_HOST &&
+           ChunkSize >= L0HostUsmPreAllocThreshold))
+        BlockSize = ChunkSize;
+    }
+    BucketParams.emplace_back(ChunkSize, BlockSize);
+  }
+
+  DP("Initialized %s pool for device " DPxMOD ": AllocUnit = %zu, "
+     "AllocMax = %zu, "
+     "Capacity = %" PRIu32 ", PoolSizeMax = %zu\n",
+     ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Device), AllocUnit, AllocMax,
+     BlockCapacity, PoolSizeMax);
+}
+
+// Used for reduction pool
+MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator,
+                                     const L0OptionsTy &Option) {
+  AllocKind = TARGET_ALLOC_DEVICE;
+  Allocator = _Allocator;
+  AllocMin = AllocUnit = 1024 << 6; // 64KB
+  AllocMax = Option.ReductionPoolInfo[0] << 20;
+  BlockCapacity = Option.ReductionPoolInfo[1];
+  PoolSize = 0;
+  PoolSizeMax = (size_t)Option.ReductionPoolInfo[2] << 20;
+
+  const auto MinSize = getBucketId(AllocMin);
+  const auto MaxSize = getBucketId(AllocMax);
+  Buckets.resize(MaxSize - MinSize + 1);
+  BucketStats.resize(Buckets.size(), {0, 0});
+  for (size_t I = 0; I < Buckets.size(); I++) {
+    const size_t ChunkSize = AllocMin << I;
+    BucketParams.emplace_back(ChunkSize, ChunkSize * BlockCapacity);
+  }
+
+  DP("Initialized reduction scratch pool for device " DPxMOD
+     ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
+     DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+}
+
+// Used for small memory pool with fixed parameters
+MemAllocatorTy::MemPoolTy::MemPoolTy(MemAllocatorTy *_Allocator) {
+  AllocKind = TARGET_ALLOC_DEVICE;
+  Allocator = _Allocator;
+  AllocMax = AllocMin;
+  BlockCapacity = AllocUnit / AllocMax;
+  PoolSize = 0;
+  PoolSizeMax = (1 << 20); // this should be sufficiently large
+  Buckets.resize(1);
+  BucketStats.resize(1, {0, 0});
+  BucketParams.emplace_back(AllocMax, AllocUnit);
+  ZeroInit = true;
+  DP("Initialized zero-initialized reduction counter pool for "
+     "device " DPxMOD ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n",
+     DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax);
+}
+
+void MemAllocatorTy::MemPoolTy::printUsage() {
+  auto PrintNum = [](uint64_t Num) {
+    if (Num > 1e9)
+      fprintf(stderr, "%11.2e", float(Num));
+    else
+      fprintf(stderr, "%11" PRIu64, Num);
+  };
+
+  bool HasPoolAlloc = false;
+  for (auto &Stat : BucketStats) {
+    if (Stat.first > 0 || Stat.second > 0) {
+      HasPoolAlloc = true;
+      break;
+    }
+  }
+
+  DP("MemPool usage for %s, device " DPxMOD "\n", ALLOC_KIND_TO_STR(AllocKind),
+     DPxPTR(Allocator->Device));
+
+  if (HasPoolAlloc) {
+    DP("-- AllocMax=%zu(MB), Capacity=%" PRIu32 ", PoolSizeMax=%zu(MB)\n",
+       AllocMax >> 20, BlockCapacity, PoolSizeMax >> 20);
+    DP("-- %18s:%11s%11s%11s\n", "", "NewAlloc", "Reuse", "Hit(%)");
+    for (size_t I = 0; I < Buckets.size(); I++) {
+      const auto &Stat = BucketStats[I];
+      if (Stat.first > 0 || Stat.second > 0) {
+        DP("-- Bucket[%10zu]:", BucketParams[I].first);
+        PrintNum(Stat.first);
+        PrintNum(Stat.second);
+        fprintf(stderr, "%11.2f\n",
+                float(Stat.second) / float(Stat.first + Stat.second) * 100);
+      }
+    }
+  } else {
+    DP("-- Not used\n");
+  }
+}
+
+/// Release resources used in the pool.
+MemAllocatorTy::MemPoolTy::~MemPoolTy() {
+  const int DebugLevel = getDebugLevel();
+  if (DebugLevel > 0)
+    printUsage();
+  for (auto &Bucket : Buckets) {
+    for (auto *Block : Bucket) {
+      if (DebugLevel > 0)
+        Allocator->log(0, Block->Size, AllocKind);
+      CALL_ZE_RET_VOID(zeMemFree, Allocator->L0Context->getZeContext(),
+                       reinterpret_cast<void *>(Block->Base));
+      delete Block;
+    }
+  }
+}
+
+/// Allocate the requested size of memory from this pool.
+/// AllocSize is the chunk size internally used for the returned memory.
+void *MemAllocatorTy::MemPoolTy::alloc(size_t Size, size_t &AllocSize) {
+  if (Size == 0 || Size > AllocMax)
+    return nullptr;
+
+  const uint32_t BucketId = getBucketId(Size);
+  auto &Blocks = Buckets[BucketId];
+  void *Mem = nullptr;
+
+  for (auto *Block : Blocks) {
+    if (Block->isFull())
+      continue;
+    Mem = Block->alloc();
+    assert(Mem && "Inconsistent state while allocating memory from pool");
+    PtrToBlock.try_emplace(Mem, Block);
+    break;
+  }
+
+  if (Mem == nullptr) {
+    const bool IsSmallAllocatable =
+        (Size <= SmallAllocMax && SmallPoolSize <= SmallPoolSizeMax);
+    const bool IsFull = (PoolSize > PoolSizeMax);
+    if (IsFull && !IsSmallAllocatable)
+      return nullptr;
+    // Bucket is empty or all blocks in the bucket are full
+    const auto ChunkSize = BucketParams[BucketId].first;
+    const auto BlockSize = BucketParams[BucketId].second;
+    void *Base = Allocator->allocL0(BlockSize, 0, AllocKind);
+
+    if (ZeroInit) {
+      auto RC = Allocator->enqueueMemSet(Base, 0, BlockSize);
+      if (RC != OFFLOAD_SUCCESS) {
+        DP("Failed to zero-initialize pool memory\n");
+        return nullptr;
+      }
+    }
+
+    BlockTy *Block = new BlockTy(Base, BlockSize, ChunkSize);
+    Blocks.push_back(Block);
+    Mem = Block->alloc();
+    PtrToBlock.try_emplace(Mem, Block);
+    if (IsFull)
+      SmallPoolSize += BlockSize;
+    else
+      PoolSize += BlockSize;
+    DP("New block allocation for %s pool: base = " DPxMOD
+       ", size = %zu, pool size = %zu\n",
+       ALLOC_KIND_TO_STR(AllocKind), DPxPTR(Base), BlockSize, PoolSize);
+    BucketStats[BucketId].first++;
+  } else {
+    BucketStats[BucketId].second++;
+  }
+
+  AllocSize = (AllocMin << BucketId);
+
+  return Mem;
+}
+
+/// Deallocate the specified memory and returns block size deallocated.
+size_t MemAllocatorTy::MemPoolTy::dealloc(void *Ptr) {
+  if (PtrToBlock.count(Ptr) == 0)
+    return 0;
+  PtrToBlock[Ptr]->dealloc(Ptr);
+  const size_t Deallocated = PtrToBlock[Ptr]->ChunkSize;
+  PtrToBlock.erase(Ptr);
+  return Deallocated;
+}
+
+void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base, size_t Size,
+                                            int32_t Kind, bool InPool,
+                                            bool ImplicitArg) {
+  const auto Inserted =
+      Map.emplace(Ptr, MemAllocInfoTy{Base, Size, Kind, InPool, ImplicitArg});
+  // Check if we keep valid disjoint memory ranges.
+  [[maybe_unused]] bool Valid = Inserted.second;
+  if (Valid) {
+    if (Inserted.first != Map.begin()) {
+      const auto I = std::prev(Inserted.first, 1);
+      Valid = Valid && (uintptr_t)I->first + I->second.Size <= (uintptr_t)Ptr;
+    }
+    if (Valid) {
+      const auto I = std::next(Inserted.first, 1);
+      if (I != Map.end())
+        Valid = Valid && (uintptr_t)Ptr + Size <= (uintptr_t)I->first;
+    }
+  }
+  assert(Valid && "Invalid overlapping memory allocation");
+  if (ImplicitArg)
+    NumImplicitArgs[Kind]++;
+}
+
+/// Remove allocation information for the given memory location
+bool MemAllocatorTy::MemAllocInfoMapTy::remove(void *Ptr,
+                                               MemAllocInfoTy *Removed) {
+  const auto AllocInfo = Map.find(Ptr);
+  if (AllocInfo == Map.end())
+    return false;
+  if (AllocInfo->second.ImplicitArg)
+    NumImplicitArgs[AllocInfo->second.Kind]--;
+  if (Removed)
+    *Removed = AllocInfo->second;
+  Map.erase(AllocInfo);
+  return true;
+}
+
+void MemAllocatorTy::initDevicePools(L0DeviceTy &L0Device,
+                                     const L0OptionsTy &Option) {
+  SupportsLargeMem = L0Device.supportsLargeMem();
+  IsHostMem = false;
+  Device = &L0Device;
+  L0Context = &L0Device.getL0Context();
+  for (auto Kind : {TARGET_ALLOC_DEVICE, TARGET_ALLOC_SHARED}) {
+    if (Option.MemPoolInfo.count(Kind) > 0) {
+      std::lock_guard<std::mutex> Lock(Mtx);
+      Pools.emplace(std::piecewise_construct, std::forward_as_tuple(Kind),
+                    std::forward_as_tuple(Kind, this, Option));
+    }
+    if (getDebugLevel() > 0)
+      Stats.emplace(std::piecewise_construct, std::forward_as_tuple(Kind),
+                    std::tuple<>{});
+  }
+  ReductionPool = std::make_unique<MemPoolTy>(this, Option);
+  CounterPool = std::make_unique<MemPoolTy>(this);
+  updateMaxAllocSize(L0Device);
+}
+
+void MemAllocatorTy::initHostPool(L0ContextTy &Driver,
+                                  const L0OptionsTy &Option) {
+  SupportsLargeMem = Driver.supportsLargeMem();
+  IsHostMem = true;
+  this->L0Context = &Driver;
+  if (Option.MemPoolInfo.count(TARGET_ALLOC_HOST) > 0) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    Pools.emplace(std::piecewise_construct,
+                  std::forward_as_tuple(TARGET_ALLOC_HOST),
+                  std::forward_as_tuple(TARGET_ALLOC_HOST, this, Option));
+  }
+  if (getDebugLevel() > 0)
+    Stats.emplace(std::piecewise_construct,
+                  std::forward_as_tuple(TARGET_ALLOC_HOST), std::tuple<>{});
+}
+
+void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) {
+  // Update the maximum allocation size for this Allocator
+  ze_device_properties_t P;
+  P.maxMemAllocSize = 0;
+  P.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+  P.pNext = nullptr;
+  CALL_ZE_RET_VOID(zeDeviceGetProperties, L0Device.getZeDevice(), &P);
+
+  if (IsHostMem) {
+    // MaxAllocSize should be the minimum of all devices from the driver
+    if (MaxAllocSize > P.maxMemAllocSize) {
+      MaxAllocSize = P.maxMemAllocSize;
+      DP("Updated MaxAllocSize for driver " DPxMOD " to %zu\n",
+         DPxPTR(L0Context), MaxAllocSize);
+    }
+    return;
+  }
+
+  MaxAllocSize = P.maxMemAllocSize;
+  DP("Updated MaxAllocSize for device " DPxMOD " to %zu\n", DPxPTR(Device),
+     MaxAllocSize);
+}
+
+/// Release resources and report statistics if requested
+void MemAllocatorTy::deinit() {
+  std::lock_guard<std::mutex> Lock(Mtx);
+  // Release RTL-owned memory
+  for (auto *M : MemOwned) {
+    auto Err = dealloc_locked(M);
+    if (Err)
+      consumeError(std::move(Err));
+  }
+  // Release resources used in the pool
+  Pools.clear();
+  ReductionPool.reset(nullptr);
+  CounterPool.reset(nullptr);
+  // Report memory usage if requested
+  if (getDebugLevel() > 0) {
+    for (auto &Stat : Stats) {
+      DP("Memory usage for %s, device " DPxMOD "\n",
+         ALLOC_KIND_TO_STR(Stat.first), DPxPTR(Device));
+      const auto &ST = Stat.second;
+      if (ST.NumAllocs[0] == 0 && ST.NumAllocs[1] == 0) {
+        DP("-- Not used\n");
+        continue;
+      }
+      DP("-- Allocator: %12s, %12s\n", "Native", "Pool");
+      DP("-- Requested: %12zu, %12zu\n", ST.Requested[0], ST.Requested[1]);
+      DP("-- Allocated: %12zu, %12zu\n", ST.Allocated[0], ST.Allocated[1]);
+      DP("-- Freed    : %12zu, %12zu\n", ST.Freed[0], ST.Freed[1]);
+      DP("-- InUse    : %12zu, %12zu\n", ST.InUse[0], ST.InUse[1]);
+      DP("-- PeakUse  : %12zu, %12zu\n", ST.PeakUse[0], ST.PeakUse[1]);
+      DP("-- NumAllocs: %12zu, %12zu\n", ST.NumAllocs[0], ST.NumAllocs[1]);
+    }
+  }
+
+  // mark as deinitialized
+  L0Context = nullptr;
+}
+
+/// Allocate memory with the specified information
+Expected<void *> MemAllocatorTy::alloc(size_t Size, size_t Align, int32_t Kind,
+                                       intptr_t Offset, bool UserAlloc,
+                                       bool DevMalloc, uint32_t MemAdvice,
+                                       AllocOptionTy AllocOpt) {
+  assert((Kind == TARGET_ALLOC_DEVICE || Kind == TARGET_ALLOC_HOST ||
+          Kind == TARGET_ALLOC_SHARED) &&
+         "Unknown memory kind while allocating target memory");
+
+  std::lock_guard<std::mutex> Lock(Mtx);
+
+  // We do not expect meaningful Align parameter when Offset > 0, so the
+  // following code does not handle such case.
+
+  size_t AllocSize = Size + Offset;
+  void *Mem = nullptr;
+  void *AllocBase = nullptr;
+  const bool UseScratchPool =
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH);
+  const bool UseZeroInitPool =
+      (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER);
+  const bool UseDedicatedPool = UseScratchPool || UseZeroInitPool;
+
+  if ((Pools.count(Kind) > 0 && MemAdvice == UINT32_MAX) || UseDedicatedPool) {
+    // Pool is enabled for the allocation kind, and we do not use any memory
+    // advice. We should avoid using pool if there is any meaningful memory
+    // advice not to affect sibling allocation in the same block.
+    if (Align > 0)
+      AllocSize += (Align - 1);
+    size_t PoolAllocSize = 0;
+    if (UseScratchPool)
+      AllocBase = ReductionPool->alloc(AllocSize, PoolAllocSize);
+    else if (UseZeroInitPool)
+      AllocBase = CounterPool->alloc(AllocSize, PoolAllocSize);
+    else
+      AllocBase = Pools[Kind].alloc(AllocSize, PoolAllocSize);
+    if (AllocBase) {
+      uintptr_t Base = (uintptr_t)AllocBase;
+      if (Align > 0)
+        Base = (Base + Align) & ~(Align - 1);
+      Mem = (void *)(Base + Offset);
+      AllocInfo.add(Mem, AllocBase, Size, Kind, true, UserAlloc);
+      log(Size, PoolAllocSize, Kind, true /* Pool */);
+      if (DevMalloc)
+        MemOwned.push_back(AllocBase);
+      if (UseDedicatedPool) {
+        DP("Allocated %zu bytes from %s pool\n", Size,
+           UseScratchPool ? "scratch" : "zero-initialized");
+      }
+      return Mem;
+    }
+  }
+
+  AllocBase = allocL0(AllocSize, Align, Kind, Size);
+  if (AllocBase) {
+    Mem = (void *)((uintptr_t)AllocBase + Offset);
+    AllocInfo.add(Mem, AllocBase, Size, Kind, false, UserAlloc);
+    if (DevMalloc)
+      MemOwned.push_back(AllocBase);
+    if (UseDedicatedPool) {
+      // We do not want this happen in general.
+      DP("Allocated %zu bytes from L0 for %s pool\n", Size,
+         UseScratchPool ? "scratch" : "zero-initialized");
+    }
+  }
+  return Mem;
+}
+
+/// Deallocate memory
+Error MemAllocatorTy::dealloc_locked(void *Ptr) {
+  MemAllocInfoTy Info;
+  if (!AllocInfo.remove(Ptr, &Info)) {
+    return Plugin::error(ErrorCode::BACKEND_FAILURE,
+                         "Cannot find memory allocation information for " DPxMOD
+                         "\n",
+                         DPxPTR(Ptr));
+  }
+  if (Info.InPool) {
+    size_t DeallocSize = 0;
+    if (Pools.count(Info.Kind) > 0)
+      DeallocSize = Pools.at(Info.Kind).dealloc(Info.Base);
+    if (DeallocSize == 0) {
+      // Try reduction scratch pool
+      DeallocSize = ReductionPool->dealloc(Info.Base);
+      // Try reduction counter pool
+      if (DeallocSize == 0)
+        DeallocSize = CounterPool->dealloc(Info.Base);
+      if (DeallocSize == 0) {
+        return Plugin::error(ErrorCode::BACKEND_FAILURE,
+                             "Cannot return memory " DPxMOD " to pool\n",
+                             DPxPTR(Ptr));
+      }
+    }
+    log(0, DeallocSize, Info.Kind, true /* Pool */);
+    return Plugin::success();
+  }
+  if (!Info.Base) {
+    DP("Error: Cannot find base address of " DPxMOD "\n", DPxPTR(Ptr));
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Cannot find base address of " DPxMOD "\n",
+                         DPxPTR(Ptr));
+  }
+  CALL_ZE_RET_ERROR(zeMemFree, L0Context->getZeContext(), Info.Base);
+  log(0, Info.Size, Info.Kind);
+
+  DP("Deleted device memory " DPxMOD " (Base: " DPxMOD ", Size: %zu)\n",
+     DPxPTR(Ptr), DPxPTR(Info.Base), Info.Size);
+
+  return Plugin::success();
+}
+
+int32_t MemAllocatorTy::enqueueMemSet(void *Dst, int8_t Value, size_t Size) {
+  return Device->enqueueMemFill(Dst, &Value, sizeof(int8_t), Size);
+}
+
+int32_t MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src,
+                                       size_t Size) {
+  return Device->enqueueMemCopy(Dst, Src, Size);
+}
+
+void *MemAllocatorTy::allocL0(size_t Size, size_t Align, int32_t Kind,
+                              size_t ActiveSize) {
+  void *Mem = nullptr;
+  ze_device_mem_alloc_desc_t DeviceDesc{ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
+                                        nullptr, 0, 0};
+  ze_host_mem_alloc_desc_t HostDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+                                    nullptr, 0};
+
+  // Use relaxed allocation limit if driver supports
+  ze_relaxed_allocation_limits_exp_desc_t RelaxedDesc{
+      ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC, nullptr,
+      ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE};
+  if (Size > MaxAllocSize && SupportsLargeMem) {
+    DeviceDesc.pNext = &RelaxedDesc;
+    HostDesc.pNext = &RelaxedDesc;
+  }
+
+  auto zeDevice = Device ? Device->getZeDevice() : 0;
+  auto zeContext = L0Context->getZeContext();
+  bool makeResident = false;
+  switch (Kind) {
+  case TARGET_ALLOC_DEVICE:
+    makeResident = true;
+    CALL_ZE_RET_NULL(zeMemAllocDevice, zeContext, &DeviceDesc, Size, Align,
+                     zeDevice, &Mem);
+    DP("Allocated a device memory " DPxMOD "\n", DPxPTR(Mem));
+    break;
+  case TARGET_ALLOC_HOST:
+    CALL_ZE_RET_NULL(zeMemAllocHost, zeContext, &HostDesc, Size, Align, &Mem);
+    DP("Allocated a host memory " DPxMOD "\n", DPxPTR(Mem));
+    break;
+  case TARGET_ALLOC_SHARED:
+    CALL_ZE_RET_NULL(zeMemAllocShared, zeContext, &DeviceDesc, &HostDesc, Size,
+                     Align, zeDevice, &Mem);
+    DP("Allocated a shared memory " DPxMOD "\n", DPxPTR(Mem));
+    break;
+  default:
+    assert(0 && "Invalid target data allocation kind");
+  }
+
+  size_t LoggedSize = ActiveSize ? ActiveSize : Size;
+  log(LoggedSize, LoggedSize, Kind);
+  if (makeResident) {
+    assert(Device &&
+           "Device is not set for memory allocation. Is this a Device Pool?");
+    if (Device->makeMemoryResident(Mem, Size) != OFFLOAD_SUCCESS)
+      Mem = nullptr;
+  }
+  return Mem;
+}
+
+ze_event_handle_t EventPoolTy::getEvent() {
+  std::lock_guard<std::mutex> Lock(*Mtx);
+
+  if (Events.empty()) {
+    // Need to create a new L0 pool
+    ze_event_pool_desc_t Desc{ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr, 0, 0};
+    Desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | Flags;
+    Desc.count = PoolSize;
+    ze_event_pool_handle_t Pool;
+    CALL_ZE_RET_NULL(zeEventPoolCreate, Context, &Desc, 0, nullptr, &Pool);
+    Pools.push_back(Pool);
+
+    // Create events
+    ze_event_desc_t EventDesc{ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, 0, 0};
+    EventDesc.wait = 0;
+    EventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
+    for (uint32_t I = 0; I < PoolSize; I++) {
+      EventDesc.index = I;
+      ze_event_handle_t Event;
+      CALL_ZE_RET_NULL(zeEventCreate, Pool, &EventDesc, &Event);
+      Events.push_back(Event);
+    }
+  }
+
+  auto Ret = Events.back();
+  Events.pop_back();
+
+  return Ret;
+}
+
+/// Return an event to the pool
+void EventPoolTy::releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device) {
+  std::lock_guard<std::mutex> Lock(*Mtx);
+  CALL_ZE_RET_VOID(zeEventHostReset, Event);
+  Events.push_back(Event);
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Options.cpp b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
new file mode 100644
index 0000000000000..2e2c2cd5a5bbf
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Options.cpp
@@ -0,0 +1,180 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero RTL Options support
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget.h"
+
+#include "L0Defs.h"
+#include "L0Options.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+/// Read environment variables
+void L0OptionsTy::processEnvironmentVars() {
+  // Compilation options for IGC
+  UserCompilationOptions +=
+      std::string(" ") +
+      StringEnvar("LIBOMPTARGET_LEVEL_ZERO_COMPILATION_OPTIONS", "").get();
+
+  // Memory pool
+  // LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>
+  //  <Option>       := 0 | <PoolInfoList>
+  //  <PoolInfoList> := <PoolInfo>[,<PoolInfoList>]
+  //  <PoolInfo>     := <MemType>[,<AllocMax>[,<Capacity>[,<PoolSize>]]]
+  //  <MemType>      := all | device | host | shared
+  //  <AllocMax>     := non-negative integer or empty, max allocation size in
+  //                    MB (default: 1)
+  //  <Capacity>     := positive integer or empty, number of allocations from
+  //                    a single block (default: 4)
+  //  <PoolSize>     := positive integer or empty, max pool size in MB
+  //                    (default: 256)
+  const StringEnvar MemoryPoolVar("LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL", "");
+  if (MemoryPoolVar.isPresent()) {
+    if (MemoryPoolVar.get() == "0") {
+      Flags.UseMemoryPool = 0;
+      MemPoolInfo.clear();
+    } else {
+      std::istringstream Str(MemoryPoolVar.get());
+      int32_t MemType = -1;
+      int32_t Offset = 0;
+      int32_t Valid = 1;
+      const std::array<int32_t, 3> DefaultValue{1, 4, 256};
+      const int32_t AllMemType = INT32_MAX;
+      std::array<int32_t, 3> AllInfo{1, 4, 256};
+      std::map<int32_t, std::array<int32_t, 3>> PoolInfo;
+      for (std::string Token; std::getline(Str, Token, ',') && Valid > 0;) {
+        if (Token == "device") {
+          MemType = TARGET_ALLOC_DEVICE;
+          PoolInfo.emplace(MemType, DefaultValue);
+          Offset = 0;
+        } else if (Token == "host") {
+          MemType = TARGET_ALLOC_HOST;
+          PoolInfo.emplace(MemType, DefaultValue);
+          Offset = 0;
+        } else if (Token == "shared") {
+          MemType = TARGET_ALLOC_SHARED;
+          PoolInfo.emplace(MemType, DefaultValue);
+          Offset = 0;
+        } else if (Token == "all") {
+          MemType = AllMemType;
+          Offset = 0;
+          Valid = 2;
+        } else if (Offset < 3 && MemType >= 0) {
+          int32_t Num = std::atoi(Token.c_str());
+          bool ValidNum = (Num >= 0 && Offset == 0) || (Num > 0 && Offset > 0);
+          if (ValidNum && MemType == AllMemType)
+            AllInfo[Offset++] = Num;
+          else if (ValidNum)
+            PoolInfo[MemType][Offset++] = Num;
+          else if (Token.size() == 0)
+            Offset++;
+          else
+            Valid = 0;
+        } else {
+          Valid = 0;
+        }
+      }
+      if (Valid > 0) {
+        if (Valid == 2) {
+          // "all" is specified -- ignore other inputs
+          if (AllInfo[0] > 0) {
+            MemPoolInfo[TARGET_ALLOC_DEVICE] = AllInfo;
+            MemPoolInfo[TARGET_ALLOC_HOST] = AllInfo;
+            MemPoolInfo[TARGET_ALLOC_SHARED] = std::move(AllInfo);
+          } else {
+            MemPoolInfo.clear();
+          }
+        } else {
+          // Use user-specified configuration
+          for (auto &I : PoolInfo) {
+            if (I.second[0] > 0)
+              MemPoolInfo[I.first] = I.second;
+            else
+              MemPoolInfo.erase(I.first);
+          }
+        }
+      } else {
+        DP("Ignoring incorrect memory pool configuration "
+           "LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=%s\n",
+           MemoryPoolVar.get().c_str());
+        DP("LIBOMPTARGET_LEVEL_ZERO_MEMORY_POOL=<Option>\n");
+        DP("  <Option>       := 0 | <PoolInfoList>\n");
+        DP("  <PoolInfoList> := <PoolInfo>[,<PoolInfoList>]\n");
+        DP("  <PoolInfo>     := "
+           "<MemType>[,<AllocMax>[,<Capacity>[,<PoolSize>]]]\n");
+        DP("  <MemType>      := all | device | host | shared\n");
+        DP("  <AllocMax>     := non-negative integer or empty, "
+           "max allocation size in MB (default: 1)\n");
+        DP("  <Capacity>     := positive integer or empty, "
+           "number of allocations from a single block (default: 4)\n");
+        DP("  <PoolSize>     := positive integer or empty, "
+           "max pool size in MB (default: 256)\n");
+      }
+    }
+  }
+
+  if (StringEnvar("INTEL_ENABLE_OFFLOAD_ANNOTATIONS").isPresent()) {
+    // To match SYCL RT behavior, we just need to check whether
+    // INTEL_ENABLE_OFFLOAD_ANNOTATIONS is set. The actual value
+    // does not matter.
+    CommonSpecConstants.addConstant<char>(0xFF747469, 1);
+  }
+
+  // LIBOMPTARGET_LEVEL_ZERO_STAGING_BUFFER_SIZE=<SizeInKB>
+  const Envar<size_t> StagingBufferSizeVar(
+      "LIBOMPTARGET_LEVEL_ZERO_STAGING_BUFFER_SIZE");
+  if (StagingBufferSizeVar.isPresent()) {
+    size_t SizeInKB = StagingBufferSizeVar;
+    if (SizeInKB > (16 << 10)) {
+      SizeInKB = (16 << 10);
+      DP("Staging buffer size is capped at %zu KB\n", SizeInKB);
+    }
+    StagingBufferSize = SizeInKB << 10;
+  }
+
+  // LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE=<Fmt>
+  // <Fmt> := sync | async | async_ordered
+  // sync: perform synchronization after each command
+  // async: perform synchronization when it is required
+  // async_ordered: same as "async", but command is ordered
+  // This option is ignored unless IMM is fully enabled on compute and copy.
+  // On Intel PVC GPU, when used with immediate command lists over Level Zero
+  // backend, a target region may involve multiple command submissions to the
+  // L0 copy queue and compute queue. L0 events are used for each submission
+  // (data transfer of a single item or kernel execution). When "async" is
+  // specified, a) each data transfer to device is submitted with an event.
+  // b) The kernel is submitted next with a dependence on all the previous
+  // data transfer events. The kernel also has an event associated with it.
+  // c) The data transfer from device will be submitted with a dependence on
+  // the kernel event. d) Finally wait on the host for all the events
+  // associated with the data transfer from device.
+  // The env-var also affects any "target update" constructs as well.
+  // The env-var only affects the L0 copy/compute commands issued from a
+  // single target construct execution, not across multiple invocations.
+  const StringEnvar CommandModeVar("LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE");
+  if (CommandModeVar.isPresent()) {
+    if (match(CommandModeVar, "sync"))
+      CommandMode = CommandModeTy::Sync;
+    else if (match(CommandModeVar, "async"))
+      CommandMode = CommandModeTy::Async;
+    else if (match(CommandModeVar, "async_ordered"))
+      CommandMode = CommandModeTy::AsyncOrdered;
+    else
+      INVALID_OPTION(LIBOMPTARGET_LEVEL_ZERO_COMMAND_MODE,
+                     CommandModeVar.get().c_str());
+  }
+
+  // Detect if we need to enable compatibility with Level Zero debug mode.
+  ZeDebugEnabled = BoolEnvar("ZET_ENABLE_PROGRAM_DEBUGGING", false);
+}
+
+} // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
new file mode 100644
index 0000000000000..b134fb659738e
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Plugin.cpp
@@ -0,0 +1,279 @@
+//===--- Target RTLs Implementation ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for SPIR-V/Xe machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <level_zero/zes_api.h>
+
+#include "L0Device.h"
+#include "L0Interop.h"
+#include "L0Kernel.h"
+#include "L0Plugin.h"
+#include "L0Trace.h"
+
+namespace llvm::omp::target::plugin {
+
+using namespace llvm::omp::target;
+using namespace error;
+
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+// Common data across all possible plugin instantiations
+L0OptionsTy LevelZeroPluginTy::Options;
+
+int32_t LevelZeroPluginTy::findDevices() {
+  CALL_ZE_RET_ZERO(zeInit, ZE_INIT_FLAG_GPU_ONLY);
+  uint32_t NumDrivers = 0;
+  CALL_ZE_RET_ZERO(zeDriverGet, &NumDrivers, nullptr);
+  if (NumDrivers == 0) {
+    DP("Cannot find any drivers.\n");
+    return 0;
+  }
+
+  // We expect multiple drivers on Windows to support different device types,
+  // so we need to maintain multiple drivers and contexts in general.
+  llvm::SmallVector<ze_driver_handle_t> FoundDrivers(NumDrivers);
+  CALL_ZE_RET_ZERO(zeDriverGet, &NumDrivers, FoundDrivers.data());
+
+  struct RootInfoTy {
+    uint32_t OrderId;
+    ze_device_handle_t zeDevice;
+    L0ContextTy *Driver;
+    bool IsDiscrete;
+  };
+  llvm::SmallVector<RootInfoTy> RootDevices;
+
+  uint32_t OrderId = 0;
+  for (uint32_t DriverId = 0; DriverId < NumDrivers; DriverId++) {
+    const auto &Driver = FoundDrivers[DriverId];
+    uint32_t DeviceCount = 0;
+    ze_result_t RC;
+    CALL_ZE(RC, zeDeviceGet, Driver, &DeviceCount, nullptr);
+    if (RC != ZE_RESULT_SUCCESS || DeviceCount == 0) {
+      DP("Cannot find any devices from driver " DPxMOD ".\n", DPxPTR(Driver));
+      continue;
+    }
+    // We have a driver that supports at least one device
+    ContextList.emplace_back(*this, Driver, DriverId);
+    auto &DrvInfo = ContextList.back();
+    llvm::SmallVector<ze_device_handle_t> FoundDevices(DeviceCount);
+    CALL_ZE_RET_ZERO(zeDeviceGet, Driver, &DeviceCount, FoundDevices.data());
+
+    for (auto &zeDevice : FoundDevices)
+      RootDevices.push_back(
+          {OrderId++, zeDevice, &DrvInfo, L0DeviceTy::isDiscrete(zeDevice)});
+  }
+
+  // move discrete devices to the front
+  std::sort(RootDevices.begin(), RootDevices.end(),
+            [](const RootInfoTy &A, const RootInfoTy &B) {
+              // if both are discrete, order by OrderId
+              // if both are not discrete, order by OrderId
+              // Otherwise, discrete goes first
+
+              if (A.IsDiscrete && B.IsDiscrete)
+                return A.OrderId < B.OrderId;
+              if (!A.IsDiscrete && !B.IsDiscrete)
+                return A.OrderId < B.OrderId;
+              return A.IsDiscrete;
+            });
+
+  struct DeviceInfoTy {
+    L0DeviceIdTy Id;
+    L0ContextTy *Driver;
+    bool isRoot() const { return Id.SubId < 0 && Id.CCSId < 0; }
+  };
+
+  llvm::SmallVector<DeviceInfoTy> DevicesToAdd;
+
+  // helper lambda
+  auto addDevice = [&DevicesToAdd](auto &zeDevice, auto *Driver, int32_t RootId,
+                                   int32_t SubId = -1, int32_t CCSId = -1) {
+    DevicesToAdd.push_back({{zeDevice, RootId, SubId, CCSId}, Driver});
+  };
+  for (size_t RootId = 0; RootId < RootDevices.size(); RootId++) {
+    const auto zeDevice = RootDevices[RootId].zeDevice;
+    auto *RootDriver = RootDevices[RootId].Driver;
+    addDevice(zeDevice, RootDriver, RootId);
+  }
+  NumDevices = DevicesToAdd.size();
+  auto DeviceId = 0;
+  for (auto &DeviceInfo : DevicesToAdd) {
+    auto RootId = DeviceInfo.Id.RootId;
+    auto SubId = DeviceInfo.Id.SubId;
+    auto CCSId = DeviceInfo.Id.CCSId;
+    auto zeDevice = DeviceInfo.Id.zeId;
+    auto *Driver = DeviceInfo.Driver;
+
+    std::string IdStr = std::to_string(RootId) +
+                        (SubId < 0 ? "" : "." + std::to_string(SubId)) +
+                        (CCSId < 0 ? "" : "." + std::to_string(CCSId));
+
+    L0Devices.push_back(new L0DeviceTy(*this, DeviceId, getNumRootDevices(),
+                                       zeDevice, *Driver, std::move(IdStr),
+                                       CCSId < 0 ? 0 : CCSId /* ComputeIndex */
+                                       ));
+    DeviceId++;
+  }
+
+  DP("Found %" PRIu32 " root devices, %" PRIu32 " total devices.\n",
+     getNumRootDevices(), NumDevices);
+  DP("List of devices (DeviceID[.SubID[.CCSID]])\n");
+  for (auto &l0Device : L0Devices) {
+    DP("-- %s\n", l0Device->getZeIdCStr());
+    (void)l0Device; // silence warning
+  }
+
+  if (getDebugLevel() > 0) {
+    DP("Root Device Information\n");
+    for (uint32_t I = 0; I < getNumRootDevices(); I++) {
+      auto &l0Device = getDeviceFromId(I);
+      l0Device.reportDeviceInfo();
+    }
+  }
+
+  return getNumRootDevices();
+}
+
+/// Clean-up routine to be invoked by the destructor or
+/// LevelZeroPluginTy::deinit.
+void LevelZeroPluginTy::closeRTL() {
+
+  ContextTLSTable.clear();
+  DeviceTLSTable.clear();
+  ThreadTLSTable.clear();
+  ContextList.clear();
+
+  DP("Plugin closed successfully\n");
+}
+
+Expected<int32_t> LevelZeroPluginTy::initImpl() {
+  DP("Level0 NG plugin initialization\n");
+  // process options before anything else
+  Options.init();
+  return findDevices();
+}
+
+Error LevelZeroPluginTy::deinitImpl() {
+  DP("Deinit Level0 plugin!\n");
+  closeRTL();
+  return Plugin::success();
+}
+
+GenericDeviceTy *LevelZeroPluginTy::createDevice(GenericPluginTy &Plugin,
+                                                 int32_t DeviceId,
+                                                 int32_t NumDevices) {
+  return &getDeviceFromId(DeviceId);
+}
+
+GenericGlobalHandlerTy *LevelZeroPluginTy::createGlobalHandler() {
+  return new L0GlobalHandlerTy();
+}
+
+uint16_t LevelZeroPluginTy::getMagicElfBits() const { return ELF::EM_INTELGT; }
+
+Triple::ArchType LevelZeroPluginTy::getTripleArch() const {
+  return Triple::spirv64;
+}
+
+const char *LevelZeroPluginTy::getName() const { return GETNAME(TARGET_NAME); }
+
+Error LevelZeroPluginTy::flushQueueImpl(omp_interop_val_t *Interop) {
+  return Plugin::success();
+}
+
+Expected<bool> LevelZeroPluginTy::isELFCompatible(uint32_t DeviceId,
+                                                  StringRef Image) const {
+  uint64_t MajorVer, MinorVer;
+  return isValidOneOmpImage(Image, MajorVer, MinorVer);
+}
+
+Error LevelZeroPluginTy::syncBarrierImpl(omp_interop_val_t *Interop) {
+  if (!Interop) {
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+                         DPxPTR(Interop));
+  }
+  if (!Interop->async_info || !Interop->async_info->Queue)
+    return Plugin::success();
+
+  // L0 object
+  const auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+  const auto device_id = Interop->device_id;
+  auto &l0Device = getDeviceFromId(device_id);
+
+  // We can synchronize both L0 & SYCL objects with the same ze command
+  if (l0Device.useImmForInterop()) {
+    DP("LevelZeroPluginTy::sync_barrier: Synchronizing " DPxMOD
+       " with ImmCmdList barrier\n",
+       DPxPTR(Interop));
+    auto ImmCmdList = L0->ImmCmdList;
+
+    CALL_ZE_RET_ERROR(zeCommandListHostSynchronize, ImmCmdList, UINT64_MAX);
+  } else {
+    DP("LevelZeroPluginTy::sync_barrier: Synchronizing " DPxMOD
+       " with queue synchronize\n",
+       DPxPTR(Interop));
+    auto CmdQueue = L0->CommandQueue;
+    CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, UINT64_MAX);
+  }
+
+  return Plugin::success();
+}
+
+Error LevelZeroPluginTy::asyncBarrierImpl(omp_interop_val_t *Interop) {
+  if (!Interop) {
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "Invalid/inconsistent OpenMP interop " DPxMOD "\n",
+                         DPxPTR(Interop));
+  }
+  if (!Interop->async_info || !Interop->async_info->Queue)
+    return Plugin::success();
+
+  const auto L0 = static_cast<L0Interop::Property *>(Interop->rtl_property);
+  const auto device_id = Interop->device_id;
+  if (Interop->attrs.inorder)
+    return Plugin::success();
+
+  auto &l0Device = getDeviceFromId(device_id);
+  if (l0Device.useImmForInterop()) {
+    DP("LevelZeroPluginTy::async_barrier: Appending ImmCmdList barrier "
+       "to " DPxMOD "\n",
+       DPxPTR(Interop));
+    auto ImmCmdList = L0->ImmCmdList;
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, ImmCmdList, nullptr, 0,
+                      nullptr);
+  } else {
+#if 0
+    // TODO: re-enable once we have a way to delay the CmdList reset 
+    DP("LevelZeroPluginTy::async_barrier: Appending CmdList barrier to " DPxMOD
+       "\n",
+       DPxPTR(Interop));
+    auto CmdQueue = L0->CommandQueue;
+    ze_command_list_handle_t CmdList = l0Device.getCmdList();
+    CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
+    CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
+                      nullptr);
+    CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
+#else
+    return syncBarrierImpl(Interop);
+#endif
+  }
+
+  return Plugin::success();
+}
+
+} // namespace llvm::omp::target::plugin
+
+extern "C" {
+llvm::omp::target::plugin::GenericPluginTy *createPlugin_level_zero() {
+  return new llvm::omp::target::plugin::LevelZeroPluginTy();
+}
+}
diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
new file mode 100644
index 0000000000000..0cd88b4ea97ef
--- /dev/null
+++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp
@@ -0,0 +1,565 @@
+//===--- Level Zero Target RTL Implementation -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Level Zero Program abstraction
+//
+//===----------------------------------------------------------------------===//
+
+#include <fstream>
+#ifdef _WIN32
+#include <fcntl.h>
+#include <io.h>
+#else
+#include <dlfcn.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif // !_WIN32
+
+#include "L0Plugin.h"
+#include "L0Program.h"
+
+namespace llvm::omp::target::plugin {
+
+Error L0GlobalHandlerTy::getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+                                                     DeviceImageTy &Image,
+                                                     GlobalTy &DeviceGlobal) {
+  const char *GlobalName = DeviceGlobal.getName().data();
+
+  L0ProgramTy &Program = L0ProgramTy::makeL0Program(Image);
+  void *Addr = Program.getOffloadVarDeviceAddr(GlobalName);
+
+  // Save the pointer to the symbol allowing nullptr.
+  DeviceGlobal.setPtr(Addr);
+
+  if (Addr == nullptr)
+    return Plugin::error(ErrorCode::UNKNOWN, "Failed to load global '%s'",
+                         GlobalName);
+
+  return Plugin::success();
+}
+
+inline L0DeviceTy &L0ProgramTy::getL0Device() const {
+  return L0DeviceTy::makeL0Device(getDevice());
+}
+
+L0ProgramTy::~L0ProgramTy() {
+  for (auto *Kernel : Kernels) {
+    // We need explicit destructor and deallocate calls to release the kernels
+    // created by `GenericDeviceTy::constructKernel()`.
+    Kernel->~L0KernelTy();
+    getL0Device().getPlugin().free(Kernel);
+  }
+  for (auto Module : Modules) {
+    CALL_ZE_RET_VOID(zeModuleDestroy, Module);
+  }
+}
+
+void L0ProgramTy::setLibModule() {
+#if _WIN32
+  return;
+#else
+  // Check if the image belongs to a dynamic library
+  Dl_info DLI{nullptr, nullptr, nullptr, nullptr};
+  if (dladdr(getStart(), &DLI) && DLI.dli_fname) {
+    std::vector<uint8_t> FileBin;
+    auto Size = readFile(DLI.dli_fname, FileBin);
+    if (Size) {
+      auto MB = MemoryBuffer::getMemBuffer(
+          StringRef(reinterpret_cast<const char *>(FileBin.data()), Size),
+          /*BufferName=*/"", /*RequiresNullTerminator=*/false);
+      auto ELF = ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+      if (ELF) {
+        if (auto *Obj = dyn_cast<ELF64LEObjectFile>((*ELF).get())) {
+          const auto Header = Obj->getELFFile().getHeader();
+          if (Header.e_type == ELF::ET_DYN) {
+            DP("Processing current image as library\n");
+            IsLibModule = true;
+          }
+        }
+      }
+    }
+  }
+#endif // _WIN32
+}
+
+int32_t L0ProgramTy::addModule(size_t Size, const uint8_t *Image,
+                               const std::string_view CommonBuildOptions,
+                               ze_module_format_t Format) {
+  const ze_module_constants_t SpecConstants =
+      LevelZeroPluginTy::getOptions().CommonSpecConstants.getModuleConstants();
+  auto &l0Device = getL0Device();
+  std::string BuildOptions(CommonBuildOptions);
+
+  // Add required flag to enable dynamic linking.
+  if (IsLibModule)
+    BuildOptions += " -library-compilation ";
+
+  ze_module_desc_t ModuleDesc{};
+  ModuleDesc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
+  ModuleDesc.pNext = nullptr;
+  ModuleDesc.format = Format;
+  ze_module_handle_t Module = nullptr;
+  ze_module_build_log_handle_t BuildLog = nullptr;
+  ze_result_t RC;
+
+  // Build a single module from a single image
+  ModuleDesc.inputSize = Size;
+  ModuleDesc.pInputModule = Image;
+  ModuleDesc.pBuildFlags = BuildOptions.c_str();
+  ModuleDesc.pConstants = &SpecConstants;
+  CALL_ZE_RC(RC, zeModuleCreate, l0Device.getZeContext(),
+             l0Device.getZeDevice(), &ModuleDesc, &Module, &BuildLog);
+
+  const bool BuildFailed = (RC != ZE_RESULT_SUCCESS);
+
+  if (BuildFailed)
+    return OFFLOAD_FAIL;
+
+  // Check if module link is required. We do not need this check for
+  // library module
+  if (!RequiresModuleLink && !IsLibModule) {
+    ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
+                                         nullptr, 0};
+    CALL_ZE_RET_FAIL(zeModuleGetProperties, Module, &Properties);
+    RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
+  }
+  // For now, assume the first module contains libraries, globals.
+  if (Modules.empty())
+    GlobalModule = Module;
+  Modules.push_back(Module);
+  l0Device.addGlobalModule(Module);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t L0ProgramTy::linkModules() {
+  auto &l0Device = getL0Device();
+  if (!RequiresModuleLink) {
+    DP("Module link is not required\n");
+    return OFFLOAD_SUCCESS;
+  }
+
+  if (Modules.empty()) {
+    DP("Invalid number of modules when linking modules\n");
+    return OFFLOAD_FAIL;
+  }
+
+  ze_result_t RC;
+  ze_module_build_log_handle_t LinkLog = nullptr;
+  CALL_ZE_RC(RC, zeModuleDynamicLink,
+             static_cast<uint32_t>(l0Device.getNumGlobalModules()),
+             l0Device.getGlobalModulesArray(), &LinkLog);
+  const bool LinkFailed = (RC != ZE_RESULT_SUCCESS);
+  return LinkFailed ? OFFLOAD_FAIL : OFFLOAD_SUCCESS;
+}
+
+size_t L0ProgramTy::readFile(const char *FileName,
+                             std::vector<uint8_t> &OutFile) const {
+  std::ifstream IFS(FileName, std::ios::binary);
+  if (!IFS.good())
+    return 0;
+  IFS.seekg(0, IFS.end);
+  auto FileSize = static_cast<size_t>(IFS.tellg());
+  OutFile.resize(FileSize);
+  IFS.seekg(0);
+  if (!IFS.read(reinterpret_cast<char *>(OutFile.data()), FileSize)) {
+    OutFile.clear();
+    return 0;
+  }
+  return FileSize;
+}
+
+void L0ProgramTy::replaceDriverOptsWithBackendOpts(const L0DeviceTy &Device,
+                                                   std::string &Options) const {
+  // Options that need to be replaced with backend-specific options
+  static const struct {
+    std::string Option;
+    std::string BackendOption;
+  } OptionTranslationTable[] = {
+      {"-ftarget-compile-fast",
+       "-igc_opts 'PartitionUnit=1,SubroutineThreshold=50000'"},
+      {"-foffload-fp32-prec-div", "-ze-fp32-correctly-rounded-divide-sqrt"},
+      {"-foffload-fp32-prec-sqrt", "-ze-fp32-correctly-rounded-divide-sqrt"},
+  };
+
+  for (const auto &OptPair : OptionTranslationTable) {
+    const size_t Pos = Options.find(OptPair.Option);
+    if (Pos != std::string::npos) {
+      Options.replace(Pos, OptPair.Option.length(), OptPair.BackendOption);
+    }
+  }
+}
+
+// FIXME: move this to llvm/BinaryFormat/ELF.h and elf.h:
+#define NT_INTEL_ONEOMP_OFFLOAD_VERSION 1
+#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT 2
+#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX 3
+
+bool isValidOneOmpImage(StringRef Image, uint64_t &MajorVer,
+                        uint64_t &MinorVer) {
+  const auto MB = MemoryBuffer::getMemBuffer(Image,
+                                             /*BufferName=*/"",
+                                             /*RequiresNullTerminator=*/false);
+  auto ExpectedNewE =
+      ELFObjectFileBase::createELFObjectFile(MB->getMemBufferRef());
+  if (!ExpectedNewE) {
+    DP("Warning: unable to get ELF handle!\n");
+    return false;
+  }
+  bool Res = false;
+  auto processObjF = [&](const auto ELFObjF) {
+    if (!ELFObjF) {
+      DP("Warning: Unexpected ELF type!\n");
+      return false;
+    }
+    const auto &ELFF = ELFObjF->getELFFile();
+    auto Sections = ELFF.sections();
+    if (!Sections) {
+      DP("Warning: unable to get ELF sections!\n");
+      return false;
+    }
+    bool SeenOffloadSection = false;
+    for (auto Sec : *Sections) {
+      if (Sec.sh_type != ELF::SHT_NOTE)
+        continue;
+      Error Err = Plugin::success();
+      for (auto Note : ELFF.notes(Sec, Err)) {
+        if (Err) {
+          DP("Warning: unable to get ELF notes handle!\n");
+          return false;
+        }
+        if (Note.getName() != "INTELONEOMPOFFLOAD")
+          continue;
+        SeenOffloadSection = true;
+        if (Note.getType() != NT_INTEL_ONEOMP_OFFLOAD_VERSION)
+          continue;
+
+        std::string DescStr(std::move(Note.getDescAsStringRef(4).str()));
+        const auto DelimPos = DescStr.find('.');
+        if (DelimPos == std::string::npos) {
+          // The version has to look like "Major#.Minor#".
+          DP("Invalid NT_INTEL_ONEOMP_OFFLOAD_VERSION: '%s'\n",
+             DescStr.c_str());
+          return false;
+        }
+        const std::string MajorVerStr = DescStr.substr(0, DelimPos);
+        DescStr.erase(0, DelimPos + 1);
+        MajorVer = std::stoull(MajorVerStr);
+        MinorVer = std::stoull(DescStr);
+        return (MajorVer == 1 && MinorVer == 0);
+      }
+    }
+    return SeenOffloadSection;
+  };
+  if (const auto *O = dyn_cast<ELF64LEObjectFile>((*ExpectedNewE).get())) {
+    Res = processObjF(O);
+  } else if (const auto *O =
+                 dyn_cast<ELF32LEObjectFile>((*ExpectedNewE).get())) {
+    Res = processObjF(O);
+  } else {
+    assert(false && "Unexpected ELF format");
+  }
+  return Res;
+}
+
+int32_t L0ProgramTy::buildModules(const std::string_view BuildOptions) {
+  auto &l0Device = getL0Device();
+  auto Image = getMemoryBuffer();
+  if (identify_magic(Image.getBuffer()) == file_magic::spirv_object) {
+    // Handle legacy plain SPIR-V image.
+    const uint8_t *ImgBegin = reinterpret_cast<const uint8_t *>(getStart());
+    return addModule(getSize(), ImgBegin, BuildOptions,
+                     ZE_MODULE_FORMAT_IL_SPIRV);
+  }
+
+  uint64_t MajorVer, MinorVer;
+  if (!isValidOneOmpImage(Image.getBuffer(), MajorVer, MinorVer)) {
+    DP("Warning: image is not a valid oneAPI OpenMP image.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  setLibModule();
+
+  // Iterate over the images and pick the first one that fits.
+  uint64_t ImageCount = 0;
+  struct V1ImageInfo {
+    // 0 - native, 1 - SPIR-V
+    uint64_t Format = (std::numeric_limits<uint64_t>::max)();
+    std::string CompileOpts;
+    std::string LinkOpts;
+    // We may have multiple sections created from split-kernel mode
+    std::vector<const uint8_t *> PartBegin;
+    std::vector<uint64_t> PartSize;
+
+    V1ImageInfo(uint64_t Format, std::string CompileOpts, std::string LinkOpts)
+        : Format(Format), CompileOpts(std::move(CompileOpts)),
+          LinkOpts(std::move(LinkOpts)) {}
+  };
+  std::unordered_map<uint64_t, V1ImageInfo> AuxInfo;
+
+  auto ExpectedNewE = ELFObjectFileBase::createELFObjectFile(Image);
+  assert(ExpectedNewE &&
+         "isValidOneOmpImage() returns true for invalid ELF image");
+  auto processELF = [&](auto *EObj) {
+    assert(EObj && "isValidOneOmpImage() returns true for invalid ELF image.");
+    const auto &E = EObj->getELFFile();
+    // Collect auxiliary information.
+    uint64_t MaxImageIdx = 0;
+
+    auto Sections = E.sections();
+    assert(Sections && "isValidOneOmpImage() returns true for ELF image with "
+                       "invalid sections.");
+
+    for (auto Sec : *Sections) {
+      if (Sec.sh_type != ELF::SHT_NOTE)
+        continue;
+      Error Err = Plugin::success();
+      for (auto Note : E.notes(Sec, Err)) {
+        assert(!Err && "isValidOneOmpImage() returns true for ELF image with "
+                       "invalid notes.");
+        if (Note.getName().str() != "INTELONEOMPOFFLOAD")
+          continue;
+
+        const uint64_t Type = Note.getType();
+        auto DescStrRef = Note.getDescAsStringRef(4);
+        switch (Type) {
+        default:
+          DP("Warning: unrecognized INTELONEOMPOFFLOAD note.\n");
+          break;
+        case NT_INTEL_ONEOMP_OFFLOAD_VERSION:
+          break;
+        case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT:
+          if (DescStrRef.getAsInteger(10, ImageCount)) {
+            DP("Warning: invalid NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT: '%s'\n",
+               DescStrRef.str().c_str());
+            ImageCount = 0;
+          }
+          break;
+        case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX:
+          llvm::SmallVector<llvm::StringRef, 4> Parts;
+          DescStrRef.split(Parts, '\0', /* MaxSplit = */ 4,
+                           /* KeepEmpty = */ true);
+
+          // Ignore records with less than 4 strings.
+          if (Parts.size() != 4) {
+            DP("Warning: short NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX "
+               "record is ignored.\n");
+            continue;
+          }
+
+          uint64_t Idx = 0;
+          if (Parts[0].getAsInteger(10, Idx)) {
+            DP("Warning: ignoring auxiliary information (invalid index "
+               "'%s').\n",
+               Parts[0].str().c_str());
+            continue;
+          }
+          MaxImageIdx = (std::max)(MaxImageIdx, Idx);
+          if (AuxInfo.find(Idx) != AuxInfo.end()) {
+            DP("Warning: duplicate auxiliary information for image %" PRIu64
+               " is ignored.\n",
+               Idx);
+            continue;
+          }
+
+          uint64_t Part1Id;
+          if (Parts[1].getAsInteger(10, Part1Id)) {
+            DP("Warning: ignoring auxiliary information (invalid part id "
+               "'%s').\n",
+               Parts[1].str().c_str());
+            continue;
+          }
+
+          AuxInfo.emplace(
+              std::piecewise_construct, std::forward_as_tuple(Idx),
+              std::forward_as_tuple(Part1Id, Parts[2].str(), Parts[3].str()));
+          // Image pointer and size
+          // will be initialized later.
+        }
+      }
+    }
+
+    if (MaxImageIdx >= ImageCount)
+      DP("Warning: invalid image index found in auxiliary information.\n");
+
+    for (auto Sec : *Sections) {
+      const char *Prefix = "__openmp_offload_spirv_";
+      auto ExpectedSectionName = E.getSectionName(Sec);
+      assert(ExpectedSectionName && "isValidOneOmpImage() returns true for ELF "
+                                    "image with invalid section names");
+      auto &SectionNameRef = *ExpectedSectionName;
+      if (!SectionNameRef.consume_front(Prefix))
+        continue;
+
+      // Expected section name in split-kernel mode:
+      // __openmp_offload_spirv_<image_id>_<part_id>
+      auto Parts = SectionNameRef.split('_');
+      // It seems that we do not need part ID as long as they are ordered
+      // in the image and we keep the ordering in the runtime.
+      SectionNameRef = Parts.first;
+      if (Parts.second.empty()) {
+        DP("Found a single section in the image\n");
+      } else {
+        DP("Found a split section in the image\n");
+      }
+
+      uint64_t Idx = 0;
+      if (SectionNameRef.getAsInteger(10, Idx)) {
+        DP("Warning: ignoring image section (invalid index '%s').\n",
+           SectionNameRef.str().c_str());
+        continue;
+      }
+      if (Idx >= ImageCount) {
+        DP("Warning: ignoring image section (index %" PRIu64
+           " is out of range).\n",
+           Idx);
+        continue;
+      }
+
+      auto AuxInfoIt = AuxInfo.find(Idx);
+      if (AuxInfoIt == AuxInfo.end()) {
+        DP("Warning: ignoring image section (no aux info).\n");
+        continue;
+      }
+      auto Contents = E.getSectionContents(Sec);
+      assert(Contents);
+      AuxInfoIt->second.PartBegin.push_back((*Contents).data());
+      AuxInfoIt->second.PartSize.push_back(Sec.sh_size);
+    }
+  };
+
+  if (auto *O = dyn_cast<ELF64LEObjectFile>((*ExpectedNewE).get())) {
+    processELF(O);
+  } else if (auto *O = dyn_cast<ELF32LEObjectFile>((*ExpectedNewE).get())) {
+    processELF(O);
+  } else {
+    assert(false && "Unexpected ELF format");
+  }
+
+  for (uint64_t Idx = 0; Idx < ImageCount; ++Idx) {
+    const auto It = AuxInfo.find(Idx);
+    if (It == AuxInfo.end()) {
+      DP("Warning: image %" PRIu64
+         " without auxiliary information is ingored.\n",
+         Idx);
+      continue;
+    }
+
+    const auto NumParts = It->second.PartBegin.size();
+    // Split-kernel is not supported in SPIRV format
+    if (NumParts > 1 && It->second.Format != 0) {
+      DP("Warning: split-kernel images are not supported in SPIRV format\n");
+      continue;
+    }
+
+    // Skip unknown image format
+    if (It->second.Format != 0 && It->second.Format != 1) {
+      DP("Warning: image %" PRIu64 "is ignored due to unknown format.\n", Idx);
+      continue;
+    }
+
+    const bool IsBinary = (It->second.Format == 0);
+    const auto ModuleFormat =
+        IsBinary ? ZE_MODULE_FORMAT_NATIVE : ZE_MODULE_FORMAT_IL_SPIRV;
+    std::string Options(BuildOptions);
+    {
+      Options += " " + It->second.CompileOpts + " " + It->second.LinkOpts;
+      replaceDriverOptsWithBackendOpts(l0Device, Options);
+    }
+
+    for (size_t I = 0; I < NumParts; I++) {
+      const unsigned char *ImgBegin =
+          reinterpret_cast<const unsigned char *>(It->second.PartBegin[I]);
+      size_t ImgSize = It->second.PartSize[I];
+
+      auto RC = addModule(ImgSize, ImgBegin, Options, ModuleFormat);
+
+      if (RC != OFFLOAD_SUCCESS) {
+        DP("Error: failed to create program from %s "
+           "(%" PRIu64 "-%zu).\n",
+           IsBinary ? "Binary" : "SPIR-V", Idx, I);
+        return OFFLOAD_FAIL;
+      }
+    }
+    DP("Created module from image #%" PRIu64 ".\n", Idx);
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  return OFFLOAD_FAIL;
+}
+
+void *L0ProgramTy::getOffloadVarDeviceAddr(const char *CName) const {
+  DP("Looking up OpenMP global variable '%s'.\n", CName);
+
+  if (!GlobalModule || !CName)
+    return nullptr;
+
+  std::string Name(CName);
+  size_t SizeDummy = 0;
+  void *DevicePtr = nullptr;
+  ze_result_t RC;
+  for (auto Module : Modules) {
+    CALL_ZE(RC, zeModuleGetGlobalPointer, Module, Name.c_str(), &SizeDummy,
+            &DevicePtr);
+    if (RC == ZE_RESULT_SUCCESS && DevicePtr)
+      return DevicePtr;
+  }
+  DP("Warning: global variable '%s' was not found in the device.\n",
+     Name.c_str());
+  return nullptr;
+}
+
+int32_t L0ProgramTy::readGlobalVariable(const char *Name, size_t Size,
+                                        void *HostPtr) {
+  size_t SizeDummy = 0;
+  void *DevicePtr = nullptr;
+  ze_result_t RC;
+  CALL_ZE(RC, zeModuleGetGlobalPointer, GlobalModule, Name, &SizeDummy,
+          &DevicePtr);
+  if (RC != ZE_RESULT_SUCCESS || !DevicePtr) {
+    DP("Warning: cannot read from device global variable %s\n", Name);
+    return OFFLOAD_FAIL;
+  }
+  return getL0Device().enqueueMemCopy(HostPtr, DevicePtr, Size);
+}
+
+int32_t L0ProgramTy::writeGlobalVariable(const char *Name, size_t Size,
+                                         const void *HostPtr) {
+  size_t SizeDummy = 0;
+  void *DevicePtr = nullptr;
+  ze_result_t RC;
+  CALL_ZE(RC, zeModuleGetGlobalPointer, GlobalModule, Name, &SizeDummy,
+          &DevicePtr);
+  if (RC != ZE_RESULT_SUCCESS || !DevicePtr) {
+    DP("Warning: cannot write to device global variable %s\n", Name);
+    return OFFLOAD_FAIL;
+  }
+  return getL0Device().enqueueMemCopy(DevicePtr, HostPtr, Size);
+}
+
+int32_t L0ProgramTy::loadModuleKernels() {
+  // We need to build kernels here before filling the offload entries since we
+  // don't know which module contains a specific kernel with a name.
+  for (auto Module : Modules) {
+    uint32_t Count = 0;
+    CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, nullptr);
+    if (Count == 0)
+      continue;
+
+    llvm::SmallVector<const char *> Names(Count);
+    CALL_ZE_RET_FAIL(zeModuleGetKernelNames, Module, &Count, Names.data());
+
+    for (auto *Name : Names) {
+      KernelsToModuleMap.emplace(Name, Module);
+    }
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+} // namespace llvm::omp::target::plugin