[OpenMP][libomptarget] Improve kernel initialization in plugins

This patch modifies the plugins so that the initialization of KernelTy objects is done in the init method. Part of the initialization was done in the constructKernelEntry method. Now this method is called constructKernel and only allocates and constructs a KernelTy object. This patch prepares the kernel class for the new implementation of device reductions. Differential Revision: https://reviews.llvm.org/D156917
llvm · Aug 6, 2023 · b8e297d · b8e297d
1 parent 16f6f19
commit b8e297d
Show file tree

Hide file tree

Showing 5 changed files with 126 additions and 114 deletions.
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1899,20 +1899,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   uint64_t getClockFrequency() const override { return ClockFrequency; }
 
   /// Allocate and construct an AMDGPU kernel.
-  Expected<GenericKernelTy *>
-  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
-                       DeviceImageTy &Image) override {
+  Expected<GenericKernelTy &>
+  constructKernel(const __tgt_offload_entry &KernelEntry,
+                  OMPTgtExecModeFlags ExecMode) override {
+    // Allocate and construct the AMDGPU kernel.
+    AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate<AMDGPUKernelTy>();
+    if (!AMDGPUKernel)
+      return Plugin::error("Failed to allocate memory for AMDGPU kernel");
 
-    Expected<OMPTgtExecModeFlags> ExecModeOrErr =
-        getExecutionModeForKernel(KernelEntry.name, Image);
-    if (!ExecModeOrErr)
-      return ExecModeOrErr.takeError();
+    new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name, ExecMode);
 
-    // Allocate and initialize the AMDGPU kernel.
-    AMDGPUKernelTy *AMDKernel = Plugin::get().allocate<AMDGPUKernelTy>();
-    new (AMDKernel) AMDGPUKernelTy(KernelEntry.name, ExecModeOrErr.get());
-
-    return AMDKernel;
+    return *AMDGPUKernel;
   }
 
   /// Set the current context to this device's context. Do nothing since the

diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -746,21 +746,25 @@ Error GenericDeviceTy::registerKernelOffloadEntry(
     __tgt_offload_entry &DeviceEntry) {
   DeviceEntry = KernelEntry;
 
+  // Retrieve the execution mode.
+  auto ExecModeOrErr = getExecutionModeForKernel(KernelEntry.name, Image);
+  if (!ExecModeOrErr)
+    return ExecModeOrErr.takeError();
+
   // Create a kernel object.
-  auto KernelOrErr = constructKernelEntry(KernelEntry, Image);
+  auto KernelOrErr = constructKernel(KernelEntry, *ExecModeOrErr);
   if (!KernelOrErr)
     return KernelOrErr.takeError();
 
-  GenericKernelTy *Kernel = *KernelOrErr;
-  assert(Kernel != nullptr && "Invalid kernel");
+  GenericKernelTy &Kernel = *KernelOrErr;
 
   // Initialize the kernel.
-  if (auto Err = Kernel->init(*this, Image))
+  if (auto Err = Kernel.init(*this, Image))
     return Err;
 
   // Set the device entry address to the kernel address and store the entry on
   // the entry table.
-  DeviceEntry.addr = (void *)Kernel;
+  DeviceEntry.addr = (void *)&Kernel;
   Image.getOffloadEntryTable().addEntry(DeviceEntry);
 
   return Plugin::success();

diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -794,9 +794,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
                                    __tgt_offload_entry &DeviceEntry);
 
   /// Allocate and construct a kernel object.
-  virtual Expected<GenericKernelTy *>
-  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
-                       DeviceImageTy &Image) = 0;
+  virtual Expected<GenericKernelTy &>
+  constructKernel(const __tgt_offload_entry &KernelEntry,
+                  OMPTgtExecModeFlags ExecMode) = 0;
 
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
@@ -837,8 +837,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
 protected:
   /// Return the execution mode used for kernel \p Name.
-  Expected<OMPTgtExecModeFlags> getExecutionModeForKernel(StringRef Name,
-                                                          DeviceImageTy &Image);
+  virtual Expected<OMPTgtExecModeFlags>
+  getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image);
 
   /// Environment variables defined by the LLVM OpenMP implementation
   /// regarding the initial number of streams and events.

diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -37,31 +37,80 @@ struct CUDAKernelTy;
 struct CUDADeviceTy;
 struct CUDAPluginTy;
 
+/// Class implementing the CUDA device images properties.
+struct CUDADeviceImageTy : public DeviceImageTy {
+  /// Create the CUDA image with the id and the target image pointer.
+  CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
+      : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {}
+
+  /// Load the image as a CUDA module.
+  Error loadModule() {
+    assert(!Module && "Module already loaded");
+
+    CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr);
+    if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s"))
+      return Err;
+
+    return Plugin::success();
+  }
+
+  /// Unload the CUDA module corresponding to the image.
+  Error unloadModule() {
+    assert(Module && "Module not loaded");
+
+    CUresult Res = cuModuleUnload(Module);
+    if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s"))
+      return Err;
+
+    Module = nullptr;
+
+    return Plugin::success();
+  }
+
+  /// Getter of the CUDA module.
+  CUmodule getModule() const { return Module; }
+
+private:
+  /// The CUDA module that loaded the image.
+  CUmodule Module;
+};
+
 /// Class implementing the CUDA kernel functionalities which derives from the
 /// generic kernel class.
 struct CUDAKernelTy : public GenericKernelTy {
-  /// Create a CUDA kernel with a name, an execution mode, and the kernel
-  /// function.
-  CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode,
-               CUfunction Func)
-      : GenericKernelTy(Name, ExecutionMode), Func(Func) {}
+  /// Create a CUDA kernel with a name and an execution mode.
+  CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecMode)
+      : GenericKernelTy(Name, ExecMode), Func(nullptr) {}
 
-  /// Initialize the CUDA kernel
+  /// Initialize the CUDA kernel.
   Error initImpl(GenericDeviceTy &GenericDevice,
                  DeviceImageTy &Image) override {
+    CUresult Res;
+    CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
+
+    // Retrieve the function pointer of the kernel.
+    Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName());
+    if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s",
+                                 getName()))
+      return Err;
+
+    // Check that the function pointer is valid.
+    if (!Func)
+      return Plugin::error("Invalid function for kernel %s", getName());
+
     int MaxThreads;
-    CUresult Res = cuFuncGetAttribute(
-        &MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
+    Res = cuFuncGetAttribute(&MaxThreads,
+                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
     if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
       return Err;
 
-    /// Set the maximum number of threads for the CUDA kernel.
+    // The maximum number of threads cannot exceed the maximum of the kernel.
     MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
 
     return Plugin::success();
   }
 
-  /// Launch the CUDA kernel function
+  /// Launch the CUDA kernel function.
   Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
                    uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
@@ -165,44 +214,6 @@ struct CUDAEventRef final : public GenericDeviceResourceRef {
   HandleTy Event;
 };
 
-/// Class implementing the CUDA device images properties.
-struct CUDADeviceImageTy : public DeviceImageTy {
-  /// Create the CUDA image with the id and the target image pointer.
-  CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {}
-
-  /// Load the image as a CUDA module.
-  Error loadModule() {
-    assert(!Module && "Module already loaded");
-
-    CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr);
-    if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s"))
-      return Err;
-
-    return Plugin::success();
-  }
-
-  /// Unload the CUDA module corresponding to the image.
-  Error unloadModule() {
-    assert(Module && "Module not loaded");
-
-    CUresult Res = cuModuleUnload(Module);
-    if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s"))
-      return Err;
-
-    Module = nullptr;
-
-    return Plugin::success();
-  }
-
-  /// Getter of the CUDA module.
-  CUmodule getModule() const { return Module; }
-
-private:
-  /// The CUDA module that loaded the image.
-  CUmodule Module;
-};
-
 /// Class implementing the CUDA device functionalities which derives from the
 /// generic device class.
 struct CUDADeviceTy : public GenericDeviceTy {
@@ -330,32 +341,17 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Allocate and construct a CUDA kernel.
-  Expected<GenericKernelTy *>
-  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
-                       DeviceImageTy &Image) override {
-    CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
-
-    // Retrieve the function pointer of the kernel.
-    CUfunction Func;
-    CUresult Res =
-        cuModuleGetFunction(&Func, CUDAImage.getModule(), KernelEntry.name);
-    if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s",
-                                 KernelEntry.name))
-      return std::move(Err);
-
-    DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", DPxPTR(&KernelEntry),
-       KernelEntry.name, DPxPTR(Func));
-
-    Expected<OMPTgtExecModeFlags> ExecModeOrErr =
-        getExecutionModeForKernel(KernelEntry.name, Image);
-    if (!ExecModeOrErr)
-      return ExecModeOrErr.takeError();
-
-    // Allocate and initialize the CUDA kernel.
+  Expected<GenericKernelTy &>
+  constructKernel(const __tgt_offload_entry &KernelEntry,
+                  OMPTgtExecModeFlags ExecMode) override {
+    // Allocate and construct the CUDA kernel.
     CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>();
-    new (CUDAKernel) CUDAKernelTy(KernelEntry.name, ExecModeOrErr.get(), Func);
+    if (!CUDAKernel)
+      return Plugin::error("Failed to allocate memory for CUDA kernel");
+
+    new (CUDAKernel) CUDAKernelTy(KernelEntry.name, ExecMode);
 
-    return CUDAKernel;
+    return *CUDAKernel;
   }
 
   /// Set the current context to this device's context.

diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -49,14 +49,27 @@ using llvm::sys::DynamicLibrary;
 
 /// Class implementing kernel functionalities for GenELF64.
 struct GenELF64KernelTy : public GenericKernelTy {
-  /// Construct the kernel with a name, execution mode and a function.
-  GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode,
-                   void (*Func)(void))
-      : GenericKernelTy(Name, ExecutionMode), Func(Func) {}
+  /// Construct the kernel with a name and an execution mode.
+  GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecMode)
+      : GenericKernelTy(Name, ExecMode), Func(nullptr) {}
 
   /// Initialize the kernel.
-  Error initImpl(GenericDeviceTy &GenericDevice,
-                 DeviceImageTy &Image) override {
+  Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
+    // Functions have zero size.
+    GlobalTy Global(getName(), 0);
+
+    // Get the metadata (address) of the kernel function.
+    GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
+    if (auto Err = GHandler.getGlobalMetadataFromDevice(Device, Image, Global))
+      return Err;
+
+    // Check that the function pointer is valid.
+    if (!Global.getPtr())
+      return Plugin::error("Invalid function for kernel %s", getName());
+
+    // Save the function pointer.
+    Func = (void (*)())Global.getPtr();
+
     // Set the maximum number of threads to a single.
     MaxNumThreads = 1;
     return Plugin::success();
@@ -119,23 +132,18 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Error deinitImpl() override { return Plugin::success(); }
 
   /// Construct the kernel for a specific image on the device.
-  Expected<GenericKernelTy *>
-  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
-                       DeviceImageTy &Image) override {
-    GlobalTy Func(KernelEntry);
-
-    // Get the metadata (address) of the kernel function.
-    GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
-    if (auto Err = GHandler.getGlobalMetadataFromDevice(*this, Image, Func))
-      return std::move(Err);
-
-    // Allocate and create the kernel.
+  Expected<GenericKernelTy &>
+  constructKernel(const __tgt_offload_entry &KernelEntry,
+                  OMPTgtExecModeFlags ExecMode) override {
+    // Allocate and construct the kernel.
     GenELF64KernelTy *GenELF64Kernel =
         Plugin::get().allocate<GenELF64KernelTy>();
-    new (GenELF64Kernel) GenELF64KernelTy(
-        KernelEntry.name, OMP_TGT_EXEC_MODE_GENERIC, (void (*)())Func.getPtr());
+    if (!GenELF64Kernel)
+      return Plugin::error("Failed to allocate memory for GenELF64 kernel");
 
-    return GenELF64Kernel;
+    new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name, ExecMode);
+
+    return *GenELF64Kernel;
   }
 
   /// Set the current context to this device, which is a no-op.
@@ -312,6 +320,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   }
   Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
 
+protected:
+  /// Retrieve the execution mode for kernels. All kernels use the generic mode.
+  Expected<OMPTgtExecModeFlags>
+  getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image) override {
+    return OMP_TGT_EXEC_MODE_GENERIC;
+  }
+
 private:
   /// Grid values for Generic ELF64 plugins.
   static constexpr GV GenELF64GridValues = {