[OpenMP] Associate the KernelEnvironment with the GenericKernelTy (#7…

…0383) By associating the kernel environment with the generic kernel we can access middle-end information easily, including the launch bounds ranges that are acceptable. By constraining the number of threads accordingly, we now obey the user-provided bounds that were passed via attributes.
llvm · Oct 29, 2023 · d346c82 · d346c82
1 parent d8f5a18
commit d346c82
Show file tree

Hide file tree

Showing 10 changed files with 74 additions and 105 deletions.
diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp
@@ -32,24 +32,23 @@ void foo() {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP1]], i64 0, i32 2
 // CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA18:![0-9]+]], !alias.scope !13, !noalias !16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA16:![0-9]+]], !alias.scope !13, !noalias !17
 // CHECK-NEXT:    switch i32 [[TMP3]], label [[DOTOMP_OUTLINED__EXIT:%.*]] [
 // CHECK-NEXT:    i32 0, label [[DOTUNTIED_JMP__I:%.*]]
 // CHECK-NEXT:    i32 1, label [[DOTUNTIED_NEXT__I:%.*]]
 // CHECK-NEXT:    ]
 // CHECK:       .untied.jmp..i:
-// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA18]], !alias.scope !13, !noalias !16
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !19
+// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], !alias.scope !13, !noalias !17
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !13
 // CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       .untied.next..i:
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i64 0, i32 1
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 2
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 1
-// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20:![0-9]+]], !alias.scope !16, !noalias !13
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA18]], !alias.scope !16, !noalias !13
-// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA21:![0-9]+]], !alias.scope !16, !noalias !13
-// CHECK-NEXT:    tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !19
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA19:![0-9]+]], !noalias !13
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA16]], !noalias !13
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA20:![0-9]+]], !noalias !13
+// CHECK-NEXT:    tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !13
 // CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       .omp_outlined..exit:
 // CHECK-NEXT:    ret i32 0

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4093,8 +4093,8 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
 
   Function *Kernel = Builder.GetInsertBlock()->getParent();
 
-  /// Manifest the launch configuration in the metadata matching the kernel
-  /// environment.
+  // Manifest the launch configuration in the metadata matching the kernel
+  // environment.
   if (MinTeamsVal > 1 || MaxTeamsVal > 0)
     writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
 

diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -411,8 +411,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
 /// generic kernel class.
 struct AMDGPUKernelTy : public GenericKernelTy {
   /// Create an AMDGPU kernel with a name and an execution mode.
-  AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-      : GenericKernelTy(Name, ExecutionMode) {}
+  AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
 
   /// Initialize the AMDGPU kernel.
   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -1978,14 +1977,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   /// Allocate and construct an AMDGPU kernel.
   Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry,
-                  OMPTgtExecModeFlags ExecMode) override {
+  constructKernel(const __tgt_offload_entry &KernelEntry) override {
     // Allocate and construct the AMDGPU kernel.
     AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate<AMDGPUKernelTy>();
     if (!AMDGPUKernel)
       return Plugin::error("Failed to allocate memory for AMDGPU kernel");
 
-    new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name, ExecMode);
+    new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name);
 
     return *AMDGPUKernel;
   }

diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -339,9 +339,33 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
 
   ImagePtr = &Image;
 
-  PreferredNumThreads = GenericDevice.getDefaultNumThreads();
+  // Retrieve kernel environment object for the kernel.
+  GlobalTy KernelEnv(std::string(Name) + "_kernel_environment",
+                     sizeof(KernelEnvironment), &KernelEnvironment);
+  GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
+  if (auto Err =
+          GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) {
+    [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+    DP("Failed to read kernel environment for '%s': %s\n"
+       "Using default SPMD (2) execution mode\n",
+       Name, ErrStr.data());
+    KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_SPMD;
+    KernelEnvironment.Configuration.MayUseNestedParallelism = /*Unknown=*/2;
+    KernelEnvironment.Configuration.UseGenericStateMachine = /*Unknown=*/2;
+  }
 
-  MaxNumThreads = GenericDevice.getThreadLimit();
+  // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
+  MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0
+                      ? std::min(KernelEnvironment.Configuration.MaxThreads,
+                                 int32_t(GenericDevice.getThreadLimit()))
+                      : GenericDevice.getThreadLimit();
+
+  // Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref;
+  PreferredNumThreads =
+      KernelEnvironment.Configuration.MinThreads > 0
+          ? std::max(KernelEnvironment.Configuration.MinThreads,
+                     int32_t(GenericDevice.getDefaultNumThreads()))
+          : GenericDevice.getDefaultNumThreads();
 
   return initImpl(GenericDevice, Image);
 }
@@ -890,13 +914,8 @@ Error GenericDeviceTy::registerKernelOffloadEntry(
     __tgt_offload_entry &DeviceEntry) {
   DeviceEntry = KernelEntry;
 
-  // Retrieve the execution mode.
-  auto ExecModeOrErr = getExecutionModeForKernel(KernelEntry.name, Image);
-  if (!ExecModeOrErr)
-    return ExecModeOrErr.takeError();
-
   // Create a kernel object.
-  auto KernelOrErr = constructKernel(KernelEntry, *ExecModeOrErr);
+  auto KernelOrErr = constructKernel(KernelEntry);
   if (!KernelOrErr)
     return KernelOrErr.takeError();
 
@@ -914,45 +933,6 @@ Error GenericDeviceTy::registerKernelOffloadEntry(
   return Plugin::success();
 }
 
-Expected<KernelEnvironmentTy>
-GenericDeviceTy::getKernelEnvironmentForKernel(StringRef Name,
-                                               DeviceImageTy &Image) {
-  // Create a metadata object for the kernel environment object.
-  StaticGlobalTy<KernelEnvironmentTy> KernelEnv(Name.data(),
-                                                "_kernel_environment");
-
-  // Retrieve kernel environment object for the kernel.
-  GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
-  if (auto Err = GHandler.readGlobalFromImage(*this, Image, KernelEnv))
-    return std::move(Err);
-
-  return KernelEnv.getValue();
-}
-
-Expected<OMPTgtExecModeFlags>
-GenericDeviceTy::getExecutionModeForKernel(StringRef Name,
-                                           DeviceImageTy &Image) {
-  auto KernelEnvOrError = getKernelEnvironmentForKernel(Name, Image);
-  if (!KernelEnvOrError) {
-    [[maybe_unused]] std::string ErrStr =
-        toString(KernelEnvOrError.takeError());
-    DP("Failed to read kernel environment for '%s': %s\n"
-       "Using default SPMD (2) execution mode\n",
-       Name.data(), ErrStr.data());
-    return OMP_TGT_EXEC_MODE_SPMD;
-  }
-
-  auto &KernelEnv = *KernelEnvOrError;
-  auto ExecMode = KernelEnv.Configuration.ExecMode;
-
-  // Check that the retrieved execution mode is valid.
-  if (!GenericKernelTy::isValidExecutionMode(ExecMode))
-    return Plugin::error("Invalid execution mode %d for '%s'", ExecMode,
-                         Name.data());
-
-  return ExecMode;
-}
-
 Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
                                          size_t Size, bool ExternallyLocked) {
   // Insert the new entry into the map.

diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -255,9 +255,8 @@ class DeviceImageTy {
 /// implement the necessary virtual function members.
 struct GenericKernelTy {
   /// Construct a kernel with a name and a execution mode.
-  GenericKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-      : Name(Name), ExecutionMode(ExecutionMode), PreferredNumThreads(0),
-        MaxNumThreads(0) {}
+  GenericKernelTy(const char *Name)
+      : Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {}
 
   virtual ~GenericKernelTy() {}
 
@@ -285,6 +284,11 @@ struct GenericKernelTy {
     return *ImagePtr;
   }
 
+  /// Return the kernel environment object for kernel \p Name.
+  const KernelEnvironmentTy &getKernelEnvironmentForKernel() {
+    return KernelEnvironment;
+  }
+
   /// Indicate whether an execution mode is valid.
   static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
     switch (ExecutionMode) {
@@ -299,7 +303,7 @@ struct GenericKernelTy {
 protected:
   /// Get the execution mode name of the kernel.
   const char *getExecutionModeName() const {
-    switch (ExecutionMode) {
+    switch (KernelEnvironment.Configuration.ExecMode) {
     case OMP_TGT_EXEC_MODE_SPMD:
       return "SPMD";
     case OMP_TGT_EXEC_MODE_GENERIC:
@@ -343,19 +347,20 @@ struct GenericKernelTy {
 
   /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
   bool isGenericSPMDMode() const {
-    return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD;
+    return KernelEnvironment.Configuration.ExecMode ==
+           OMP_TGT_EXEC_MODE_GENERIC_SPMD;
   }
   bool isGenericMode() const {
-    return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC;
+    return KernelEnvironment.Configuration.ExecMode ==
+           OMP_TGT_EXEC_MODE_GENERIC;
+  }
+  bool isSPMDMode() const {
+    return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD;
   }
-  bool isSPMDMode() const { return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD; }
 
   /// The kernel name.
   const char *Name;
 
-  /// The execution flags of the kernel.
-  OMPTgtExecModeFlags ExecutionMode;
-
   /// The image that contains this kernel.
   DeviceImageTy *ImagePtr = nullptr;
 
@@ -365,6 +370,9 @@ struct GenericKernelTy {
 
   /// The maximum number of threads which the kernel could leverage.
   uint32_t MaxNumThreads;
+
+  /// The kernel environment, including execution flags.
+  KernelEnvironmentTy KernelEnvironment;
 };
 
 /// Class representing a map of host pinned allocations. We track these pinned
@@ -819,8 +827,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   /// Allocate and construct a kernel object.
   virtual Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry,
-                  OMPTgtExecModeFlags ExecMode) = 0;
+  constructKernel(const __tgt_offload_entry &KernelEntry) = 0;
 
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
@@ -864,10 +871,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
       UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
 
 protected:
-  /// Return the execution mode used for kernel \p Name.
-  virtual Expected<OMPTgtExecModeFlags>
-  getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image);
-
   /// Environment variables defined by the LLVM OpenMP implementation
   /// regarding the initial number of streams and events.
   UInt32Envar OMPX_InitialNumStreams;
@@ -916,10 +919,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 #endif
 
 private:
-  /// Return the kernel environment object for kernel \p Name.
-  Expected<KernelEnvironmentTy>
-  getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);
-
   DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
   DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
 };

diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -79,8 +79,7 @@ struct CUDADeviceImageTy : public DeviceImageTy {
 /// generic kernel class.
 struct CUDAKernelTy : public GenericKernelTy {
   /// Create a CUDA kernel with a name and an execution mode.
-  CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecMode)
-      : GenericKernelTy(Name, ExecMode), Func(nullptr) {}
+  CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
 
   /// Initialize the CUDA kernel.
   Error initImpl(GenericDeviceTy &GenericDevice,
@@ -356,14 +355,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
   /// Allocate and construct a CUDA kernel.
   Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry,
-                  OMPTgtExecModeFlags ExecMode) override {
+  constructKernel(const __tgt_offload_entry &KernelEntry) override {
     // Allocate and construct the CUDA kernel.
     CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>();
     if (!CUDAKernel)
       return Plugin::error("Failed to allocate memory for CUDA kernel");
 
-    new (CUDAKernel) CUDAKernelTy(KernelEntry.name, ExecMode);
+    new (CUDAKernel) CUDAKernelTy(KernelEntry.name);
 
     return *CUDAKernel;
   }

diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -25,6 +25,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 #include "llvm/Support/DynamicLibrary.h"
 
@@ -51,8 +52,7 @@ using llvm::sys::DynamicLibrary;
 /// Class implementing kernel functionalities for GenELF64.
 struct GenELF64KernelTy : public GenericKernelTy {
   /// Construct the kernel with a name and an execution mode.
-  GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecMode)
-      : GenericKernelTy(Name, ExecMode), Func(nullptr) {}
+  GenELF64KernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
 
   /// Initialize the kernel.
   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -71,6 +71,10 @@ struct GenELF64KernelTy : public GenericKernelTy {
     // Save the function pointer.
     Func = (void (*)())Global.getPtr();
 
+    KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_GENERIC;
+    KernelEnvironment.Configuration.MayUseNestedParallelism = /* Unknown */ 2;
+    KernelEnvironment.Configuration.UseGenericStateMachine = /* Unknown */ 2;
+
     // Set the maximum number of threads to a single.
     MaxNumThreads = 1;
     return Plugin::success();
@@ -137,15 +141,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
 
   /// Construct the kernel for a specific image on the device.
   Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry,
-                  OMPTgtExecModeFlags ExecMode) override {
+  constructKernel(const __tgt_offload_entry &KernelEntry) override {
     // Allocate and construct the kernel.
     GenELF64KernelTy *GenELF64Kernel =
         Plugin::get().allocate<GenELF64KernelTy>();
     if (!GenELF64Kernel)
       return Plugin::error("Failed to allocate memory for GenELF64 kernel");
 
-    new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name, ExecMode);
+    new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name);
 
     return *GenELF64Kernel;
   }
@@ -325,13 +328,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   }
   Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
 
-protected:
-  /// Retrieve the execution mode for kernels. All kernels use the generic mode.
-  Expected<OMPTgtExecModeFlags>
-  getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image) override {
-    return OMP_TGT_EXEC_MODE_GENERIC;
-  }
-
 private:
   /// Grid values for Generic ELF64 plugins.
   static constexpr GV GenELF64GridValues = {

diff --git a/openmp/libomptarget/test/offloading/default_thread_limit.c b/openmp/libomptarget/test/offloading/default_thread_limit.c
@@ -48,8 +48,7 @@ int main() {
   for (int i = 0; i < N; ++i) {
     optnone();
   }
-// FIXME: Use the attribute value to imply a thread_limit
-// DEFAULT: {{(128|256)}} (MaxFlatWorkGroupSize: 42
+// DEFAULT: 42 (MaxFlatWorkGroupSize: 42
 #pragma omp target ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
 #pragma omp teams distribute parallel for
   for (int i = 0; i < N; ++i) {

diff --git a/openmp/libomptarget/test/offloading/thread_state_1.c b/openmp/libomptarget/test/offloading/thread_state_1.c
@@ -26,8 +26,8 @@ int main() {
       }
     }
   }
-  if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt > 1 && i_lvl == 2 && i_tid == 0 &&
-      i_nt == 1) {
+  if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt >= 1 && i_lvl == 2 &&
+      i_tid == 0 && i_nt == 1) {
     // CHECK: Success
     printf("Success\n");
     return 0;

diff --git a/openmp/libomptarget/test/offloading/thread_state_2.c b/openmp/libomptarget/test/offloading/thread_state_2.c
@@ -28,8 +28,8 @@ int main() {
       }
     }
   }
-  if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt > 1 && i_lvl == 2 && i_tid == 0 &&
-      i_nt == 1) {
+  if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt >= 1 && i_lvl == 2 &&
+      i_tid == 0 && i_nt == 1) {
     // CHECK: Success
     printf("Success\n");
     return 0;