[OpenMP] Pass min/max thread and team count to the OMPIRBuilder (#70247)

We now provide the information about the min/max thread and team count from to the OMPIRBuilder, no matter what the source was. That means we unify `thread_limit`, `num_teams`, `num_threads` handling with the target specific attriutes (`__launch_bounds__` and `amdgpu_flat_work_group_size`). This is in preparation to pass the values to the runtime, and to allow the middle-end (OpenMP-opt) to tighten the values if it seems appropriate. There is no "real" change after this commit.
llvm · Oct 26, 2023 · 0ba57c8 · 0ba57c8
1 parent 57cebc7
commit 0ba57c8
Show file tree

Hide file tree

Showing 12 changed files with 1,322 additions and 1,165 deletions.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6021,15 +6021,46 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
       };
 
   // Get NumTeams and ThreadLimit attributes
-  int32_t DefaultValTeams = -1;
-  uint32_t DefaultValThreads = UINT32_MAX;
-  getNumTeamsExprForTargetDirective(CGF, D, DefaultValTeams);
-  getNumThreadsExprForTargetDirective(CGF, D, DefaultValThreads,
+  int32_t DefaultValMinTeams = 1;
+  int32_t DefaultValMaxTeams = -1;
+  uint32_t DefaultValMinThreads = 1;
+  uint32_t DefaultValMaxThreads = UINT32_MAX;
+
+  getNumTeamsExprForTargetDirective(CGF, D, DefaultValMinTeams,
+                                    DefaultValMaxTeams);
+  getNumThreadsExprForTargetDirective(CGF, D, DefaultValMaxThreads,
                                       /*UpperBoundOnly=*/true);
 
-  OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
-                                      DefaultValTeams, DefaultValThreads,
-                                      IsOffloadEntry, OutlinedFn, OutlinedFnID);
+  for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
+    for (auto *A : C->getAttrs()) {
+      int32_t MinThreadsVal = 1, MaxThreadsVal = 0;
+      int32_t MinBlocksVal = 1, MaxBlocksVal = -1;
+      if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
+        CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &MaxThreadsVal,
+                                       &MinBlocksVal, &MaxBlocksVal);
+      else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
+        CGM.handleAMDGPUFlatWorkGroupSizeAttr(
+            nullptr, Attr, /*ReqdWGS=*/nullptr, &MinThreadsVal, &MaxThreadsVal);
+      else
+        continue;
+
+      DefaultValMinThreads =
+          std::max(DefaultValMinThreads, uint32_t(MinThreadsVal));
+      DefaultValMaxThreads =
+          DefaultValMaxThreads
+              ? std::min(DefaultValMaxThreads, uint32_t(MaxThreadsVal))
+              : MaxThreadsVal;
+      DefaultValMinTeams = DefaultValMinTeams
+                               ? std::max(DefaultValMinTeams, MinBlocksVal)
+                               : MinBlocksVal;
+      DefaultValMaxTeams = std::min(DefaultValMaxTeams, MaxBlocksVal);
+    }
+  }
+
+  OMPBuilder.emitTargetRegionFunction(
+      EntryInfo, GenerateOutlinedFunction, DefaultValMinTeams,
+      DefaultValMaxTeams, DefaultValMinThreads, DefaultValMaxThreads,
+      IsOffloadEntry, OutlinedFn, OutlinedFnID);
 
   if (!OutlinedFn)
     return;
@@ -6038,14 +6069,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
 
   for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
     for (auto *A : C->getAttrs()) {
-      if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
-        CGM.handleCUDALaunchBoundsAttr(OutlinedFn, Attr);
-      else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
-        CGM.handleAMDGPUFlatWorkGroupSizeAttr(OutlinedFn, Attr);
-      else if (auto *Attr = dyn_cast<AMDGPUWavesPerEUAttr>(A))
+      if (auto *Attr = dyn_cast<AMDGPUWavesPerEUAttr>(A))
         CGM.handleAMDGPUWavesPerEUAttr(OutlinedFn, Attr);
-      else
-        llvm_unreachable("Unexpected attribute kind");
     }
   }
 }
@@ -6103,8 +6128,8 @@ const Stmt *CGOpenMPRuntime::getSingleCompoundChild(ASTContext &Ctx,
 }
 
 const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
-    CodeGenFunction &CGF, const OMPExecutableDirective &D,
-    int32_t &DefaultVal) {
+    CodeGenFunction &CGF, const OMPExecutableDirective &D, int32_t &MinTeamsVal,
+    int32_t &MaxTeamsVal) {
 
   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
   assert(isOpenMPTargetExecutionDirective(DirectiveKind) &&
@@ -6125,22 +6150,22 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
           if (NumTeams->isIntegerConstantExpr(CGF.getContext()))
             if (auto Constant =
                     NumTeams->getIntegerConstantExpr(CGF.getContext()))
-              DefaultVal = Constant->getExtValue();
+              MinTeamsVal = MaxTeamsVal = Constant->getExtValue();
           return NumTeams;
         }
-        DefaultVal = 0;
+        MinTeamsVal = MaxTeamsVal = 0;
         return nullptr;
       }
       if (isOpenMPParallelDirective(NestedDir->getDirectiveKind()) ||
           isOpenMPSimdDirective(NestedDir->getDirectiveKind())) {
-        DefaultVal = 1;
+        MinTeamsVal = MaxTeamsVal = 1;
         return nullptr;
       }
-      DefaultVal = 1;
+      MinTeamsVal = MaxTeamsVal = 1;
       return nullptr;
     }
     // A value of -1 is used to check if we need to emit no teams region
-    DefaultVal = -1;
+    MinTeamsVal = MaxTeamsVal = -1;
     return nullptr;
   }
   case OMPD_target_teams_loop:
@@ -6154,18 +6179,18 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
           D.getSingleClause<OMPNumTeamsClause>()->getNumTeams();
       if (NumTeams->isIntegerConstantExpr(CGF.getContext()))
         if (auto Constant = NumTeams->getIntegerConstantExpr(CGF.getContext()))
-          DefaultVal = Constant->getExtValue();
+          MinTeamsVal = MaxTeamsVal = Constant->getExtValue();
       return NumTeams;
     }
-    DefaultVal = 0;
+    MinTeamsVal = MaxTeamsVal = 0;
     return nullptr;
   }
   case OMPD_target_parallel:
   case OMPD_target_parallel_for:
   case OMPD_target_parallel_for_simd:
   case OMPD_target_parallel_loop:
   case OMPD_target_simd:
-    DefaultVal = 1;
+    MinTeamsVal = MaxTeamsVal = 1;
     return nullptr;
   case OMPD_parallel:
   case OMPD_for:
@@ -6240,8 +6265,9 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective(
          "Clauses associated with the teams directive expected to be emitted "
          "only for the host!");
   CGBuilderTy &Bld = CGF.Builder;
-  int32_t DefaultNT = -1;
-  const Expr *NumTeams = getNumTeamsExprForTargetDirective(CGF, D, DefaultNT);
+  int32_t MinNT = -1, MaxNT = -1;
+  const Expr *NumTeams =
+      getNumTeamsExprForTargetDirective(CGF, D, MinNT, MaxNT);
   if (NumTeams != nullptr) {
     OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
 
@@ -6271,7 +6297,8 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective(
     }
   }
 
-  return llvm::ConstantInt::get(CGF.Int32Ty, DefaultNT);
+  assert(MinNT == MaxNT && "Num threads ranges require handling here.");
+  return llvm::ConstantInt::get(CGF.Int32Ty, MinNT);
 }
 
 /// Check for a num threads constant value (stored in \p DefaultVal), or

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -637,7 +637,8 @@ class CGOpenMPRuntime {
   /// Otherwise, return nullptr.
   const Expr *getNumTeamsExprForTargetDirective(CodeGenFunction &CGF,
                                                 const OMPExecutableDirective &D,
-                                                int32_t &DefaultVal);
+                                                int32_t &MinTeamsVal,
+                                                int32_t &MaxTeamsVal);
   llvm::Value *emitNumTeamsForTargetDirective(CodeGenFunction &CGF,
                                               const OMPExecutableDirective &D);
 

diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
@@ -1543,15 +1543,23 @@ class CodeGenModule : public CodeGenTypeCache {
   void moveLazyEmissionStates(CodeGenModule *NewBuilder);
 
   /// Emit the IR encoding to attach the CUDA launch bounds attribute to \p F.
+  /// If \p MaxThreadsVal is not nullptr, the max threads value is stored in it,
+  /// if a valid one was found.
   void handleCUDALaunchBoundsAttr(llvm::Function *F,
-                                  const CUDALaunchBoundsAttr *A);
+                                  const CUDALaunchBoundsAttr *A,
+                                  int32_t *MaxThreadsVal = nullptr,
+                                  int32_t *MinBlocksVal = nullptr,
+                                  int32_t *MaxClusterRankVal = nullptr);
 
   /// Emit the IR encoding to attach the AMD GPU flat-work-group-size attribute
   /// to \p F. Alternatively, the work group size can be taken from a \p
-  /// ReqdWGS.
+  /// ReqdWGS. If \p MinThreadsVal is not nullptr, the min threads value is
+  /// stored in it, if a valid one was found. If \p MaxThreadsVal is not
+  /// nullptr, the max threads value is stored in it, if a valid one was found.
   void handleAMDGPUFlatWorkGroupSizeAttr(
       llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *A,
-      const ReqdWorkGroupSizeAttr *ReqdWGS = nullptr);
+      const ReqdWorkGroupSizeAttr *ReqdWGS = nullptr,
+      int32_t *MinThreadsVal = nullptr, int32_t *MaxThreadsVal = nullptr);
 
   /// Emit the IR encoding to attach the AMD GPU waves-per-eu attribute to \p F.
   void handleAMDGPUWavesPerEUAttr(llvm::Function *F,

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -594,7 +594,8 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
 
 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
     llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
-    const ReqdWorkGroupSizeAttr *ReqdWGS) {
+    const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
+    int32_t *MaxThreadsVal) {
   unsigned Min = 0;
   unsigned Max = 0;
   if (FlatWGS) {
@@ -607,8 +608,13 @@ void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
   if (Min != 0) {
     assert(Min <= Max && "Min must be less than or equal Max");
 
+    if (MinThreadsVal)
+      *MinThreadsVal = Min;
+    if (MaxThreadsVal)
+      *MaxThreadsVal = Max;
     std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
-    F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
+    if (F)
+      F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
   } else
     assert(Max == 0 && "Max must be zero");
 }

diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -287,33 +287,52 @@ bool NVPTXTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
 }
 }
 
-void CodeGenModule::handleCUDALaunchBoundsAttr(
-    llvm::Function *F, const CUDALaunchBoundsAttr *Attr) {
+void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
+                                               const CUDALaunchBoundsAttr *Attr,
+                                               int32_t *MaxThreadsVal,
+                                               int32_t *MinBlocksVal,
+                                               int32_t *MaxClusterRankVal) {
   // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
   llvm::APSInt MaxThreads(32);
   MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
-  if (MaxThreads > 0)
-    NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
-                                            MaxThreads.getExtValue());
+  if (MaxThreads > 0) {
+    if (MaxThreadsVal)
+      *MaxThreadsVal = MaxThreads.getExtValue();
+    if (F) {
+      // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
+      NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
+                                              MaxThreads.getExtValue());
+    }
+  }
 
   // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
   // was not specified in __launch_bounds__ or if the user specified a 0 value,
   // we don't have to add a PTX directive.
   if (Attr->getMinBlocks()) {
     llvm::APSInt MinBlocks(32);
     MinBlocks = Attr->getMinBlocks()->EvaluateKnownConstInt(getContext());
-    if (MinBlocks > 0)
-      // Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
-      NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm",
-                                              MinBlocks.getExtValue());
+    if (MinBlocks > 0) {
+      if (MinBlocksVal)
+        *MinBlocksVal = MinBlocks.getExtValue();
+      if (F) {
+        // Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
+        NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm",
+                                                MinBlocks.getExtValue());
+      }
+    }
   }
   if (Attr->getMaxBlocks()) {
     llvm::APSInt MaxBlocks(32);
     MaxBlocks = Attr->getMaxBlocks()->EvaluateKnownConstInt(getContext());
-    if (MaxBlocks > 0)
-      // Create !{<func-ref>, metadata !"maxclusterrank", i32 <val>} node
-      NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank",
-                                              MaxBlocks.getExtValue());
+    if (MaxBlocks > 0) {
+      if (MaxClusterRankVal)
+        *MaxClusterRankVal = MaxBlocks.getExtValue();
+      if (F) {
+        // Create !{<func-ref>, metadata !"maxclusterrank", i32 <val>} node
+        NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank",
+                                                MaxBlocks.getExtValue());
+      }
+    }
   }
 }
 

diff --git a/clang/test/OpenMP/ompx_attributes_codegen.cpp b/clang/test/OpenMP/ompx_attributes_codegen.cpp
@@ -1,16 +1,17 @@
 // REQUIRES: amdgpu-registered-target
 
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
-// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
+// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA
 // expected-no-diagnostics
 
 
 // Check that the target attributes are set on the generated kernel
 void func() {
-  // CHECK: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l15() #0
-  // CHECK: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l17()
-  // CHECK: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l19() #4
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l16() #0
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18()
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20() #4
 
   #pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
   {}
@@ -21,11 +22,20 @@ void func() {
   {}
 }
 
-// CHECK: attributes #0
-// CHECK-SAME: "amdgpu-flat-work-group-size"="10,20"
-// CHECK: attributes #4
-// CHECK-SAME: "amdgpu-flat-work-group-size"="3,17"
-// CHECK-SAME: "amdgpu-waves-per-eu"="3,7"
+// AMD: attributes #0
+// AMD-SAME: "amdgpu-flat-work-group-size"="10,20"
+// AMD-SAME: "omp_target_thread_limit"="20"
+// AMD: "omp_target_thread_limit"="45"
+// AMD: attributes #4
+// AMD-SAME: "amdgpu-flat-work-group-size"="3,17"
+// AMD-SAME: "amdgpu-waves-per-eu"="3,7"
+// AMD-SAME: "omp_target_thread_limit"="17"
 
-// CHECK: !{ptr @__omp_offloading[[HASH]]_l17, !"maxntidx", i32 45}
-// CHECK: !{ptr @__omp_offloading[[HASH]]_l17, !"minctasm", i32 90}
+// It is unclear if we should use the AMD annotations for other targets, we do for now.
+// NVIDIA: "omp_target_thread_limit"="20"
+// NVIDIA: "omp_target_thread_limit"="45"
+// NVIDIA: "omp_target_thread_limit"="17"
+// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l16, !"maxntidx", i32 20}
+// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l18, !"minctasm", i32 90}
+// NVIDIA: !{ptr @__omp_offloading[[HASH2]]_l18, !"maxntidx", i32 45}
+// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l20, !"maxntidx", i32 17}