AMDGPU: Invert ABI attribute handling

Previously we assumed all callable functions did not need any implicitly passed inputs, and added attributes to functions to indicate when they were necessary. Requiring attributes for correctness is pretty ugly, and it makes supporting indirect and external calls more complicated. This inverts the direction of the attributes, so an undecorated function is assumed to need all implicit imputs. This enables AMDGPUAttributor by default to mark when functions are proven to not need a given input. This strips the equivalent functionality from the legacy AMDGPUAnnotateKernelFeatures pass. However, AMDGPUAnnotateKernelFeatures is not fully removed at this point although it should be in the future. It is still necessary for the two hacky amdgpu-calls and amdgpu-stack-objects attributes, which would be better served by a trivial analysis on the IR during selection. Additionally, AMDGPUAnnotateKernelFeatures still redundantly handles the uniform-work-group-size attribute to be removed in a future commit. At this point when not using -amdgpu-fixed-function-abi, we are still modifying the ABI based on these newly negated attributes. In the future, this option will be removed and the locations for implicit inputs will always be fixed. We will then use the new attributes to avoid passing the values when unnecessary.
llvm · Sep 9, 2021 · 722b8e0 · 722b8e0
1 parent bfa2a81
commit 722b8e0
Show file tree

Hide file tree

Showing 23 changed files with 959 additions and 831 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-/// \file This pass adds target attributes to functions which use intrinsics
-/// which will impact calling convention lowering.
+/// \file This pass propagates the uniform-work-group-size attribute from
+/// kernels to leaf functions when possible. It also adds additional attributes
+/// to hint ABI lowering optimizations later.
 //
 //===----------------------------------------------------------------------===//
 
@@ -25,22 +26,14 @@
 using namespace llvm;
 
 namespace {
-static constexpr StringLiteral ImplicitAttrNames[] = {
-    // X ids unnecessarily propagated to kernels.
-    "amdgpu-work-item-id-x",  "amdgpu-work-item-id-y",
-    "amdgpu-work-item-id-z",  "amdgpu-work-group-id-x",
-    "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
-    "amdgpu-dispatch-ptr",    "amdgpu-dispatch-id",
-    "amdgpu-queue-ptr",       "amdgpu-implicitarg-ptr"};
-
 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
 private:
   const TargetMachine *TM = nullptr;
   SmallVector<CallGraphNode*, 8> NodeList;
 
-  bool addFeatureAttributes(Function &F);
   bool processUniformWorkGroupAttribute();
   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
+  bool addFeatureAttributes(Function &F);
 
 public:
   static char ID;
@@ -58,12 +51,6 @@ class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
     AU.setPreservesAll();
     CallGraphSCCPass::getAnalysisUsage(AU);
   }
-
-  static bool visitConstantExpr(const ConstantExpr *CE);
-  static bool visitConstantExprsRecursively(
-    const Constant *EntryC,
-    SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
-    bool HasApertureRegs);
 };
 
 } // end anonymous namespace
@@ -75,137 +62,6 @@ char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
                 "Add AMDGPU function attributes", false, false)
 
-
-// The queue ptr is only needed when casting to flat, not from it.
-static bool castRequiresQueuePtr(unsigned SrcAS) {
-  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
-}
-
-static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
-  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
-}
-
-static bool isDSAddress(const Constant *C) {
-  const GlobalValue *GV = dyn_cast<GlobalValue>(C);
-  if (!GV)
-    return false;
-  unsigned AS = GV->getAddressSpace();
-  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
-}
-
-bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
-  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
-    unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
-    return castRequiresQueuePtr(SrcAS);
-  }
-
-  return false;
-}
-
-bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
-  const Constant *EntryC,
-  SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
-  bool IsFunc, bool HasApertureRegs) {
-
-  if (!ConstantExprVisited.insert(EntryC).second)
-    return false;
-
-  SmallVector<const Constant *, 16> Stack;
-  Stack.push_back(EntryC);
-
-  while (!Stack.empty()) {
-    const Constant *C = Stack.pop_back_val();
-
-    // We need to trap on DS globals in non-entry functions.
-    if (IsFunc && isDSAddress(C))
-      return true;
-
-    // Check this constant expression.
-    if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
-      if (!HasApertureRegs && visitConstantExpr(CE))
-        return true;
-    }
-
-    // Visit all sub-expressions.
-    for (const Use &U : C->operands()) {
-      const auto *OpC = dyn_cast<Constant>(U);
-      if (!OpC)
-        continue;
-
-      if (!ConstantExprVisited.insert(OpC).second)
-        continue;
-
-      Stack.push_back(OpC);
-    }
-  }
-
-  return false;
-}
-
-// We do not need to note the x workitem or workgroup id because they are always
-// initialized.
-//
-// TODO: We should not add the attributes if the known compile time workgroup
-// size is 1 for y/z.
-static StringRef intrinsicToAttrName(Intrinsic::ID ID,
-                                     bool &NonKernelOnly,
-                                     bool &IsQueuePtr) {
-  switch (ID) {
-  case Intrinsic::amdgcn_workitem_id_x:
-    NonKernelOnly = true;
-    return "amdgpu-work-item-id-x";
-  case Intrinsic::amdgcn_workgroup_id_x:
-    NonKernelOnly = true;
-    return "amdgpu-work-group-id-x";
-  case Intrinsic::amdgcn_workitem_id_y:
-  case Intrinsic::r600_read_tidig_y:
-    return "amdgpu-work-item-id-y";
-  case Intrinsic::amdgcn_workitem_id_z:
-  case Intrinsic::r600_read_tidig_z:
-    return "amdgpu-work-item-id-z";
-  case Intrinsic::amdgcn_workgroup_id_y:
-  case Intrinsic::r600_read_tgid_y:
-    return "amdgpu-work-group-id-y";
-  case Intrinsic::amdgcn_workgroup_id_z:
-  case Intrinsic::r600_read_tgid_z:
-    return "amdgpu-work-group-id-z";
-  case Intrinsic::amdgcn_dispatch_ptr:
-    return "amdgpu-dispatch-ptr";
-  case Intrinsic::amdgcn_dispatch_id:
-    return "amdgpu-dispatch-id";
-  case Intrinsic::amdgcn_implicitarg_ptr:
-    return "amdgpu-implicitarg-ptr";
-  case Intrinsic::amdgcn_queue_ptr:
-  case Intrinsic::amdgcn_is_shared:
-  case Intrinsic::amdgcn_is_private:
-    // TODO: Does not require queue ptr on gfx9+
-  case Intrinsic::trap:
-  case Intrinsic::debugtrap:
-    IsQueuePtr = true;
-    return "amdgpu-queue-ptr";
-  default:
-    return "";
-  }
-}
-
-static bool handleAttr(Function &Parent, const Function &Callee,
-                       StringRef Name) {
-  if (Callee.hasFnAttribute(Name)) {
-    Parent.addFnAttr(Name);
-    return true;
-  }
-  return false;
-}
-
-static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
-                                   bool &NeedQueuePtr) {
-  if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
-    NeedQueuePtr = true;
-
-  for (StringRef AttrName : ImplicitAttrNames)
-    handleAttr(Parent, Callee, AttrName);
-}
-
 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
   bool Changed = false;
 
@@ -257,28 +113,10 @@ bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
 }
 
 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
-  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
-  bool HasApertureRegs = ST.hasApertureRegs();
-  SmallPtrSet<const Constant *, 8> ConstantExprVisited;
-
   bool HaveStackObjects = false;
   bool Changed = false;
-  bool NeedQueuePtr = false;
   bool HaveCall = false;
-  bool HasIndirectCall = false;
   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
-  CallingConv::ID CC = F.getCallingConv();
-  bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
-
-  // If this function hasAddressTaken() = true
-  // then add all attributes corresponding to the implicit args.
-  if (CallingConvSupportsAllImplicits &&
-      F.hasAddressTaken(nullptr, true, true, true)) {
-    for (StringRef AttrName : ImplicitAttrNames) {
-      F.addFnAttr(AttrName);
-    }
-    Changed = true;
-  }
 
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
@@ -293,59 +131,21 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
 
         // Note the occurence of indirect call.
         if (!Callee) {
-          if (!CB->isInlineAsm()) {
-            HasIndirectCall = true;
+          if (!CB->isInlineAsm())
             HaveCall = true;
-          }
+
           continue;
         }
 
         Intrinsic::ID IID = Callee->getIntrinsicID();
         if (IID == Intrinsic::not_intrinsic) {
           HaveCall = true;
-          copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
           Changed = true;
-        } else {
-          bool NonKernelOnly = false;
-
-          StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
-                                                   NeedQueuePtr);
-          if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
-            F.addFnAttr(AttrName);
-            Changed = true;
-          }
-        }
-      }
-
-      if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
-        continue;
-
-      if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
-        if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
-          NeedQueuePtr = true;
-          continue;
-        }
-      }
-
-      for (const Use &U : I.operands()) {
-        const auto *OpC = dyn_cast<Constant>(U);
-        if (!OpC)
-          continue;
-
-        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
-                                          HasApertureRegs)) {
-          NeedQueuePtr = true;
-          break;
         }
       }
     }
   }
 
-  if (NeedQueuePtr) {
-    F.addFnAttr("amdgpu-queue-ptr");
-    Changed = true;
-  }
-
   // TODO: We could refine this to captured pointers that could possibly be
   // accessed by flat instructions. For now this is mostly a poor way of
   // estimating whether there are calls before argument lowering.
@@ -359,28 +159,6 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
     Changed = true;
   }
 
-  // This pass cannot copy attributes from callees to callers
-  // if there is an indirect call and in thus such cases,
-  // hasAddressTaken() would be false for kernels and functions
-  // making an indirect call (if they are themselves not indirectly called).
-  // We must tag all such kernels/functions with all implicits attributes
-  // for correctness.
-  // e.g.
-  // 1. Kernel K1 makes an indirect call to function F1.
-  //    Without detecting an indirect call in K1, this pass will not
-  //    add all implicit args to K1 (which is incorrect).
-  // 2. Kernel K1 makes direct call to F1 which makes indirect call to function
-  // F2.
-  //    Without detecting an indirect call in F1 (whose hasAddressTaken() is
-  //    false), the pass will not add all implicit args to F1 (which is
-  //    essential for correctness).
-  if (CallingConvSupportsAllImplicits && HasIndirectCall) {
-    for (StringRef AttrName : ImplicitAttrNames) {
-      F.addFnAttr(AttrName);
-    }
-    Changed = true;
-  }
-
   return Changed;
 }
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -978,8 +978,13 @@ void AMDGPUPassConfig::addIRPasses() {
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
-  if (TM->getTargetTriple().getArch() == Triple::amdgcn)
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
+    addPass(createAMDGPUAttributorPass());
+
+    // FIXME: This pass adds 2 hacky attributes that can be replaced with an
+    // analysis, and should be removed.
     addPass(createAMDGPUAnnotateKernelFeaturesPass());
+  }
 
   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
       EnableLowerKernelArguments)