Skip to content

Commit

Permalink
AMDGPU: Invert ABI attribute handling
Browse files Browse the repository at this point in the history
Previously we assumed all callable functions did not need any
implicitly passed inputs, and added attributes to functions to
indicate when they were necessary. Requiring attributes for
correctness is pretty ugly, and it makes supporting indirect and
external calls more complicated.

This inverts the direction of the attributes, so an undecorated
function is assumed to need all implicit imputs. This enables
AMDGPUAttributor by default to mark when functions are proven to not
need a given input. This strips the equivalent functionality from the
legacy AMDGPUAnnotateKernelFeatures pass.

However, AMDGPUAnnotateKernelFeatures is not fully removed at this
point although it should be in the future. It is still necessary for
the two hacky amdgpu-calls and amdgpu-stack-objects attributes, which
would be better served by a trivial analysis on the IR during
selection. Additionally, AMDGPUAnnotateKernelFeatures still
redundantly handles the uniform-work-group-size attribute to be
removed in a future commit.

At this point when not using -amdgpu-fixed-function-abi, we are still
modifying the ABI based on these newly negated attributes. In the
future, this option will be removed and the locations for implicit
inputs will always be fixed. We will then use the new attributes to
avoid passing the values when unnecessary.
  • Loading branch information
arsenm committed Sep 9, 2021
1 parent bfa2a81 commit 722b8e0
Show file tree
Hide file tree
Showing 23 changed files with 959 additions and 831 deletions.
234 changes: 6 additions & 228 deletions llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
Expand Up @@ -6,8 +6,9 @@
//
//===----------------------------------------------------------------------===//
//
/// \file This pass adds target attributes to functions which use intrinsics
/// which will impact calling convention lowering.
/// \file This pass propagates the uniform-work-group-size attribute from
/// kernels to leaf functions when possible. It also adds additional attributes
/// to hint ABI lowering optimizations later.
//
//===----------------------------------------------------------------------===//

Expand All @@ -25,22 +26,14 @@
using namespace llvm;

namespace {
static constexpr StringLiteral ImplicitAttrNames[] = {
// X ids unnecessarily propagated to kernels.
"amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
"amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
"amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
"amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
"amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"};

class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
private:
const TargetMachine *TM = nullptr;
SmallVector<CallGraphNode*, 8> NodeList;

bool addFeatureAttributes(Function &F);
bool processUniformWorkGroupAttribute();
bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
bool addFeatureAttributes(Function &F);

public:
static char ID;
Expand All @@ -58,12 +51,6 @@ class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
AU.setPreservesAll();
CallGraphSCCPass::getAnalysisUsage(AU);
}

static bool visitConstantExpr(const ConstantExpr *CE);
static bool visitConstantExprsRecursively(
const Constant *EntryC,
SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
bool HasApertureRegs);
};

} // end anonymous namespace
Expand All @@ -75,137 +62,6 @@ char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
"Add AMDGPU function attributes", false, false)


// The queue ptr is only needed when casting to flat, not from it.
static bool castRequiresQueuePtr(unsigned SrcAS) {
return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
}

static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
return castRequiresQueuePtr(ASC->getSrcAddressSpace());
}

static bool isDSAddress(const Constant *C) {
const GlobalValue *GV = dyn_cast<GlobalValue>(C);
if (!GV)
return false;
unsigned AS = GV->getAddressSpace();
return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
}

bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
return castRequiresQueuePtr(SrcAS);
}

return false;
}

bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
const Constant *EntryC,
SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
bool IsFunc, bool HasApertureRegs) {

if (!ConstantExprVisited.insert(EntryC).second)
return false;

SmallVector<const Constant *, 16> Stack;
Stack.push_back(EntryC);

while (!Stack.empty()) {
const Constant *C = Stack.pop_back_val();

// We need to trap on DS globals in non-entry functions.
if (IsFunc && isDSAddress(C))
return true;

// Check this constant expression.
if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
if (!HasApertureRegs && visitConstantExpr(CE))
return true;
}

// Visit all sub-expressions.
for (const Use &U : C->operands()) {
const auto *OpC = dyn_cast<Constant>(U);
if (!OpC)
continue;

if (!ConstantExprVisited.insert(OpC).second)
continue;

Stack.push_back(OpC);
}
}

return false;
}

// We do not need to note the x workitem or workgroup id because they are always
// initialized.
//
// TODO: We should not add the attributes if the known compile time workgroup
// size is 1 for y/z.
static StringRef intrinsicToAttrName(Intrinsic::ID ID,
bool &NonKernelOnly,
bool &IsQueuePtr) {
switch (ID) {
case Intrinsic::amdgcn_workitem_id_x:
NonKernelOnly = true;
return "amdgpu-work-item-id-x";
case Intrinsic::amdgcn_workgroup_id_x:
NonKernelOnly = true;
return "amdgpu-work-group-id-x";
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
return "amdgpu-work-item-id-y";
case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
return "amdgpu-work-item-id-z";
case Intrinsic::amdgcn_workgroup_id_y:
case Intrinsic::r600_read_tgid_y:
return "amdgpu-work-group-id-y";
case Intrinsic::amdgcn_workgroup_id_z:
case Intrinsic::r600_read_tgid_z:
return "amdgpu-work-group-id-z";
case Intrinsic::amdgcn_dispatch_ptr:
return "amdgpu-dispatch-ptr";
case Intrinsic::amdgcn_dispatch_id:
return "amdgpu-dispatch-id";
case Intrinsic::amdgcn_implicitarg_ptr:
return "amdgpu-implicitarg-ptr";
case Intrinsic::amdgcn_queue_ptr:
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private:
// TODO: Does not require queue ptr on gfx9+
case Intrinsic::trap:
case Intrinsic::debugtrap:
IsQueuePtr = true;
return "amdgpu-queue-ptr";
default:
return "";
}
}

static bool handleAttr(Function &Parent, const Function &Callee,
StringRef Name) {
if (Callee.hasFnAttribute(Name)) {
Parent.addFnAttr(Name);
return true;
}
return false;
}

static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
bool &NeedQueuePtr) {
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
NeedQueuePtr = true;

for (StringRef AttrName : ImplicitAttrNames)
handleAttr(Parent, Callee, AttrName);
}

bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
bool Changed = false;

Expand Down Expand Up @@ -257,28 +113,10 @@ bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
}

bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
bool HasApertureRegs = ST.hasApertureRegs();
SmallPtrSet<const Constant *, 8> ConstantExprVisited;

bool HaveStackObjects = false;
bool Changed = false;
bool NeedQueuePtr = false;
bool HaveCall = false;
bool HasIndirectCall = false;
bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
CallingConv::ID CC = F.getCallingConv();
bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);

// If this function hasAddressTaken() = true
// then add all attributes corresponding to the implicit args.
if (CallingConvSupportsAllImplicits &&
F.hasAddressTaken(nullptr, true, true, true)) {
for (StringRef AttrName : ImplicitAttrNames) {
F.addFnAttr(AttrName);
}
Changed = true;
}

for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
Expand All @@ -293,59 +131,21 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {

// Note the occurence of indirect call.
if (!Callee) {
if (!CB->isInlineAsm()) {
HasIndirectCall = true;
if (!CB->isInlineAsm())
HaveCall = true;
}

continue;
}

Intrinsic::ID IID = Callee->getIntrinsicID();
if (IID == Intrinsic::not_intrinsic) {
HaveCall = true;
copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
Changed = true;
} else {
bool NonKernelOnly = false;

StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
NeedQueuePtr);
if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
F.addFnAttr(AttrName);
Changed = true;
}
}
}

if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
continue;

if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
NeedQueuePtr = true;
continue;
}
}

for (const Use &U : I.operands()) {
const auto *OpC = dyn_cast<Constant>(U);
if (!OpC)
continue;

if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
HasApertureRegs)) {
NeedQueuePtr = true;
break;
}
}
}
}

if (NeedQueuePtr) {
F.addFnAttr("amdgpu-queue-ptr");
Changed = true;
}

// TODO: We could refine this to captured pointers that could possibly be
// accessed by flat instructions. For now this is mostly a poor way of
// estimating whether there are calls before argument lowering.
Expand All @@ -359,28 +159,6 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
Changed = true;
}

// This pass cannot copy attributes from callees to callers
// if there is an indirect call and in thus such cases,
// hasAddressTaken() would be false for kernels and functions
// making an indirect call (if they are themselves not indirectly called).
// We must tag all such kernels/functions with all implicits attributes
// for correctness.
// e.g.
// 1. Kernel K1 makes an indirect call to function F1.
// Without detecting an indirect call in K1, this pass will not
// add all implicit args to K1 (which is incorrect).
// 2. Kernel K1 makes direct call to F1 which makes indirect call to function
// F2.
// Without detecting an indirect call in F1 (whose hasAddressTaken() is
// false), the pass will not add all implicit args to F1 (which is
// essential for correctness).
if (CallingConvSupportsAllImplicits && HasIndirectCall) {
for (StringRef AttrName : ImplicitAttrNames) {
F.addFnAttr(AttrName);
}
Changed = true;
}

return Changed;
}

Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Expand Up @@ -978,8 +978,13 @@ void AMDGPUPassConfig::addIRPasses() {
}

void AMDGPUPassConfig::addCodeGenPrepare() {
if (TM->getTargetTriple().getArch() == Triple::amdgcn)
if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
addPass(createAMDGPUAttributorPass());

// FIXME: This pass adds 2 hacky attributes that can be replaced with an
// analysis, and should be removed.
addPass(createAMDGPUAnnotateKernelFeaturesPass());
}

if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
EnableLowerKernelArguments)
Expand Down

0 comments on commit 722b8e0

Please sign in to comment.