Skip to content

Commit

Permalink
[AMDGPU] Promote generic pointer kernel arguments into global
Browse files Browse the repository at this point in the history
The new pass walks kernel's pointer arguments, then loads from them.
If a loaded value is a pointer and loaded pointer is unmodified in
the kernel before the load, then promote loaded pointer to global.
Then recursively continue.

Differential Revision: https://reviews.llvm.org/D111464
  • Loading branch information
rampitec committed Oct 12, 2021
1 parent 7a29496 commit 9cf995b
Show file tree
Hide file tree
Showing 6 changed files with 559 additions and 1 deletion.
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Expand Up @@ -102,6 +102,15 @@ FunctionPass *createAMDGPULowerKernelArgumentsPass();
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
extern char &AMDGPULowerKernelArgumentsID;

FunctionPass *createAMDGPUPromoteKernelArgumentsPass();
void initializeAMDGPUPromoteKernelArgumentsPass(PassRegistry &);
extern char &AMDGPUPromoteKernelArgumentsID;

struct AMDGPUPromoteKernelArgumentsPass
: PassInfoMixin<AMDGPUPromoteKernelArgumentsPass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};

ModulePass *createAMDGPULowerKernelAttributesPass();
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
extern char &AMDGPULowerKernelAttributesID;
Expand Down
195 changes: 195 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
@@ -0,0 +1,195 @@
//===-- AMDGPUPromoteKernelArguments.cpp ----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file This pass recursively promotes generic pointer arguments of a kernel
/// into the global address space.
///
/// The pass walks kernel's pointer arguments, then loads from them. If a loaded
/// value is a pointer and loaded pointer is unmodified in the kernel before the
/// load, then promote loaded pointer to global. Then recursively continue.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/InitializePasses.h"

#define DEBUG_TYPE "amdgpu-promote-kernel-arguments"

using namespace llvm;

namespace {

class AMDGPUPromoteKernelArguments : public FunctionPass {
MemorySSA *MSSA;

Instruction *ArgCastInsertPt;

SmallVector<Value *> Ptrs;

void enqueueUsers(Value *Ptr);

bool promotePointer(Value *Ptr);

public:
static char ID;

AMDGPUPromoteKernelArguments() : FunctionPass(ID) {}

bool run(Function &F, MemorySSA &MSSA);

bool runOnFunction(Function &F) override;

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MemorySSAWrapperPass>();
AU.setPreservesAll();
}
};

} // end anonymous namespace

void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
SmallVector<User *> PtrUsers(Ptr->users());

while (!PtrUsers.empty()) {
Instruction *U = dyn_cast<Instruction>(PtrUsers.pop_back_val());
if (!U)
continue;

switch (U->getOpcode()) {
default:
break;
case Instruction::Load: {
LoadInst *LD = cast<LoadInst>(U);
PointerType *PT = dyn_cast<PointerType>(LD->getType());
if (!PT ||
(PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) ||
LD->getPointerOperand()->stripInBoundsOffsets() != Ptr)
break;
const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(LD);
// TODO: This load poprobably can be promoted to constant address space.
if (MSSA->isLiveOnEntryDef(MA))
Ptrs.push_back(LD);
break;
}
case Instruction::GetElementPtr:
case Instruction::AddrSpaceCast:
case Instruction::BitCast:
if (U->getOperand(0)->stripInBoundsOffsets() == Ptr)
PtrUsers.append(U->user_begin(), U->user_end());
break;
}
}
}

bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
enqueueUsers(Ptr);

PointerType *PT = cast<PointerType>(Ptr->getType());
if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
return false;

bool IsArg = isa<Argument>(Ptr);
IRBuilder<> B(IsArg ? ArgCastInsertPt
: &*std::next(cast<Instruction>(Ptr)->getIterator()));

// Cast pointer to global address space and back to flat and let
// Infer Address Spaces pass to do all necessary rewriting.
PointerType *NewPT =
PointerType::getWithSamePointeeType(PT, AMDGPUAS::GLOBAL_ADDRESS);
Value *Cast =
B.CreateAddrSpaceCast(Ptr, NewPT, Twine(Ptr->getName(), ".global"));
Value *CastBack =
B.CreateAddrSpaceCast(Cast, PT, Twine(Ptr->getName(), ".flat"));
Ptr->replaceUsesWithIf(CastBack,
[Cast](Use &U) { return U.getUser() != Cast; });

return true;
}

// skip allocas
static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);

// If this is a dynamic alloca, the value may depend on the loaded kernargs,
// so loads will need to be inserted before it.
if (!AI || !AI->isStaticAlloca())
break;
}

return InsPt;
}

bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
if (skipFunction(F))
return false;

CallingConv::ID CC = F.getCallingConv();
if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
return false;

ArgCastInsertPt = &*getInsertPt(*F.begin());
this->MSSA = &MSSA;

for (Argument &Arg : F.args()) {
if (Arg.use_empty())
continue;

PointerType *PT = dyn_cast<PointerType>(Arg.getType());
if (!PT || (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS))
continue;

Ptrs.push_back(&Arg);
}

bool Changed = false;
while (!Ptrs.empty()) {
Value *Ptr = Ptrs.pop_back_val();
Changed |= promotePointer(Ptr);
}

return Changed;
}

bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) {
MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
return run(F, MSSA);
}

INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
"AMDGPU Promote Kernel Arguments", false, false)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
"AMDGPU Promote Kernel Arguments", false, false)

char AMDGPUPromoteKernelArguments::ID = 0;

FunctionPass *llvm::createAMDGPUPromoteKernelArgumentsPass() {
return new AMDGPUPromoteKernelArguments();
}

PreservedAnalyses
AMDGPUPromoteKernelArgumentsPass::run(Function &F,
FunctionAnalysisManager &AM) {
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
if (AMDGPUPromoteKernelArguments().run(F, MSSA)) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
PA.preserve<MemorySSAAnalysis>();
return PA;
}
return PreservedAnalyses::all();
}
28 changes: 27 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Expand Up @@ -306,6 +306,11 @@ static cl::opt<bool> EnablePreRAOptimizations(
cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
cl::Hidden);

static cl::opt<bool> EnablePromoteKernelArguments(
"amdgpu-enable-promote-kernel-arguments",
cl::desc("Enable promotion of flat kernel pointer arguments to global"),
cl::Hidden, cl::init(true));

extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
Expand Down Expand Up @@ -339,6 +344,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUArgumentUsageInfoPass(*PR);
initializeAMDGPUAtomicOptimizerPass(*PR);
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
Expand Down Expand Up @@ -533,6 +539,8 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
bool PromoteKernelArguments =
EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less;

if (EnableFunctionCalls) {
delete Builder.Inliner;
Expand Down Expand Up @@ -574,7 +582,14 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {

Builder.addExtension(
PassManagerBuilder::EP_CGSCCOptimizerLate,
[EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
[EnableOpt, PromoteKernelArguments](const PassManagerBuilder &,
legacy::PassManagerBase &PM) {
// Add promote kernel arguments pass to the opt pipeline right before
// infer address spaces which is needed to do actual address space
// rewriting.
if (PromoteKernelArguments)
PM.add(createAMDGPUPromoteKernelArgumentsPass());

// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.
PM.add(createInferAddressSpacesPass());
Expand Down Expand Up @@ -651,6 +666,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
return true;
}
if (PassName == "amdgpu-promote-kernel-arguments") {
PM.addPass(AMDGPUPromoteKernelArgumentsPass());
return true;
}
return false;
});

Expand Down Expand Up @@ -702,6 +721,13 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {

FunctionPassManager FPM;

// Add promote kernel arguments pass to the opt pipeline right before
// infer address spaces which is needed to do actual address space
// rewriting.
if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
EnablePromoteKernelArguments)
FPM.addPass(AMDGPUPromoteKernelArgumentsPass());

// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.
FPM.addPass(InferAddressSpacesPass());
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Expand Up @@ -83,6 +83,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUPrintfRuntimeBinding.cpp
AMDGPUPromoteAlloca.cpp
AMDGPUPropagateAttributes.cpp
AMDGPUPromoteKernelArguments.cpp
AMDGPURegBankCombiner.cpp
AMDGPURegisterBankInfo.cpp
AMDGPUReplaceLDSUseWithPointer.cpp
Expand Down
10 changes: 10 additions & 0 deletions llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
Expand Up @@ -408,6 +408,11 @@
; GCN-O2-NEXT: OpenMP specific optimizations
; GCN-O2-NEXT: Deduce function attributes
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O2-NEXT: Function Alias Analysis Results
; GCN-O2-NEXT: Memory SSA
; GCN-O2-NEXT: AMDGPU Promote Kernel Arguments
; GCN-O2-NEXT: Infer address spaces
; GCN-O2-NEXT: AMDGPU Kernel Attributes
; GCN-O2-NEXT: FunctionPass Manager
Expand Down Expand Up @@ -766,6 +771,11 @@
; GCN-O3-NEXT: Deduce function attributes
; GCN-O3-NEXT: Promote 'by reference' arguments to scalars
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O3-NEXT: Function Alias Analysis Results
; GCN-O3-NEXT: Memory SSA
; GCN-O3-NEXT: AMDGPU Promote Kernel Arguments
; GCN-O3-NEXT: Infer address spaces
; GCN-O3-NEXT: AMDGPU Kernel Attributes
; GCN-O3-NEXT: FunctionPass Manager
Expand Down

0 comments on commit 9cf995b

Please sign in to comment.