-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector #84735
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesAdd a new mode to PromoteAllocaToVector so it considers the whole function before promoting allocas.
Passed internal performance testing. Full diff: https://github.com/llvm/llvm-project/pull/84735.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index b1b15e9915aea3..582f06a3d755a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -32,6 +32,7 @@
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
@@ -39,6 +40,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -64,10 +66,21 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
cl::desc("Maximum byte size to consider promote alloca to vector"),
cl::init(0));
+static cl::opt<bool> PromoteAllocaWholeFn(
+ "promote-alloca-vector-whole-function",
+ cl::desc("Collect, analyze and sort all allocas within a function before promoting them to vector. When disabled, allocas are promoted one by one in isolation"),
+ cl::init(true));
+static cl::opt<unsigned>
+ LoopUserWeight("promote-alloca-vector-loop-user-weight",
+ cl::desc("The bonus weight of users of allocas within loop "
+ "when sorting profitable alloca for whole-function alloca promotion to vector"),
+ cl::init(4));
+
// Shared implementation which can do both promotion to vector and to LDS.
class AMDGPUPromoteAllocaImpl {
private:
const TargetMachine &TM;
+ LoopInfo &LI;
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
@@ -101,8 +114,11 @@ class AMDGPUPromoteAllocaImpl {
bool tryPromoteAllocaToVector(AllocaInst &I);
bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
+ void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
+
public:
- AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {
+ AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
+
const Triple &TT = TM.getTargetTriple();
IsAMDGCN = TT.getArch() == Triple::amdgcn;
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
@@ -122,7 +138,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
if (skipFunction(F))
return false;
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
- return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+ return AMDGPUPromoteAllocaImpl(
+ TPC->getTM<TargetMachine>(),
+ getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
.run(F, /*PromoteToLDS*/ true);
return false;
}
@@ -131,6 +149,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
};
@@ -145,7 +164,9 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
if (skipFunction(F))
return false;
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
- return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+ return AMDGPUPromoteAllocaImpl(
+ TPC->getTM<TargetMachine>(),
+ getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
.run(F, /*PromoteToLDS*/ false);
return false;
}
@@ -156,6 +177,7 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
};
@@ -186,18 +208,23 @@ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
// Move LDS uses from functions to kernels before promote alloca for accurate
// estimation of LDS available
INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
"AMDGPU promote alloca to vector or LDS", false, false)
-INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
- "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+ "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+ "AMDGPU promote alloca to vector", false, false)
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
FunctionAnalysisManager &AM) {
- bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS*/ true);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -208,7 +235,8 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
PreservedAnalyses
AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
- bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS*/ false);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -225,6 +253,49 @@ FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
return new AMDGPUPromoteAllocaToVector();
}
+void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
+ SmallVectorImpl<AllocaInst *> &Allocas) {
+ DenseMap<AllocaInst *, unsigned> Scores;
+
+ LLVM_DEBUG(dbgs() << "Before sorting allocas:\n"; for (auto *A
+ : Allocas) dbgs()
+ << " " << *A << "\n";);
+
+ for (auto *Alloca : Allocas) {
+ LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
+ unsigned &Score = Scores[Alloca];
+ // Increment score by one for each user + a bonus for users within loops.
+ //
+ // Look through GEPs and bitcasts for additional users.
+ SmallVector<User *, 8> WorkList;
+ WorkList.append(Alloca->user_begin(), Alloca->user_end());
+ while (!WorkList.empty()) {
+ auto *Inst = dyn_cast<Instruction>(WorkList.pop_back_val());
+ if (!Inst)
+ continue;
+
+ if (isa<BitCastInst>(Inst) || isa<GetElementPtrInst>(Inst)) {
+ WorkList.append(Inst->user_begin(), Inst->user_end());
+ continue;
+ }
+
+ unsigned UserScore =
+ 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
+ LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
+ Score += UserScore;
+ }
+ LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
+ }
+
+ sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
+ return Scores.at(A) > Scores.at(B);
+ });
+
+ LLVM_DEBUG(dbgs() << "After sorting allocas:\n"; for (auto *A
+ : Allocas) dbgs()
+ << " " << *A << "\n";);
+}
+
bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
Mod = F.getParent();
DL = &Mod->getDataLayout();
@@ -237,6 +308,13 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
+ // Use up to 1/4 of available register budget for vectorization.
+ // FIXME: Increase the limit for whole function budgets? Perhaps x2?
+ unsigned VectorizationBudget =
+ (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+ : (MaxVGPRs * 32)) /
+ 4;
+
SmallVector<AllocaInst *, 16> Allocas;
for (Instruction &I : F.getEntryBlock()) {
if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
@@ -248,11 +326,32 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
}
}
+ if (PromoteAllocaWholeFn)
+ sortAllocasToPromote(Allocas);
+
bool Changed = false;
for (AllocaInst *AI : Allocas) {
- if (tryPromoteAllocaToVector(*AI))
+ const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
+ if (AllocaCost > VectorizationBudget) {
+ LLVM_DEBUG(dbgs() << " Alloca too big for vectorization: " << *AI
+ << "\n");
+ return false;
+ }
+
+ if (tryPromoteAllocaToVector(*AI)) {
Changed = true;
- else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+ if (PromoteAllocaWholeFn) {
+ // When PromoteAllocaWholeFn is enabled, the budget applies for the
+ // whole function. Otherwise it's per alloca.
+ assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
+ "Underflow!");
+ VectorizationBudget -= AllocaCost;
+ LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
+ << VectorizationBudget << "\n");
+ if (VectorizationBudget == 0)
+ break;
+ }
+ } else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
Changed = true;
}
@@ -641,16 +740,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
ArrayTy->getNumElements());
}
- // Use up to 1/4 of available register budget for vectorization.
- unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
- : (MaxVGPRs * 32);
-
- if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) {
- LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with " << MaxVGPRs
- << " registers available\n");
- return false;
- }
-
// FIXME: There is no reason why we can't support larger arrays, we
// are just being conservative for now.
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 5007f77316f5b4..0ff5dd3680dfab 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -195,13 +195,13 @@
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU atomic optimizations
; GCN-O1-NEXT: Expand Atomic instructions
-; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Dominator Tree Construction
+; GCN-O1-NEXT: Natural Loop Information
+; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Cycle Info Analysis
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU IR optimizations
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
-; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: Canonicalize natural loops
; GCN-O1-NEXT: Scalar Evolution Analysis
; GCN-O1-NEXT: Loop Pass Manager
@@ -470,9 +470,9 @@
; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations
; GCN-O1-OPTS-NEXT: Expand Atomic instructions
-; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
+; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Canonicalize natural loops
; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis
; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis
@@ -775,9 +775,9 @@
; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: AMDGPU atomic optimizations
; GCN-O2-NEXT: Expand Atomic instructions
-; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Natural Loop Information
+; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O2-NEXT: Scalar Evolution Analysis
; GCN-O2-NEXT: Straight line strength reduction
@@ -1084,9 +1084,9 @@
; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: AMDGPU atomic optimizations
; GCN-O3-NEXT: Expand Atomic instructions
-; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Natural Loop Information
+; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O3-NEXT: Scalar Evolution Analysis
; GCN-O3-NEXT: Straight line strength reduction
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
new file mode 100644
index 00000000000000..49778b22ce4b8a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
@@ -0,0 +1,72 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -promote-alloca-vector-whole-function -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: Before sorting allocas:
+; CHECK-NEXT: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %simpleuser, align 4
+; CHECK-NEXT: => Final Score:1
+; CHECK-NEXT: Scoring: %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [+1]: %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1
+; CHECK-NEXT: [+1]: store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4
+; CHECK-NEXT: [+1]: %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1
+; CHECK-NEXT: [+1]: store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4
+; CHECK-NEXT: => Final Score:4
+; CHECK-NEXT: After sorting allocas:
+; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+define amdgpu_kernel void @simple_users_scores() #0 {
+entry:
+ ; should get a score of 1
+ %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+ ; should get a score of 4
+ %manyusers = alloca [4 x i64], align 4, addrspace(5)
+
+ store i32 42, ptr addrspace(5) %simpleuser
+
+ %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
+ %v0 = load i8, ptr addrspace(5) %manyusers.1
+ %v0.ext = zext i8 %v0 to i32
+ store i32 %v0.ext, ptr addrspace(5) %manyusers.1
+
+ %manyusers.2 = bitcast ptr addrspace(5) %manyusers to ptr addrspace(5)
+ %v1 = load i8, ptr addrspace(5) %manyusers.2
+ %v1.ext = zext i8 %v0 to i32
+ store i32 %v1.ext, ptr addrspace(5) %manyusers.2
+
+ ret void
+}
+
+; CHECK: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [+5]: %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1
+; CHECK-NEXT: [+9]: store i32 32, ptr addrspace(5) %stack.1, align 4
+; CHECK-NEXT: [+9]: %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1
+; CHECK-NEXT: [+1]: store i32 64, ptr addrspace(5) %stack.2, align 4
+; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT: [+5]: store i32 32, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT: => Final Score:30
+define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
+entry:
+ ; should get a score of 1
+ %stack = alloca [4 x i64], align 4, addrspace(5)
+ %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4
+ %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8
+
+ store i32 42, ptr addrspace(5) %stack
+ br label %loop.outer
+
+loop.outer:
+ store i32 32, ptr addrspace(5) %stack
+ %outer.cmp = load i1, ptr addrspace(5) %stack.1
+ br label %loop.inner
+
+loop.inner:
+ store i32 32, ptr addrspace(5) %stack.1
+ %inner.cmp = load i1, ptr addrspace(5) %stack.2
+ br i1 %inner.cmp, label %loop.inner, label %loop.outer
+
+exit:
+ store i32 64, ptr addrspace(5) %stack.2
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
index 64d7c7868ca8de..c7797d492ca16c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -promote-alloca-vector-whole-function=0 < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -promote-alloca-vector-whole-function=0 -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
target datalayout = "A5"
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
WorkList.append(Inst->user_begin(), Inst->user_end()); | ||
continue; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if we should account for the size here somehow. Do we end up preferring many accesses of small allocations, or fewer accesses of larger ones?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently it just works on users, so if we have a big alloca with 2 users and a small one with 3, it'll promote the small one first then may run out of the budget for the big one.
Making it aware of size is a good idea, but I'm not sure how big the impact should be. It's tricky to tune this because while big allocas may be better to promote, they might cause more spills which negates all the benefits of the transform as well.
I'd say adding size awareness can wait, but if we want to add it now, I'd suggest adding some small penalty for small allocas (under a certain threshold, e.g. 8 bytes) and adding a small boost to bigger allocas (e.g. above 64 bytes)
45a2e87
to
fa46a0c
Compare
…vm#84735) Update PromoteAllocaToVector so it considers the whole function before promoting allocas. Allocas are scored & sorted so the highest value ones are seen first. The budget is now per function instead of per alloca. Passed internal performance testing.
Add a new mode to PromoteAllocaToVector so it considers the whole function before promoting allocas.
Passed internal performance testing.