[AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector #84735

Pierre-vh · 2024-03-11T09:21:26Z

Add a new mode to PromoteAllocaToVector so it considers the whole function before promoting allocas.

Sort allocas so "high-value" ones are seen first.
- High value = many users, especially users within loops which have more weight to them.
In this mode, the budget is per function instead of per alloca.

Passed internal performance testing.

llvmbot · 2024-03-11T09:22:00Z

@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)

Changes

Add a new mode to PromoteAllocaToVector so it considers the whole function before promoting allocas.

Sort allocas so "high-value" ones are seen first.
- High value = many users, especially users within loops which have more weight to them.
In this mode, the budget is per function instead of per alloca.

Passed internal performance testing.

Full diff: https://github.com/llvm/llvm-project/pull/84735.diff

4 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (+108-19)
(modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+5-5)
(added) llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll (+72)
(modified) llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll (+2-2)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index b1b15e9915aea3..582f06a3d755a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
@@ -39,6 +40,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -64,10 +66,21 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
     cl::desc("Maximum byte size to consider promote alloca to vector"),
     cl::init(0));
 
+static cl::opt<bool> PromoteAllocaWholeFn(
+    "promote-alloca-vector-whole-function",
+    cl::desc("Collect, analyze and sort all allocas within a function before promoting them to vector. When disabled, allocas are promoted one by one in isolation"),
+    cl::init(true));
+static cl::opt<unsigned>
+    LoopUserWeight("promote-alloca-vector-loop-user-weight",
+                   cl::desc("The bonus weight of users of allocas within loop "
+                            "when sorting profitable alloca for whole-function alloca promotion to vector"),
+                   cl::init(4));
+
 // Shared implementation which can do both promotion to vector and to LDS.
 class AMDGPUPromoteAllocaImpl {
 private:
   const TargetMachine &TM;
+  LoopInfo &LI;
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
 
@@ -101,8 +114,11 @@ class AMDGPUPromoteAllocaImpl {
   bool tryPromoteAllocaToVector(AllocaInst &I);
   bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
 
+  void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
+
 public:
-  AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {
+  AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
+
     const Triple &TT = TM.getTargetTriple();
     IsAMDGCN = TT.getArch() == Triple::amdgcn;
     IsAMDHSA = TT.getOS() == Triple::AMDHSA;
@@ -122,7 +138,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
     if (skipFunction(F))
       return false;
     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-      return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+      return AMDGPUPromoteAllocaImpl(
+                 TPC->getTM<TargetMachine>(),
+                 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
           .run(F, /*PromoteToLDS*/ true);
     return false;
   }
@@ -131,6 +149,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -145,7 +164,9 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
     if (skipFunction(F))
       return false;
     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-      return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+      return AMDGPUPromoteAllocaImpl(
+                 TPC->getTM<TargetMachine>(),
+                 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
           .run(F, /*PromoteToLDS*/ false);
     return false;
   }
@@ -156,6 +177,7 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -186,18 +208,23 @@ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
 // Move LDS uses from functions to kernels before promote alloca for accurate
 // estimation of LDS available
 INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
                     "AMDGPU promote alloca to vector or LDS", false, false)
 
-INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
-                "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+                      "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+                    "AMDGPU promote alloca to vector", false, false)
 
 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
 char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
 
 PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
                                                FunctionAnalysisManager &AM) {
-  bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS*/ true);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -208,7 +235,8 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
 
 PreservedAnalyses
 AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
-  bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS*/ false);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -225,6 +253,49 @@ FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
   return new AMDGPUPromoteAllocaToVector();
 }
 
+void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
+    SmallVectorImpl<AllocaInst *> &Allocas) {
+  DenseMap<AllocaInst *, unsigned> Scores;
+
+  LLVM_DEBUG(dbgs() << "Before sorting allocas:\n"; for (auto *A
+                                                         : Allocas) dbgs()
+                                                    << "  " << *A << "\n";);
+
+  for (auto *Alloca : Allocas) {
+    LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
+    unsigned &Score = Scores[Alloca];
+    // Increment score by one for each user + a bonus for users within loops.
+    //
+    // Look through GEPs and bitcasts for additional users.
+    SmallVector<User *, 8> WorkList;
+    WorkList.append(Alloca->user_begin(), Alloca->user_end());
+    while (!WorkList.empty()) {
+      auto *Inst = dyn_cast<Instruction>(WorkList.pop_back_val());
+      if (!Inst)
+        continue;
+
+      if (isa<BitCastInst>(Inst) || isa<GetElementPtrInst>(Inst)) {
+        WorkList.append(Inst->user_begin(), Inst->user_end());
+        continue;
+      }
+
+      unsigned UserScore =
+          1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
+      LLVM_DEBUG(dbgs() << "  [+" << UserScore << "]:\t" << *Inst << "\n");
+      Score += UserScore;
+    }
+    LLVM_DEBUG(dbgs() << "  => Final Score:" << Score << "\n");
+  }
+
+  sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
+    return Scores.at(A) > Scores.at(B);
+  });
+
+  LLVM_DEBUG(dbgs() << "After sorting allocas:\n"; for (auto *A
+                                                        : Allocas) dbgs()
+                                                   << "  " << *A << "\n";);
+}
+
 bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   Mod = F.getParent();
   DL = &Mod->getDataLayout();
@@ -237,6 +308,13 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
 
   bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
 
+  // Use up to 1/4 of available register budget for vectorization.
+  // FIXME: Increase the limit for whole function budgets? Perhaps x2?
+  unsigned VectorizationBudget =
+      (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+                                  : (MaxVGPRs * 32)) /
+      4;
+
   SmallVector<AllocaInst *, 16> Allocas;
   for (Instruction &I : F.getEntryBlock()) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
@@ -248,11 +326,32 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
     }
   }
 
+  if (PromoteAllocaWholeFn)
+    sortAllocasToPromote(Allocas);
+
   bool Changed = false;
   for (AllocaInst *AI : Allocas) {
-    if (tryPromoteAllocaToVector(*AI))
+    const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
+    if (AllocaCost > VectorizationBudget) {
+      LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization: " << *AI
+                        << "\n");
+      return false;
+    }
+
+    if (tryPromoteAllocaToVector(*AI)) {
       Changed = true;
-    else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+      if (PromoteAllocaWholeFn) {
+        // When PromoteAllocaWholeFn is enabled, the budget applies for the
+        // whole function. Otherwise it's per alloca.
+        assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
+               "Underflow!");
+        VectorizationBudget -= AllocaCost;
+        LLVM_DEBUG(dbgs() << "  Remaining vectorization budget:"
+                          << VectorizationBudget << "\n");
+        if (VectorizationBudget == 0)
+          break;
+      }
+    } else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
       Changed = true;
   }
 
@@ -641,16 +740,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
                                       ArrayTy->getNumElements());
   }
 
-  // Use up to 1/4 of available register budget for vectorization.
-  unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
-                                              : (MaxVGPRs * 32);
-
-  if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) {
-    LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with " << MaxVGPRs
-                      << " registers available\n");
-    return false;
-  }
-
   // FIXME: There is no reason why we can't support larger arrays, we
   // are just being conservative for now.
   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 5007f77316f5b4..0ff5dd3680dfab 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -195,13 +195,13 @@
 ; GCN-O1-NEXT:      Uniformity Analysis
 ; GCN-O1-NEXT:      AMDGPU atomic optimizations
 ; GCN-O1-NEXT:      Expand Atomic instructions
-; GCN-O1-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-NEXT:      Dominator Tree Construction
+; GCN-O1-NEXT:      Natural Loop Information
+; GCN-O1-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-NEXT:      Cycle Info Analysis
 ; GCN-O1-NEXT:      Uniformity Analysis
 ; GCN-O1-NEXT:      AMDGPU IR optimizations
 ; GCN-O1-NEXT:      Basic Alias Analysis (stateless AA impl)
-; GCN-O1-NEXT:      Natural Loop Information
 ; GCN-O1-NEXT:      Canonicalize natural loops
 ; GCN-O1-NEXT:      Scalar Evolution Analysis
 ; GCN-O1-NEXT:      Loop Pass Manager
@@ -470,9 +470,9 @@
 ; GCN-O1-OPTS-NEXT:      Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:      AMDGPU atomic optimizations
 ; GCN-O1-OPTS-NEXT:      Expand Atomic instructions
-; GCN-O1-OPTS-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:      Natural Loop Information
+; GCN-O1-OPTS-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-OPTS-NEXT:      Canonicalize natural loops
 ; GCN-O1-OPTS-NEXT:      Lazy Branch Probability Analysis
 ; GCN-O1-OPTS-NEXT:      Lazy Block Frequency Analysis
@@ -775,9 +775,9 @@
 ; GCN-O2-NEXT:      Uniformity Analysis
 ; GCN-O2-NEXT:      AMDGPU atomic optimizations
 ; GCN-O2-NEXT:      Expand Atomic instructions
-; GCN-O2-NEXT:      AMDGPU Promote Alloca
 ; GCN-O2-NEXT:      Dominator Tree Construction
 ; GCN-O2-NEXT:      Natural Loop Information
+; GCN-O2-NEXT:      AMDGPU Promote Alloca
 ; GCN-O2-NEXT:      Split GEPs to a variadic base and a constant offset for better CSE
 ; GCN-O2-NEXT:      Scalar Evolution Analysis
 ; GCN-O2-NEXT:      Straight line strength reduction
@@ -1084,9 +1084,9 @@
 ; GCN-O3-NEXT:      Uniformity Analysis
 ; GCN-O3-NEXT:      AMDGPU atomic optimizations
 ; GCN-O3-NEXT:      Expand Atomic instructions
-; GCN-O3-NEXT:      AMDGPU Promote Alloca
 ; GCN-O3-NEXT:      Dominator Tree Construction
 ; GCN-O3-NEXT:      Natural Loop Information
+; GCN-O3-NEXT:      AMDGPU Promote Alloca
 ; GCN-O3-NEXT:      Split GEPs to a variadic base and a constant offset for better CSE
 ; GCN-O3-NEXT:      Scalar Evolution Analysis
 ; GCN-O3-NEXT:      Straight line strength reduction
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
new file mode 100644
index 00000000000000..49778b22ce4b8a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
@@ -0,0 +1,72 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -promote-alloca-vector-whole-function -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK:      Before sorting allocas:
+; CHECK-NEXT:     %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:     %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: Scoring:   %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+1]:   store i32 42, ptr addrspace(5) %simpleuser, align 4
+; CHECK-NEXT:   => Final Score:1
+; CHECK-NEXT: Scoring:   %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+1]:   %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1
+; CHECK-NEXT:   [+1]:   store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4
+; CHECK-NEXT:   [+1]:   %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1
+; CHECK-NEXT:   [+1]:   store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4
+; CHECK-NEXT:   => Final Score:4
+; CHECK-NEXT: After sorting allocas:
+; CHECK-NEXT:     %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:     %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+define amdgpu_kernel void @simple_users_scores() #0 {
+entry:
+  ; should get a score of 1
+  %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+  ; should get a score of 4
+  %manyusers = alloca [4 x i64], align 4, addrspace(5)
+
+  store i32 42, ptr addrspace(5) %simpleuser
+
+  %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
+  %v0 = load i8, ptr addrspace(5)  %manyusers.1
+  %v0.ext = zext i8 %v0 to i32
+  store i32 %v0.ext, ptr addrspace(5) %manyusers.1
+
+  %manyusers.2 = bitcast ptr addrspace(5) %manyusers to ptr addrspace(5)
+  %v1 = load i8, ptr addrspace(5)  %manyusers.2
+  %v1.ext = zext i8 %v0 to i32
+  store i32 %v1.ext, ptr addrspace(5) %manyusers.2
+
+  ret void
+}
+
+; CHECK:      Scoring:   %stack = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+5]:   %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1
+; CHECK-NEXT:   [+9]:   store i32 32, ptr addrspace(5) %stack.1, align 4
+; CHECK-NEXT:   [+9]:   %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1
+; CHECK-NEXT:   [+1]:   store i32 64, ptr addrspace(5) %stack.2, align 4
+; CHECK-NEXT:   [+1]:   store i32 42, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT:   [+5]:   store i32 32, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT:   => Final Score:30
+define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
+entry:
+  ; should get a score of 1
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4
+  %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8
+
+  store i32 42, ptr addrspace(5) %stack
+  br label %loop.outer
+
+loop.outer:
+  store i32 32, ptr addrspace(5) %stack
+  %outer.cmp = load i1, ptr addrspace(5) %stack.1
+  br label %loop.inner
+
+loop.inner:
+  store i32 32, ptr addrspace(5) %stack.1
+  %inner.cmp = load i1, ptr addrspace(5) %stack.2
+  br i1 %inner.cmp, label %loop.inner, label %loop.outer
+
+exit:
+  store i32 64, ptr addrspace(5) %stack.2
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
index 64d7c7868ca8de..c7797d492ca16c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -promote-alloca-vector-whole-function=0 < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' -promote-alloca-vector-whole-function=0 -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s
 
 target datalayout = "A5"

github-actions · 2024-03-11T09:23:48Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

arsenm · 2024-03-11T09:29:59Z

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

+        WorkList.append(Inst->user_begin(), Inst->user_end());
+        continue;
+      }
+


I wonder if we should account for the size here somehow. Do we end up preferring many accesses of small allocations, or fewer accesses of larger ones?

Currently it just works on users, so if we have a big alloca with 2 users and a small one with 3, it'll promote the small one first then may run out of the budget for the big one.

Making it aware of size is a good idea, but I'm not sure how big the impact should be. It's tricky to tune this because while big allocas may be better to promote, they might cause more spills which negates all the benefits of the transform as well.

I'd say adding size awareness can wait, but if we want to add it now, I'd suggest adding some small penalty for small allocas (under a certain threshold, e.g. 8 bytes) and adding a small boost to bigger allocas (e.g. above 64 bytes)

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

…vm#84735) Update PromoteAllocaToVector so it considers the whole function before promoting allocas. Allocas are scored & sorted so the highest value ones are seen first. The budget is now per function instead of per alloca. Passed internal performance testing.

Pierre-vh requested review from jayfoad and arsenm March 11, 2024 09:21

llvmbot added the backend:AMDGPU label Mar 11, 2024

arsenm reviewed Mar 11, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Outdated Show resolved Hide resolved

Pierre-vh requested a review from arsenm March 11, 2024 12:01

arsenm reviewed Mar 14, 2024

View reviewed changes

Pierre-vh requested a review from arsenm March 14, 2024 11:24

arsenm reviewed Mar 15, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Outdated Show resolved Hide resolved

Pierre-vh added 5 commits March 19, 2024 10:37

[AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector

85fce21

clang-format

89b83bd

comments

e7acf76

comments

e5c5fc7

comments

fa46a0c

Pierre-vh force-pushed the promotealloc-budget branch from 45a2e87 to fa46a0c Compare March 19, 2024 09:38

Pierre-vh requested a review from arsenm March 19, 2024 09:38

arsenm approved these changes Mar 19, 2024

View reviewed changes

Pierre-vh merged commit 953c13b into llvm:main Mar 19, 2024
3 of 4 checks passed

Pierre-vh deleted the promotealloc-budget branch March 19, 2024 10:49

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector #84735

[AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector #84735

Pierre-vh commented Mar 11, 2024

llvmbot commented Mar 11, 2024

github-actions bot commented Mar 11, 2024 •

edited

arsenm Mar 11, 2024

Pierre-vh Mar 11, 2024

[AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector #84735

[AMDGPU][PromoteAlloca] Whole-function alloca promotion to vector #84735

Conversation

Pierre-vh commented Mar 11, 2024

llvmbot commented Mar 11, 2024

github-actions bot commented Mar 11, 2024 • edited

arsenm Mar 11, 2024

Choose a reason for hiding this comment

Pierre-vh Mar 11, 2024

Choose a reason for hiding this comment

github-actions bot commented Mar 11, 2024 •

edited