llvm · nhaehnle · Dec 4, 2025 · Nov 7, 2025 · Nov 5, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -122,6 +122,7 @@ class AMDGPUPromoteAllocaImpl {
   /// Check whether we have enough local memory for promotion.
   bool hasSufficientLocalMem(const Function &F);
 
+  FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
   bool tryPromoteAllocaToVector(AllocaInst &I);
   bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
 
@@ -791,16 +792,13 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
   return I;
 }
 
-// FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
-  LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
-
+FixedVectorType *
+AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
   if (DisablePromoteAllocaToVector) {
-    LLVM_DEBUG(dbgs() << "  Promote alloca to vector is disabled\n");
-    return false;
+    LLVM_DEBUG(dbgs() << "  Promote alloca to vectors is disabled\n");
+    return nullptr;
   }
 
-  Type *AllocaTy = Alloca.getAllocatedType();
   auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
   if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
     uint64_t NumElems = 1;
@@ -832,10 +830,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       }
     }
   }
-
   if (!VectorTy) {
     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
-    return false;
+    return nullptr;
   }
 
   const unsigned MaxElements =
@@ -845,9 +842,29 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       VectorTy->getNumElements() < 2) {
     LLVM_DEBUG(dbgs() << "  " << *VectorTy
                       << " has an unsupported number of elements\n");
-    return false;
+    return nullptr;
   }
 
+  Type *VecEltTy = VectorTy->getElementType();
+  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
+  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
+                         "does not match the type's size\n");
+    return nullptr;
+  }
+
+  return VectorTy;
+}
+
+// FIXME: Should try to pick the most likely to be profitable allocas first.
+bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
+  LLVM_DEBUG(dbgs() << "Trying to promote to vectors: " << Alloca << '\n');
+
+  Type *AllocaTy = Alloca.getAllocatedType();
+  FixedVectorType *VectorTy = getVectorTypeForAlloca(AllocaTy);
+  if (!VectorTy)
+    return false;
+
   std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
   SmallVector<Instruction *> WorkList;
   SmallVector<Instruction *> UsersToRemove;
@@ -869,13 +886,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
   Type *VecEltTy = VectorTy->getElementType();
-  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
-  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
-    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
-                         "does not match the type's size\n");
-    return false;
-  }
-  unsigned ElementSize = ElementSizeInBits / 8;
+  unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
   assert(ElementSize > 0);
   for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());

diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll
@@ -1,19 +1,19 @@
-; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
 
 ; Show that what the alloca promotion pass will do for non-atomic load/store.
 
 ; OPT-LABEL: @vector_alloca_not_atomic(
 ;
-; OPT: extractelement <3 x i32> <i32 0, i32 1, i32 2>, i64 %index
-define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i64 %index) {
+; OPT: extractelement <3 x i32> <i32 0, i32 1, i32 2>, i32 %index
+define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i32 %index) {
 entry:
   %alloca = alloca [3 x i32], addrspace(5)
   %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
   %a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
   store i32 0, ptr addrspace(5) %alloca
   store i32 1, ptr addrspace(5) %a1
   store i32 2, ptr addrspace(5) %a2
-  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index
   %data = load i32, ptr addrspace(5) %tmp
   store i32 %data, ptr addrspace(1) %out
   ret void
@@ -26,15 +26,15 @@ entry:
 ; OPT: store i32 1, ptr addrspace(5)
 ; OPT: store i32 2, ptr addrspace(5)
 ; OPT: load atomic i32, ptr addrspace(5)
-define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i64 %index) {
+define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i32 %index) {
 entry:
   %alloca = alloca [3 x i32], addrspace(5)
   %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
   %a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
   store i32 0, ptr addrspace(5) %alloca
   store i32 1, ptr addrspace(5) %a1
   store i32 2, ptr addrspace(5) %a2
-  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index
   %data = load atomic i32, ptr addrspace(5) %tmp acquire, align 4
   store i32 %data, ptr addrspace(1) %out
   ret void
@@ -47,15 +47,15 @@ entry:
 ; OPT: store atomic i32 1, ptr addrspace(5)
 ; OPT: store atomic i32 2, ptr addrspace(5)
 ; OPT: load i32, ptr addrspace(5)
-define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i64 %index) {
+define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i32 %index) {
 entry:
   %alloca = alloca [3 x i32], addrspace(5)
   %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
   %a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
   store atomic i32 0, ptr addrspace(5) %alloca release, align 4
   store atomic i32 1, ptr addrspace(5) %a1 release, align 4
   store atomic i32 2, ptr addrspace(5) %a2  release, align 4
-  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index
   %data = load i32, ptr addrspace(5) %tmp
   store i32 %data, ptr addrspace(1) %out
   ret void

diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -72,7 +72,8 @@ entry:
 ; OPT-NOT:   alloca
 ; OPT: bb2:
 ; OPT:  %promotealloca = phi <6 x float> [ zeroinitializer, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x float> %promotealloca, float %tmp71, i32 %tmp10
+; OPT: [[TMP:%tmp7.*]] = load float, ptr addrspace(1) %tmp5, align 4
+; OPT:  %0 = insertelement <6 x float> %promotealloca, float [[TMP]], i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x float> %0 to <6 x i32>
 ; OPT:  %1 = extractelement <6 x i32> %bc, i32 %tmp20
@@ -132,7 +133,8 @@ bb15:                                             ; preds = %.preheader
 ; OPT-NOT:   alloca
 ; OPT: bb2:
 ; OPT:  %promotealloca = phi <6 x double> [ zeroinitializer, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x double> %promotealloca, double %tmp71, i32 %tmp10
+; OPT:  [[TMP:%tmp7.*]] = load double, ptr addrspace(1) %tmp5, align 8
+; OPT:  %0 = insertelement <6 x double> %promotealloca, double [[TMP]], i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x double> %0 to <6 x i64>
 ; OPT:  %1 = extractelement <6 x i64> %bc, i32 %tmp20