From a69372f3079fde869defca1897894b75311ba7e1 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314@gmail.com>
Date: Fri, 31 Oct 2025 11:13:00 +0800
Subject: [PATCH 1/3] [AMDGPU] Enable i8 GEP promotion for vector allocas

---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 15 ++++++++++++--
 .../AMDGPU/promote-alloca-vector-gep.ll       | 20 +++++++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ddabd25894414..793c0237cdf38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -456,10 +456,21 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   const auto &VarOffset = VarOffsets.front();
   APInt OffsetQuot;
   APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
-  if (Rem != 0 || OffsetQuot.isZero())
-    return nullptr;
+
+  Value *Scaled = nullptr;
+  if (Rem != 0 || OffsetQuot.isZero()) {
+    unsigned ElemSizeShift = Log2_64(VecElemSize);
+    Scaled = Builder.CreateLShr(VarOffset.first, ElemSizeShift);
+    if (Instruction *NewInst = dyn_cast<Instruction>(Scaled))
+      NewInsts.push_back(NewInst);
+    OffsetQuot = APInt(BW, 1);
+    Rem = 0;
+  }
 
   Value *Offset = VarOffset.first;
+  if (Scaled)
+    Offset = Scaled;
+
   auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
   if (!OffsetType)
     return nullptr;
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
index 76e1868b3c4b9..65bddaba8dd14 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
@@ -250,6 +250,26 @@ bb2:
   store i32 0, ptr addrspace(5) %extractelement
   ret void
 }
+
+define amdgpu_kernel void @scalar_alloca_vector_gep_i8(ptr %buffer, float %data, i32 %index) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8(
+; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT:    [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
+; CHECK-NEXT:    store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca <3 x float>, align 16, addrspace(5)
+  %vec = load <3 x float>, ptr %buffer
+  store <3 x float> %vec, ptr addrspace(5) %alloca
+  %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+  store float %data, ptr addrspace(5) %elt, align 4
+  %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16
+  store <3 x float> %updated, ptr %buffer, align 16
+  ret void
+}
 ;.
 ; CHECK: [[META0]] = !{}
 ; CHECK: [[RNG1]] = !{i32 0, i32 1025}

From 6a287406b4902ceedb95207ca1fe0ca76731f0a7 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314@gmail.com>
Date: Mon, 10 Nov 2025 16:22:08 +0800
Subject: [PATCH 2/3] [AMDGPU] Use computeKnownBits to check if it points the
 middle of an element

---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 18 +++--
 .../AMDGPU/promote-alloca-vector-gep.ll       | 68 ++++++++++++++++++-
 2 files changed, 76 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 793c0237cdf38..fc610fc74cbc7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -456,21 +456,25 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   const auto &VarOffset = VarOffsets.front();
   APInt OffsetQuot;
   APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
-
-  Value *Scaled = nullptr;
+  Value *Offset = VarOffset.first;
   if (Rem != 0 || OffsetQuot.isZero()) {
     unsigned ElemSizeShift = Log2_64(VecElemSize);
-    Scaled = Builder.CreateLShr(VarOffset.first, ElemSizeShift);
+    SimplifyQuery SQ(DL);
+    SQ.CxtI = GEP;
+    KnownBits KB = computeKnownBits(VarOffset.first, SQ);
+    // Bail out if the index may point into the middle of an element.
+    if (KB.countMinTrailingZeros() < ElemSizeShift)
+      return nullptr;
+
+    Value *Scaled = Builder.CreateLShr(VarOffset.first, ElemSizeShift);
     if (Instruction *NewInst = dyn_cast<Instruction>(Scaled))
       NewInsts.push_back(NewInst);
+
+    Offset = Scaled;
     OffsetQuot = APInt(BW, 1);
     Rem = 0;
   }
 
-  Value *Offset = VarOffset.first;
-  if (Scaled)
-    Offset = Scaled;
-
   auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
   if (!OffsetType)
     return nullptr;
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
index 65bddaba8dd14..d4088188f774d 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
@@ -251,11 +251,12 @@ bb2:
   ret void
 }
 
-define amdgpu_kernel void @scalar_alloca_vector_gep_i8(ptr %buffer, float %data, i32 %index) {
-; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8(
-; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i32 [[INDEX:%.*]]) {
+define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(ptr %buffer, float %data, i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(
+; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <3 x float> poison
 ; CHECK-NEXT:    [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
+; CHECK-NEXT:    [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
 ; CHECK-NEXT:    store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
@@ -264,6 +265,67 @@ define amdgpu_kernel void @scalar_alloca_vector_gep_i8(ptr %buffer, float %data,
   %alloca = alloca <3 x float>, align 16, addrspace(5)
   %vec = load <3 x float>, ptr %buffer
   store <3 x float> %vec, ptr addrspace(5) %alloca
+  %index = select i1 %idx_sel, i32 0, i32 4
+  %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+  store float %data, ptr addrspace(5) %elt, align 4
+  %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16
+  store <3 x float> %updated, ptr %buffer, align 16
+  ret void
+}
+
+define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(ptr %buffer, float %data, i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(
+; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT:    [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
+; CHECK-NEXT:    [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
+; CHECK-NEXT:    store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca <3 x float>, align 16, addrspace(5)
+  %vec = load <3 x float>, ptr %buffer
+  store <3 x float> %vec, ptr addrspace(5) %alloca
+  %index = select i1 %idx_sel, i32 4, i32 8
+  %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+  store float %data, ptr addrspace(5) %elt, align 4
+  %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16
+  store <3 x float> %updated, ptr %buffer, align 16
+  ret void
+}
+
+define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(ptr %buffer, float %data, i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(
+; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP9:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x <3 x float>], ptr addrspace(3) @scalar_alloca_vector_gep_i8_4_or_5_no_promote.alloca, i32 0, i32 [[TMP14]]
+; CHECK-NEXT:    [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
+; CHECK-NEXT:    store <3 x float> [[VEC]], ptr addrspace(3) [[TMP15]], align 16
+; CHECK-NEXT:    [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5
+; CHECK-NEXT:    [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP15]], i32 [[INDEX]]
+; CHECK-NEXT:    store float [[DATA]], ptr addrspace(3) [[ELT]], align 4
+; CHECK-NEXT:    [[UPDATED:%.*]] = load <3 x float>, ptr addrspace(3) [[TMP15]], align 16
+; CHECK-NEXT:    store <3 x float> [[UPDATED]], ptr [[BUFFER]], align 16
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca <3 x float>, align 16, addrspace(5)
+  %vec = load <3 x float>, ptr %buffer
+  store <3 x float> %vec, ptr addrspace(5) %alloca
+  %index = select i1 %idx_sel, i32 4, i32 5
   %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
   store float %data, ptr addrspace(5) %elt, align 4
   %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16

From 19584caf2f6e657d155400ba4eef95eeb9f6fe8f Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314@gmail.com>
Date: Thu, 20 Nov 2025 10:46:06 +0800
Subject: [PATCH 3/3] Remove unnecessary check.

---
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index fc610fc74cbc7..d9487bbaa497d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -457,7 +457,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   APInt OffsetQuot;
   APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
   Value *Offset = VarOffset.first;
-  if (Rem != 0 || OffsetQuot.isZero()) {
+  if (Rem != 0) {
     unsigned ElemSizeShift = Log2_64(VecElemSize);
     SimplifyQuery SQ(DL);
     SQ.CxtI = GEP;