From a69372f3079fde869defca1897894b75311ba7e1 Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Fri, 31 Oct 2025 11:13:00 +0800 Subject: [PATCH 1/3] [AMDGPU] Enable i8 GEP promotion for vector allocas --- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 15 ++++++++++++-- .../AMDGPU/promote-alloca-vector-gep.ll | 20 +++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index ddabd25894414..793c0237cdf38 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -456,10 +456,21 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, const auto &VarOffset = VarOffsets.front(); APInt OffsetQuot; APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem); - if (Rem != 0 || OffsetQuot.isZero()) - return nullptr; + + Value *Scaled = nullptr; + if (Rem != 0 || OffsetQuot.isZero()) { + unsigned ElemSizeShift = Log2_64(VecElemSize); + Scaled = Builder.CreateLShr(VarOffset.first, ElemSizeShift); + if (Instruction *NewInst = dyn_cast(Scaled)) + NewInsts.push_back(NewInst); + OffsetQuot = APInt(BW, 1); + Rem = 0; + } Value *Offset = VarOffset.first; + if (Scaled) + Offset = Scaled; + auto *OffsetType = dyn_cast(Offset->getType()); if (!OffsetType) return nullptr; diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll index 76e1868b3c4b9..65bddaba8dd14 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll @@ -250,6 +250,26 @@ bb2: store i32 0, ptr addrspace(5) %extractelement ret void } + +define amdgpu_kernel void @scalar_alloca_vector_gep_i8(ptr %buffer, float %data, i32 %index) { +; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8( +; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison +; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]] +; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16 +; CHECK-NEXT: ret void +; + %alloca = alloca <3 x float>, align 16, addrspace(5) + %vec = load <3 x float>, ptr %buffer + store <3 x float> %vec, ptr addrspace(5) %alloca + %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index + store float %data, ptr addrspace(5) %elt, align 4 + %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16 + store <3 x float> %updated, ptr %buffer, align 16 + ret void +} ;. ; CHECK: [[META0]] = !{} ; CHECK: [[RNG1]] = !{i32 0, i32 1025} From 6a287406b4902ceedb95207ca1fe0ca76731f0a7 Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Mon, 10 Nov 2025 16:22:08 +0800 Subject: [PATCH 2/3] [AMDGPU] Use computeKnownBits to check if it points the middle of an element --- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 18 +++-- .../AMDGPU/promote-alloca-vector-gep.ll | 68 ++++++++++++++++++- 2 files changed, 76 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 793c0237cdf38..fc610fc74cbc7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -456,21 +456,25 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, const auto &VarOffset = VarOffsets.front(); APInt OffsetQuot; APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem); - - Value *Scaled = nullptr; + Value *Offset = VarOffset.first; if (Rem != 0 || OffsetQuot.isZero()) { unsigned ElemSizeShift = Log2_64(VecElemSize); - Scaled = Builder.CreateLShr(VarOffset.first, ElemSizeShift); + SimplifyQuery SQ(DL); + SQ.CxtI = GEP; + KnownBits KB = computeKnownBits(VarOffset.first, SQ); + // Bail out if the index may point into the middle of an element. + if (KB.countMinTrailingZeros() < ElemSizeShift) + return nullptr; + + Value *Scaled = Builder.CreateLShr(VarOffset.first, ElemSizeShift); if (Instruction *NewInst = dyn_cast(Scaled)) NewInsts.push_back(NewInst); + + Offset = Scaled; OffsetQuot = APInt(BW, 1); Rem = 0; } - Value *Offset = VarOffset.first; - if (Scaled) - Offset = Scaled; - auto *OffsetType = dyn_cast(Offset->getType()); if (!OffsetType) return nullptr; diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll index 65bddaba8dd14..d4088188f774d 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll @@ -251,11 +251,12 @@ bb2: ret void } -define amdgpu_kernel void @scalar_alloca_vector_gep_i8(ptr %buffer, float %data, i32 %index) { -; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8( -; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i32 [[INDEX:%.*]]) { +define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(ptr %buffer, float %data, i1 %idx_sel) { +; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4( +; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) { ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16 +; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 4 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]] ; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16 @@ -264,6 +265,67 @@ define amdgpu_kernel void @scalar_alloca_vector_gep_i8(ptr %buffer, float %data, %alloca = alloca <3 x float>, align 16, addrspace(5) %vec = load <3 x float>, ptr %buffer store <3 x float> %vec, ptr addrspace(5) %alloca + %index = select i1 %idx_sel, i32 0, i32 4 + %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index + store float %data, ptr addrspace(5) %elt, align 4 + %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16 + store <3 x float> %updated, ptr %buffer, align 16 + ret void +} + +define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(ptr %buffer, float %data, i1 %idx_sel) { +; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8( +; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) { +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison +; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16 +; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]] +; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16 +; CHECK-NEXT: ret void +; + %alloca = alloca <3 x float>, align 16, addrspace(5) + %vec = load <3 x float>, ptr %buffer + store <3 x float> %vec, ptr addrspace(5) %alloca + %index = select i1 %idx_sel, i32 4, i32 8 + %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index + store float %data, ptr addrspace(5) %elt, align 4 + %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16 + store <3 x float> %updated, ptr %buffer, align 16 + ret void +} + +define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(ptr %buffer, float %data, i1 %idx_sel) { +; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote( +; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]] +; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x <3 x float>], ptr addrspace(3) @scalar_alloca_vector_gep_i8_4_or_5_no_promote.alloca, i32 0, i32 [[TMP14]] +; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16 +; CHECK-NEXT: store <3 x float> [[VEC]], ptr addrspace(3) [[TMP15]], align 16 +; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5 +; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP15]], i32 [[INDEX]] +; CHECK-NEXT: store float [[DATA]], ptr addrspace(3) [[ELT]], align 4 +; CHECK-NEXT: [[UPDATED:%.*]] = load <3 x float>, ptr addrspace(3) [[TMP15]], align 16 +; CHECK-NEXT: store <3 x float> [[UPDATED]], ptr [[BUFFER]], align 16 +; CHECK-NEXT: ret void +; + %alloca = alloca <3 x float>, align 16, addrspace(5) + %vec = load <3 x float>, ptr %buffer + store <3 x float> %vec, ptr addrspace(5) %alloca + %index = select i1 %idx_sel, i32 4, i32 5 %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index store float %data, ptr addrspace(5) %elt, align 4 %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16 From 19584caf2f6e657d155400ba4eef95eeb9f6fe8f Mon Sep 17 00:00:00 2001 From: Harrison Hao Date: Thu, 20 Nov 2025 10:46:06 +0800 Subject: [PATCH 3/3] Remove unnecessary check. --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index fc610fc74cbc7..d9487bbaa497d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -457,7 +457,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, APInt OffsetQuot; APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem); Value *Offset = VarOffset.first; - if (Rem != 0 || OffsetQuot.isZero()) { + if (Rem != 0) { unsigned ElemSizeShift = Log2_64(VecElemSize); SimplifyQuery SQ(DL); SQ.CxtI = GEP;