Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -456,10 +456,25 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
const auto &VarOffset = VarOffsets.front();
APInt OffsetQuot;
APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
if (Rem != 0 || OffsetQuot.isZero())
return nullptr;

Value *Offset = VarOffset.first;
if (Rem != 0) {
unsigned ElemSizeShift = Log2_64(VecElemSize);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to validate VecElemSize is a power of 2?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, but I think it is not necessary to explicitly check whether the element size is a power of two, because it is already covered by the existing check here:

Type *VecEltTy = VectorTy->getElementType();
unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
"does not match the type's size\n");
return false;
}

If the element type is not naturally aligned, it will return false, which also rejects non power of 2 element sizes, such as i24.

SimplifyQuery SQ(DL);
SQ.CxtI = GEP;
KnownBits KB = computeKnownBits(VarOffset.first, SQ);
// Bail out if the index may point into the middle of an element.
if (KB.countMinTrailingZeros() < ElemSizeShift)
return nullptr;

Value *Scaled = Builder.CreateLShr(VarOffset.first, ElemSizeShift);
if (Instruction *NewInst = dyn_cast<Instruction>(Scaled))
NewInsts.push_back(NewInst);

Offset = Scaled;
OffsetQuot = APInt(BW, 1);
Rem = 0;
}

auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
if (!OffsetType)
return nullptr;
Expand Down
82 changes: 82 additions & 0 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,88 @@ bb2:
store i32 0, ptr addrspace(5) %extractelement
ret void
}

define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(ptr %buffer, float %data, i1 %idx_sel) {
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(
; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 4
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
; CHECK-NEXT: ret void
;
%alloca = alloca <3 x float>, align 16, addrspace(5)
%vec = load <3 x float>, ptr %buffer
store <3 x float> %vec, ptr addrspace(5) %alloca
%index = select i1 %idx_sel, i32 0, i32 4
%elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
store float %data, ptr addrspace(5) %elt, align 4
%updated = load <3 x float>, ptr addrspace(5) %alloca, align 16
store <3 x float> %updated, ptr %buffer, align 16
ret void
}

define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(ptr %buffer, float %data, i1 %idx_sel) {
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(
; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
; CHECK-NEXT: ret void
;
%alloca = alloca <3 x float>, align 16, addrspace(5)
%vec = load <3 x float>, ptr %buffer
store <3 x float> %vec, ptr addrspace(5) %alloca
%index = select i1 %idx_sel, i32 4, i32 8
%elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
store float %data, ptr addrspace(5) %elt, align 4
%updated = load <3 x float>, ptr addrspace(5) %alloca, align 16
store <3 x float> %updated, ptr %buffer, align 16
ret void
}

define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(ptr %buffer, float %data, i1 %idx_sel) {
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(
; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x <3 x float>], ptr addrspace(3) @scalar_alloca_vector_gep_i8_4_or_5_no_promote.alloca, i32 0, i32 [[TMP14]]
; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
; CHECK-NEXT: store <3 x float> [[VEC]], ptr addrspace(3) [[TMP15]], align 16
; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5
; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP15]], i32 [[INDEX]]
; CHECK-NEXT: store float [[DATA]], ptr addrspace(3) [[ELT]], align 4
; CHECK-NEXT: [[UPDATED:%.*]] = load <3 x float>, ptr addrspace(3) [[TMP15]], align 16
; CHECK-NEXT: store <3 x float> [[UPDATED]], ptr [[BUFFER]], align 16
; CHECK-NEXT: ret void
;
%alloca = alloca <3 x float>, align 16, addrspace(5)
%vec = load <3 x float>, ptr %buffer
store <3 x float> %vec, ptr addrspace(5) %alloca
%index = select i1 %idx_sel, i32 4, i32 5
%elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
store float %data, ptr addrspace(5) %elt, align 4
%updated = load <3 x float>, ptr addrspace(5) %alloca, align 16
store <3 x float> %updated, ptr %buffer, align 16
ret void
}
;.
; CHECK: [[META0]] = !{}
; CHECK: [[RNG1]] = !{i32 0, i32 1025}
Expand Down