Skip to content

Commit 6a28740

Browse files
committed
[AMDGPU] Use computeKnownBits to check if it points the middle of an element
1 parent a69372f commit 6a28740

File tree

2 files changed

+76
-10
lines changed

2 files changed

+76
-10
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -456,21 +456,25 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
456456
const auto &VarOffset = VarOffsets.front();
457457
APInt OffsetQuot;
458458
APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
459-
460-
Value *Scaled = nullptr;
459+
Value *Offset = VarOffset.first;
461460
if (Rem != 0 || OffsetQuot.isZero()) {
462461
unsigned ElemSizeShift = Log2_64(VecElemSize);
463-
Scaled = Builder.CreateLShr(VarOffset.first, ElemSizeShift);
462+
SimplifyQuery SQ(DL);
463+
SQ.CxtI = GEP;
464+
KnownBits KB = computeKnownBits(VarOffset.first, SQ);
465+
// Bail out if the index may point into the middle of an element.
466+
if (KB.countMinTrailingZeros() < ElemSizeShift)
467+
return nullptr;
468+
469+
Value *Scaled = Builder.CreateLShr(VarOffset.first, ElemSizeShift);
464470
if (Instruction *NewInst = dyn_cast<Instruction>(Scaled))
465471
NewInsts.push_back(NewInst);
472+
473+
Offset = Scaled;
466474
OffsetQuot = APInt(BW, 1);
467475
Rem = 0;
468476
}
469477

470-
Value *Offset = VarOffset.first;
471-
if (Scaled)
472-
Offset = Scaled;
473-
474478
auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
475479
if (!OffsetType)
476480
return nullptr;

llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll

Lines changed: 65 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -251,11 +251,12 @@ bb2:
251251
ret void
252252
}
253253

254-
define amdgpu_kernel void @scalar_alloca_vector_gep_i8(ptr %buffer, float %data, i32 %index) {
255-
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8(
256-
; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i32 [[INDEX:%.*]]) {
254+
define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(ptr %buffer, float %data, i1 %idx_sel) {
255+
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(
256+
; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
257257
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
258258
; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
259+
; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 4
259260
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
260261
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
261262
; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
@@ -264,6 +265,67 @@ define amdgpu_kernel void @scalar_alloca_vector_gep_i8(ptr %buffer, float %data,
264265
%alloca = alloca <3 x float>, align 16, addrspace(5)
265266
%vec = load <3 x float>, ptr %buffer
266267
store <3 x float> %vec, ptr addrspace(5) %alloca
268+
%index = select i1 %idx_sel, i32 0, i32 4
269+
%elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
270+
store float %data, ptr addrspace(5) %elt, align 4
271+
%updated = load <3 x float>, ptr addrspace(5) %alloca, align 16
272+
store <3 x float> %updated, ptr %buffer, align 16
273+
ret void
274+
}
275+
276+
define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(ptr %buffer, float %data, i1 %idx_sel) {
277+
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(
278+
; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
279+
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
280+
; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
281+
; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8
282+
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
283+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
284+
; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
285+
; CHECK-NEXT: ret void
286+
;
287+
%alloca = alloca <3 x float>, align 16, addrspace(5)
288+
%vec = load <3 x float>, ptr %buffer
289+
store <3 x float> %vec, ptr addrspace(5) %alloca
290+
%index = select i1 %idx_sel, i32 4, i32 8
291+
%elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
292+
store float %data, ptr addrspace(5) %elt, align 4
293+
%updated = load <3 x float>, ptr addrspace(5) %alloca, align 16
294+
store <3 x float> %updated, ptr %buffer, align 16
295+
ret void
296+
}
297+
298+
define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(ptr %buffer, float %data, i1 %idx_sel) {
299+
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(
300+
; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
301+
; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
302+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
303+
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
304+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
305+
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
306+
; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
307+
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
308+
; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
309+
; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
310+
; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
311+
; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
312+
; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
313+
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
314+
; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
315+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x <3 x float>], ptr addrspace(3) @scalar_alloca_vector_gep_i8_4_or_5_no_promote.alloca, i32 0, i32 [[TMP14]]
316+
; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
317+
; CHECK-NEXT: store <3 x float> [[VEC]], ptr addrspace(3) [[TMP15]], align 16
318+
; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5
319+
; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP15]], i32 [[INDEX]]
320+
; CHECK-NEXT: store float [[DATA]], ptr addrspace(3) [[ELT]], align 4
321+
; CHECK-NEXT: [[UPDATED:%.*]] = load <3 x float>, ptr addrspace(3) [[TMP15]], align 16
322+
; CHECK-NEXT: store <3 x float> [[UPDATED]], ptr [[BUFFER]], align 16
323+
; CHECK-NEXT: ret void
324+
;
325+
%alloca = alloca <3 x float>, align 16, addrspace(5)
326+
%vec = load <3 x float>, ptr %buffer
327+
store <3 x float> %vec, ptr addrspace(5) %alloca
328+
%index = select i1 %idx_sel, i32 4, i32 5
267329
%elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
268330
store float %data, ptr addrspace(5) %elt, align 4
269331
%updated = load <3 x float>, ptr addrspace(5) %alloca, align 16

0 commit comments

Comments
 (0)