Skip to content

Commit

Permalink
[LV] Don't consider the latch block as ScalarPredicatedBB.
Browse files Browse the repository at this point in the history
The conditional branch from the loop latch will be replaced by a
single branch controlling the loop, so there is no extra overhead from
scalarization. This improves the cost esimates in some cases.
  • Loading branch information
fhahn committed Apr 29, 2024
1 parent a1e9608 commit 9c3f5fe
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 22 deletions.
6 changes: 5 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6876,11 +6876,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// In cases of scalarized and predicated instructions, there will be VF
// predicated blocks in the vectorized loop. Each branch around these
// blocks requires also an extract of its vector compare i1 element.
// Note that the conditional branch from the loop latch will be replaced by
// a single branch controlling the loop, so there is no extra overhead from
// scalarization.
bool ScalarPredicatedBB = false;
BranchInst *BI = cast<BranchInst>(I);
if (VF.isVector() && BI->isConditional() &&
(PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
BI->getParent() != TheLoop->getLoopLatch())
ScalarPredicatedBB = true;

if (ScalarPredicatedBB) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -763,54 +763,86 @@ define void @latch_branch_cost(ptr %dst) {
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
; PRED-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], <i64 99, i64 99, i64 99, i64 99>
; PRED-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], <i64 99, i64 99, i64 99, i64 99, i64 99, i64 99, i64 99, i64 99>
; PRED-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
; PRED-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; PRED: pred.store.if:
; PRED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; PRED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
; PRED-NEXT: store i8 0, ptr [[TMP3]], align 1
; PRED-NEXT: br label [[PRED_STORE_CONTINUE]]
; PRED: pred.store.continue:
; PRED-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
; PRED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
; PRED-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
; PRED: pred.store.if1:
; PRED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1
; PRED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
; PRED-NEXT: store i8 0, ptr [[TMP6]], align 1
; PRED-NEXT: br label [[PRED_STORE_CONTINUE2]]
; PRED: pred.store.continue2:
; PRED-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
; PRED-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
; PRED-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
; PRED: pred.store.if3:
; PRED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2
; PRED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
; PRED-NEXT: store i8 0, ptr [[TMP9]], align 1
; PRED-NEXT: br label [[PRED_STORE_CONTINUE4]]
; PRED: pred.store.continue4:
; PRED-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
; PRED-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
; PRED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
; PRED-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
; PRED: pred.store.if5:
; PRED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3
; PRED-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
; PRED-NEXT: store i8 0, ptr [[TMP12]], align 1
; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]]
; PRED-NEXT: br label [[PRED_STORE_CONTINUE7]]
; PRED: pred.store.continue6:
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
; PRED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
; PRED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; PRED-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
; PRED-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
; PRED: pred.store.if7:
; PRED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4
; PRED-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
; PRED-NEXT: store i8 0, ptr [[TMP15]], align 1
; PRED-NEXT: br label [[PRED_STORE_CONTINUE8]]
; PRED: pred.store.continue8:
; PRED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
; PRED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
; PRED: pred.store.if9:
; PRED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5
; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
; PRED-NEXT: store i8 0, ptr [[TMP18]], align 1
; PRED-NEXT: br label [[PRED_STORE_CONTINUE10]]
; PRED: pred.store.continue10:
; PRED-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
; PRED-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
; PRED: pred.store.if11:
; PRED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6
; PRED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP20]]
; PRED-NEXT: store i8 0, ptr [[TMP21]], align 1
; PRED-NEXT: br label [[PRED_STORE_CONTINUE12]]
; PRED: pred.store.continue12:
; PRED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
; PRED-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE6]]
; PRED: pred.store.if13:
; PRED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7
; PRED-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]]
; PRED-NEXT: store i8 0, ptr [[TMP24]], align 1
; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]]
; PRED: pred.store.continue14:
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
; PRED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
; PRED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; PRED: middle.block:
; PRED-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; PRED: scalar.ph:
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 104, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; PRED-NEXT: br label [[FOR_BODY:%.*]]
; PRED: loop:
; PRED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; PRED-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDVARS_IV]]
; PRED-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
; PRED-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; PRED-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
; PRED-NEXT: store i8 0, ptr [[GEP]], align 1
; PRED-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[IV]], 1
; PRED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
; PRED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; PRED: exit:
Expand Down
61 changes: 57 additions & 4 deletions llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
Original file line number Diff line number Diff line change
Expand Up @@ -856,16 +856,67 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
; PRED-LABEL: define void @exit_cond_zext_iv(
; PRED-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
; PRED-NEXT: entry:
; PRED-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; PRED: vector.scevcheck:
; PRED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
; PRED-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1
; PRED-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
; PRED-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
; PRED-NEXT: [[TMP3:%.*]] = add i32 1, [[TMP2]]
; PRED-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
; PRED-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
; PRED-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
; PRED-NEXT: br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; PRED: vector.ph:
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
; PRED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1
; PRED-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
; PRED-NEXT: br label [[LOOP:%.*]]
; PRED: loop:
; PRED-NEXT: [[IV_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
; PRED-NEXT: [[IV_CONV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_EXT:%.*]], [[LOOP]] ]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; PRED-NEXT: [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
; PRED-NEXT: [[TMP7:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
; PRED-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
; PRED-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; PRED: pred.store.if:
; PRED-NEXT: [[IV_CONV:%.*]] = add i64 [[INDEX]], 0
; PRED-NEXT: [[GEP:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV]], i32 2
; PRED-NEXT: store i32 0, ptr [[GEP]], align 8
; PRED-NEXT: br label [[PRED_STORE_CONTINUE]]
; PRED: pred.store.continue:
; PRED-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
; PRED-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
; PRED: pred.store.if5:
; PRED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 1
; PRED-NEXT: [[TMP13:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP12]], i32 2
; PRED-NEXT: store i32 0, ptr [[TMP13]], align 8
; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]]
; PRED: pred.store.continue6:
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; PRED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; PRED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
; PRED: middle.block:
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; PRED: scalar.ph:
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
; PRED-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
; PRED-NEXT: br label [[LOOP1:%.*]]
; PRED: loop:
; PRED-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP1]] ]
; PRED-NEXT: [[IV_CONV1:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_EXT:%.*]], [[LOOP1]] ]
; PRED-NEXT: [[GEP1:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV1]], i32 2
; PRED-NEXT: store i32 0, ptr [[GEP1]], align 8
; PRED-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1
; PRED-NEXT: [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
; PRED-NEXT: [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
; PRED-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]]
; PRED-NEXT: br i1 [[C]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
; PRED: exit:
; PRED-NEXT: ret void
;
Expand Down Expand Up @@ -913,4 +964,6 @@ attributes #0 = { "target-features"="+sve" }
; PRED: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
; PRED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
; PRED: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
; PRED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
; PRED: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
;.

0 comments on commit 9c3f5fe

Please sign in to comment.