diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f6dcdbee590e1..c44d90f0998eb 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6876,11 +6876,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // In cases of scalarized and predicated instructions, there will be VF // predicated blocks in the vectorized loop. Each branch around these // blocks requires also an extract of its vector compare i1 element. + // Note that the conditional branch from the loop latch will be replaced by + // a single branch controlling the loop, so there is no extra overhead from + // scalarization. bool ScalarPredicatedBB = false; BranchInst *BI = cast(I); if (VF.isVector() && BI->isConditional() && (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || - PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) + PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) && + BI->getParent() != TheLoop->getLoopLatch()) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index a78696cadaafb..14b5ee2440806 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -763,9 +763,9 @@ define void @latch_branch_cost(ptr %dst) { ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] ; PRED: vector.body: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] -; PRED-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], -; PRED-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] +; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], +; PRED-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 ; PRED-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; PRED: pred.store.if: ; PRED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 @@ -773,7 +773,7 @@ define void @latch_branch_cost(ptr %dst) { ; PRED-NEXT: store i8 0, ptr [[TMP3]], align 1 ; PRED-NEXT: br label [[PRED_STORE_CONTINUE]] ; PRED: pred.store.continue: -; PRED-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; PRED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 ; PRED-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] ; PRED: pred.store.if1: ; PRED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 @@ -781,7 +781,7 @@ define void @latch_branch_cost(ptr %dst) { ; PRED-NEXT: store i8 0, ptr [[TMP6]], align 1 ; PRED-NEXT: br label [[PRED_STORE_CONTINUE2]] ; PRED: pred.store.continue2: -; PRED-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; PRED-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 ; PRED-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] ; PRED: pred.store.if3: ; PRED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2 @@ -789,28 +789,60 @@ define void @latch_branch_cost(ptr %dst) { ; PRED-NEXT: store i8 0, ptr [[TMP9]], align 1 ; PRED-NEXT: br label [[PRED_STORE_CONTINUE4]] ; PRED: pred.store.continue4: -; PRED-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 -; PRED-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; PRED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; PRED-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] ; PRED: pred.store.if5: ; PRED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3 ; PRED-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] ; PRED-NEXT: store i8 0, ptr [[TMP12]], align 1 -; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]] +; PRED-NEXT: br label [[PRED_STORE_CONTINUE7]] ; PRED: pred.store.continue6: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; PRED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 -; PRED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; PRED-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; PRED-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] +; PRED: pred.store.if7: +; PRED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4 +; PRED-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]] +; PRED-NEXT: store i8 0, ptr [[TMP15]], align 1 +; PRED-NEXT: br label [[PRED_STORE_CONTINUE8]] +; PRED: pred.store.continue8: +; PRED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; PRED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] +; PRED: pred.store.if9: +; PRED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5 +; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]] +; PRED-NEXT: store i8 0, ptr [[TMP18]], align 1 +; PRED-NEXT: br label [[PRED_STORE_CONTINUE10]] +; PRED: pred.store.continue10: +; PRED-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; PRED-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] +; PRED: pred.store.if11: +; PRED-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6 +; PRED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP20]] +; PRED-NEXT: store i8 0, ptr [[TMP21]], align 1 +; PRED-NEXT: br label [[PRED_STORE_CONTINUE12]] +; PRED: pred.store.continue12: +; PRED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; PRED-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE6]] +; PRED: pred.store.if13: +; PRED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7 +; PRED-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]] +; PRED-NEXT: store i8 0, ptr [[TMP24]], align 1 +; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]] +; PRED: pred.store.continue14: +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], +; PRED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104 +; PRED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 104, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; PRED-NEXT: br label [[FOR_BODY:%.*]] ; PRED: loop: -; PRED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; PRED-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDVARS_IV]] -; PRED-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1 -; PRED-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 +; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; PRED-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] +; PRED-NEXT: store i8 0, ptr [[GEP]], align 1 +; PRED-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[IV]], 1 ; PRED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 ; PRED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; PRED: exit: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index af5decb0d340e..c85ae6dba73ed 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -856,16 +856,67 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED-LABEL: define void @exit_cond_zext_iv( ; PRED-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) { ; PRED-NEXT: entry: +; PRED-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1) +; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; PRED: vector.scevcheck: +; PRED-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1) +; PRED-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 +; PRED-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 +; PRED-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32 +; PRED-NEXT: [[TMP3:%.*]] = add i32 1, [[TMP2]] +; PRED-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1 +; PRED-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 +; PRED-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]] +; PRED-NEXT: br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; PRED: vector.ph: +; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1 +; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 +; PRED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1 +; PRED-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer ; PRED-NEXT: br label [[LOOP:%.*]] -; PRED: loop: -; PRED-NEXT: [[IV_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] -; PRED-NEXT: [[IV_CONV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_EXT:%.*]], [[LOOP]] ] +; PRED: vector.body: +; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; PRED-NEXT: [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], +; PRED-NEXT: [[TMP7:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]] +; PRED-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; PRED-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; PRED: pred.store.if: +; PRED-NEXT: [[IV_CONV:%.*]] = add i64 [[INDEX]], 0 ; PRED-NEXT: [[GEP:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV]], i32 2 ; PRED-NEXT: store i32 0, ptr [[GEP]], align 8 +; PRED-NEXT: br label [[PRED_STORE_CONTINUE]] +; PRED: pred.store.continue: +; PRED-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; PRED-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; PRED: pred.store.if5: +; PRED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 1 +; PRED-NEXT: [[TMP13:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP12]], i32 2 +; PRED-NEXT: store i32 0, ptr [[TMP13]], align 8 +; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]] +; PRED: pred.store.continue6: +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; PRED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; PRED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; PRED: middle.block: +; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; PRED: scalar.ph: +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: br label [[LOOP1:%.*]] +; PRED: loop: +; PRED-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP1]] ] +; PRED-NEXT: [[IV_CONV1:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_EXT:%.*]], [[LOOP1]] ] +; PRED-NEXT: [[GEP1:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV1]], i32 2 +; PRED-NEXT: store i32 0, ptr [[GEP1]], align 8 ; PRED-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1 ; PRED-NEXT: [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64 ; PRED-NEXT: [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]] -; PRED-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]] +; PRED-NEXT: br i1 [[C]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]] ; PRED: exit: ; PRED-NEXT: ret void ; @@ -913,4 +964,6 @@ attributes #0 = { "target-features"="+sve" } ; PRED: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} ; PRED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} ; PRED: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]} +; PRED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; PRED: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} ;.