Skip to content

Commit a8810d5

Browse files
committed
[VPlan] Use BlockFrequencyInfo in getPredBlockCostDivisor
1 parent ffa3283 commit a8810d5

29 files changed

+689
-1188
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -578,8 +578,10 @@ class InnerLoopVectorizer {
578578
/// The profitablity analysis.
579579
LoopVectorizationCostModel *Cost;
580580

581-
/// BFI and PSI are used to check for profile guided size optimizations.
581+
/// Used to calculate the probability of predicated blocks in
582+
/// getPredBlockCostDivisor.
582583
BlockFrequencyInfo *BFI;
584+
/// Used to check for profile guided size optimizations.
583585
ProfileSummaryInfo *PSI;
584586

585587
/// Structure to hold information about generated runtime checks, responsible
@@ -900,7 +902,7 @@ class LoopVectorizationCostModel {
900902
InterleavedAccessInfo &IAI,
901903
ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
902904
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
903-
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
905+
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), BFI(BFI), TheFunction(F),
904906
Hints(Hints), InterleaveInfo(IAI) {
905907
if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
906908
initializeVScaleForTuning();
@@ -1249,6 +1251,17 @@ class LoopVectorizationCostModel {
12491251
/// Superset of instructions that return true for isScalarWithPredication.
12501252
bool isPredicatedInst(Instruction *I) const;
12511253

1254+
/// A helper function that returns how much we should divide the cost of a
1255+
/// predicated block by. Typically this is the reciprocal of the block
1256+
/// probability, i.e. if we return X we are assuming the predicated block will
1257+
/// execute once for every X iterations of the loop header so the block should
1258+
/// only contribute 1/X of its cost to the total cost calculation, but when
1259+
/// optimizing for code size it will just be 1 as code size costs don't depend
1260+
/// on execution probabilities.
1261+
inline unsigned
1262+
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1263+
const BasicBlock *BB) const;
1264+
12521265
/// Return the costs for our two available strategies for lowering a
12531266
/// div/rem operation which requires speculating at least one lane.
12541267
/// First result is for scalarization (will be invalid for scalable
@@ -1711,6 +1724,8 @@ class LoopVectorizationCostModel {
17111724
/// Interface to emit optimization remarks.
17121725
OptimizationRemarkEmitter *ORE;
17131726

1727+
const BlockFrequencyInfo *BFI;
1728+
17141729
const Function *TheFunction;
17151730

17161731
/// Loop Vectorize Hint.
@@ -2863,6 +2878,19 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
28632878
}
28642879
}
28652880

2881+
unsigned LoopVectorizationCostModel::getPredBlockCostDivisor(
2882+
TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const {
2883+
if (CostKind == TTI::TCK_CodeSize)
2884+
return 1;
2885+
2886+
uint64_t HeaderFreq = BFI->getBlockFreq(TheLoop->getHeader()).getFrequency();
2887+
uint64_t BBFreq = BFI->getBlockFreq(BB).getFrequency();
2888+
assert(HeaderFreq >= BBFreq &&
2889+
"Header has smaller block freq than dominated BB?");
2890+
return BFI->getBlockFreq(TheLoop->getHeader()).getFrequency() /
2891+
BFI->getBlockFreq(BB).getFrequency();
2892+
}
2893+
28662894
std::pair<InstructionCost, InstructionCost>
28672895
LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
28682896
ElementCount VF) const {
@@ -2899,7 +2927,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
28992927
// Scale the cost by the probability of executing the predicated blocks.
29002928
// This assumes the predicated block for each vector lane is equally
29012929
// likely.
2902-
ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
2930+
ScalarizationCost =
2931+
ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
29032932
}
29042933
InstructionCost SafeDivisorCost = 0;
29052934

@@ -5031,7 +5060,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
50315060
}
50325061

50335062
// Scale the total scalar cost by block probability.
5034-
ScalarCost /= getPredBlockCostDivisor(CostKind);
5063+
ScalarCost /= getPredBlockCostDivisor(CostKind, PredInst->getParent());
50355064

50365065
// Compute the discount. A non-negative discount means the vector version
50375066
// of the instruction costs more, and scalarizing would be beneficial.
@@ -5084,7 +5113,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
50845113
// cost by the probability of executing it. blockNeedsPredication from
50855114
// Legal is used so as to not include all blocks in tail folded loops.
50865115
if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5087-
BlockCost /= getPredBlockCostDivisor(CostKind);
5116+
BlockCost /= getPredBlockCostDivisor(CostKind, BB);
50885117

50895118
Cost += BlockCost;
50905119
}
@@ -5162,7 +5191,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
51625191
// conditional branches, but may not be executed for each vector lane. Scale
51635192
// the cost by the probability of executing the predicated block.
51645193
if (isPredicatedInst(I)) {
5165-
Cost /= getPredBlockCostDivisor(CostKind);
5194+
Cost /= getPredBlockCostDivisor(CostKind, I->getParent());
51665195

51675196
// Add the cost of an i1 extract and a branch
51685197
auto *VecI1Ty =
@@ -6710,6 +6739,11 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
67106739
SkipCostComputation.contains(UI);
67116740
}
67126741

6742+
unsigned VPCostContext::getPredBlockCostDivisor(
6743+
TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const {
6744+
return CM.getPredBlockCostDivisor(CostKind, BB);
6745+
}
6746+
67136747
InstructionCost
67146748
LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
67156749
VPCostContext &CostCtx) const {
@@ -10273,9 +10307,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
1027310307

1027410308
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
1027510309
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10276-
BFI = nullptr;
10277-
if (PSI && PSI->hasProfileSummary())
10278-
BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10310+
BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
1027910311
LoopVectorizeResult Result = runImpl(F);
1028010312
if (!Result.MadeAnyChange)
1028110313
return PreservedAnalyses::all();

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -855,7 +855,9 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
855855
// For the scalar case, we may not always execute the original predicated
856856
// block, Thus, scale the block's cost by the probability of executing it.
857857
if (VF.isScalar())
858-
return ThenCost / getPredBlockCostDivisor(Ctx.CostKind);
858+
if (auto *VPIRBB = dyn_cast<VPIRBasicBlock>(Then))
859+
return ThenCost / Ctx.getPredBlockCostDivisor(Ctx.CostKind,
860+
VPIRBB->getIRBasicBlock());
859861

860862
return ThenCost;
861863
}

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
5050
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
5151
int64_t Step);
5252

53-
/// A helper function that returns how much we should divide the cost of a
54-
/// predicated block by. Typically this is the reciprocal of the block
55-
/// probability, i.e. if we return X we are assuming the predicated block will
56-
/// execute once for every X iterations of the loop header so the block should
57-
/// only contribute 1/X of its cost to the total cost calculation, but when
58-
/// optimizing for code size it will just be 1 as code size costs don't depend
59-
/// on execution probabilities.
60-
///
61-
/// TODO: We should use actual block probability here, if available. Currently,
62-
/// we always assume predicated blocks have a 50% chance of executing.
63-
inline unsigned
64-
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) {
65-
return CostKind == TTI::TCK_CodeSize ? 1 : 2;
66-
}
67-
6853
/// A range of powers-of-2 vectorization factors with fixed start and
6954
/// adjustable end. The range includes start and excludes end, e.g.,:
7055
/// [1, 16) = {1, 2, 4, 8}
@@ -378,6 +363,9 @@ struct VPCostContext {
378363
InstructionCost getScalarizationOverhead(Type *ResultTy,
379364
ArrayRef<const VPValue *> Operands,
380365
ElementCount VF);
366+
367+
unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
368+
const BasicBlock *BB) const;
381369
};
382370

383371
/// This class can be used to assign names to VPValues. For VPValues without

llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
2525
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_UDIV_CONTINUE2]] ]
2626
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
2727
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4
28-
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
28+
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], splat (i64 1)
2929
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
3030
; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
3131
; CHECK: pred.udiv.if:
@@ -65,7 +65,7 @@ for.body:
6565
%r = phi i64 [ 0, %entry ], [ %var6, %for.inc ]
6666
%var0 = getelementptr inbounds i64, ptr %a, i64 %i
6767
%var2 = load i64, ptr %var0, align 4
68-
%cond0 = icmp sgt i64 %var2, 0
68+
%cond0 = icmp sgt i64 %var2, 1
6969
br i1 %cond0, label %if.then, label %for.inc
7070

7171
if.then:

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 18 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -611,63 +611,18 @@ exit:
611611
define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
612612
; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store(
613613
; COMMON-SAME: ptr [[DST:%.*]]) {
614-
; COMMON-NEXT: [[ENTRY:.*:]]
615-
; COMMON-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
616-
; COMMON: [[VECTOR_PH]]:
617-
; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
618-
; COMMON: [[VECTOR_BODY]]:
619-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
620-
; COMMON: [[PRED_STORE_IF]]:
621-
; COMMON-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0
622-
; COMMON-NEXT: store i8 0, ptr [[TMP0]], align 1
623-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE]]
624-
; COMMON: [[PRED_STORE_CONTINUE]]:
625-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
626-
; COMMON: [[PRED_STORE_IF1]]:
627-
; COMMON-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1
628-
; COMMON-NEXT: store i8 1, ptr [[TMP1]], align 1
629-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE2]]
630-
; COMMON: [[PRED_STORE_CONTINUE2]]:
631-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
632-
; COMMON: [[PRED_STORE_IF3]]:
633-
; COMMON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2
634-
; COMMON-NEXT: store i8 2, ptr [[TMP2]], align 1
635-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE4]]
636-
; COMMON: [[PRED_STORE_CONTINUE4]]:
637-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
638-
; COMMON: [[PRED_STORE_IF5]]:
639-
; COMMON-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3
640-
; COMMON-NEXT: store i8 3, ptr [[TMP3]], align 1
641-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE6]]
642-
; COMMON: [[PRED_STORE_CONTINUE6]]:
643-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
644-
; COMMON: [[PRED_STORE_IF7]]:
645-
; COMMON-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4
646-
; COMMON-NEXT: store i8 4, ptr [[TMP4]], align 1
647-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE8]]
648-
; COMMON: [[PRED_STORE_CONTINUE8]]:
649-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
650-
; COMMON: [[PRED_STORE_IF9]]:
651-
; COMMON-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5
652-
; COMMON-NEXT: store i8 5, ptr [[TMP5]], align 1
653-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE10]]
654-
; COMMON: [[PRED_STORE_CONTINUE10]]:
655-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
656-
; COMMON: [[PRED_STORE_IF11]]:
657-
; COMMON-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6
658-
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
659-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
660-
; COMMON: [[PRED_STORE_CONTINUE12]]:
661-
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
662-
; COMMON: [[PRED_STORE_IF13]]:
663-
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
664-
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
665-
; COMMON-NEXT: br label %[[EXIT]]
614+
; COMMON-NEXT: [[ENTRY:.*]]:
615+
; COMMON-NEXT: br label %[[EXIT1:.*]]
616+
; COMMON: [[EXIT1]]:
617+
; COMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[EXIT1]] ]
618+
; COMMON-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8
619+
; COMMON-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
620+
; COMMON-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
621+
; COMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
622+
; COMMON-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
623+
; COMMON-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[EXIT1]]
666624
; COMMON: [[EXIT]]:
667-
; COMMON-NEXT: br label %[[MIDDLE_BLOCK:.*]]
668-
; COMMON: [[MIDDLE_BLOCK]]:
669-
; COMMON-NEXT: br [[EXIT1:label %.*]]
670-
; COMMON: [[SCALAR_PH]]:
625+
; COMMON-NEXT: ret void
671626
;
672627
entry:
673628
br label %loop
@@ -863,7 +818,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
863818
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
864819
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
865820
; DEFAULT-NEXT: [[TMP80:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
866-
; DEFAULT-NEXT: br i1 [[TMP80]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
821+
; DEFAULT-NEXT: br i1 [[TMP80]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
867822
; DEFAULT: [[MIDDLE_BLOCK]]:
868823
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
869824
; DEFAULT-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -1051,7 +1006,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
10511006
; PRED-NEXT: [[TMP84:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
10521007
; PRED-NEXT: [[TMP85:%.*]] = xor i1 [[TMP84]], true
10531008
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
1054-
; PRED-NEXT: br i1 [[TMP85]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
1009+
; PRED-NEXT: br i1 [[TMP85]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
10551010
; PRED: [[MIDDLE_BLOCK]]:
10561011
; PRED-NEXT: br [[EXIT:label %.*]]
10571012
; PRED: [[SCALAR_PH]]:
@@ -1109,7 +1064,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
11091064
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
11101065
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
11111066
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
1112-
; DEFAULT-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
1067+
; DEFAULT-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
11131068
; DEFAULT: [[MIDDLE_BLOCK]]:
11141069
; DEFAULT-NEXT: br label %[[SCALAR_PH]]
11151070
; DEFAULT: [[SCALAR_PH]]:
@@ -1157,7 +1112,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
11571112
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
11581113
; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
11591114
; PRED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
1160-
; PRED-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
1115+
; PRED-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
11611116
; PRED: [[MIDDLE_BLOCK]]:
11621117
; PRED-NEXT: br [[EXIT:label %.*]]
11631118
; PRED: [[SCALAR_PH]]:
@@ -1240,8 +1195,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
12401195
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
12411196
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
12421197
; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
1243-
; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 8)
1244-
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
1198+
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
12451199
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
12461200
; DEFAULT: [[VECTOR_MEMCHECK]]:
12471201
; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
@@ -1283,7 +1237,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
12831237
; DEFAULT-NEXT: store <vscale x 4 x i8> [[TMP23]], ptr [[TMP24]], align 1
12841238
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
12851239
; DEFAULT-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1286-
; DEFAULT-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1240+
; DEFAULT-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
12871241
; DEFAULT: [[MIDDLE_BLOCK]]:
12881242
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
12891243
; DEFAULT-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -1345,7 +1299,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
13451299
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
13461300
; PRED-NEXT: [[TMP28:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
13471301
; PRED-NEXT: [[TMP29:%.*]] = xor i1 [[TMP28]], true
1348-
; PRED-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
1302+
; PRED-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
13491303
; PRED: [[MIDDLE_BLOCK]]:
13501304
; PRED-NEXT: br [[EXIT:label %.*]]
13511305
; PRED: [[SCALAR_PH]]:

llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
5757
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane ir<%cmp3>
5858
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add
5959
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV
60-
; CHECK: LV: Minimum required TC for runtime checks to be profitable:160
61-
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 160)
60+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:128
61+
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 128)
6262
; CHECK-NEXT: LV: Too many memory checks needed.
6363
entry:
6464
%p1 = alloca [1024 x i8]
@@ -105,7 +105,7 @@ loop.header:
105105
%gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
106106
%l = load i64, ptr %gep.src, align 1
107107
%t = trunc i64 %l to i1
108-
br i1 %t, label %exit.0, label %loop.latch
108+
br i1 %t, label %exit.0, label %loop.latch, !prof !0
109109

110110
loop.latch:
111111
%iv.next = add i64 %iv, 1
@@ -120,4 +120,6 @@ exit.1:
120120
ret i64 0
121121
}
122122

123+
!0 = !{!"branch_weights", i32 1, i32 1}
124+
123125
attributes #1 = { "target-features"="+sve" vscale_range(1,16) }

0 commit comments

Comments
 (0)