Skip to content

Commit 319b4e7

Browse files
committed
[VPlan] Use BlockFrequencyInfo in getPredBlockCostDivisor
1 parent 994ac69 commit 319b4e7

34 files changed

+731
-1123
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -578,8 +578,10 @@ class InnerLoopVectorizer {
578578
/// The profitablity analysis.
579579
LoopVectorizationCostModel *Cost;
580580

581-
/// BFI and PSI are used to check for profile guided size optimizations.
581+
/// Used to calculate the probability of predicated blocks in
582+
/// getPredBlockCostDivisor.
582583
BlockFrequencyInfo *BFI;
584+
/// Used to check for profile guided size optimizations.
583585
ProfileSummaryInfo *PSI;
584586

585587
/// Structure to hold information about generated runtime checks, responsible
@@ -900,7 +902,7 @@ class LoopVectorizationCostModel {
900902
InterleavedAccessInfo &IAI,
901903
ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
902904
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
903-
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
905+
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), BFI(BFI), TheFunction(F),
904906
Hints(Hints), InterleaveInfo(IAI) {
905907
if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
906908
initializeVScaleForTuning();
@@ -1249,6 +1251,17 @@ class LoopVectorizationCostModel {
12491251
/// Superset of instructions that return true for isScalarWithPredication.
12501252
bool isPredicatedInst(Instruction *I) const;
12511253

1254+
/// A helper function that returns how much we should divide the cost of a
1255+
/// predicated block by. Typically this is the reciprocal of the block
1256+
/// probability, i.e. if we return X we are assuming the predicated block will
1257+
/// execute once for every X iterations of the loop header so the block should
1258+
/// only contribute 1/X of its cost to the total cost calculation, but when
1259+
/// optimizing for code size it will just be 1 as code size costs don't depend
1260+
/// on execution probabilities.
1261+
inline unsigned
1262+
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1263+
const BasicBlock *BB) const;
1264+
12521265
/// Return the costs for our two available strategies for lowering a
12531266
/// div/rem operation which requires speculating at least one lane.
12541267
/// First result is for scalarization (will be invalid for scalable
@@ -1711,6 +1724,8 @@ class LoopVectorizationCostModel {
17111724
/// Interface to emit optimization remarks.
17121725
OptimizationRemarkEmitter *ORE;
17131726

1727+
const BlockFrequencyInfo *BFI;
1728+
17141729
const Function *TheFunction;
17151730

17161731
/// Loop Vectorize Hint.
@@ -2866,6 +2881,19 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
28662881
}
28672882
}
28682883

2884+
unsigned LoopVectorizationCostModel::getPredBlockCostDivisor(
2885+
TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const {
2886+
if (CostKind == TTI::TCK_CodeSize)
2887+
return 1;
2888+
2889+
uint64_t HeaderFreq = BFI->getBlockFreq(TheLoop->getHeader()).getFrequency();
2890+
uint64_t BBFreq = BFI->getBlockFreq(BB).getFrequency();
2891+
assert(HeaderFreq >= BBFreq &&
2892+
"Header has smaller block freq than dominated BB?");
2893+
return BFI->getBlockFreq(TheLoop->getHeader()).getFrequency() /
2894+
BFI->getBlockFreq(BB).getFrequency();
2895+
}
2896+
28692897
std::pair<InstructionCost, InstructionCost>
28702898
LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
28712899
ElementCount VF) const {
@@ -2902,7 +2930,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
29022930
// Scale the cost by the probability of executing the predicated blocks.
29032931
// This assumes the predicated block for each vector lane is equally
29042932
// likely.
2905-
ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
2933+
ScalarizationCost =
2934+
ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
29062935
}
29072936

29082937
InstructionCost SafeDivisorCost = 0;
@@ -5035,7 +5064,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
50355064
}
50365065

50375066
// Scale the total scalar cost by block probability.
5038-
ScalarCost /= getPredBlockCostDivisor(CostKind);
5067+
ScalarCost /= getPredBlockCostDivisor(CostKind, PredInst->getParent());
50395068

50405069
// Compute the discount. A non-negative discount means the vector version
50415070
// of the instruction costs more, and scalarizing would be beneficial.
@@ -5088,7 +5117,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
50885117
// cost by the probability of executing it. blockNeedsPredication from
50895118
// Legal is used so as to not include all blocks in tail folded loops.
50905119
if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5091-
BlockCost /= getPredBlockCostDivisor(CostKind);
5120+
BlockCost /= getPredBlockCostDivisor(CostKind, BB);
50925121

50935122
Cost += BlockCost;
50945123
}
@@ -5167,7 +5196,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
51675196
// conditional branches, but may not be executed for each vector lane. Scale
51685197
// the cost by the probability of executing the predicated block.
51695198
if (isPredicatedInst(I)) {
5170-
Cost /= getPredBlockCostDivisor(CostKind);
5199+
Cost /= getPredBlockCostDivisor(CostKind, I->getParent());
51715200

51725201
// Add the cost of an i1 extract and a branch
51735202
auto *VecI1Ty =
@@ -6727,6 +6756,11 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
67276756
SkipCostComputation.contains(UI);
67286757
}
67296758

6759+
unsigned VPCostContext::getPredBlockCostDivisor(
6760+
TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const {
6761+
return CM.getPredBlockCostDivisor(CostKind, BB);
6762+
}
6763+
67306764
InstructionCost
67316765
LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
67326766
VPCostContext &CostCtx) const {
@@ -10310,9 +10344,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
1031010344

1031110345
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
1031210346
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10313-
BFI = nullptr;
10314-
if (PSI && PSI->hasProfileSummary())
10315-
BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10347+
BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
1031610348
LoopVectorizeResult Result = runImpl(F);
1031710349
if (!Result.MadeAnyChange)
1031810350
return PreservedAnalyses::all();

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -855,7 +855,9 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
855855
// For the scalar case, we may not always execute the original predicated
856856
// block, Thus, scale the block's cost by the probability of executing it.
857857
if (VF.isScalar())
858-
return ThenCost / getPredBlockCostDivisor(Ctx.CostKind);
858+
if (auto *VPIRBB = dyn_cast<VPIRBasicBlock>(Then))
859+
return ThenCost / Ctx.getPredBlockCostDivisor(Ctx.CostKind,
860+
VPIRBB->getIRBasicBlock());
859861

860862
return ThenCost;
861863
}

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
5050
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
5151
int64_t Step);
5252

53-
/// A helper function that returns how much we should divide the cost of a
54-
/// predicated block by. Typically this is the reciprocal of the block
55-
/// probability, i.e. if we return X we are assuming the predicated block will
56-
/// execute once for every X iterations of the loop header so the block should
57-
/// only contribute 1/X of its cost to the total cost calculation, but when
58-
/// optimizing for code size it will just be 1 as code size costs don't depend
59-
/// on execution probabilities.
60-
///
61-
/// TODO: We should use actual block probability here, if available. Currently,
62-
/// we always assume predicated blocks have a 50% chance of executing.
63-
inline unsigned
64-
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) {
65-
return CostKind == TTI::TCK_CodeSize ? 1 : 2;
66-
}
67-
6853
/// A range of powers-of-2 vectorization factors with fixed start and
6954
/// adjustable end. The range includes start and excludes end, e.g.,:
7055
/// [1, 16) = {1, 2, 4, 8}
@@ -378,6 +363,9 @@ struct VPCostContext {
378363
InstructionCost getScalarizationOverhead(Type *ResultTy,
379364
ArrayRef<const VPValue *> Operands,
380365
ElementCount VF);
366+
367+
unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
368+
const BasicBlock *BB) const;
381369
};
382370

383371
/// This class can be used to assign names to VPValues. For VPValues without

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3170,7 +3170,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31703170
// Scale the cost by the probability of executing the predicated blocks.
31713171
// This assumes the predicated block for each vector lane is equally
31723172
// likely.
3173-
ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
3173+
ScalarCost /= Ctx.getPredBlockCostDivisor(Ctx.CostKind, UI->getParent());
31743174
return ScalarCost;
31753175
}
31763176
case Instruction::Load:

llvm/test/Other/new-pm-defaults.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,8 @@
261261
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
262262
; CHECK-O-NEXT: Running pass: InjectTLIMappings
263263
; CHECK-O-NEXT: Running pass: LoopVectorizePass
264+
; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
265+
; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
264266
; CHECK-O-NEXT: Running pass: InferAlignmentPass
265267
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
266268
; CHECK-O-NEXT: Running pass: InstCombinePass

llvm/test/Other/new-pm-lto-defaults.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@
128128
; CHECK-O23SZ-NEXT: Running analysis: LoopAccessAnalysis on foo
129129
; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo
130130
; CHECK-O23SZ-NEXT: Running analysis: DemandedBitsAnalysis on foo
131+
; CHECK-O23SZ-NEXT: Running analysis: BlockFrequencyAnalysis on foo
132+
; CHECK-O23SZ-NEXT: Running analysis: BranchProbabilityAnalysis on foo
131133
; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
132134
; CHECK-O23SZ-NEXT: Running pass: LoopUnrollPass on foo
133135
; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo

llvm/test/Other/new-pm-thinlto-postlink-defaults.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,8 @@
180180
; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
181181
; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings
182182
; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
183+
; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
184+
; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
183185
; CHECK-POSTLINK-O-NEXT: Running pass: InferAlignmentPass
184186
; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
185187
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass

llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
2525
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_UDIV_CONTINUE2]] ]
2626
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
2727
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4
28-
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
28+
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], splat (i64 1)
2929
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
3030
; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
3131
; CHECK: pred.udiv.if:
@@ -65,7 +65,7 @@ for.body:
6565
%r = phi i64 [ 0, %entry ], [ %var6, %for.inc ]
6666
%var0 = getelementptr inbounds i64, ptr %a, i64 %i
6767
%var2 = load i64, ptr %var0, align 4
68-
%cond0 = icmp sgt i64 %var2, 0
68+
%cond0 = icmp sgt i64 %var2, 1
6969
br i1 %cond0, label %if.then, label %for.inc
7070

7171
if.then:

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 13 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -612,63 +612,18 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
612612
;
613613
; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store(
614614
; COMMON-SAME: ptr [[DST:%.*]]) {
615-
; COMMON-NEXT: [[ENTRY:.*:]]
616-
; COMMON-NEXT: br label %[[VECTOR_PH:.*]]
617-
; COMMON: [[VECTOR_PH]]:
618-
; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
619-
; COMMON: [[VECTOR_BODY]]:
620-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
621-
; COMMON: [[PRED_STORE_IF]]:
622-
; COMMON-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0
623-
; COMMON-NEXT: store i8 0, ptr [[TMP0]], align 1
624-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE]]
625-
; COMMON: [[PRED_STORE_CONTINUE]]:
626-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
627-
; COMMON: [[PRED_STORE_IF1]]:
628-
; COMMON-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1
629-
; COMMON-NEXT: store i8 1, ptr [[TMP1]], align 1
630-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE2]]
631-
; COMMON: [[PRED_STORE_CONTINUE2]]:
632-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
633-
; COMMON: [[PRED_STORE_IF3]]:
634-
; COMMON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2
635-
; COMMON-NEXT: store i8 2, ptr [[TMP2]], align 1
636-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE4]]
637-
; COMMON: [[PRED_STORE_CONTINUE4]]:
638-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
639-
; COMMON: [[PRED_STORE_IF5]]:
640-
; COMMON-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3
641-
; COMMON-NEXT: store i8 3, ptr [[TMP3]], align 1
642-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE6]]
643-
; COMMON: [[PRED_STORE_CONTINUE6]]:
644-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
645-
; COMMON: [[PRED_STORE_IF7]]:
646-
; COMMON-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4
647-
; COMMON-NEXT: store i8 4, ptr [[TMP4]], align 1
648-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE8]]
649-
; COMMON: [[PRED_STORE_CONTINUE8]]:
650-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
651-
; COMMON: [[PRED_STORE_IF9]]:
652-
; COMMON-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5
653-
; COMMON-NEXT: store i8 5, ptr [[TMP5]], align 1
654-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE10]]
655-
; COMMON: [[PRED_STORE_CONTINUE10]]:
656-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
657-
; COMMON: [[PRED_STORE_IF11]]:
658-
; COMMON-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6
659-
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
660-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
661-
; COMMON: [[PRED_STORE_CONTINUE12]]:
662-
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
663-
; COMMON: [[PRED_STORE_IF13]]:
664-
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
665-
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
666-
; COMMON-NEXT: br label %[[EXIT]]
667-
; COMMON: [[EXIT]]:
668-
; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
669-
; COMMON: [[SCALAR_PH]]:
670-
; COMMON-NEXT: br [[EXIT1:label %.*]]
671-
; COMMON: [[SCALAR_PH1:.*:]]
615+
; COMMON-NEXT: [[ENTRY:.*]]:
616+
; COMMON-NEXT: br label %[[EXIT1:.*]]
617+
; COMMON: [[EXIT1]]:
618+
; COMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[EXIT1]] ]
619+
; COMMON-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8
620+
; COMMON-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
621+
; COMMON-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
622+
; COMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
623+
; COMMON-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
624+
; COMMON-NEXT: br i1 [[EC]], label %[[SCALAR_PH1:.*]], label %[[EXIT1]]
625+
; COMMON: [[SCALAR_PH1]]:
626+
; COMMON-NEXT: ret void
672627
;
673628
entry:
674629
br label %loop
@@ -1241,8 +1196,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
12411196
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
12421197
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
12431198
; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
1244-
; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 8)
1245-
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
1199+
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
12461200
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
12471201
; DEFAULT: [[VECTOR_MEMCHECK]]:
12481202
; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()

llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
5757
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane ir<%cmp3>
5858
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add
5959
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV
60-
; CHECK: LV: Minimum required TC for runtime checks to be profitable:160
61-
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 160)
60+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:128
61+
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 128)
6262
; CHECK-NEXT: LV: Too many memory checks needed.
6363
entry:
6464
%p1 = alloca [1024 x i8]
@@ -105,7 +105,7 @@ loop.header:
105105
%gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
106106
%l = load i64, ptr %gep.src, align 1
107107
%t = trunc i64 %l to i1
108-
br i1 %t, label %exit.0, label %loop.latch
108+
br i1 %t, label %exit.0, label %loop.latch, !prof !0
109109

110110
loop.latch:
111111
%iv.next = add i64 %iv, 1
@@ -120,4 +120,6 @@ exit.1:
120120
ret i64 0
121121
}
122122

123+
!0 = !{!"branch_weights", i32 1, i32 1}
124+
123125
attributes #1 = { "target-features"="+sve" vscale_range(1,16) }

0 commit comments

Comments
 (0)