Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1231,6 +1231,13 @@ class TargetTransformInfo {
LLVM_ABI bool
shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;

/// \return True if vectorization factors wider than those matching the
/// largest element type should be chosen conservatively. This only makes
/// sense when shouldMaximizeVectorBandwidth returns true.
/// \p K Register Kind for vectorization.
LLVM_ABI bool shouldMaximizeVectorBandwidthConservatively(
TargetTransformInfo::RegisterKind K) const;

/// \return The minimum vectorization factor for types of given element
/// bit width, or 0 if there is no minimum VF. The returned value only
/// applies when shouldMaximizeVectorBandwidth returns true.
Expand Down
5 changes: 5 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,11 @@ class TargetTransformInfoImplBase {
return false;
}

virtual bool shouldMaximizeVectorBandwidthConservatively(
TargetTransformInfo::RegisterKind K) const {
return false;
}

virtual ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const {
return ElementCount::get(0, IsScalable);
}
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,11 @@ bool TargetTransformInfo::shouldMaximizeVectorBandwidth(
return TTIImpl->shouldMaximizeVectorBandwidth(K);
}

bool TargetTransformInfo::shouldMaximizeVectorBandwidthConservatively(
TargetTransformInfo::RegisterKind K) const {
return TTIImpl->shouldMaximizeVectorBandwidthConservatively(K);
}

ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth,
bool IsScalable) const {
return TTIImpl->getMinimumVF(ElemWidth, IsScalable);
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,8 @@ class LoopVectorizationPlanner {
///
/// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
/// been retired.
InstructionCost cost(VPlan &Plan, ElementCount VF) const;
InstructionCost cost(VPlan &Plan, ElementCount VF,
bool CountsVecCalcOnly = false) const;

/// Precompute costs for certain instructions using the legacy cost model. The
/// function is used to bring up the VPlan-based cost model to initially avoid
Expand Down
77 changes: 63 additions & 14 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,11 @@ static cl::opt<bool> MaximizeBandwidth(
cl::desc("Maximize bandwidth when selecting vectorization factor which "
"will be determined by the smallest type in loop."));

static cl::opt<bool> MaximizeBandwidthConservatively(
"vectorizer-maximize-bandwidth-conservatively", cl::init(false), cl::Hidden,
cl::desc("When MaximizeBandwidth is enabled, a larger vector factor is "
"chosen conservatively."));

static cl::opt<bool> EnableInterleavedMemAccesses(
"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
Expand Down Expand Up @@ -962,9 +967,16 @@ class LoopVectorizationCostModel {
/// user options, for the given register kind.
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);

/// \return True if maximizing vector bandwidth should be applied
/// conservatively by the target or user options, for the given register kind.
/// This only makes sense when useMaxBandwidth returns true.
bool useMaxBandwidthConservatively(TargetTransformInfo::RegisterKind RegKind);

/// \return True if register pressure should be calculated for the given VF.
bool shouldCalculateRegPressureForVF(ElementCount VF);

bool isVFForMaxBandwidth(ElementCount VF);

/// \return The size (in bits) of the smallest and widest types in the code
/// that needs to be vectorized. We ignore values that remain scalar such as
/// 64 bit loop indices.
Expand Down Expand Up @@ -3812,11 +3824,15 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF(
ElementCount VF) {
// Only calculate register pressure for VFs enabled by MaxBandwidth.
return isVFForMaxBandwidth(VF);
}

bool LoopVectorizationCostModel::isVFForMaxBandwidth(ElementCount VF) {
if (!useMaxBandwidth(VF.isScalable()
? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector))
return false;
// Only calculate register pressure for VFs enabled by MaxBandwidth.
return ElementCount::isKnownGT(
VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
: MaxPermissibleVFWithoutMaxBW.FixedVF);
Expand All @@ -3830,6 +3846,13 @@ bool LoopVectorizationCostModel::useMaxBandwidth(
Legal->hasVectorCallVariants())));
}

bool LoopVectorizationCostModel::useMaxBandwidthConservatively(
TargetTransformInfo::RegisterKind RegKind) {
return MaximizeBandwidthConservatively ||
(MaximizeBandwidthConservatively.getNumOccurrences() == 0 &&
TTI.shouldMaximizeVectorBandwidthConservatively(RegKind));
}

ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const {
unsigned EstimatedVF = VF.getKnownMinValue();
Expand Down Expand Up @@ -6923,13 +6946,16 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
return Cost;
}

InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
bool CountsVecCalcOnly) const {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
InstructionCost Cost;

if (!CountsVecCalcOnly)
Cost += precomputeCosts(Plan, VF, CostCtx);

// Now compute and add the VPlan-based cost.
Cost += Plan.cost(VF, CostCtx);
Cost += Plan.cost(VF, CostCtx, CountsVecCalcOnly);
#ifndef NDEBUG
unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
Expand Down Expand Up @@ -7105,8 +7131,25 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
continue;
}

if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
BestFactor = CurrentFactor;
if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail())) {
if (CM.isVFForMaxBandwidth(VF) &&
CM.useMaxBandwidthConservatively(
VF.isScalable() ? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector)) {
if (ElementCount::isKnownLT(BestFactor.Width, VF) &&
llvm::find(VFs, BestFactor.Width)) {
VectorizationFactor BestFactorVecCalc(
BestFactor.Width, cost(*P, BestFactor.Width, true), ScalarCost);
VectorizationFactor CurrentFactorVecCalc(VF, cost(*P, VF, true),
ScalarCost);
if (isMoreProfitable(CurrentFactorVecCalc, BestFactorVecCalc,
P->hasScalarTail()))
BestFactor = CurrentFactor;
}
} else {
BestFactor = CurrentFactor;
}
}

// If profitable add it to ProfitableVF list.
if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
Expand All @@ -7131,13 +7174,19 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
// legacy cost model doesn't properly model costs for such loops.
assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
CostCtx, OrigLoop,
BestFactor.Width) ||
planContainsAdditionalSimplifications(
getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
" VPlan cost model and legacy cost model disagreed");
if (!CM.isVFForMaxBandwidth(LegacyVF.Width) ||
!CM.useMaxBandwidthConservatively(
LegacyVF.Width.isScalable()
? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector))
assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
CostCtx, OrigLoop,
BestFactor.Width) ||
planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
CostCtx, OrigLoop,
LegacyVF.Width)) &&
" VPlan cost model and legacy cost model disagreed");
assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
"when vectorizing, the scalar cost must be computed.");
#endif
Expand Down
54 changes: 41 additions & 13 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -800,10 +800,34 @@ void VPRegionBlock::execute(VPTransformState *State) {
State->Lane.reset();
}

InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly) {
InstructionCost Cost = 0;
for (VPRecipeBase &R : Recipes)
Cost += R.cost(VF, Ctx);
for (VPRecipeBase &R : Recipes) {
if (!CountsVecCalcOnly)
Cost += R.cost(VF, Ctx);
else {
switch (R.getVPDefID()) {
case VPDef::VPActiveLaneMaskPHISC:
case VPDef::VPBlendSC:
case VPDef::VPFirstOrderRecurrencePHISC:
case VPDef::VPPartialReductionSC:
case VPDef::VPReductionPHISC:
case VPDef::VPReductionSC:
case VPDef::VPWidenCallSC:
case VPDef::VPWidenCanonicalIVSC:
case VPDef::VPWidenCastSC:
case VPDef::VPWidenGEPSC:
case VPDef::VPWidenIntOrFpInductionSC:
case VPDef::VPWidenIntrinsicSC:
case VPDef::VPWidenPHISC:
case VPDef::VPWidenPointerInductionSC:
case VPDef::VPWidenSC:
case VPDef::VPWidenSelectSC:
Cost += R.cost(VF, Ctx);
}
}
}
return Cost;
}

Expand All @@ -826,11 +850,12 @@ const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const {
return Pred->getExitingBasicBlock();
}

InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly) {
if (!isReplicator()) {
InstructionCost Cost = 0;
for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
Cost += Block->cost(VF, Ctx);
Cost += Block->cost(VF, Ctx, CountsVecCalcOnly);
InstructionCost BackedgeCost =
ForceTargetInstructionCost.getNumOccurrences()
? InstructionCost(ForceTargetInstructionCost.getNumOccurrences())
Expand All @@ -853,7 +878,7 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
// uniform condition.
using namespace llvm::VPlanPatternMatch;
VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
InstructionCost ThenCost = Then->cost(VF, Ctx);
InstructionCost ThenCost = Then->cost(VF, Ctx, CountsVecCalcOnly);

// For the scalar case, we may not always execute the original predicated
// block, Thus, scale the block's cost by the probability of executing it.
Expand Down Expand Up @@ -1016,19 +1041,22 @@ void VPlan::execute(VPTransformState *State) {
}
}

InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly) {
// For now only return the cost of the vector loop region, ignoring any other
// blocks, like the preheader or middle blocks, expect for checking them for
// recipes with invalid costs.
InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
InstructionCost Cost =
getVectorLoopRegion()->cost(VF, Ctx, CountsVecCalcOnly);

// If the cost of the loop region is invalid or any recipe in the skeleton
// outside loop regions are invalid return an invalid cost.
if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(getEntry())),
[&VF, &Ctx](VPBasicBlock *VPBB) {
return !VPBB->cost(VF, Ctx).isValid();
}))
if (!Cost.isValid() ||
any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(getEntry())),
[&VF, &Ctx, &CountsVecCalcOnly](VPBasicBlock *VPBB) {
return !VPBB->cost(VF, Ctx, CountsVecCalcOnly).isValid();
}))
return InstructionCost::getInvalid();

return Cost;
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,8 @@ class LLVM_ABI_FOR_TEST VPBlockBase {
virtual void execute(VPTransformState *State) = 0;

/// Return the cost of the block.
virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly = false) = 0;

/// Return true if it is legal to hoist instructions into this block.
bool isLegalToHoistInto() {
Expand Down Expand Up @@ -3716,7 +3717,8 @@ class LLVM_ABI_FOR_TEST VPBasicBlock : public VPBlockBase {
void execute(VPTransformState *State) override;

/// Return the cost of this VPBasicBlock.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly) override;

/// Return the position of the first non-phi node recipe in the block.
iterator getFirstNonPhi();
Expand Down Expand Up @@ -3897,7 +3899,8 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
void execute(VPTransformState *State) override;

// Return the cost of this region.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly) override;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
Expand Down Expand Up @@ -4022,7 +4025,8 @@ class VPlan {
void execute(VPTransformState *State);

/// Return the cost of this plan.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
bool CountsVecCalcOnly = false);

VPBasicBlock *getEntry() { return Entry; }
const VPBasicBlock *getEntry() const { return Entry; }
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
; REQUIRES: asserts
; RUN: opt < %s -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
; RUN: opt < %s -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize -vectorizer-maximize-bandwidth -vectorizer-maximize-bandwidth-conservatively -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=CHECK-CONS

define void @f(i32 %n, ptr noalias %a, ptr %b, ptr %c) {
; The following loop is an example where choosing a larger vector width reduces
; the number of instructions but may lead to performance degradation due to the
; FP pipeline becoming a bottleneck.
;
; void f(int n, short *restrict a, long *b, double *c) {
; for (int i = 0; i < n; i++) {
; a[i] = b[i] + c[i];
; }
; }

; In the usual cost model, vscale x 8 is chosen.
; CHECK: Cost for VF vscale x 2: 8 (Estimated cost per lane: 4.0)
; CHECK: Cost for VF vscale x 4: 14 (Estimated cost per lane: 3.5)
; CHECK: Cost for VF vscale x 8: 26 (Estimated cost per lane: 3.2)
; CHECK: LV: Selecting VF: vscale x 8.

; In a conservative cost model, a larger vector width is chosen only if it is
; superior when compared solely based on the cost of the FP pipeline, in
; addition to the usual model.
; CHECK-CONS: Cost for VF vscale x 2: 3 (Estimated cost per lane: 1.5)
; CHECK-CONS: Cost for VF vscale x 4: 7 (Estimated cost per lane: 1.8)
; CHECK-CONS: Cost for VF vscale x 8: 15 (Estimated cost per lane: 1.9)
; CHECK-CONS: LV: Selecting VF: vscale x 2.

entry:
%cmp10 = icmp sgt i32 %n, 0
br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader: ; preds = %entry
%wide.trip.count = zext nneg i32 %n to i64
br label %for.body

for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void

for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds nuw i64, ptr %b, i64 %indvars.iv
%0 = load i64, ptr %arrayidx, align 8
%conv = sitofp i64 %0 to double
%arrayidx2 = getelementptr inbounds nuw double, ptr %c, i64 %indvars.iv
%1 = load double, ptr %arrayidx2, align 8
%add = fadd double %1, %conv
%conv3 = fptosi double %add to i16
%arrayidx5 = getelementptr inbounds nuw i16, ptr %a, i64 %indvars.iv
store i16 %conv3, ptr %arrayidx5, align 2
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
}
Loading