diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index c4ba8e9857dc4..abf087281fe41 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1231,6 +1231,13 @@ class TargetTransformInfo { LLVM_ABI bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const; + /// \return True if vectorization factors wider than those matching the + /// largest element type should be chosen conservatively. This only makes + /// sense when shouldMaximizeVectorBandwidth returns true. + /// \p K Register Kind for vectorization. + LLVM_ABI bool shouldMaximizeVectorBandwidthConservatively( + TargetTransformInfo::RegisterKind K) const; + /// \return The minimum vectorization factor for types of given element /// bit width, or 0 if there is no minimum VF. The returned value only /// applies when shouldMaximizeVectorBandwidth returns true. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 43813d2f3acb5..6651505be9b86 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -597,6 +597,11 @@ class TargetTransformInfoImplBase { return false; } + virtual bool shouldMaximizeVectorBandwidthConservatively( + TargetTransformInfo::RegisterKind K) const { + return false; + } + virtual ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const { return ElementCount::get(0, IsScalable); } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 4ac8f03e6dbf5..0485581b8006c 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -803,6 +803,11 @@ bool TargetTransformInfo::shouldMaximizeVectorBandwidth( return TTIImpl->shouldMaximizeVectorBandwidth(K); } +bool TargetTransformInfo::shouldMaximizeVectorBandwidthConservatively( + TargetTransformInfo::RegisterKind K) const { + return TTIImpl->shouldMaximizeVectorBandwidthConservatively(K); +} + ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth, bool IsScalable) const { return TTIImpl->getMinimumVF(ElemWidth, IsScalable); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 838476dcae661..c747920b0a318 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -474,7 +474,8 @@ class LoopVectorizationPlanner { /// /// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has /// been retired. - InstructionCost cost(VPlan &Plan, ElementCount VF) const; + InstructionCost cost(VPlan &Plan, ElementCount VF, + bool CountsVecCalcOnly = false) const; /// Precompute costs for certain instructions using the legacy cost model. The /// function is used to bring up the VPlan-based cost model to initially avoid diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a0f306c12754f..a70c21353139d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -263,6 +263,11 @@ static cl::opt MaximizeBandwidth( cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop.")); +static cl::opt MaximizeBandwidthConservatively( + "vectorizer-maximize-bandwidth-conservatively", cl::init(false), cl::Hidden, + cl::desc("When MaximizeBandwidth is enabled, a larger vector factor is " + "chosen conservatively.")); + static cl::opt EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop")); @@ -962,9 +967,16 @@ class LoopVectorizationCostModel { /// user options, for the given register kind. bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind); + /// \return True if maximizing vector bandwidth should be applied + /// conservatively by the target or user options, for the given register kind. + /// This only makes sense when useMaxBandwidth returns true. + bool useMaxBandwidthConservatively(TargetTransformInfo::RegisterKind RegKind); + /// \return True if register pressure should be calculated for the given VF. bool shouldCalculateRegPressureForVF(ElementCount VF); + bool isVFForMaxBandwidth(ElementCount VF); + /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. @@ -3812,11 +3824,15 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF( ElementCount VF) { + // Only calculate register pressure for VFs enabled by MaxBandwidth. + return isVFForMaxBandwidth(VF); +} + +bool LoopVectorizationCostModel::isVFForMaxBandwidth(ElementCount VF) { if (!useMaxBandwidth(VF.isScalable() ? TargetTransformInfo::RGK_ScalableVector : TargetTransformInfo::RGK_FixedWidthVector)) return false; - // Only calculate register pressure for VFs enabled by MaxBandwidth. return ElementCount::isKnownGT( VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF : MaxPermissibleVFWithoutMaxBW.FixedVF); @@ -3830,6 +3846,13 @@ bool LoopVectorizationCostModel::useMaxBandwidth( Legal->hasVectorCallVariants()))); } +bool LoopVectorizationCostModel::useMaxBandwidthConservatively( + TargetTransformInfo::RegisterKind RegKind) { + return MaximizeBandwidthConservatively || + (MaximizeBandwidthConservatively.getNumOccurrences() == 0 && + TTI.shouldMaximizeVectorBandwidthConservatively(RegKind)); +} + ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount( ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const { unsigned EstimatedVF = VF.getKnownMinValue(); @@ -6923,13 +6946,16 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, return Cost; } -InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, - ElementCount VF) const { +InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF, + bool CountsVecCalcOnly) const { VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind); - InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); + InstructionCost Cost; + + if (!CountsVecCalcOnly) + Cost += precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. - Cost += Plan.cost(VF, CostCtx); + Cost += Plan.cost(VF, CostCtx, CountsVecCalcOnly); #ifndef NDEBUG unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning()); LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost @@ -7105,8 +7131,25 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { continue; } - if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail())) - BestFactor = CurrentFactor; + if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail())) { + if (CM.isVFForMaxBandwidth(VF) && + CM.useMaxBandwidthConservatively( + VF.isScalable() ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector)) { + if (ElementCount::isKnownLT(BestFactor.Width, VF) && + llvm::find(VFs, BestFactor.Width)) { + VectorizationFactor BestFactorVecCalc( + BestFactor.Width, cost(*P, BestFactor.Width, true), ScalarCost); + VectorizationFactor CurrentFactorVecCalc(VF, cost(*P, VF, true), + ScalarCost); + if (isMoreProfitable(CurrentFactorVecCalc, BestFactorVecCalc, + P->hasScalarTail())) + BestFactor = CurrentFactor; + } + } else { + BestFactor = CurrentFactor; + } + } // If profitable add it to ProfitableVF list. if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail())) @@ -7131,13 +7174,19 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The // legacy cost model doesn't properly model costs for such loops. - assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() || - planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), - CostCtx, OrigLoop, - BestFactor.Width) || - planContainsAdditionalSimplifications( - getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) && - " VPlan cost model and legacy cost model disagreed"); + if (!CM.isVFForMaxBandwidth(LegacyVF.Width) || + !CM.useMaxBandwidthConservatively( + LegacyVF.Width.isScalable() + ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector)) + assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() || + planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), + CostCtx, OrigLoop, + BestFactor.Width) || + planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width), + CostCtx, OrigLoop, + LegacyVF.Width)) && + " VPlan cost model and legacy cost model disagreed"); assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && "when vectorizing, the scalar cost must be computed."); #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index f972efa07eb7e..3470de8e56871 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -800,10 +800,34 @@ void VPRegionBlock::execute(VPTransformState *State) { State->Lane.reset(); } -InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) { +InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx, + bool CountsVecCalcOnly) { InstructionCost Cost = 0; - for (VPRecipeBase &R : Recipes) - Cost += R.cost(VF, Ctx); + for (VPRecipeBase &R : Recipes) { + if (!CountsVecCalcOnly) + Cost += R.cost(VF, Ctx); + else { + switch (R.getVPDefID()) { + case VPDef::VPActiveLaneMaskPHISC: + case VPDef::VPBlendSC: + case VPDef::VPFirstOrderRecurrencePHISC: + case VPDef::VPPartialReductionSC: + case VPDef::VPReductionPHISC: + case VPDef::VPReductionSC: + case VPDef::VPWidenCallSC: + case VPDef::VPWidenCanonicalIVSC: + case VPDef::VPWidenCastSC: + case VPDef::VPWidenGEPSC: + case VPDef::VPWidenIntOrFpInductionSC: + case VPDef::VPWidenIntrinsicSC: + case VPDef::VPWidenPHISC: + case VPDef::VPWidenPointerInductionSC: + case VPDef::VPWidenSC: + case VPDef::VPWidenSelectSC: + Cost += R.cost(VF, Ctx); + } + } + } return Cost; } @@ -826,11 +850,12 @@ const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const { return Pred->getExitingBasicBlock(); } -InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) { +InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx, + bool CountsVecCalcOnly) { if (!isReplicator()) { InstructionCost Cost = 0; for (VPBlockBase *Block : vp_depth_first_shallow(getEntry())) - Cost += Block->cost(VF, Ctx); + Cost += Block->cost(VF, Ctx, CountsVecCalcOnly); InstructionCost BackedgeCost = ForceTargetInstructionCost.getNumOccurrences() ? InstructionCost(ForceTargetInstructionCost.getNumOccurrences()) @@ -853,7 +878,7 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) { // uniform condition. using namespace llvm::VPlanPatternMatch; VPBasicBlock *Then = cast(getEntry()->getSuccessors()[0]); - InstructionCost ThenCost = Then->cost(VF, Ctx); + InstructionCost ThenCost = Then->cost(VF, Ctx, CountsVecCalcOnly); // For the scalar case, we may not always execute the original predicated // block, Thus, scale the block's cost by the probability of executing it. @@ -1016,19 +1041,22 @@ void VPlan::execute(VPTransformState *State) { } } -InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { +InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx, + bool CountsVecCalcOnly) { // For now only return the cost of the vector loop region, ignoring any other // blocks, like the preheader or middle blocks, expect for checking them for // recipes with invalid costs. - InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx); + InstructionCost Cost = + getVectorLoopRegion()->cost(VF, Ctx, CountsVecCalcOnly); // If the cost of the loop region is invalid or any recipe in the skeleton // outside loop regions are invalid return an invalid cost. - if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly( - vp_depth_first_shallow(getEntry())), - [&VF, &Ctx](VPBasicBlock *VPBB) { - return !VPBB->cost(VF, Ctx).isValid(); - })) + if (!Cost.isValid() || + any_of(VPBlockUtils::blocksOnly( + vp_depth_first_shallow(getEntry())), + [&VF, &Ctx, &CountsVecCalcOnly](VPBasicBlock *VPBB) { + return !VPBB->cost(VF, Ctx, CountsVecCalcOnly).isValid(); + })) return InstructionCost::getInvalid(); return Cost; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index d6bc462a0dfab..88f4f5dd24eaa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -340,7 +340,8 @@ class LLVM_ABI_FOR_TEST VPBlockBase { virtual void execute(VPTransformState *State) = 0; /// Return the cost of the block. - virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0; + virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx, + bool CountsVecCalcOnly = false) = 0; /// Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() { @@ -3716,7 +3717,8 @@ class LLVM_ABI_FOR_TEST VPBasicBlock : public VPBlockBase { void execute(VPTransformState *State) override; /// Return the cost of this VPBasicBlock. - InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override; + InstructionCost cost(ElementCount VF, VPCostContext &Ctx, + bool CountsVecCalcOnly) override; /// Return the position of the first non-phi node recipe in the block. iterator getFirstNonPhi(); @@ -3897,7 +3899,8 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase { void execute(VPTransformState *State) override; // Return the cost of this region. - InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override; + InstructionCost cost(ElementCount VF, VPCostContext &Ctx, + bool CountsVecCalcOnly) override; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with @@ -4022,7 +4025,8 @@ class VPlan { void execute(VPTransformState *State); /// Return the cost of this plan. - InstructionCost cost(ElementCount VF, VPCostContext &Ctx); + InstructionCost cost(ElementCount VF, VPCostContext &Ctx, + bool CountsVecCalcOnly = false); VPBasicBlock *getEntry() { return Entry; } const VPBasicBlock *getEntry() const { return Entry; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-conservatively.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-conservatively.ll new file mode 100644 index 0000000000000..441669c5f6dc6 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-conservatively.ll @@ -0,0 +1,58 @@ +; REQUIRES: asserts +; RUN: opt < %s -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -debug-only=loop-vectorize 2>&1 | FileCheck %s +; RUN: opt < %s -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize -vectorizer-maximize-bandwidth -vectorizer-maximize-bandwidth-conservatively -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=CHECK-CONS + +define void @f(i32 %n, ptr noalias %a, ptr %b, ptr %c) { +; The following loop is an example where choosing a larger vector width reduces +; the number of instructions but may lead to performance degradation due to the +; FP pipeline becoming a bottleneck. +; +; void f(int n, short *restrict a, long *b, double *c) { +; for (int i = 0; i < n; i++) { +; a[i] = b[i] + c[i]; +; } +; } + +; In the usual cost model, vscale x 8 is chosen. +; CHECK: Cost for VF vscale x 2: 8 (Estimated cost per lane: 4.0) +; CHECK: Cost for VF vscale x 4: 14 (Estimated cost per lane: 3.5) +; CHECK: Cost for VF vscale x 8: 26 (Estimated cost per lane: 3.2) +; CHECK: LV: Selecting VF: vscale x 8. + +; In a conservative cost model, a larger vector width is chosen only if it is +; superior when compared solely based on the cost of the FP pipeline, in +; addition to the usual model. +; CHECK-CONS: Cost for VF vscale x 2: 3 (Estimated cost per lane: 1.5) +; CHECK-CONS: Cost for VF vscale x 4: 7 (Estimated cost per lane: 1.8) +; CHECK-CONS: Cost for VF vscale x 8: 15 (Estimated cost per lane: 1.9) +; CHECK-CONS: LV: Selecting VF: vscale x 2. + +entry: + %cmp10 = icmp sgt i32 %n, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw i64, ptr %b, i64 %indvars.iv + %0 = load i64, ptr %arrayidx, align 8 + %conv = sitofp i64 %0 to double + %arrayidx2 = getelementptr inbounds nuw double, ptr %c, i64 %indvars.iv + %1 = load double, ptr %arrayidx2, align 8 + %add = fadd double %1, %conv + %conv3 = fptosi double %add to i16 + %arrayidx5 = getelementptr inbounds nuw i16, ptr %a, i64 %indvars.iv + store i16 %conv3, ptr %arrayidx5, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +}