From 8720c85b69a96d0f5c7f23cf7eb1687c84cc9c6f Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 22 Sep 2025 15:19:26 +0100 Subject: [PATCH] [LV] Make ScalarEpilogue a LoopVectorizeHint Move ScalarEpilogueLowering from LoopVectorizationCostModel into LoopVectorizeHints, as it is really a kind of hint that's dependent on the user's preferences. The patch has the side-effect of costing the scalar-epilogue correctly in some corner cases. --- .../Vectorize/LoopVectorizationLegality.h | 50 +++- .../Vectorize/LoopVectorizationLegality.cpp | 94 +++++++- .../Transforms/Vectorize/LoopVectorize.cpp | 213 ++++-------------- .../LoopVectorize/RISCV/dead-ops-cost.ll | 4 +- .../LoopVectorize/RISCV/strided-accesses.ll | 6 +- 5 files changed, 192 insertions(+), 175 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index 405d4a742f37b..28bc90c6cf046 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -64,7 +64,8 @@ class LoopVectorizeHints { HK_FORCE, HK_ISVECTORIZED, HK_PREDICATE, - HK_SCALABLE + HK_SCALABLE, + HK_SCALAREPILOGUE }; /// Hint - associates name and validation with the hint value. @@ -97,6 +98,9 @@ class LoopVectorizeHints { /// Says whether we should use fixed width or scalable vectorization. Hint Scalable; + /// Hint specifying how we should lower the scalar epilogue. + Hint ScalarEpilogue; + /// Return the loop metadata prefix. static StringRef Prefix() { return "llvm.loop."; } @@ -121,6 +125,33 @@ class LoopVectorizeHints { SK_PreferScalable = 1 }; + /// Whether it is allowed to have the original scalar loop execute at least + /// once. This may be needed as a fallback loop in case runtime + /// aliasing/dependence checks fail, or to handle the tail/remainder + /// iterations when the trip count is unknown or doesn't divide by the VF, or + /// as a peel-loop to handle gaps in interleave-groups. Under optsize and when + /// the trip count is very small we don't allow anyiterations to execute in + /// the scalar loop. + enum ScalarEpilogueKind { + // The default: allowing scalar epilogues. + SEK_Allowed, + + // Vectorization with OptForSize: don't allow epilogues. + SEK_NotAllowedOptSize, + + // A special case of vectorisation with OptForSize: loops with a very small + // trip count are considered for vectorization under OptForSize, thereby + // making sure the cost of their loop body is dominant, free of runtime + // guards and scalar iteration overheads. + SEK_NotAllowedLowTripLoop, + + // Loop hint predicate indicating an epilogue is undesired. + SEK_NotNeededUsePredicate, + + // Directive indicating we must either tail fold or not vectorize + SEK_NotAllowedUsePredicate + }; + LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced, OptimizationRemarkEmitter &ORE, const TargetTransformInfo *TTI = nullptr); @@ -156,6 +187,21 @@ class LoopVectorizeHints { return FK_Disabled; return (ForceKind)Force.Value; } + ScalarEpilogueKind getScalarEpilogue() const { + return static_cast(ScalarEpilogue.Value); + } + bool isScalarEpilogueAllowed() const { + return ScalarEpilogue.Value == SEK_Allowed; + } + void setScalarEpilogue(ScalarEpilogueKind SEK) { ScalarEpilogue.Value = SEK; } + + // Determine how to lower the scalar epilogue and set it. Depends on 1) + // optimising for minimum code-size, 2) predicate compiler options, 3) loop + // hints forcing predication, and 4) a TTI hook that analyses whether the loop + // is suitable for predication. + void setScalarEpilogue(ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, + TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, + InterleavedAccessInfo *IAI); /// \return true if scalable vectorization has been explicitly disabled. bool isScalableVectorizationDisabled() const { @@ -196,6 +242,8 @@ class LoopVectorizeHints { /// Interface to emit optimization remarks. OptimizationRemarkEmitter &ORE; + + const TargetTransformInfo *TTI; }; /// This holds vectorization requirements that must be verified late in diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index ff35db14f7094..28ed2d6edf65e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -83,6 +83,34 @@ static cl::opt EnableHistogramVectorization( "enable-histogram-loop-vectorization", cl::init(false), cl::Hidden, cl::desc("Enables autovectorization of some loops containing histograms")); +// Option prefer-predicate-over-epilogue indicates that an epilogue is +// undesired, that predication is preferred, and this lists all options. I.e., +// the vectorizer will try to fold the tail-loop (epilogue) into the vector body +// and predicate the instructions accordingly. If tail-folding fails, there are +// different fallback strategies depending on these values: +enum class PreferPredicateTy { + ScalarEpilogue = 0, + PredicateElseScalarEpilogue, + PredicateOrDontVectorize +}; + +static cl::opt PreferPredicateOverEpilogue( + "prefer-predicate-over-epilogue", + cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, + cl::desc("Tail-folding and predication preferences over creating a scalar " + "epilogue loop."), + cl::values( + clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", + "Don't tail-predicate loops, create scalar epilogue"), + clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, + "predicate-else-scalar-epilogue", + "prefer tail-folding, create scalar epilogue if tail " + "folding fails."), + clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, + "predicate-dont-vectorize", + "prefers tail-folding, don't attempt vectorization if " + "tail-folding fails."))); + /// Maximum vectorization interleave count. static const unsigned MaxInterleaveFactor = 16; @@ -100,6 +128,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) { case HK_PREDICATE: case HK_SCALABLE: return (Val == 0 || Val == 1); + case HK_SCALAREPILOGUE: + return Val <= SEK_NotAllowedUsePredicate; } return false; } @@ -114,7 +144,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, IsVectorized("isvectorized", 0, HK_ISVECTORIZED), Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE), - TheLoop(L), ORE(ORE) { + ScalarEpilogue("scalarepilogue", SEK_Allowed, HK_SCALAREPILOGUE), + TheLoop(L), ORE(ORE), TTI(TTI) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -302,8 +333,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { return; unsigned Val = C->getZExtValue(); - Hint *Hints[] = {&Width, &Interleave, &Force, - &IsVectorized, &Predicate, &Scalable}; + Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, + &Predicate, &Scalable, &ScalarEpilogue}; for (auto *H : Hints) { if (Name == H->Name) { if (H->validate(Val)) @@ -315,6 +346,63 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { } } +void LoopVectorizeHints::setScalarEpilogue(ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI, + TargetLibraryInfo *TLI, + LoopVectorizationLegality &LVL, + InterleavedAccessInfo *IAI) { + // 1) OptSize takes precedence over all other options, i.e. if this is set, + // don't look at hints or options, and don't request a scalar epilogue. + // (For PGSO, as shouldOptimizeForSize isn't currently accessible from + // LoopAccessInfo (due to code dependency and not being able to reliably get + // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection + // of strides in LoopAccessInfo::analyzeLoop() and vectorize without + // versioning when the vectorization is forced, unlike hasOptSize. So revert + // back to the old way and vectorize with versioning when forced. See D81345.) + Function *F = TheLoop->getHeader()->getParent(); + if (F->hasOptSize() || + (llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI, + PGSOQueryType::IRPass) && + getForce() != LoopVectorizeHints::FK_Enabled)) { + ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotAllowedOptSize; + return; + } + + // 2) If set, obey the directives + if (PreferPredicateOverEpilogue.getNumOccurrences()) { + switch (PreferPredicateOverEpilogue) { + case PreferPredicateTy::ScalarEpilogue: + ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed; + return; + case PreferPredicateTy::PredicateElseScalarEpilogue: + ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate; + return; + case PreferPredicateTy::PredicateOrDontVectorize: + ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotAllowedUsePredicate; + return; + }; + } + + // 3) If set, obey the hints + switch (getPredicate()) { + case LoopVectorizeHints::FK_Enabled: + ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate; + return; + case LoopVectorizeHints::FK_Disabled: + ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed; + return; + }; + + // 4) if the TTI hook indicates this is profitable, request predication. + TailFoldingInfo TFI(TLI, &LVL, IAI); + if (TTI->preferPredicateOverEpilogue(&TFI)) { + ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate; + return; + } + + ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed; +} + // Return true if the inner loop \p Lp is uniform with regard to the outer loop // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes // executing the inner loop will execute the same iterations). This check is diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ca092dcfcb492..7ed66969dd489 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -197,37 +197,6 @@ static cl::opt VectorizeMemoryCheckThreshold( "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks")); -// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, -// that predication is preferred, and this lists all options. I.e., the -// vectorizer will try to fold the tail-loop (epilogue) into the vector body -// and predicate the instructions accordingly. If tail-folding fails, there are -// different fallback strategies depending on these values: -namespace PreferPredicateTy { - enum Option { - ScalarEpilogue = 0, - PredicateElseScalarEpilogue, - PredicateOrDontVectorize - }; -} // namespace PreferPredicateTy - -static cl::opt PreferPredicateOverEpilogue( - "prefer-predicate-over-epilogue", - cl::init(PreferPredicateTy::ScalarEpilogue), - cl::Hidden, - cl::desc("Tail-folding and predication preferences over creating a scalar " - "epilogue loop."), - cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, - "scalar-epilogue", - "Don't tail-predicate loops, create scalar epilogue"), - clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, - "predicate-else-scalar-epilogue", - "prefer tail-folding, create scalar epilogue if tail " - "folding fails."), - clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, - "predicate-dont-vectorize", - "prefers tail-folding, don't attempt vectorization if " - "tail-folding fails."))); - static cl::opt ForceTailFoldingStyle( "force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), @@ -854,30 +823,6 @@ static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, } // end namespace llvm namespace llvm { - -// Loop vectorization cost-model hints how the scalar epilogue loop should be -// lowered. -enum ScalarEpilogueLowering { - - // The default: allowing scalar epilogues. - CM_ScalarEpilogueAllowed, - - // Vectorization with OptForSize: don't allow epilogues. - CM_ScalarEpilogueNotAllowedOptSize, - - // A special case of vectorisation with OptForSize: loops with a very small - // trip count are considered for vectorization under OptForSize, thereby - // making sure the cost of their loop body is dominant, free of runtime - // guards and scalar iteration overheads. - CM_ScalarEpilogueNotAllowedLowTripLoop, - - // Loop hint predicate indicating an epilogue is undesired. - CM_ScalarEpilogueNotNeededUsePredicate, - - // Directive indicating we must either tail fold or not vectorize - CM_ScalarEpilogueNotAllowedUsePredicate -}; - /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of @@ -889,19 +834,17 @@ class LoopVectorizationCostModel { friend class LoopVectorizationPlanner; public: - LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, - PredicatedScalarEvolution &PSE, LoopInfo *LI, - LoopVectorizationLegality *Legal, + LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, + LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, - const LoopVectorizeHints *Hints, + LoopVectorizeHints &Hints, InterleavedAccessInfo &IAI, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) - : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), - TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), - Hints(Hints), InterleaveInfo(IAI) { + : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), + AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) { if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors) initializeVScaleForTuning(); CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput; @@ -970,7 +913,7 @@ class LoopVectorizationCostModel { /// the IsOrdered flag of RdxDesc is set and we do not allow reordering /// of FP operations. bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { - return !Hints->allowReordering() && RdxDesc.isOrdered(); + return !Hints.allowReordering() && RdxDesc.isOrdered(); } /// \returns The smallest bitwidth each instruction can be represented with. @@ -1280,7 +1223,7 @@ class LoopVectorizationCostModel { /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop. bool requiresScalarEpilogue(bool IsVectorizing) const { - if (!isScalarEpilogueAllowed()) { + if (!Hints.isScalarEpilogueAllowed()) { LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); return false; } @@ -1301,12 +1244,6 @@ class LoopVectorizationCostModel { return false; } - /// Returns true if a scalar epilogue is not allowed due to optsize or a - /// loop hint annotation. - bool isScalarEpilogueAllowed() const { - return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; - } - /// Returns the TailFoldingStyle that is best for the current loop. TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { if (!ChosenTailFoldingStyle) @@ -1345,8 +1282,9 @@ class LoopVectorizationCostModel { return; // If for some reason EVL mode is unsupported, fallback to a scalar epilogue // if it's allowed, or DataWithoutLaneMask otherwise. - if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed || - ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) + if (Hints.isScalarEpilogueAllowed() || + Hints.getScalarEpilogue() == + LoopVectorizeHints::SEK_NotNeededUsePredicate) ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None}; else ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask, @@ -1558,15 +1496,6 @@ class LoopVectorizationCostModel { DenseMap> PredicatedBBsAfterVectorization; - /// Records whether it is allowed to have the original scalar loop execute at - /// least once. This may be needed as a fallback loop in case runtime - /// aliasing/dependence checks fail, or to handle the tail/remainder - /// iterations when the trip count is unknown or doesn't divide by the VF, - /// or as a peel-loop to handle gaps in interleave-groups. - /// Under optsize and when the trip count is very small we don't allow any - /// iterations to execute in the scalar loop. - ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - /// Control finally chosen tail folding style. The first element is used if /// the IV update may overflow, the second element - if it does not. std::optional> @@ -1713,8 +1642,8 @@ class LoopVectorizationCostModel { const Function *TheFunction; - /// Loop Vectorize Hint. - const LoopVectorizeHints *Hints; + /// Loop Vectorize Hints. + LoopVectorizeHints &Hints; /// The interleave access information contains groups of interleaved accesses /// with the same stride and close to each other. @@ -2976,7 +2905,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( Legal->isMaskRequired(I); bool LoadAccessWithGapsRequiresEpilogMasking = isa(I) && Group->requiresScalarEpilogue() && - !isScalarEpilogueAllowed(); + !Hints.isScalarEpilogueAllowed(); bool StoreAccessWithGapsRequiresMasking = isa(I) && !Group->isFull(); if (!PredicatedAccessRequiresMasking && @@ -3313,7 +3242,7 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) return false; - if (Hints->isScalableVectorizationDisabled()) { + if (Hints.isScalableVectorizationDisabled()) { reportVectorizationInfo("Scalable vectorization is explicitly disabled", "ScalableVectorizationDisabled", ORE, TheLoop); return false; @@ -3538,21 +3467,21 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return FixedScalableVFPair::getNone(); } - switch (ScalarEpilogueStatus) { - case CM_ScalarEpilogueAllowed: + switch (Hints.getScalarEpilogue()) { + case LoopVectorizeHints::SEK_Allowed: return computeFeasibleMaxVF(MaxTC, UserVF, false); - case CM_ScalarEpilogueNotAllowedUsePredicate: + case LoopVectorizeHints::SEK_NotAllowedUsePredicate: [[fallthrough]]; - case CM_ScalarEpilogueNotNeededUsePredicate: + case LoopVectorizeHints::SEK_NotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" << "LV: Not allowing scalar epilogue, creating predicated " << "vector loop.\n"); break; - case CM_ScalarEpilogueNotAllowedLowTripLoop: + case LoopVectorizeHints::SEK_NotAllowedLowTripLoop: // fallthrough as a special case of OptForSize - case CM_ScalarEpilogueNotAllowedOptSize: - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) + case LoopVectorizeHints::SEK_NotAllowedOptSize: + if (Hints.getScalarEpilogue() == LoopVectorizeHints::SEK_NotAllowedOptSize) LLVM_DEBUG( dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); else @@ -3636,7 +3565,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { // If we have a low-trip-count, and the fixed-width VF is known to divide // the trip count but the scalable factor does not, use the fixed-width // factor in preference to allow the generation of a non-predicated loop. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop && + if (Hints.getScalarEpilogue() == + LoopVectorizeHints::SEK_NotAllowedLowTripLoop && NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) { LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will " "remain for any chosen VF.\n"); @@ -3677,14 +3607,16 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + if (Hints.getScalarEpilogue() == + LoopVectorizeHints::SEK_NotNeededUsePredicate) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); - ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + Hints.setScalarEpilogue(LoopVectorizeHints::SEK_Allowed); return MaxFactors; } - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { + if (Hints.getScalarEpilogue() == + LoopVectorizeHints::SEK_NotAllowedUsePredicate) { LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); return FixedScalableVFPair::getNone(); } @@ -4344,7 +4276,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( return Result; } - if (!CM.isScalarEpilogueAllowed()) { + if (!Hints.isScalarEpilogueAllowed()) { LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " "epilogue is allowed.\n"); return Result; @@ -4538,7 +4470,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - if (!CM.isScalarEpilogueAllowed()) + if (!Hints.isScalarEpilogueAllowed()) return 1; if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), @@ -5288,7 +5220,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, // Calculate the cost of the whole interleaved group. bool UseMaskForGaps = - (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || + (Group->requiresScalarEpilogue() && !Hints.isScalarEpilogueAllowed()) || (isa(I) && !Group->isFull()); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices, @@ -8718,7 +8650,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // single VPInterleaveRecipe at its insertion point. VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan, InterleaveGroups, RecipeBuilder, - CM.isScalarEpilogueAllowed()); + Hints.isScalarEpilogueAllowed()); // Replace VPValues for known constant strides. VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE, @@ -9155,7 +9087,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks( const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks(); if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) { assert((!CM.OptForSize || - CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) && + Hints.getForce() == LoopVectorizeHints::FK_Enabled) && "Cannot SCEV check stride or overflow when optimizing for size"); VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock, HasBranchWeights); @@ -9169,7 +9101,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks( if (CM.OptForSize) { assert( - CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled && + Hints.getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); ORE->emit([&]() { @@ -9227,55 +9159,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) { State.set(this, DerivedIV, VPLane(0)); } -// Determine how to lower the scalar epilogue, which depends on 1) optimising -// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing -// predication, and 4) a TTI hook that analyses whether the loop is suitable -// for predication. -static ScalarEpilogueLowering getScalarEpilogueLowering( - Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, - LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { - // 1) OptSize takes precedence over all other options, i.e. if this is set, - // don't look at hints or options, and don't request a scalar epilogue. - // (For PGSO, as shouldOptimizeForSize isn't currently accessible from - // LoopAccessInfo (due to code dependency and not being able to reliably get - // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection - // of strides in LoopAccessInfo::analyzeLoop() and vectorize without - // versioning when the vectorization is forced, unlike hasOptSize. So revert - // back to the old way and vectorize with versioning when forced. See D81345.) - if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, - PGSOQueryType::IRPass) && - Hints.getForce() != LoopVectorizeHints::FK_Enabled)) - return CM_ScalarEpilogueNotAllowedOptSize; - - // 2) If set, obey the directives - if (PreferPredicateOverEpilogue.getNumOccurrences()) { - switch (PreferPredicateOverEpilogue) { - case PreferPredicateTy::ScalarEpilogue: - return CM_ScalarEpilogueAllowed; - case PreferPredicateTy::PredicateElseScalarEpilogue: - return CM_ScalarEpilogueNotNeededUsePredicate; - case PreferPredicateTy::PredicateOrDontVectorize: - return CM_ScalarEpilogueNotAllowedUsePredicate; - }; - } - - // 3) If set, obey the hints - switch (Hints.getPredicate()) { - case LoopVectorizeHints::FK_Enabled: - return CM_ScalarEpilogueNotNeededUsePredicate; - case LoopVectorizeHints::FK_Disabled: - return CM_ScalarEpilogueAllowed; - }; - - // 4) if the TTI hook indicates this is profitable, request predication. - TailFoldingInfo TFI(TLI, &LVL, IAI); - if (TTI->preferPredicateOverEpilogue(&TFI)) - return CM_ScalarEpilogueNotNeededUsePredicate; - - return CM_ScalarEpilogueAllowed; -} - // Process the loop in the VPlan-native vectorization path. This path builds // VPlan upfront in the vectorization pipeline, which allows to apply // VPlan-to-VPlan transformations from the very beginning without modifying the @@ -9296,11 +9179,9 @@ static bool processLoopInVPlanNativePath( Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - ScalarEpilogueLowering SEL = - getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); - - LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints, IAI, PSI, BFI); + Hints.setScalarEpilogue(PSI, BFI, TLI, *LVL, &IAI); + LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, + Hints, IAI, PSI, BFI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. @@ -9421,7 +9302,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, PredicatedScalarEvolution &PSE, VPCostContext &CostCtx, VPlan &Plan, - ScalarEpilogueLowering SEL, + const LoopVectorizeHints &Hints, std::optional VScale) { InstructionCost TotalCost = Checks.getCost(); if (!TotalCost.isValid()) @@ -9498,7 +9379,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, // epilogue is allowed, choose the next closest multiple of VF. This should // partly compensate for ignoring the epilogue cost. uint64_t MinTC = std::max(MinTC1, MinTC2); - if (SEL == CM_ScalarEpilogueAllowed) + if (Hints.isScalarEpilogueAllowed()) MinTC = alignTo(MinTC, IntVF); VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); @@ -9926,8 +9807,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the function attributes and profiles to find out if this function // should be optimized for size. - ScalarEpilogueLowering SEL = - getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); + Hints.setScalarEpilogue(PSI, BFI, TLI, LVL, &IAI); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. @@ -9943,12 +9823,13 @@ bool LoopVectorizePass::processLoop(Loop *L) { LLVM_DEBUG(dbgs() << "\n"); // Predicate tail-folded loops are efficient even when the loop // iteration count is low. However, setting the epilogue policy to - // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops + // `SEK_NotAllowedLowTripLoop` prevents vectorizing loops // with runtime checks. It's more effective to let // `isOutsideLoopWorkProfitable` determine if vectorization is // beneficial for the loop. - if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) - SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; + if (Hints.getScalarEpilogue() != + LoopVectorizeHints::SEK_NotNeededUsePredicate) + Hints.setScalarEpilogue(LoopVectorizeHints::SEK_NotAllowedLowTripLoop); } } @@ -9999,8 +9880,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { } // Use the cost model. - LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, - F, &Hints, IAI, PSI, BFI); + LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, + Hints, IAI, PSI, BFI); // Use the planner for vectorization. LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, ORE); @@ -10046,7 +9927,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.CostKind); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, - LVP.getPlanFor(VF.Width), SEL, + LVP.getPlanFor(VF.Width), Hints, CM.getVScaleForTuning())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index 65373e4128b7f..49cc0b7ad5def 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -83,7 +83,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 6) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 8) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: @@ -168,7 +168,7 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0 ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 770, [[UMAX3]] ; CHECK-NEXT: [[SMAX4:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[SMAX4]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP2]], 19 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP2]], 24 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 4c84913eea23d..e7df49c9062e2 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -706,7 +706,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-UF2-NEXT: [[P21:%.*]] = ptrtoint ptr [[P2:%.*]] to i64 ; NOSTRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; NOSTRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 -; NOSTRIDED-UF2-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 12) +; NOSTRIDED-UF2-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 16) ; NOSTRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]] ; NOSTRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; NOSTRIDED-UF2: vector.scevcheck: @@ -862,7 +862,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-UF2-NEXT: entry: ; STRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; STRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 -; STRIDED-UF2-NEXT: [[UMAX9:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 79) +; STRIDED-UF2-NEXT: [[UMAX9:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 80) ; STRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX9]] ; STRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; STRIDED-UF2: vector.scevcheck: @@ -1224,7 +1224,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-UF2-NEXT: entry: ; STRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; STRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 -; STRIDED-UF2-NEXT: [[UMAX6:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 28) +; STRIDED-UF2-NEXT: [[UMAX6:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 32) ; STRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX6]] ; STRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; STRIDED-UF2: vector.memcheck: