diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0c81f0bfd3a006..68889bb7823340 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -162,7 +162,31 @@ class IntrinsicCostAttributes { bool skipScalarizationCost() const { return ScalarizationCost.isValid(); } }; -enum class PredicationStyle { None, Data, DataAndControlFlow }; +enum class TailFoldingStyle { + /// Don't use tail folding + None, + /// Use predicate only to mask operations on data in the loop. + /// When the VL is not known to be a power-of-2, this method requires a + /// runtime overflow check for the i + VL in the loop because it compares the + /// scalar induction variable against the tripcount rounded up by VL which may + /// overflow. When the VL is a power-of-2, both the increment and uprounded + /// tripcount will overflow to 0, which does not require a runtime check + /// since the loop is exited when the loop induction variable equals the + /// uprounded trip-count, which are both 0. + Data, + /// Same as Data, but avoids using the get.active.lane.mask intrinsic to + /// calculate the mask and instead implements this with a + /// splat/stepvector/cmp. + /// FIXME: Can this kind be removed now that SelectionDAGBuilder expands the + /// active.lane.mask intrinsic when it is not natively supported? + DataWithoutLaneMask, + /// Use predicate to control both data and control flow. + /// This method always requires a runtime overflow check for the i + VL + /// increment inside the loop, because it uses the result direclty in the + /// active.lane.mask to calculate the mask for the next iteration. If the + /// increment overflows, the mask is no longer correct. + DataAndControlFlow, +}; class TargetTransformInfo; typedef TargetTransformInfo TTI; @@ -516,13 +540,8 @@ class TargetTransformInfo { LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI) const; - /// Query the target whether lowering of the llvm.get.active.lane.mask - /// intrinsic is supported and how the mask should be used. A return value - /// of PredicationStyle::Data indicates the mask is used as data only, - /// whereas PredicationStyle::DataAndControlFlow indicates we should also use - /// the mask for control flow in the loop. If unsupported the return value is - /// PredicationStyle::None. - PredicationStyle emitGetActiveLaneMask() const; + /// Query the target what the preferred style of tail folding is. + TailFoldingStyle getPreferredTailFoldingStyle() const; // Parameters that control the loop peeling transformation struct PeelingPreferences { @@ -1616,7 +1635,7 @@ class TargetTransformInfo::Concept { AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI) = 0; - virtual PredicationStyle emitGetActiveLaneMask() = 0; + virtual TailFoldingStyle getPreferredTailFoldingStyle() = 0; virtual std::optional instCombineIntrinsic( InstCombiner &IC, IntrinsicInst &II) = 0; virtual std::optional simplifyDemandedUseBitsIntrinsic( @@ -2016,8 +2035,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { InterleavedAccessInfo *IAI) override { return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); } - PredicationStyle emitGetActiveLaneMask() override { - return Impl.emitGetActiveLaneMask(); + TailFoldingStyle getPreferredTailFoldingStyle() override { + return Impl.getPreferredTailFoldingStyle(); } std::optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 21d10482cf3688..480be9f723f23a 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -171,8 +171,8 @@ class TargetTransformInfoImplBase { return false; } - PredicationStyle emitGetActiveLaneMask() const { - return PredicationStyle::None; + TailFoldingStyle getPreferredTailFoldingStyle() const { + return TailFoldingStyle::DataWithoutLaneMask; } std::optional instCombineIntrinsic(InstCombiner &IC, diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 77dd3157d07064..ce1caafb92fb9d 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -630,8 +630,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); } - PredicationStyle emitGetActiveLaneMask() { - return BaseT::emitGetActiveLaneMask(); + TailFoldingStyle getPreferredTailFoldingStyle() { + return BaseT::getPreferredTailFoldingStyle(); } std::optional instCombineIntrinsic(InstCombiner &IC, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index e8e0fcef725682..e9c01e68fde2c9 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -312,8 +312,8 @@ bool TargetTransformInfo::preferPredicateOverEpilogue( return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); } -PredicationStyle TargetTransformInfo::emitGetActiveLaneMask() const { - return TTIImpl->emitGetActiveLaneMask(); +TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle() const { + return TTIImpl->getPreferredTailFoldingStyle(); } std::optional diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index a22ba4720e0240..a987c8ef6a7fa2 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -347,10 +347,10 @@ class AArch64TTIImpl : public BasicTTIImplBase { return ST->hasSVE() ? 5 : 0; } - PredicationStyle emitGetActiveLaneMask() const { + TailFoldingStyle getPreferredTailFoldingStyle() const { if (ST->hasSVE()) - return PredicationStyle::DataAndControlFlow; - return PredicationStyle::None; + return TailFoldingStyle::DataAndControlFlow; + return TailFoldingStyle::DataWithoutLaneMask; } bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 048790afb4962a..53953cebaa1e5a 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2286,15 +2286,15 @@ bool ARMTTIImpl::preferPredicateOverEpilogue( return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI()); } -PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const { +TailFoldingStyle ARMTTIImpl::getPreferredTailFoldingStyle() const { if (!ST->hasMVEIntegerOps() || !EnableTailPredication) - return PredicationStyle::None; + return TailFoldingStyle::DataWithoutLaneMask; // Intrinsic @llvm.get.active.lane.mask is supported. // It is used in the MVETailPredication pass, which requires the number of // elements processed by this vector loop to setup the tail-predicated // loop. - return PredicationStyle::Data; + return TailFoldingStyle::Data; } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 69b7a31d487ce0..ede400f9ecbc9e 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -312,7 +312,7 @@ class ARMTTIImpl : public BasicTTIImplBase { TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); - PredicationStyle emitGetActiveLaneMask() const; + TailFoldingStyle getPreferredTailFoldingStyle() const; void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index c0da4bda949493..143079c470fb97 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -76,9 +76,9 @@ class RISCVTTIImpl : public BasicTTIImplBase { bool shouldExpandReduction(const IntrinsicInst *II) const; bool supportsScalableVectors() const { return ST->hasVInstructions(); } bool enableScalableVectorization() const { return ST->hasVInstructions(); } - PredicationStyle emitGetActiveLaneMask() const { - return ST->hasVInstructions() ? PredicationStyle::Data - : PredicationStyle::None; + TailFoldingStyle getPreferredTailFoldingStyle() const { + return ST->hasVInstructions() ? TailFoldingStyle::Data + : TailFoldingStyle::DataWithoutLaneMask; } std::optional getMaxVScale() const; std::optional getVScaleForTuning() const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2c8fd17a9bdb67..40d5867f3df524 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1538,14 +1538,23 @@ class LoopVectorizationCostModel { return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; } + /// Returns the TailFoldingStyle that is best for the current loop. + TailFoldingStyle getTailFoldingStyle() const { + if (!CanFoldTailByMasking) + return TailFoldingStyle::None; + + return TTI.getPreferredTailFoldingStyle(); + } + /// Returns true if all loop blocks should be masked to fold tail loop. - bool foldTailByMasking() const { return FoldTailByMasking; } + bool foldTailByMasking() const { + return getTailFoldingStyle() != TailFoldingStyle::None; + } /// Returns true if were tail-folding and want to use the active lane mask /// for vector loop control flow. bool useActiveLaneMaskForControlFlow() const { - return FoldTailByMasking && - TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; + return getTailFoldingStyle() == TailFoldingStyle::DataAndControlFlow; } /// Returns true if the instructions in this block requires predication @@ -1715,7 +1724,7 @@ class LoopVectorizationCostModel { ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. - bool FoldTailByMasking = false; + bool CanFoldTailByMasking = false; /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the @@ -5134,7 +5143,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { - FoldTailByMasking = true; + CanFoldTailByMasking = true; return MaxFactors; } @@ -5292,7 +5301,7 @@ bool LoopVectorizationCostModel::isMoreProfitable( unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); - if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && + if (!A.Width.isScalable() && !B.Width.isScalable() && foldTailByMasking() && MaxTripCount) { // If we are folding the tail and the trip count is a known (possibly small) // constant, the trip count will be rounded up to an integer number of @@ -8098,8 +8107,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { // If we're using the active lane mask for control flow, then we get the // mask from the active lane mask PHI that is cached in the VPlan. - PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); - if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) + TailFoldingStyle Style = CM.getTailFoldingStyle(); + if (Style == TailFoldingStyle::DataAndControlFlow) return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); // Introduce the early-exit compare IV <= BTC to form header block mask. @@ -8115,7 +8124,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (EmitGetActiveLaneMask != PredicationStyle::None) { + if (Style != TailFoldingStyle::None && + Style != TailFoldingStyle::DataWithoutLaneMask) { VPValue *TC = Plan->getOrCreateTripCount(); BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, nullptr, "active.lane.mask"); @@ -8712,8 +8722,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, // Add the necessary canonical IV and branch recipes required to control the // loop. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW, - bool UseLaneMaskForLoopControlFlow) { + TailFoldingStyle Style) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddVPValue(StartIdx); @@ -8725,6 +8734,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar // IV by VF * UF. + bool HasNUW = Style == TailFoldingStyle::None; auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, @@ -8734,7 +8744,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); - if (UseLaneMaskForLoopControlFlow) { + if (Style == TailFoldingStyle::DataAndControlFlow) { // Create the active lane mask instruction in the vplan preheader. VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); @@ -8887,8 +8897,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking(), - CM.useActiveLaneMaskForControlFlow()); + CM.getTailFoldingStyle()); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9198,7 +9207,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true, CM.useActiveLaneMaskForControlFlow()); + CM.getTailFoldingStyle()); return Plan; }