diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 505fb435e91e6..25bf49db0e073 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3908,7 +3908,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( continue; VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { @@ -4166,7 +4166,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( @@ -6876,7 +6876,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(), + OrigLoop); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7110,12 +7111,13 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The // legacy cost model doesn't properly model costs for such loops. assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() || + !Legal->getLAI()->getSymbolicStrides().empty() || planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) || @@ -8441,7 +8443,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, - *CM.PSE.getSE()); + *CM.PSE.getSE(), OrigLoop); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -9911,7 +9913,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind, *CM.PSE.getSE()); + CM.CostKind, *CM.PSE.getSE(), L); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 2aaabd9ebdd04..965426f86ff21 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -350,13 +350,14 @@ struct VPCostContext { SmallPtrSet SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; ScalarEvolution &SE; + const Loop *L; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, TargetTransformInfo::TargetCostKind CostKind, - ScalarEvolution &SE) + ScalarEvolution &SE, const Loop *L) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind), SE(SE) {} + CostKind(CostKind), SE(SE), L(L) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9a63c802047ea..bde62dd6dd4bc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3167,26 +3167,30 @@ bool VPReplicateRecipe::shouldPack() const { }); } -/// Returns true if \p Ptr is a pointer computation for which the legacy cost -/// model computes a SCEV expression when computing the address cost. -static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { +/// Returns a SCEV expression for \p Ptr if it is a pointer computation for +/// which the legacy cost model computes a SCEV expression when computing the +/// address cost. Computing SCEVs for VPValues is incomplete and returns +/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In +/// those cases we fall back to the legacy cost model. Otherwise return nullptr. +static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE, + const Loop *L) { auto *PtrR = Ptr->getDefiningRecipe(); if (!PtrR || !((isa(PtrR) && cast(PtrR)->getOpcode() == Instruction::GetElementPtr) || isa(PtrR) || match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) - return false; + return nullptr; // We are looking for a GEP where all indices are either loop invariant or // inductions. for (VPValue *Opd : drop_begin(PtrR->operands())) { if (!Opd->isDefinedOutsideLoopRegions() && !isa(Opd)) - return false; + return nullptr; } - return true; + return vputils::getSCEVExprForVPValue(Ptr, SE, L); } /// Returns true if \p V is used as part of the address of another load or @@ -3354,9 +3358,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, bool IsLoad = UI->getOpcode() == Instruction::Load; const VPValue *PtrOp = getOperand(!IsLoad); - // TODO: Handle cases where we need to pass a SCEV to - // getAddressComputationCost. - if (shouldUseAddressAccessSCEV(PtrOp)) + const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L); + if (isa_and_nonnull(PtrSCEV)) break; Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); @@ -3374,7 +3377,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, InstructionCost ScalarCost = ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, - nullptr, Ctx.CostKind); + PtrSCEV, Ctx.CostKind); if (isSingleScalar()) return ScalarCost; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 4db92e7def3ed..54348c6e34488 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -75,7 +75,8 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { B == Plan.getBackedgeTakenCount(); } -const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { +const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V, + ScalarEvolution &SE, const Loop *L) { if (V->isLiveIn()) { if (Value *LiveIn = V->getLiveInIRValue()) return SE.getSCEV(LiveIn); @@ -86,6 +87,52 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { return TypeSwitch(V->getDefiningRecipe()) .Case( [](const VPExpandSCEVRecipe *R) { return R->getSCEV(); }) + .Case([&SE, L](const VPCanonicalIVPHIRecipe *R) { + if (!L) + return SE.getCouldNotCompute(); + const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); + return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L, + SCEV::FlagAnyWrap); + }) + .Case([&SE, L](const VPDerivedIVRecipe *R) { + const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L); + const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L); + const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L); + if (any_of(ArrayRef({Start, IV, Scale}), IsaPred)) + return SE.getCouldNotCompute(); + + return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()), + SE.getMulExpr(IV, SE.getTruncateOrSignExtend( + Scale, IV->getType()))); + }) + .Case([&SE, L](const VPScalarIVStepsRecipe *R) { + const SCEV *IV = getSCEVExprForVPValue(R->getOperand(0), SE, L); + const SCEV *Step = getSCEVExprForVPValue(R->getOperand(1), SE, L); + if (isa(IV) || isa(Step)) + return SE.getCouldNotCompute(); + return SE.getMulExpr(SE.getTruncateOrSignExtend(IV, Step->getType()), + Step); + }) + .Case([&SE, L](const VPReplicateRecipe *R) { + if (R->getOpcode() != Instruction::GetElementPtr) + return SE.getCouldNotCompute(); + + const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L); + if (isa(Base)) + return SE.getCouldNotCompute(); + + SmallVector IndexExprs; + for (VPValue *Index : drop_begin(R->operands())) { + const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L); + if (isa(IndexExpr)) + return SE.getCouldNotCompute(); + IndexExprs.push_back(IndexExpr); + } + + Type *SrcElementTy = cast(R->getUnderlyingInstr()) + ->getSourceElementType(); + return SE.getGEPExpr(Base, IndexExprs, SrcElementTy); + }) .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 37cd413da9079..c21a0e70c1392 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr); /// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no /// SCEV expression could be constructed. -const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE); +const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE, + const Loop *L = nullptr); /// Returns true if \p VPV is a single scalar, either because it produces the /// same value for all lanes or only has its first lane used.