From 721c122b2105770a83f19318388a7ae6abebc5a3 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 22 Sep 2025 09:52:30 +0100 Subject: [PATCH 1/3] [VPlan] Compute cost of more replicating loads/stores in ::computeCost. Update VPReplicateRecipe::computeCost to compute costs of more replicating loads/stores. There are 2 cases that require extra checks to match the legacy cost model: 1. If the pointer is based on an induction, the legacy cost model passes its SCEV to getAddressComputationCost. In those cases, still fall back to the legacy cost. SCEV computations will be added as follow-up 2. If a load is used as part of an address of another load, the legacy cost model skips the scalarization overhead. Those cases are currently handled by a usedByLoadOrStore helper. Note that getScalarizationOverhead also needs updating, because when the legacy cost model computes the scalarization overhead, scalars have not been collected yet, so we can't each for replicating recipes to skip their cost, except other loads. This again can be further improved by modeling inserts/extracts explicitly and consistently, and compute costs for those operations directly where needed. --- .../Transforms/Vectorize/LoopVectorize.cpp | 16 ++- llvm/lib/Transforms/Vectorize/VPlan.cpp | 11 +- llvm/lib/Transforms/Vectorize/VPlanHelpers.h | 16 ++- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 116 ++++++++++++++++-- 4 files changed, 131 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ca092dcfcb492..bc6f6c5da3aa5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3925,7 +3925,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( if (VF.isScalar()) continue; - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { @@ -4182,7 +4183,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, + *CM.PSE.getSE()); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( @@ -6871,7 +6873,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -7082,7 +7084,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // simplifications not accounted for in the legacy cost model. If that's the // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The @@ -8704,7 +8707,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -10043,7 +10047,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind); + CM.CostKind, *CM.PSE.getSE()); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1e6f1e3aeb0ac..52a5a9053346d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1778,8 +1778,10 @@ VPCostContext::getOperandInfo(VPValue *V) const { return TTI::getOperandInfo(V->getLiveInIRValue()); } -InstructionCost VPCostContext::getScalarizationOverhead( - Type *ResultTy, ArrayRef Operands, ElementCount VF) { +InstructionCost +VPCostContext::getScalarizationOverhead(Type *ResultTy, + ArrayRef Operands, + ElementCount VF, bool Skip) { if (VF.isScalar()) return 0; @@ -1799,7 +1801,10 @@ InstructionCost VPCostContext::getScalarizationOverhead( SmallPtrSet UniqueOperands; SmallVector Tys; for (auto *Op : Operands) { - if (Op->isLiveIn() || isa(Op) || + if (Op->isLiveIn() || + (!Skip && isa(Op)) || + (isa(Op) && + cast(Op)->getOpcode() == Instruction::Load) || !UniqueOperands.insert(Op).second) continue; Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index fe59774b7c838..2a8baec74b72b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -349,12 +349,14 @@ struct VPCostContext { LoopVectorizationCostModel &CM; SmallPtrSet SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; + ScalarEvolution &SE; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, - TargetTransformInfo::TargetCostKind CostKind) + TargetTransformInfo::TargetCostKind CostKind, + ScalarEvolution &SE) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind) {} + CostKind(CostKind), SE(SE) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. @@ -374,10 +376,12 @@ struct VPCostContext { /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy /// and \p Operands with \p VF. This is a convenience wrapper for the - /// type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Type *ResultTy, - ArrayRef Operands, - ElementCount VF); + /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR + /// is true, always compute the cost of scalarizing replicating operands. + InstructionCost + getScalarizationOverhead(Type *ResultTy, ArrayRef Operands, + ElementCount VF, + bool AlwaysIncludeReplicatingR = false); }; /// This class can be used to assign names to VPValues. For VPValues without diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 8e9c3db50319f..3029599487bf9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3075,6 +3075,63 @@ bool VPReplicateRecipe::shouldPack() const { }); } +/// Returns true if \p Ptr is a pointer computation for which the legacy cost +/// model computes a SCEV expression when comping the address cost. +static bool shouldUseAddressAccessSCEV(VPValue *Ptr) { + auto *PtrR = Ptr->getDefiningRecipe(); + if (!PtrR || !((isa(PtrR) && + cast(PtrR)->getOpcode() == + Instruction::GetElementPtr) || + isa(PtrR))) + return false; + + // We are looking for a gep with all loop invariant indices except for one + // which should be an induction variable. + unsigned NumOperands = PtrR->getNumOperands(); + for (unsigned Idx = 1; Idx < NumOperands; ++Idx) { + VPValue *Opd = PtrR->getOperand(Idx); + if (!(Opd->isDefinedOutsideLoopRegions()) && + !isa(Opd)) + return false; + } + + return true; +} + +/// Returns true of \p V is used as part of the address of another load or +/// store. +static bool isUsedByLoadStoreAddress(const VPUser *V) { + SmallPtrSet Seen; + SmallVector WorkList = {V}; + + while (!WorkList.empty()) { + auto *Cur = dyn_cast(WorkList.pop_back_val()); + if (!Cur || !Seen.insert(Cur).second) + continue; + + for (VPUser *U : Cur->users()) { + if (auto *InterleaveR = dyn_cast(U)) + if (InterleaveR->getAddr() == Cur) + return true; + if (auto *RepR = dyn_cast(U)) { + if (RepR->getOpcode() == Instruction::Load && + RepR->getOperand(0) == Cur) + return true; + if (RepR->getOpcode() == Instruction::Store && + RepR->getOperand(1) == Cur) + return true; + } + if (auto *MemR = dyn_cast(U)) { + if (MemR->getAddr() == Cur && MemR->isConsecutive()) + return true; + } + } + + append_range(WorkList, cast(Cur)->users()); + } + return false; +} + InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Instruction *UI = cast(getUnderlyingValue()); @@ -3182,21 +3239,54 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, } case Instruction::Load: case Instruction::Store: { - if (isSingleScalar()) { - bool IsLoad = UI->getOpcode() == Instruction::Load; - Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); - Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); - const Align Alignment = getLoadStoreAlignment(UI); - unsigned AS = getLoadStoreAddressSpace(UI); - TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); - InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( - UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); - return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( - ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); - } + if (VF.isScalable() && !isSingleScalar()) + return InstructionCost::getInvalid(); + // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - break; + if (getParent()->getParent() && getParent()->getParent()->isReplicator()) + break; + + bool IsLoad = UI->getOpcode() == Instruction::Load; + // TODO: Handle cases where we need to pass a SCEV to + // getAddressComputationCost. + if (shouldUseAddressAccessSCEV(getOperand(!IsLoad))) + break; + + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); + + Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF); + + InstructionCost ScalarCost = + ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + PtrTy, &Ctx.SE, nullptr, Ctx.CostKind); + if (isSingleScalar()) + return ScalarCost; + + SmallVector OpsToScalarize; + Type *ResultTy = Type::getVoidTy(getParent()->getPlan()->getContext()); + // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we + // don't assign scalarization overhead in general, if the target prefers + // vectorized addressing or the loaded value is used as part of an address + // of another load or store. + if (Ctx.TTI.prefersVectorizedAddressing() || + !isUsedByLoadStoreAddress(this)) { + if (!(IsLoad && !Ctx.TTI.prefersVectorizedAddressing()) && + !(!IsLoad && Ctx.TTI.supportsEfficientVectorElementLoadStore())) + append_range(OpsToScalarize, operands()); + + if (!Ctx.TTI.supportsEfficientVectorElementLoadStore()) + ResultTy = Ctx.Types.inferScalarType(this); + } + + return (ScalarCost * VF.getFixedValue()) + + Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true); } } From 6c25bbc7aacf89ab28d375867132ca35a09d0b43 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 24 Sep 2025 14:19:25 +0100 Subject: [PATCH 2/3] !fixup address latest comments, thanks! --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 12 ++++----- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 27 ++++++++++--------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 88862ef037909..140a7464d4dd8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1758,10 +1758,9 @@ VPCostContext::getOperandInfo(VPValue *V) const { return TTI::getOperandInfo(V->getLiveInIRValue()); } -InstructionCost -VPCostContext::getScalarizationOverhead(Type *ResultTy, - ArrayRef Operands, - ElementCount VF, bool Skip) { +InstructionCost VPCostContext::getScalarizationOverhead( + Type *ResultTy, ArrayRef Operands, ElementCount VF, + bool AlwaysIncludeReplicatingR) { if (VF.isScalar()) return 0; @@ -1782,9 +1781,8 @@ VPCostContext::getScalarizationOverhead(Type *ResultTy, SmallVector Tys; for (auto *Op : Operands) { if (Op->isLiveIn() || - (!Skip && isa(Op)) || - (isa(Op) && - cast(Op)->getOpcode() == Instruction::Load) || + (!AlwaysIncludeReplicatingR && + isa(Op)) || !UniqueOperands.insert(Op).second) continue; Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF)); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 5993359bdba9e..81b6546ebe21a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3069,7 +3069,7 @@ bool VPReplicateRecipe::shouldPack() const { } /// Returns true if \p Ptr is a pointer computation for which the legacy cost -/// model computes a SCEV expression when comping the address cost. +/// model computes a SCEV expression when computing the address cost. static bool shouldUseAddressAccessSCEV(VPValue *Ptr) { auto *PtrR = Ptr->getDefiningRecipe(); if (!PtrR || !((isa(PtrR) && @@ -3078,12 +3078,12 @@ static bool shouldUseAddressAccessSCEV(VPValue *Ptr) { isa(PtrR))) return false; - // We are looking for a gep with all loop invariant indices except for one + // We are looking for a GEP with all loop invariant indices except for one // which should be an induction variable. unsigned NumOperands = PtrR->getNumOperands(); for (unsigned Idx = 1; Idx < NumOperands; ++Idx) { VPValue *Opd = PtrR->getOperand(Idx); - if (!(Opd->isDefinedOutsideLoopRegions()) && + if (!Opd->isDefinedOutsideLoopRegions() && !isa(Opd)) return false; } @@ -3091,7 +3091,7 @@ static bool shouldUseAddressAccessSCEV(VPValue *Ptr) { return true; } -/// Returns true of \p V is used as part of the address of another load or +/// Returns true if \p V is used as part of the address of another load or /// store. static bool isUsedByLoadStoreAddress(const VPUser *V) { SmallPtrSet Seen; @@ -3103,7 +3103,7 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) { continue; for (VPUser *U : Cur->users()) { - if (auto *InterleaveR = dyn_cast(U)) + if (auto *InterleaveR = dyn_cast(U)) if (InterleaveR->getAddr() == Cur) return true; if (auto *RepR = dyn_cast(U)) { @@ -3237,7 +3237,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - if (getParent()->getParent() && getParent()->getParent()->isReplicator()) + const VPRegionBlock *ParentRegion = getParent()->getParent(); + if (ParentRegion && ParentRegion->isReplicator()) break; bool IsLoad = UI->getOpcode() == Instruction::Load; @@ -3263,18 +3264,20 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, return ScalarCost; SmallVector OpsToScalarize; - Type *ResultTy = Type::getVoidTy(getParent()->getPlan()->getContext()); + Type *ResultTy = Type::getVoidTy(PtrTy->getContext()); // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we // don't assign scalarization overhead in general, if the target prefers // vectorized addressing or the loaded value is used as part of an address // of another load or store. - if (Ctx.TTI.prefersVectorizedAddressing() || - !isUsedByLoadStoreAddress(this)) { - if (!(IsLoad && !Ctx.TTI.prefersVectorizedAddressing()) && - !(!IsLoad && Ctx.TTI.supportsEfficientVectorElementLoadStore())) + bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); + if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) { + bool EfficientVectorLoadStore = + Ctx.TTI.supportsEfficientVectorElementLoadStore(); + if (!(IsLoad && !PreferVectorizedAddressing) && + !(!IsLoad && EfficientVectorLoadStore)) append_range(OpsToScalarize, operands()); - if (!Ctx.TTI.supportsEfficientVectorElementLoadStore()) + if (!EfficientVectorLoadStore) ResultTy = Ctx.Types.inferScalarType(this); } From 462b4563d46684a50b6d26e6d2a76d5a02c97530 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 27 Sep 2025 20:33:42 +0100 Subject: [PATCH 3/3] !fixup updated comment, add variable --- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 85cee34601af3..6a74bc9c5ccca 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3070,7 +3070,7 @@ bool VPReplicateRecipe::shouldPack() const { /// Returns true if \p Ptr is a pointer computation for which the legacy cost /// model computes a SCEV expression when computing the address cost. -static bool shouldUseAddressAccessSCEV(VPValue *Ptr) { +static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { auto *PtrR = Ptr->getDefiningRecipe(); if (!PtrR || !((isa(PtrR) && cast(PtrR)->getOpcode() == @@ -3078,11 +3078,9 @@ static bool shouldUseAddressAccessSCEV(VPValue *Ptr) { isa(PtrR))) return false; - // We are looking for a GEP with all loop invariant indices except for one - // which should be an induction variable. - unsigned NumOperands = PtrR->getNumOperands(); - for (unsigned Idx = 1; Idx < NumOperands; ++Idx) { - VPValue *Opd = PtrR->getOperand(Idx); + // We are looking for a GEP where all indices are either loop invariant or + // inductions. + for (VPValue *Opd : drop_begin(PtrR->operands())) { if (!Opd->isDefinedOutsideLoopRegions() && !isa(Opd)) return false; @@ -3242,13 +3240,14 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, break; bool IsLoad = UI->getOpcode() == Instruction::Load; + const VPValue *PtrOp = getOperand(!IsLoad); // TODO: Handle cases where we need to pass a SCEV to // getAddressComputationCost. - if (shouldUseAddressAccessSCEV(getOperand(!IsLoad))) + if (shouldUseAddressAccessSCEV(PtrOp)) break; Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); - Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); const Align Alignment = getLoadStoreAlignment(UI); unsigned AS = getLoadStoreAddressSpace(UI); TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));