diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index c85ef3e131068..edec066083abd 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -28,6 +28,7 @@ class DataLayout; class Loop; class raw_ostream; class TargetTransformInfo; +class MemorySSA; /// Collection of parameters shared beetween the Loop Vectorizer and the /// Loop Access Analysis. @@ -181,11 +182,12 @@ class MemoryDepChecker { }; MemoryDepChecker(PredicatedScalarEvolution &PSE, AssumptionCache *AC, - DominatorTree *DT, const Loop *L, + MemorySSA *MSSA, DominatorTree *DT, AAResults *AA, + const Loop *L, const DenseMap &SymbolicStrides, unsigned MaxTargetVectorWidthInBits, std::optional &LoopGuards) - : PSE(PSE), AC(AC), DT(DT), InnermostLoop(L), + : PSE(PSE), AC(AC), DT(DT), MSSA(MSSA), AA(AA), InnermostLoop(L), SymbolicStrides(SymbolicStrides), MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits), LoopGuards(LoopGuards) {} @@ -292,6 +294,14 @@ class MemoryDepChecker { return PointerBounds; } + /// Return if a Load can be hoisted in this loop with a pattern of a + /// memory induction variable. This assumes a alias runtime check + /// will be used before hoisting. + bool + isInvariantLoadHoistable(LoadInst *L, ScalarEvolution &SE, StoreInst **S, + const SCEV **Step, + SmallVectorImpl *Instructions) const; + DominatorTree *getDT() const { assert(DT && "requested DT, but it is not available"); return DT; @@ -312,6 +322,8 @@ class MemoryDepChecker { AssumptionCache *AC; DominatorTree *DT; + MemorySSA *MSSA; + AAResults *AA; const Loop *InnermostLoop; @@ -692,7 +704,7 @@ class LoopAccessInfo { const TargetTransformInfo *TTI, const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI, AssumptionCache *AC, - bool AllowPartial = false); + MemorySSA *MSSA, bool AllowPartial = false); /// Return true we can analyze the memory accesses in the loop and there are /// no memory dependence cycles. Note that for dependences between loads & @@ -786,7 +798,8 @@ class LoopAccessInfo { /// Analyze the loop. Returns true if all memory access in the loop can be /// vectorized. bool analyzeLoop(AAResults *AA, const LoopInfo *LI, - const TargetLibraryInfo *TLI, DominatorTree *DT); + const TargetLibraryInfo *TLI, DominatorTree *DT, + MemorySSA *MSSA); /// Check if the structure of the loop allows it to be analyzed by this /// pass. @@ -963,12 +976,15 @@ class LoopAccessInfoManager { TargetTransformInfo *TTI; const TargetLibraryInfo *TLI = nullptr; AssumptionCache *AC; + MemorySSA *MSSA; public: LoopAccessInfoManager(ScalarEvolution &SE, AAResults &AA, DominatorTree &DT, LoopInfo &LI, TargetTransformInfo *TTI, - const TargetLibraryInfo *TLI, AssumptionCache *AC) - : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI), AC(AC) {} + const TargetLibraryInfo *TLI, AssumptionCache *AC, + MemorySSA *MSSA) + : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI), AC(AC), MSSA(MSSA) { + } LLVM_ABI const LoopAccessInfo &getInfo(Loop &L, bool AllowPartial = false); diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 5d88e5f54e3d6..4a8871ddbb7eb 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -1777,6 +1778,232 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, return Diff == 1; } +/// Collects all subexpressions that appear within a given SCEV tree. +struct SCEVSubexprCollector : public SCEVVisitor { + SmallPtrSet &Subs; + SCEVSubexprCollector(SmallPtrSet &S) : Subs(S) {} + + template void visitOperands(Operands operands) { + for (auto *Op : operands) + visit(Op); + } + void visitConstant(const SCEVConstant *C) { Subs.insert(C); } + void visitUnknown(const SCEVUnknown *U) { Subs.insert(U); } + void visitAddExpr(const SCEVAddExpr *E) { + Subs.insert(E); + for (auto *Op : E->operands()) + visit(Op); + } + void visitMulExpr(const SCEVMulExpr *E) { + Subs.insert(E); + for (auto *Op : E->operands()) + visit(Op); + } + void visitAddRecExpr(const SCEVAddRecExpr *E) { + Subs.insert(E); + for (auto *Op : E->operands()) + visit(Op); + } + void visitSMaxExpr(const SCEVSMaxExpr *E) { + Subs.insert(E); + for (auto *Op : E->operands()) + visit(Op); + } + void visitSMinExpr(const SCEVSMinExpr *E) { + Subs.insert(E); + for (auto *Op : E->operands()) + visit(Op); + } + void visitUMinExpr(const SCEVUMinExpr *E) { + Subs.insert(E); + for (auto *Op : E->operands()) + visit(Op); + } + void visitUMaxExpr(const SCEVUMaxExpr *E) { + Subs.insert(E); + for (auto *Op : E->operands()) + visit(Op); + } + void visitMinMaxExpr(const SCEVMinMaxExpr *E) { + Subs.insert(E); + for (auto *Op : E->operands()) + visit(Op); + } + void visitUDivExpr(const SCEVUDivExpr *E) { + Subs.insert(E); + visit(E->getLHS()); + visit(E->getRHS()); + } + void visitZeroExtendExpr(const SCEVZeroExtendExpr *E) { + Subs.insert(E); + visit(E->getOperand()); + } + void visitSignExtendExpr(const SCEVSignExtendExpr *E) { + Subs.insert(E); + visit(E->getOperand()); + } + void visitTruncateExpr(const SCEVTruncateExpr *E) { + Subs.insert(E); + visit(E->getOperand()); + } + void visitCouldNotCompute(const SCEVCouldNotCompute *E) { Subs.insert(E); } + void visitVScale(const SCEVVScale *E) { + Subs.insert(E); + visitOperands(E->operands()); + } + void visitPtrToIntExpr(const SCEVPtrToIntExpr *E) { + Subs.insert(E); + visitOperands(E->operands()); + } + void visitSequentialUMinExpr(const SCEVSequentialUMinExpr *E) { + Subs.insert(E); + visitOperands(E->operands()); + } +}; + +bool MemoryDepChecker::isInvariantLoadHoistable( + LoadInst *L, ScalarEvolution &SE, StoreInst **S, const SCEV **StepSCEV, + SmallVectorImpl *Instructions) const { + assert(L != nullptr); + assert(InnermostLoop->isLoopInvariant(L->getPointerOperand())); + + if (!MSSA) + return false; + + MemoryAccess *MA = MSSA->getMemoryAccess(L); + auto QLoc = MemoryLocation::get(L); + + SmallVector Stores; + SmallVector Loads; + + for (auto &&I : *InnermostLoop->getHeader()) { + if (auto *Store = dyn_cast(&I)) { + AliasResult AR = AA->alias(MemoryLocation::get(Store), QLoc); + if (AR == AliasResult::MustAlias) + Stores.push_back(Store); + } + if (auto *Load = dyn_cast(&I)) { + AliasResult AR = AA->alias(MemoryLocation::get(Load), QLoc); + if (AR == AliasResult::MustAlias) + Loads.push_back(Load); + } + } + + if (Loads.size() != 1 || Loads[0]->isVolatile() || Stores.size() != 1 || + Stores[0]->isVolatile()) + return false; + + // I have the memory PHI, so I know where is the backedge + // I have to find all memory accesses to the same cell (that I care) + // There should be a single memory use and a single memorydef + // memory use should have MemoryPhi as transitive clobber + // backedge should have the MemoryDef as a transitive clobber (must-alias) (?) + MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(MA); + while (auto *MD = dyn_cast(Clobber)) { + Instruction *DefI = MD->getMemoryInst(); + + if (!DefI) + return false; + + AliasResult AR = AA->alias(MemoryLocation::get(DefI), QLoc); + + Clobber = MD->getDefiningAccess(); + + // We assume runtime aliasing check will be used + if (AR == AliasResult::MustAlias) + return false; + } + + MemoryAccess *MS = MSSA->getMemoryAccess(Stores[0]); + MemoryAccess *StoreClobber = MSSA->getWalker()->getClobberingMemoryAccess(MS); + while (true) { + if (isa(StoreClobber)) + break; + if (auto *MD = dyn_cast(StoreClobber)) { + Instruction *DefI = MD->getMemoryInst(); + + if (!DefI) + return false; + + AliasResult AR = AA->alias(MemoryLocation::get(DefI), QLoc); + + StoreClobber = MD->getDefiningAccess(); + + if (AR == AliasResult::MustAlias) + return false; + } + } + + if (!SE.isSCEVable(Stores[0]->getValueOperand()->getType())) + return false; + + const SCEV *LoadSCEV = SE.getUnknown(L); + const SCEV *StoreSCEV = SE.getSCEV(Stores[0]->getValueOperand()); + + auto Step = SE.getMinusSCEV(StoreSCEV, LoadSCEV); + + if (isa(Step) || + !SE.isLoopInvariant(Step, InnermostLoop)) + return false; + + SmallVector WL; + + SmallPtrSet Slice; + SmallPtrSet Subs; + SCEVSubexprCollector Collector(Subs); + Collector.visit(StoreSCEV); + + // Register all instructions that matches the SCEV + // to allow its removal when hoisting it and + // re-expanding the SCEV + auto enqueueIfMatches = [&](Value *X) { + if (auto *XI = dyn_cast(X)) { + const SCEV *SX = SE.getSCEV(XI); + if (Subs.contains(SX) && Slice.insert(XI).second) + WL.push_back(XI); + } + }; + + enqueueIfMatches(Stores[0]->getValueOperand()); + + while (!WL.empty()) { + Instruction *I = WL.pop_back_val(); + + for (Value *Op : I->operands()) { + if (isa(Op) || isa(Op)) + continue; + enqueueIfMatches(Op); + } + } + + auto hasExternalUsers = + [&Stores](const SmallPtrSetImpl &Slice) { + for (Instruction *I : Slice) + for (Use &U : I->uses()) + if (auto *UserI = dyn_cast(U.getUser())) { + if (isa(UserI)) + continue; + if (!Slice.count(UserI) && + !std::count(Stores.begin(), Stores.end(), UserI)) + return true; + } + return false; + }; + + if (hasExternalUsers(Slice)) + return false; + + if (S) + *S = Stores[0]; + if (StepSCEV) + *StepSCEV = Step; + + if (Instructions) + Instructions->insert(Instructions->end(), Slice.begin(), Slice.end()); + + return true; +} + void MemoryDepChecker::addAccess(StoreInst *SI) { visitPointers(SI->getPointerOperand(), *InnermostLoop, [this, SI](Value *Ptr) { @@ -2102,6 +2329,19 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( int64_t StrideBPtrInt = *StrideBPtr; LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt << " Sink induction step: " << StrideBPtrInt << "\n"); + + if (!StrideAPtrInt && !StrideBPtrInt && !(AIsWrite && BIsWrite) && + (AIsWrite || BIsWrite) && !isa(APtr) && + InnermostLoop->isLoopInvariant(APtr) && + InnermostLoop->isLoopInvariant(BPtr)) { + LoadInst *L = dyn_cast(AIsWrite ? BInst : AInst); + if (InnermostLoop->isLoopInvariant(L->getPointerOperand())) + if (L && isInvariantLoadHoistable(L, SE, nullptr, nullptr, nullptr)) + ShouldRetryWithRuntimeChecks = true; + + return MemoryDepChecker::Dependence::Unknown; + } + // At least Src or Sink are loop invariant and the other is strided or // invariant. We can generate a runtime check to disambiguate the accesses. if (!StrideAPtrInt || !StrideBPtrInt) @@ -2505,7 +2745,7 @@ bool LoopAccessInfo::canAnalyzeLoop() { bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, const TargetLibraryInfo *TLI, - DominatorTree *DT) { + DominatorTree *DT, MemorySSA *MSSA) { // Holds the Load and Store instructions. SmallVector Loads; SmallVector Stores; @@ -2715,9 +2955,15 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, // See if there is an unsafe dependency between a load to a uniform address and // store to the same uniform address. if (UniformStores.contains(Ptr)) { - LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform " - "load and uniform store to the same address!\n"); - HasLoadStoreDependenceInvolvingLoopInvariantAddress = true; + auto &SE = *PSE->getSE(); + if (TheLoop->isLoopInvariant(LD->getPointerOperand()) && + !getDepChecker().isInvariantLoadHoistable(LD, SE, nullptr, nullptr, + nullptr)) { + LLVM_DEBUG( + dbgs() << "LAA: Found an unsafe dependency between a uniform " + "load and uniform store to the same address!\n"); + HasLoadStoreDependenceInvolvingLoopInvariantAddress = true; + } } MemoryLocation Loc = MemoryLocation::get(LD); @@ -3064,7 +3310,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetTransformInfo *TTI, const TargetLibraryInfo *TLI, AAResults *AA, DominatorTree *DT, LoopInfo *LI, - AssumptionCache *AC, bool AllowPartial) + AssumptionCache *AC, MemorySSA *MSSA, + bool AllowPartial) : PSE(std::make_unique(*SE, *L)), PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) { unsigned MaxTargetVectorWidthInBits = std::numeric_limits::max(); @@ -3075,11 +3322,12 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2; DepChecker = std::make_unique( - *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits, LoopGuards); + *PSE, AC, MSSA, DT, AA, L, SymbolicStrides, MaxTargetVectorWidthInBits, + LoopGuards); PtrRtChecking = std::make_unique(*DepChecker, SE, LoopGuards); if (canAnalyzeLoop()) - CanVecMem = analyzeLoop(AA, LI, TLI, DT); + CanVecMem = analyzeLoop(AA, LI, TLI, DT, MSSA); } void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { @@ -3145,7 +3393,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L, // or if it was created with a different value of AllowPartial. if (Inserted || It->second->hasAllowPartial() != AllowPartial) It->second = std::make_unique(&L, &SE, TTI, TLI, &AA, &DT, - &LI, AC, AllowPartial); + &LI, AC, MSSA, AllowPartial); return *It->second; } @@ -3189,7 +3437,9 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F, auto &TTI = FAM.getResult(F); auto &TLI = FAM.getResult(F); auto &AC = FAM.getResult(F); - return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC); + auto &MSSA = FAM.getResult(F); + return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC, + &MSSA.getMSSA()); } AnalysisKey LoopAccessAnalysis::Key; diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 0c8b9043fcbbb..ebda2c96b75e6 100644 --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -680,12 +680,14 @@ class LoopDistributeForLoop { // Currently, we only distribute to isolate the part of the loop with // dependence cycles to enable partial vectorization. - if (LAI->canVectorizeMemory()) + if (!LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress() && + LAI->canVectorizeMemory()) return fail("MemOpsCanBeVectorized", "memory operations are safe for vectorization"); auto *Dependences = LAI->getDepChecker().getDependences(); - if (!Dependences || Dependences->empty()) + if (!LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress() && + (!Dependences || Dependences->empty())) return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); LLVM_DEBUG(dbgs() << "LDist: Found a candidate loop: " diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index 04039b885f3c5..72a2dcb294a57 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -1010,7 +1010,7 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM, // this pass will simplify all loops that contain inner loops, // regardless of whether anything ends up being flattened. LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr, - &AR.AC); + &AR.AC, AR.MSSA); for (Loop *InnerLoop : LN.getLoops()) { auto *OuterLoop = InnerLoop->getParentLoop(); if (!OuterLoop) diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 3aed643ee8065..e8d5fc870137f 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -549,7 +549,8 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM, const Function *F = L.getHeader()->getParent(); OptimizationRemarkEmitter ORE(F); - LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, &LAR.AC); + LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, &LAR.AC, + LAR.MSSA); if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 04b05627fa769..0ffabdda21bdf 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -465,6 +465,8 @@ class LoopVectorizationPlanner { PredicatedScalarEvolution &PSE; + LoopAccessInfoManager *LAIs; + const LoopVectorizeHints &Hints; OptimizationRemarkEmitter *ORE; @@ -498,10 +500,10 @@ class LoopVectorizationPlanner { Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo &TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI, - PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, - OptimizationRemarkEmitter *ORE) + PredicatedScalarEvolution &PSE, LoopAccessInfoManager *LAIs, + const LoopVectorizeHints &Hints, OptimizationRemarkEmitter *ORE) : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), - IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {} + IAI(IAI), PSE(PSE), LAIs(LAIs), Hints(Hints), ORE(ORE) {} /// Build VPlans for the specified \p UserVF and \p UserIC if they are /// non-zero or all applicable candidate VFs otherwise. If vectorization and @@ -628,6 +630,8 @@ class LoopVectorizationPlanner { VPRecipeBuilder &RecipeBuilder, ElementCount MinVF); + void adjustScalarIVPromotions(VPlanPtr &Plan); + /// Attach the runtime checks of \p RTChecks to \p Plan. void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 89893af5c1140..c683e1df8664b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4092,6 +4092,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPEVLBasedIVPHISC: case VPDef::VPPredInstPHISC: case VPDef::VPBranchOnMaskSC: + case VPDef::VPScalarIVPromotionRecipeSC: continue; case VPDef::VPReductionSC: case VPDef::VPActiveLaneMaskPHISC: @@ -7523,6 +7524,14 @@ BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() { OriginalScalarPH->setName("vec.epilog.iter.check"); VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH); VPBasicBlock *OldEntry = Plan.getEntry(); + + for (VPRecipeBase &R : make_early_inc_range(*OldEntry)) + // Move hoisted loads to split PreHeader + if (auto RepR = dyn_cast(&R)) { + RepR->removeFromParent(); + VectorPHVPBB->appendRecipe(RepR); + } + for (auto &R : make_early_inc_range(*OldEntry)) { // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by // defining. @@ -7532,6 +7541,7 @@ BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() { } VPBlockUtils::reassociateBlocks(OldEntry, NewEntry); + Plan.setEntry(NewEntry); // OldEntry is now dead and will be cleaned up when the plan gets destroyed. @@ -8302,7 +8312,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, // candidates built later for specific VF ranges. auto VPlan0 = VPlanTransforms::buildVPlan0( OrigLoop, *LI, Legal->getWidestInductionType(), - getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE); + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, LAIs); auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { @@ -8324,6 +8334,23 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } } +void LoopVectorizationPlanner::adjustScalarIVPromotions(VPlanPtr &Plan) { + VPScalarIVPromotionRecipe *Recipe = nullptr; + + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_deep(Plan->getVectorLoopRegion()))) + for (VPRecipeBase &R : *VPBB) + if (auto *ScalarIV = dyn_cast(&R)) { + assert(!Recipe && "Only one FFLoad is supported"); + Recipe = ScalarIV; + } + + if (!Recipe) + return; + + Recipe->setVFxUF(&Plan->getVFxUF()); +} + VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) { @@ -8434,11 +8461,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // latter are added above for masking. // FIXME: Migrate code relying on the underlying instruction from VPlan0 // to construct recipes below to not use the underlying instruction. - if (isa( - &R) || + if (isa(&R) || (isa(&R) && !UnderlyingValue)) continue; - assert(isa(&R) && UnderlyingValue && "unsupported recipe"); + assert((isa(&R) && UnderlyingValue && + "unsupported recipe")); // TODO: Gradually replace uses of underlying instruction by analyses on // VPlan. @@ -8514,6 +8542,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + adjustScalarIVPromotions(Plan); + // Apply mandatory transformation to handle FP maxnum/minnum reduction with // NaNs if possible, bail out otherwise. if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions, @@ -8583,7 +8613,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { auto Plan = VPlanTransforms::buildVPlan0( OrigLoop, *LI, Legal->getWidestInductionType(), - getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE); + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, LAIs); VPlanTransforms::handleEarlyExits(*Plan, /*HasUncountableExit*/ false); VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true, @@ -9114,7 +9144,7 @@ static bool processLoopInVPlanNativePath( TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, - LoopVectorizationRequirements &Requirements) { + LoopAccessInfoManager *LAIs, LoopVectorizationRequirements &Requirements) { if (isa(PSE.getBackedgeTakenCount())) { LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); @@ -9132,8 +9162,8 @@ static bool processLoopInVPlanNativePath( // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, - ORE); + LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, LAIs, + Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -9846,7 +9876,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // pipeline. if (!L->isInnermost()) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, - ORE, BFI, PSI, Hints, Requirements); + ORE, BFI, PSI, Hints, LAIs, + Requirements); assert(L->isInnermost() && "Inner loop expected."); @@ -9951,8 +9982,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI, PSI, BFI); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, - ORE); + LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, LAIs, + Hints, ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 092114db95e9c..e650dd9fdf65e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -554,6 +554,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenPointerInductionSC: case VPRecipeBase::VPReductionPHISC: case VPRecipeBase::VPPartialReductionSC: + case VPRecipeBase::VPScalarIVPromotionRecipeSC: return true; case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveEVLSC: @@ -580,10 +581,12 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { /// Returns the underlying instruction. Instruction *getUnderlyingInstr() { - return cast(getUnderlyingValue()); + return getUnderlyingValue() ? dyn_cast(getUnderlyingValue()) + : nullptr; } const Instruction *getUnderlyingInstr() const { - return cast(getUnderlyingValue()); + return getUnderlyingValue() ? dyn_cast(getUnderlyingValue()) + : nullptr; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2312,7 +2315,8 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { VPFirstOrderRecurrencePHIRecipe *clone() override { return new VPFirstOrderRecurrencePHIRecipe( - cast(getUnderlyingInstr()), *getOperand(0)); + getUnderlyingInstr() ? cast(getUnderlyingInstr()) : nullptr, + *getOperand(0)); } void execute(VPTransformState &State) override; @@ -3475,6 +3479,54 @@ class VPExpandSCEVRecipe : public VPSingleDefRecipe { const SCEV *getSCEV() const { return Expr; } }; +struct LLVM_ABI_FOR_TEST VPScalarIVPromotionRecipe : public VPSingleDefRecipe { + VPScalarIVPromotionRecipe(std::initializer_list Operands, + DebugLoc DL = DebugLoc::getUnknown()) + : VPSingleDefRecipe(VPDef::VPScalarIVPromotionRecipeSC, Operands, DL) {} + + VP_CLASSOF_IMPL(VPDef::VPScalarIVPromotionRecipeSC) + + bool isSingleScalar() const { return true; } + + VPScalarIVPromotionRecipe *clone() override { + assert(getNumOperands() == 3 || getNumOperands() == 4); + if (getNumOperands() == 3) + return new VPScalarIVPromotionRecipe( + {getOperand(0), getOperand(1), getOperand(2)}, getDebugLoc()); + return new VPScalarIVPromotionRecipe( + {getOperand(0), getOperand(1), getOperand(2), getOperand(3)}, + getDebugLoc()); + } + + VPValue *getVFxUF() { return getOperand(3); } + void setVFxUF(VPValue *V) { + if (getNumOperands() == 3) { + addOperand(V); + } else { + setOperand(3, V); + } + } + + void execute(VPTransformState &State) override; + + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + return 0; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } +}; + /// Canonical scalar induction phi of the vector loop. Starting at the specified /// start value (either 0 or the resume value when vectorizing the epilogue /// loop). VPWidenCanonicalIVRecipe represents the vector version of the diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 80a2e4bc3f754..8ec552b9145c5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -136,6 +136,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return inferScalarType(R->getOperand(0)); case VPInstruction::BranchOnCond: case VPInstruction::BranchOnCount: + case Instruction::Store: return Type::getVoidTy(Ctx); default: break; @@ -289,9 +290,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { .Case([this](const VPRecipeBase *R) { - return inferScalarType(R->getOperand(0)); - }) + VPPartialReductionRecipe, VPScalarIVPromotionRecipe>( + [this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) // VPInstructionWithType must be handled before VPInstruction. .Case( diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 4ffd5577d31a4..0d8f9afdeb1ec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -29,6 +29,15 @@ using namespace llvm; using namespace VPlanPatternMatch; namespace { + +struct ScalarPromotionInfo { + LoadInst *Load; + StoreInst *Store; + const SCEV *Step = nullptr; + + SmallVector Instructions; +}; + // Class that is used to build the plain CFG for the incoming IR. class PlainCFGBuilder { // The outermost loop of the input loop nest considered for vectorization. @@ -40,6 +49,10 @@ class PlainCFGBuilder { // Vectorization plan that we are working on. std::unique_ptr Plan; + PredicatedScalarEvolution *PSE; + + LoopAccessInfoManager *LAIs; + // Builder of the VPlan instruction-level representation. VPBuilder VPIRBuilder; @@ -54,6 +67,8 @@ class PlainCFGBuilder { // Hold phi node's that need to be fixed once the plain CFG has been built. SmallVector PhisToFix; + SmallVector ScalarPromotions; + // Utility functions. void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB); void fixHeaderPhis(); @@ -64,9 +79,13 @@ class PlainCFGBuilder { VPValue *getOrCreateVPOperand(Value *IRVal); void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB); + void analyzeScalarPromotion(VPBasicBlock *VPBB, BasicBlock *BB); + public: - PlainCFGBuilder(Loop *Lp, LoopInfo *LI) - : TheLoop(Lp), LI(LI), Plan(std::make_unique(Lp)) {} + PlainCFGBuilder(Loop *Lp, LoopInfo *LI, PredicatedScalarEvolution *PSE, + LoopAccessInfoManager *LAIs) + : TheLoop(Lp), LI(LI), Plan(std::make_unique(Lp)), PSE(PSE), + LAIs(LAIs) {} /// Build plain CFG for TheLoop and connect it to Plan's entry. std::unique_ptr buildPlainCFG(); @@ -166,11 +185,36 @@ VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) { return NewVPVal; } +void PlainCFGBuilder::analyzeScalarPromotion(VPBasicBlock *VPBB, + BasicBlock *BB) { + for (Instruction &InstRef : BB->instructionsWithoutDebug(false)) { + Instruction *Inst = &InstRef; + + if (auto *Load = dyn_cast(Inst)) { + auto Loop = LI->getLoopFor(Inst->getParent()); + auto &LAI = LAIs->getInfo(*Loop); + StoreInst *Store = nullptr; + const SCEV *Step = nullptr; + + if (Loop->isLoopInvariant(Load->getPointerOperand())) { + SmallVector Is; + if (LAI.getDepChecker().isInvariantLoadHoistable(Load, *PSE->getSE(), + &Store, &Step, &Is)) { + ScalarPromotions.push_back(ScalarPromotionInfo{Load, Store, Step, {}}); + ScalarPromotions.back().Instructions.insert( + ScalarPromotions.back().Instructions.end(), Is.begin(), Is.end()); + } + } + } + } +} + // Create new VPInstructions in a VPBasicBlock, given its BasicBlock // counterpart. This function must be invoked in RPO so that the operands of a // VPInstruction in \p BB have been visited before (except for Phi nodes). void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB) { + DenseSet SkipInsts; VPIRBuilder.setInsertPoint(VPBB); // TODO: Model and preserve debug intrinsics in VPlan. for (Instruction &InstRef : BB->instructionsWithoutDebug(false)) { @@ -228,6 +272,46 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB, VPPredToIncomingValue.lookup(Pred->getExitingBasicBlock())); } } else { + auto skip = false; + for (auto &SP : ScalarPromotions) { + if (Inst == SP.Load) { + VPBasicBlock *PreheaderVPBB = + Plan->getVectorPreheader(); // vector preheader, not the IR loop + // preheader + if (!PreheaderVPBB) + PreheaderVPBB = Plan->getEntry(); + + SmallVector VPOperands; + for (Value *Op : SP.Load->operands()) { + VPOperands.push_back(getOrCreateVPOperand(Op)); + } + auto *Load = new VPReplicateRecipe(SP.Load, VPOperands, + /*IsSingleScalar=*/true); + auto SCEVRecipe = new VPExpandSCEVRecipe(SP.Step); + PreheaderVPBB->appendRecipe(SCEVRecipe); + PreheaderVPBB->appendRecipe(Load); + + auto StepValue = SCEVRecipe->getVPSingleValue(); + + NewR = new VPScalarIVPromotionRecipe( + {Load, StepValue, + getOrCreateVPOperand(SP.Store->getPointerOperand())}, + SP.Load->getDebugLoc()); + VPBB->appendRecipe(NewR); + skip = true; + break; + } else if (Inst == SP.Store) { + skip = true; + break; + } else if (Inst == SP.Instructions[0]) { + skip = true; + break; + } + } + + if (skip) + continue; + // Translate LLVM-IR operands into VPValue operands and set them in the // new VPInstruction. SmallVector VPOperands; @@ -274,6 +358,8 @@ std::unique_ptr PlainCFGBuilder::buildPlainCFG() { IRDef2VPValue[&I] = Plan->getOrAddLiveIn(&I); } + // dbgs() << "ECHO 9.1 "; Plan->dump(); + LoopBlocksRPO RPO(TheLoop); RPO.perform(LI); @@ -283,6 +369,8 @@ std::unique_ptr PlainCFGBuilder::buildPlainCFG() { // Set VPBB predecessors in the same order as they are in the incoming BB. setVPBBPredsFromBB(VPBB, BB); + analyzeScalarPromotion(VPBB, BB); + // Create VPInstructions for BB. createVPInstructionsForVPBB(VPBB, BB); @@ -537,8 +625,9 @@ static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL, std::unique_ptr VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, - DebugLoc IVDL, PredicatedScalarEvolution &PSE) { - PlainCFGBuilder Builder(TheLoop, &LI); + DebugLoc IVDL, PredicatedScalarEvolution &PSE, + LoopAccessInfoManager *LAIs) { + PlainCFGBuilder Builder(TheLoop, &LI, &PSE, LAIs); std::unique_ptr VPlan0 = Builder.buildPlainCFG(); addInitialSkeleton(*VPlan0, InductionTy, IVDL, PSE, TheLoop); return VPlan0; @@ -726,8 +815,11 @@ void VPlanTransforms::addMinimumIterationCheck( // Don't execute the vector loop if (UMax - n) < (VF * UF). // FIXME: Should only check VF * UF, but currently checks Step=max(VF*UF, // minProfitableTripCount). - TripCountCheck = Builder.createICmp(ICmpInst::ICMP_ULT, DistanceToMax, - Builder.createExpandSCEV(Step), DL); + TripCountCheck = + Builder.createICmp(ICmpInst::ICMP_ULT, DistanceToMax, + VPBuilder(EntryVPBB, EntryVPBB->getFirstNonPhi()) + .createExpandSCEV(Step), + DL); } else { // TripCountCheck = false, folding tail implies positive vector trip // count. @@ -745,7 +837,9 @@ void VPlanTransforms::addMinimumIterationCheck( TripCount, Step)) { // Generate the minimum iteration check only if we cannot prove the // check is known to be true, or known to be false. - VPValue *MinTripCountVPV = Builder.createExpandSCEV(Step); + VPValue *MinTripCountVPV = + VPBuilder(EntryVPBB, EntryVPBB->getFirstNonPhi()) + .createExpandSCEV(Step); TripCountCheck = Builder.createICmp( CmpPred, TripCountVPV, MinTripCountVPV, DL, "min.iters.check"); } // else step known to be < trip count, use TripCountCheck preset to false. @@ -767,8 +861,11 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck( // Add the minimum iteration check for the epilogue vector loop. VPValue *TC = Plan.getOrAddLiveIn(TripCount); VPBuilder Builder(cast(Plan.getEntry())); - VPValue *VFxUF = Builder.createExpandSCEV(SE.getElementCount( - TripCount->getType(), (EpilogueVF * EpilogueUF), SCEV::FlagNUW)); + VPValue *VFxUF = + VPBuilder(cast(Plan.getEntry()), + cast(Plan.getEntry())->getFirstNonPhi()) + .createExpandSCEV(SE.getElementCount( + TripCount->getType(), (EpilogueVF * EpilogueUF), SCEV::FlagNUW)); VPValue *Count = Builder.createNaryOp( Instruction::Sub, {TC, Plan.getOrAddLiveIn(VectorTripCount)}, DebugLoc::getUnknown(), "n.vec.remaining"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 47d8cc260511e..6f1029d545307 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -92,6 +92,17 @@ struct deferredval_ty { /// whichever value m_VPValue(X) populated. inline deferredval_ty m_Deferred(VPValue *const &V) { return V; } +template struct RecipeBindValue { + SubT Sub; + VPValue *&Out; + bool match(const VPValue *V) const { + if (!Sub.match(V)) + return false; + Out = const_cast(V); + return true; + } +}; + /// Match an integer constant or vector of constants if Pred::isValue returns /// true for the APInt. \p BitWidth optionally specifies the bitwidth the /// matched constant must have. If it is 0, the matched constant can have any @@ -306,6 +317,10 @@ struct Recipe_match { }); } + auto bind(VPValue *&Out) const & { + return RecipeBindValue{*this, Out}; + } + private: template static bool matchRecipeAndOpcode(const VPRecipeBase *R) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f5528ab7b2bbe..2abfe8843c470 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -37,6 +37,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include using namespace llvm; @@ -672,6 +673,12 @@ Value *VPInstruction::generate(VPTransformState &State) { case Instruction::PHI: { llvm_unreachable("should be handled by VPPhi::execute"); } + case Instruction::Store: { + assert(vputils::onlyFirstLaneUsed(this) && "Should be scalar store"); + Value *V = State.get(getOperand(0), true); + Value *P = State.get(getOperand(1), true); + return Builder.CreateStore(V, P); + } case Instruction::Select: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *Cond = @@ -1293,7 +1300,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode())) + if (Instruction::isBinaryOp(getOpcode()) || + Instruction::isCast(getOpcode()) || getOpcode() == Instruction::Store) return vputils::onlyFirstLaneUsed(this); switch (getOpcode()) { @@ -1546,7 +1554,52 @@ void VPPhi::execute(VPTransformState &State) { State.set(this, NewPhi, VPLane(0)); } +void VPScalarIVPromotionRecipe::execute(VPTransformState &State) { + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + + Value *VL = State.get(getVFxUF(), VPLane(0)); + Type *Ty = State.get(getOperand(0), VPLane(0))->getType(); + VL = Builder.CreateZExtOrTrunc(VL, Ty); + + auto PhiInsertPoint = + State.CFG.VPBB2IRBB[getParent()->getExitingBasicBlock()] + ->getFirstNonPHIIt(); + auto DefaultInsertPoint = State.Builder.GetInsertPoint(); + + State.Builder.SetInsertPoint(PhiInsertPoint); + auto Phi = Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, ""); + State.Builder.SetInsertPoint(DefaultInsertPoint); + auto EntryValue = State.get(getOperand(0), VPLane(0)); + VPBlockBase *Pred = getParent()->getPredecessors()[0]; + auto *PredVPBB = Pred->getExitingBasicBlock(); + Phi->addIncoming(EntryValue, State.CFG.VPBB2IRBB[PredVPBB]); + + auto SCEVStep = State.get(getOperand(1), VPLane(0)); + SCEVStep = Builder.CreateZExtOrTrunc(SCEVStep, Ty); + + auto Mul = Builder.CreateNAryOp(Instruction::Mul, {SCEVStep, VL}); + auto Add = Builder.CreateNAryOp(Instruction::Add, {Phi, Mul}); + + auto Pointer = State.get(getOperand(2), VPLane(0)); + Builder.CreateStore(Add, Pointer); + + Phi->addIncoming(Add, dyn_cast(Add)->getParent()); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPScalarIVPromotionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " "; + printAsOperand(O, SlotTracker); + O << " = Scalar Promotion IV "; + printOperands(O, SlotTracker); + + if (auto DL = getDebugLoc()) { + O << ", !dbg "; + DL.print(O); + } +} void VPPhi::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " "; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 382521d090f4e..bb4481ba9ec71 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2714,14 +2714,16 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { return match(U, m_c_Add(m_Specific(LoopRegion->getCanonicalIV()), m_Specific(&Plan.getVFxUF()))) || - isa(U); + isa(U) || + isa(U); }) && "Only users of VFxUF should be VPWidenPointerInductionRecipe and the " "increment of the canonical induction."); Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) { // Only replace uses in VPWidenPointerInductionRecipe; The increment of the // canonical induction must not be updated. - return isa(U); + return isa(U) || + isa(U); }); // Defer erasing recipes till the end so that we don't invalidate the @@ -2946,9 +2948,10 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { VPBasicBlock *HeaderVPBB = EVLPhi->getParent(); VPValue *EVLIncrement = EVLPhi->getBackedgeValue(); VPValue *AVL; + VPValue *EVL; [[maybe_unused]] bool FoundAVL = - match(EVLIncrement, - m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi))); + match(EVLIncrement, m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL)).bind(EVL)), + m_Specific(EVLPhi))); assert(FoundAVL && "Didn't find AVL?"); // The AVL may be capped to a safe distance. @@ -3004,6 +3007,40 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { Plan.getConstantInt(AVLTy, 0)); Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp); LatchExitingBr->eraseFromParent(); + + SmallVector RecipesToErase; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getEntry()))) { + for (VPRecipeBase &R : *VPBB) + if (auto *ScalarIV = dyn_cast(&R)) { + auto ScalarTy = + VPTypeAnalysis(Plan).inferScalarType(ScalarIV->getOperand(1)); + auto EVLTy = VPTypeAnalysis(Plan).inferScalarType(EVL); + auto CompEVL = VPBuilder(ScalarIV).createScalarZExtOrTrunc( + EVL, ScalarTy, EVLTy, ScalarIV->getDebugLoc()); + + auto Phi = VPBuilder(VPBB, VPBB->getFirstNonPhi()) + .createScalarPhi({ScalarIV->getOperand(0)}, + ScalarIV->getDebugLoc()); + + auto Mul = VPBuilder(ScalarIV).createNaryOp( + Instruction::Mul, {ScalarIV->getOperand(1), CompEVL}); + auto Add = + VPBuilder(ScalarIV).createNaryOp(Instruction::Add, {Phi, Mul}); + + VPBuilder(ScalarIV).createNaryOp(Instruction::Store, + {Add, ScalarIV->getOperand(2)}); + + Phi->addOperand(Add); + + ScalarIV->replaceAllUsesWith(Add); + RecipesToErase.push_back(ScalarIV); + } + } + + for (auto &Recipe : RecipesToErase) { + Recipe->eraseFromParent(); + } } void VPlanTransforms::replaceSymbolicStrides( diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index e3bde8a47dcbc..f3824e9cd2d5f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -99,7 +99,7 @@ struct VPlanTransforms { /// >[ ] <-- original loop exit block(s), wrapped in VPIRBasicBlocks. LLVM_ABI_FOR_TEST static std::unique_ptr buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, - PredicatedScalarEvolution &PSE); + PredicatedScalarEvolution &PSE, LoopAccessInfoManager *LAIs); /// Update \p Plan to account for all early exits. LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan, diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index d4b8b72beb942..c632afde5fc03 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -344,7 +344,8 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { auto *VPBB = cast(VPB); auto InsertPtForPhi = VPBB->getFirstNonPhi(); for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - if (ToSkip.contains(&R) || isa(&R)) + if (ToSkip.contains(&R) || isa(&R) || + isa(&R)) continue; // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index e22c5dfdb9f38..8228a7cbd434f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -42,7 +42,10 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { if (U && !isa(U->getValue())) return Plan.getOrAddLiveIn(U->getValue()); auto *Expanded = new VPExpandSCEVRecipe(Expr); - Plan.getEntry()->appendRecipe(Expanded); + auto Iterator = Plan.getEntry()->begin(); + while (Iterator != Plan.getEntry()->end() && Iterator->isPhi()) + ++Iterator; + Plan.getEntry()->insert(Expanded->getDefiningRecipe(), Iterator); return Expanded; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 63eacd3d75721..9396d46860ccf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -367,6 +367,7 @@ class VPDef { VPWidenSelectSC, VPBlendSC, VPHistogramSC, + VPScalarIVPromotionRecipeSC, // START: Phi-like recipes. Need to be kept together. VPWidenPHISC, VPPredInstPHISC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 34754a1ea3992..6082e5f305898 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -157,6 +157,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { .Case([&](const VPWidenIntrinsicRecipe *S) { return VerifyEVLUse(*S, S->getNumOperands() - 1); }) + .Case( + [&](const VPScalarIVPromotionRecipe *S) { + return VerifyEVLUse(*S, S->getNumOperands() - 1); + }) .Case( [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); }) diff --git a/llvm/test/Analysis/LoopAccessAnalysis/invalidation.ll b/llvm/test/Analysis/LoopAccessAnalysis/invalidation.ll index fb3af609dd2c6..d40e238df3a42 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/invalidation.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/invalidation.ll @@ -11,6 +11,7 @@ ; CHECK-AA-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-AA: Running pass: InvalidateAnalysisPass ; CHECK-AA-NEXT: Invalidating analysis: AAManager on foo +; CHECK-AA-NEXT: Invalidating analysis: MemorySSAAnalysis on foo ; CHECK-AA-NEXT: Invalidating analysis: LoopAccessAnalysis on foo ; CHECK-AA-NEXT: Running pass: LoopAccessInfoPrinterPass on foo ; CHECK-AA-NEXT: Running analysis: LoopAccessAnalysis on foo diff --git a/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll b/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll index c0b044aef0d62..3d6aac7da5c0f 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll @@ -25,7 +25,7 @@ loop: store i32 %l, ptr %gep %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -55,7 +55,7 @@ loop: store i32 %l, ptr %a %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -90,7 +90,7 @@ loop: store i32 %l, ptr %gep %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -125,7 +125,7 @@ loop: store i32 %l, ptr %a %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -156,7 +156,7 @@ loop: store i8 %t, ptr %gep %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -192,7 +192,7 @@ loop: store i32 %t, ptr %gep %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -222,7 +222,7 @@ loop: store i32 %l, ptr %gep %iv.next = sub i32 %iv, 1 %ec = icmp eq i32 %iv.next, -100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -252,7 +252,7 @@ loop: store i32 %l, ptr %a %iv.next = sub i32 %iv, 1 %ec = icmp eq i32 %iv.next, -100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -288,7 +288,7 @@ loop: store i32 %l, ptr %gep %iv.next = sub i32 %iv, 1 %ec = icmp eq i32 %iv.next, -100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -323,7 +323,7 @@ loop: store i32 %l, ptr %a %iv.next = sub i32 %iv, 1 %ec = icmp eq i32 %iv.next, -100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -352,7 +352,7 @@ loop: store i32 %l, ptr %gep.off %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -381,7 +381,7 @@ loop: store i32 %l, ptr %a %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -415,7 +415,7 @@ loop: store i32 %l, ptr %gep.off %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -449,7 +449,7 @@ loop: store i32 %l, ptr %a %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -485,7 +485,7 @@ loop: store i32 %l, ptr %gep %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -521,7 +521,7 @@ loop: store i32 %l, ptr %a %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -556,7 +556,7 @@ loop: store i32 %l, ptr %gep %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -592,7 +592,7 @@ loop: store i32 %l, ptr %a %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -623,7 +623,7 @@ loop: store i32 %l, ptr %a %iv.next = add i32 %iv, %off %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -654,7 +654,7 @@ loop: store i32 %l, ptr %gep %iv.next = add i32 %iv, %off %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -684,7 +684,7 @@ loop: store i32 %l, ptr %gep %iv.next = add i32 %iv, %off %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -714,7 +714,7 @@ loop: store i32 %l, ptr %a %iv.next = add i32 %iv, %off %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -744,7 +744,7 @@ loop: store i32 0, ptr %gep %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -779,7 +779,7 @@ loop: store i32 0, ptr %gep %iv.next = add i32 %iv, 1 %ec = icmp eq i32 %iv.next, 100 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void @@ -814,8 +814,11 @@ loop: %iv.2.next = add i32 %iv.2, 1 %iv.3.next = add i32 %iv.3, 1 %ec = icmp eq i32 %iv.3, 200 - br i1 %ec, label %exit, label %loop + br i1 %ec, label %exit, label %loop, !llvm.loop !0 exit: ret void } + +!0 = distinct !{!0, !4} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index cb4bd793013b1..fc72d16099fcb 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -1183,10 +1183,10 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 { ; DEFAULT-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 ; DEFAULT-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 ; DEFAULT-NEXT: [[C1:%.*]] = ptrtoint ptr [[C]] to i64 -; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 ; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 ; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 8) +; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 ; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]] ; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; DEFAULT: [[VECTOR_MEMCHECK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index ab593f6f8bb6b..3b1f8101ff0c5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -2176,9 +2176,9 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 ; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK-MAXBW: for.body.preheader: -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll index 46ec858d7455c..3aa2ef89e7791 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll @@ -940,9 +940,9 @@ define i32 @add_of_zext_outside_loop(i32 %a, ptr noalias %b, i8 %c, i32 %d) #0 { ; CHECK-MAXBW-SAME: i32 [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32 -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = sub i32 1024, [[D]] ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[TMP1]], 4 +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = sub i32 1024, [[D]] ; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], [[TMP2]] ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: @@ -1061,9 +1061,9 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-MAXBW-LABEL: define i32 @add_of_loop_invariant_zext( ; CHECK-MAXBW-SAME: i32 [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = sub i32 1024, [[D]] ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[TMP1]], 4 +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = sub i32 1024, [[D]] ; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], [[TMP2]] ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll index 44ae1757ce6e6..1a83f97a6aaa3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll @@ -49,9 +49,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2) ; VSCALEFORTUNING2-LABEL: define i32 @chained_recurrences( ; VSCALEFORTUNING2-SAME: i32 [[X:%.*]], i64 [[Y:%.*]], ptr [[SRC_1:%.*]], i32 [[Z:%.*]], ptr [[SRC_2:%.*]]) #[[ATTR0:[0-9]+]] { ; VSCALEFORTUNING2-NEXT: [[ENTRY:.*]]: -; VSCALEFORTUNING2-NEXT: [[TMP0:%.*]] = add i64 [[Y]], 1 ; VSCALEFORTUNING2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; VSCALEFORTUNING2-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3 +; VSCALEFORTUNING2-NEXT: [[TMP0:%.*]] = add i64 [[Y]], 1 ; VSCALEFORTUNING2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; VSCALEFORTUNING2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VSCALEFORTUNING2: [[VECTOR_PH]]: @@ -287,9 +287,9 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 { ; DEFAULT-LABEL: define i16 @reduce_udiv( ; DEFAULT-SAME: ptr [[SRC:%.*]], i16 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; DEFAULT-NEXT: [[ENTRY:.*]]: -; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 ; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3 +; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 ; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll index e338b828d2520..151c0ad058849 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll @@ -11,11 +11,11 @@ target triple = "aarch64-unknown-linux-gnu" define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0 { ; CHECK-LABEL: @test_no_scalarization( ; CHECK-NEXT: L.entry: +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 1 ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[IDX:%.*]], 1 ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 [[TMP0]]) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[IDX]] -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], [[TMP3]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index d84463430179d..e0eca8abdd650 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -440,11 +440,11 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-UNORDERED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 ; CHECK-UNORDERED-NEXT: [[A1:%.*]] = load float, ptr [[A]], align 4 ; CHECK-UNORDERED-NEXT: [[A2:%.*]] = load float, ptr [[ARRAYIDXA]], align 4 +; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[N]], -2 ; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 ; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 -; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-UNORDERED: vector.ph: @@ -508,11 +508,11 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 ; CHECK-ORDERED-NEXT: [[A1:%.*]] = load float, ptr [[A]], align 4 ; CHECK-ORDERED-NEXT: [[A2:%.*]] = load float, ptr [[ARRAYIDXA]], align 4 +; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[N]], -2 ; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 ; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 -; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-ORDERED: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index e90f8d09fc7ab..bbf07dcb62269 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -494,11 +494,11 @@ for.body: ; preds = %for.body, %entry define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i64 %N) #1 { ; CHECK-LABEL: @even_load_dynamic_tc( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 2) ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT_NOT:%.*]] = icmp samesign ult i64 [[TMP1]], [[TMP3]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT_NOT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -789,9 +789,9 @@ for.body: ; preds = %for.body, %entry define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 { ; CHECK-LABEL: @PR27626_0( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: vector.ph: @@ -860,9 +860,9 @@ for.end: define i32 @PR27626_1(ptr %p, i64 %n) #1 { ; CHECK-LABEL: @PR27626_1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: vector.ph: @@ -936,9 +936,9 @@ for.end: define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 { ; CHECK-LABEL: @PR27626_2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: vector.ph: @@ -1008,9 +1008,9 @@ for.end: define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 { ; CHECK-LABEL: @PR27626_3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: vector.ph: @@ -1090,12 +1090,12 @@ for.end: define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { ; CHECK-LABEL: @PR27626_4( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2) ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -1166,12 +1166,12 @@ for.end: define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { ; CHECK-LABEL: @PR27626_5( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 5) ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -1248,10 +1248,10 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 { ; CHECK-LABEL: @PR34743( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, ptr [[A:%.*]], align 2 -; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll index 9312306ce519a..438c1bcaa9e18 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll @@ -6,12 +6,12 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[START_22:%.*]] = ptrtoint ptr [[START_2:%.*]] to i64 ; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END:%.*]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START_22]] ; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 ; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll index ed49dc5a7573f..e2795ba1413cf 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll @@ -13,10 +13,10 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64 ; CHECK-NEXT: [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64 -; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[N:%.*]], i32 999) -; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i32 [[UMIN]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 3 +; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[N:%.*]], i32 999) +; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i32 [[UMIN]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], [[TMP2]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: @@ -83,10 +83,10 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64 ; CHECK-NEXT: [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64 -; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[N:%.*]], i32 999) -; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i32 [[UMIN]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 3 +; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[N:%.*]], i32 999) +; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i32 [[UMIN]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], [[TMP2]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll index 4c7f70ad4d15e..bb65505e132ef 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll @@ -12,10 +12,10 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr ; CHECK-NEXT: [[SRC_13:%.*]] = ptrtoint ptr [[SRC_1:%.*]] to i64 ; CHECK-NEXT: [[DST_12:%.*]] = ptrtoint ptr [[DST_1:%.*]] to i64 ; CHECK-NEXT: [[DST_21:%.*]] = ptrtoint ptr [[DST_2:%.*]] to i64 -; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 20) +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP2]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll index f223786a07cdf..19453f2985e00 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -98,9 +98,9 @@ exit: ; preds = %loop.body define void @pointer_induction(ptr noalias %start, i64 %N) { ; CHECK-LABEL: @pointer_induction( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index 3b2b0b5c33aa9..b619710cbab96 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -217,9 +217,9 @@ for.cond.cleanup: ; preds = %for.body define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 { ; CHECK-LABEL: @pointer_iv_mixed( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[SMAX]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll index de70da6d2558b..e3f5daa029ce3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll @@ -12,9 +12,9 @@ target triple = "aarch64-unknown-linux-gnu" define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features" = "+sve" { ; NONE-LABEL: @simple_memset_tailfold( ; NONE-NEXT: entry: -; NONE-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) ; NONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; NONE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; NONE-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) ; NONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP1]] ; NONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NONE: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll index b63e03dccdc18..9f0cb02ab6b26 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll @@ -173,11 +173,11 @@ define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst) ; CHECK-LABEL: define void @test_masked_interleave_group( ; CHECK-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8) +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll index b5662b0bd8d3b..2705d972052c0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll @@ -630,11 +630,11 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) { ; ; NO-VP-LABEL: @load_factor_4_reverse( ; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 ; NO-VP-NEXT: [[TMP0:%.*]] = add nsw i64 [[N:%.*]], -1 ; NO-VP-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[TMP0]], i64 0) ; NO-VP-NEXT: [[TMP1:%.*]] = sub i64 [[N]], [[SMIN]] -; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; NO-VP-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 ; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] ; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; NO-VP: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll index 63f9a1310d15a..7ef5cf0bf3bb0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -238,9 +238,24 @@ loopexit: define void @uniform_rw(ptr align(4) %addr) { ; CHECK-LABEL: @uniform_rw( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADDR:%.*]], align 4 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[FOR_BODY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2]] = add i32 [[TMP1]], 16 +; CHECK-NEXT: store i32 [[TMP2]], ptr [[ADDR]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 4096, [[SCALAR_PH]] ] ; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[ADDR:%.*]], align 4 ; CHECK-NEXT: [[INC:%.*]] = add i32 [[LOAD]], 1 ; CHECK-NEXT: store i32 [[INC]], ptr [[ADDR]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll index 4068498dc68db..f377aa9db667e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll @@ -7,65 +7,20 @@ target triple = "x86_64-apple-macosx13.0.0" define void @test_pr59090(ptr %l_out, ptr noalias %b) #0 { ; CHECK-LABEL: @test_pr59090( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[B:%.*]], align 1, !llvm.access.group [[ACC_GRP0:![0-9]+]] ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i8 [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 10000) ; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[B:%.*]], align 1, !llvm.access.group [[ACC_GRP0:![0-9]+]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; CHECK: pred.store.if1: -; CHECK-NEXT: store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: -; CHECK-NEXT: store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] -; CHECK: pred.store.if5: -; CHECK-NEXT: store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 -; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; CHECK: pred.store.if7: -; CHECK-NEXT: store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] -; CHECK: pred.store.continue8: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] -; CHECK: pred.store.if9: -; CHECK-NEXT: store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] -; CHECK: pred.store.continue10: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] -; CHECK: pred.store.if11: -; CHECK-NEXT: store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] -; CHECK: pred.store.continue12: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 -; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]] -; CHECK: pred.store.if13: -; CHECK-NEXT: store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] -; CHECK: pred.store.continue14: +; CHECK-NEXT: [[TMP4]] = add i8 [[TMP3]], 0 +; CHECK-NEXT: store i8 [[TMP4]], ptr [[B]], align 1 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[L_OUT:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <48 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = and <48 x i1> [[INTERLEAVED_MASK]], diff --git a/llvm/test/Transforms/LoopVectorize/memory-iv-promotion.ll b/llvm/test/Transforms/LoopVectorize/memory-iv-promotion.ll new file mode 100644 index 0000000000000..465682a1e0873 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/memory-iv-promotion.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=loop-vectorize -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s --check-prefix=AARCH64 +; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s --check-prefix=X86_64 + +; Testcase extraído de ElemAttribute.cpp +; Foca no loop while.body que copia elementos i16 + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" + +define void @test_copy_loop(ptr %theFirst, ptr %theLast, ptr %dest_base, ptr %m_size_ptr) { +; AARCH64-LABEL: define void @test_copy_loop( +; AARCH64-SAME: ptr [[THEFIRST:%.*]], ptr [[THELAST:%.*]], ptr [[DEST_BASE:%.*]], ptr [[M_SIZE_PTR:%.*]]) { +; AARCH64: vector.body: +; AARCH64: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; AARCH64: [[WIDE_LOAD0:%.*]] = load <8 x i16>, ptr +; AARCH64: [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr +; AARCH64: store <8 x i16> [[WIDE_LOAD0]], ptr +; AARCH64: store <8 x i16> [[WIDE_LOAD1]], ptr +; AARCH64: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; AARCH64: br i1 {{.*}}, label %middle.block, label %vector.body +; +; X86_64-LABEL: define void @test_copy_loop( +; X86_64-SAME: ptr [[THEFIRST:%.*]], ptr [[THELAST:%.*]], ptr [[DEST_BASE:%.*]], ptr [[M_SIZE_PTR:%.*]]) { +; X86_64: vector.body: +; X86_64: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; X86_64: [[WIDE_LOAD0:%.*]] = load <2 x i16>, ptr +; X86_64: [[WIDE_LOAD1:%.*]] = load <2 x i16>, ptr +; X86_64: store <2 x i16> [[WIDE_LOAD0]], ptr +; X86_64: store <2 x i16> [[WIDE_LOAD1]], ptr +; X86_64: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; X86_64: br i1 {{.*}}, label %middle.block, label %vector.body + +entry: + %0 = load i64, ptr %m_size_ptr, align 8 + %add.ptr.i = getelementptr inbounds nuw i16, ptr %dest_base, i64 %0 + %cmp.not = icmp eq ptr %theFirst, %theLast + br i1 %cmp.not, label %cleanup, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %theFirst.addr.0112 = phi ptr [ %incdec.ptr9, %while.body ], [ %theFirst, %while.body.preheader ] + %thePointer.0111 = phi ptr [ %incdec.ptr, %while.body ], [ %add.ptr.i, %while.body.preheader ] + %1 = load i16, ptr %theFirst.addr.0112, align 2 + store i16 %1, ptr %thePointer.0111, align 2 + %incdec.ptr = getelementptr inbounds nuw i8, ptr %thePointer.0111, i64 2 + %2 = load i64, ptr %m_size_ptr, align 8 + %inc = add i64 %2, 1 + store i64 %inc, ptr %m_size_ptr, align 8 + %incdec.ptr9 = getelementptr inbounds nuw i8, ptr %theFirst.addr.0112, i64 2 + %cmp7.not = icmp eq ptr %incdec.ptr9, %theLast + br i1 %cmp7.not, label %cleanup.loopexit, label %while.body + +cleanup.loopexit: + br label %cleanup + +cleanup: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll index 5c62ca3ff3d01..1138cb7dcf4c9 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -438,8 +438,8 @@ define i64 @ivopt_widen_ptr_indvar_1(ptr noalias %a, i64 %stride, i64 %n) { ; ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_1( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3 +; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: @@ -523,8 +523,8 @@ define i64 @ivopt_widen_ptr_indvar_2(ptr noalias %a, i64 %stride, i64 %n) { ; ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_2( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3 +; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: @@ -630,8 +630,8 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) { ; ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_3( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3 +; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll index faca86a41b023..4445b0ea79209 100644 --- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll @@ -206,10 +206,15 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog ; CHECK-NEXT: br i1 [[INVAR_C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1]] ; CHECK: [[LOOP_2_PREHEADER]]: ; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], %[[LOOP_1]] ] +; CHECK-NEXT: [[TMP16:%.*]] = sub i32 2, [[STEP]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP16]] +; CHECK-NEXT: [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0) +; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[INDVAR]], -1 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP3]], -1 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IV_1_LCSSA]], [[STEP]] ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[STEP]], -2 -; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[INDVAR]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SMAX]], [[TMP4]] ; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 1) @@ -218,11 +223,6 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog ; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1) ; CHECK-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]] ; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP16:%.*]] = sub i32 2, [[STEP]] -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP16]] -; CHECK-NEXT: [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0) -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP3]], -1 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP15]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] ; CHECK: [[VECTOR_SCEVCHECK]]: diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll index 1216bc1dc33cc..17d54c931645e 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll @@ -16,11 +16,11 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) { ; CHECK-VF4UF1-NEXT: br label %[[FOR_PREHEADER:.*]] ; CHECK-VF4UF1: [[FOR_PREHEADER]]: ; CHECK-VF4UF1-NEXT: [[PRE_LOAD:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-VF4UF1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF4UF1-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2 ; CHECK-VF4UF1-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 ; CHECK-VF4UF1-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-VF4UF1-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-VF4UF1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF4UF1-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2 ; CHECK-VF4UF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-VF4UF1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK-VF4UF1: [[VECTOR_MEMCHECK]]: @@ -79,11 +79,11 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) { ; CHECK-VF4UF2-NEXT: br label %[[FOR_PREHEADER:.*]] ; CHECK-VF4UF2: [[FOR_PREHEADER]]: ; CHECK-VF4UF2-NEXT: [[PRE_LOAD:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-VF4UF2-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF4UF2-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 3 ; CHECK-VF4UF2-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 ; CHECK-VF4UF2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-VF4UF2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-VF4UF2-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF4UF2-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 3 ; CHECK-VF4UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-VF4UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK-VF4UF2: [[VECTOR_MEMCHECK]]: @@ -183,11 +183,11 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) { ; CHECK-VF4UF1: [[FOR_PREHEADER]]: ; CHECK-VF4UF1-NEXT: [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1 ; CHECK-VF4UF1-NEXT: [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4 +; CHECK-VF4UF1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF4UF1-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 ; CHECK-VF4UF1-NEXT: [[TMP5:%.*]] = add i32 [[N]], -1 ; CHECK-VF4UF1-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 ; CHECK-VF4UF1-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1 -; CHECK-VF4UF1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF4UF1-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 ; CHECK-VF4UF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; CHECK-VF4UF1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK-VF4UF1: [[VECTOR_PH]]: @@ -242,11 +242,11 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) { ; CHECK-VF4UF2: [[FOR_PREHEADER]]: ; CHECK-VF4UF2-NEXT: [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1 ; CHECK-VF4UF2-NEXT: [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4 +; CHECK-VF4UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF4UF2-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3 ; CHECK-VF4UF2-NEXT: [[TMP5:%.*]] = add i32 [[N]], -1 ; CHECK-VF4UF2-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 ; CHECK-VF4UF2-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1 -; CHECK-VF4UF2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF4UF2-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3 ; CHECK-VF4UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; CHECK-VF4UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK-VF4UF2: [[VECTOR_PH]]: @@ -352,10 +352,10 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f ; CHECK-VF4UF1-NEXT: [[CMP25:%.*]] = icmp sgt i32 [[N]], 1 ; CHECK-VF4UF1-NEXT: br i1 [[CMP25]], label %[[FOR_PREHEADER:.*]], [[FOR_END:label %.*]] ; CHECK-VF4UF1: [[FOR_PREHEADER]]: -; CHECK-VF4UF1-NEXT: [[TMP1:%.*]] = add i32 [[N]], -1 -; CHECK-VF4UF1-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-VF4UF1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF4UF1-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2 +; CHECK-VF4UF1-NEXT: [[TMP1:%.*]] = add i32 [[N]], -1 +; CHECK-VF4UF1-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-VF4UF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-VF4UF1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK-VF4UF1: [[VECTOR_MEMCHECK]]: @@ -428,10 +428,10 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f ; CHECK-VF4UF2-NEXT: [[CMP25:%.*]] = icmp sgt i32 [[N]], 1 ; CHECK-VF4UF2-NEXT: br i1 [[CMP25]], label %[[FOR_PREHEADER:.*]], [[FOR_END:label %.*]] ; CHECK-VF4UF2: [[FOR_PREHEADER]]: -; CHECK-VF4UF2-NEXT: [[TMP1:%.*]] = add i32 [[N]], -1 -; CHECK-VF4UF2-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-VF4UF2-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF4UF2-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 3 +; CHECK-VF4UF2-NEXT: [[TMP1:%.*]] = add i32 [[N]], -1 +; CHECK-VF4UF2-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-VF4UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-VF4UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK-VF4UF2: [[VECTOR_MEMCHECK]]: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 0ba7789ffba94..be0e74e99e698 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -443,10 +443,10 @@ define void @print_expand_scev(i64 %y, ptr %ptr) { ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<[[EXP_SCEV:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060)) +; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060)) /u (1 + (%y /u 492802768830814060)))) ; CHECK-NEXT: IR %div = udiv i64 %y, 492802768830814060 ; CHECK-NEXT: IR %inc = add i64 %div, 1 -; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060)) /u (1 + (%y /u 492802768830814060)))) -; CHECK-NEXT: EMIT vp<[[EXP_SCEV:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060)) ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp index 7471355603640..cdbfbee59743a 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp @@ -27,6 +27,8 @@ class VPlanSlpTest : public VPlanTestIRBase { std::unique_ptr LAI; std::unique_ptr PSE; std::unique_ptr IAI; + std::unique_ptr TTI; + std::unique_ptr MSSA; VPlanSlpTest() : DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-" @@ -41,8 +43,10 @@ class VPlanSlpTest : public VPlanTestIRBase { AARes.reset(new AAResults(*TLI)); AARes->addAAResult(*BasicAA); PSE.reset(new PredicatedScalarEvolution(*SE, *L)); - LAI.reset( - new LoopAccessInfo(L, &*SE, nullptr, &*TLI, &*AARes, &*DT, &*LI, &*AC)); + TTI = std::make_unique(DL); + MSSA.reset(new MemorySSA(F, &*AARes, &*DT)); + LAI.reset(new LoopAccessInfo(L, &*SE, nullptr, &*TLI, &*AARes, &*DT, &*LI, + &*AC, &*MSSA)); IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI)); IAI->analyzeInterleaving(false); return {Plan, *IAI}; diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h index ed6e13b4add3d..7c6c2b4cdc54b 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h +++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h @@ -18,7 +18,10 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/TargetTransformInfoImpl.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Verifier.h" @@ -27,6 +30,11 @@ namespace llvm { +struct TargetTransformInfoImpl : TargetTransformInfoImplBase { + TargetTransformInfoImpl(const DataLayout &DL) + : TargetTransformInfoImplBase(DL) {} +}; + /// Helper class to create a module from an assembly string and VPlans for a /// given loop entry block. class VPlanTestIRBase : public testing::Test { @@ -41,6 +49,11 @@ class VPlanTestIRBase : public testing::Test { std::unique_ptr SE; std::unique_ptr TLII; std::unique_ptr TLI; + std::unique_ptr TTII; + std::unique_ptr TTI; + std::unique_ptr AA; + std::unique_ptr MSSA; + std::unique_ptr LAIs; VPlanTestIRBase() : DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-" @@ -54,6 +67,7 @@ class VPlanTestIRBase : public testing::Test { EXPECT_TRUE(M); TLII = std::make_unique(M->getTargetTriple()); TLI = std::make_unique(*TLII); + TTI = std::make_unique(DL); return *M; } @@ -62,6 +76,10 @@ class VPlanTestIRBase : public testing::Test { LI.reset(new LoopInfo(*DT)); AC.reset(new AssumptionCache(F)); SE.reset(new ScalarEvolution(F, *TLI, *AC, *DT, *LI)); + AA.reset(new AAResults(*TLI)); + MSSA.reset(new MemorySSA(F, &*AA, &*DT)); + LAIs.reset(new LoopAccessInfoManager(*SE, *AA, *DT, *LI, &*TTI, &*TLI, &*AC, + &*MSSA)); } /// Build the VPlan for the loop starting from \p LoopHeader. @@ -73,7 +91,7 @@ class VPlanTestIRBase : public testing::Test { Loop *L = LI->getLoopFor(LoopHeader); PredicatedScalarEvolution PSE(*SE, *L); auto Plan = VPlanTransforms::buildVPlan0(L, *LI, IntegerType::get(*Ctx, 64), - {}, PSE); + {}, PSE, LAIs.get()); VPlanTransforms::handleEarlyExits(*Plan, HasUncountableExit); VPlanTransforms::addMiddleCheck(*Plan, true, false);