From 5113b4f4fb27a6bc3362ca91442b2f34c41a1a9c Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Wed, 15 Nov 2023 13:23:13 -0800 Subject: [PATCH 1/2] Add setBranchWeigths convenience function Add `setBranchWeights` convenience function to ProfDataUtils.h and use it where appropriate. --- llvm/include/llvm/IR/ProfDataUtils.h | 4 ++++ llvm/lib/IR/ProfDataUtils.cpp | 7 +++++++ llvm/lib/Transforms/IPO/SampleProfile.cpp | 14 +++++++------- .../ControlHeightReduction.cpp | 3 +-- .../Instrumentation/IndirectCallPromotion.cpp | 6 ++---- .../Instrumentation/PGOInstrumentation.cpp | 8 +++----- llvm/lib/Transforms/Scalar/JumpThreading.cpp | 18 +++++++----------- .../Scalar/LowerExpectIntrinsic.cpp | 6 ++---- llvm/lib/Transforms/Utils/Local.cpp | 4 +--- llvm/lib/Transforms/Utils/LoopPeel.cpp | 17 ++++------------- .../Transforms/Utils/LoopRotationUtils.cpp | 19 ++++++++++--------- 11 files changed, 48 insertions(+), 58 deletions(-) diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h index b61199372de0d..255fa2ff1c790 100644 --- a/llvm/include/llvm/IR/ProfDataUtils.h +++ b/llvm/include/llvm/IR/ProfDataUtils.h @@ -104,5 +104,9 @@ bool extractProfTotalWeight(const MDNode *ProfileData, uint64_t &TotalWeights); /// metadata was found. bool extractProfTotalWeight(const Instruction &I, uint64_t &TotalWeights); +/// Create a new `branch_weights` metadata node and add or overwrite +/// a `prof` metadata reference to instruction `I`. +void setBranchWeights(Instruction &I, ArrayRef Weights); + } // namespace llvm #endif diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp index 77b3c1cb95d68..29536b0b090cd 100644 --- a/llvm/lib/IR/ProfDataUtils.cpp +++ b/llvm/lib/IR/ProfDataUtils.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/CommandLine.h" @@ -183,4 +184,10 @@ bool extractProfTotalWeight(const Instruction &I, uint64_t &TotalVal) { return extractProfTotalWeight(I.getMetadata(LLVMContext::MD_prof), TotalVal); } +void setBranchWeights(Instruction &I, ArrayRef Weights) { + MDBuilder MDB(I.getContext()); + MDNode *BranchWeights = MDB.createBranchWeights(Weights); + I.setMetadata(LLVMContext::MD_prof, BranchWeights); +} + } // namespace llvm diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 063f7b42022ff..6c6f0a0eca72a 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -56,6 +56,7 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/PseudoProbe.h" #include "llvm/IR/ValueSymbolTable.h" #include "llvm/ProfileData/InstrProf.h" @@ -1710,9 +1711,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { else if (OverwriteExistingWeights) I.setMetadata(LLVMContext::MD_prof, nullptr); } else if (!isa(&I)) { - I.setMetadata(LLVMContext::MD_prof, - MDB.createBranchWeights( - {static_cast(BlockWeights[BB])})); + setBranchWeights(I, {static_cast(BlockWeights[BB])}); } } } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) { @@ -1720,10 +1719,11 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { // clear it for cold code. for (auto &I : *BB) { if (isa(I) || isa(I)) { - if (cast(I).isIndirectCall()) + if (cast(I).isIndirectCall()) { I.setMetadata(LLVMContext::MD_prof, nullptr); - else - I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(0)); + } else { + setBranchWeights(I, {uint32_t(0)}); + } } } } @@ -1803,7 +1803,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { if (MaxWeight > 0 && (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) { LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); - TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); + setBranchWeights(*TI, Weights); ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst) << "most popular destination for conditional branches at " diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 97cf583510f93..0a3d8d6000cf4 100644 --- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -1878,8 +1878,7 @@ void CHR::fixupBranchesAndSelects(CHRScope *Scope, static_cast(CHRBranchBias.scale(1000)), static_cast(CHRBranchBias.getCompl().scale(1000)), }; - MDBuilder MDB(F.getContext()); - MergedBR->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); + setBranchWeights(*MergedBR, Weights); CHR_DEBUG(dbgs() << "CHR branch bias " << Weights[0] << ":" << Weights[1] << "\n"); } diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index 5c9799235017a..7344fea175171 100644 --- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Value.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Casting.h" @@ -256,10 +257,7 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee, promoteCallWithIfThenElse(CB, DirectCallee, BranchWeights); if (AttachProfToDirectCall) { - MDBuilder MDB(NewInst.getContext()); - NewInst.setMetadata( - LLVMContext::MD_prof, - MDB.createBranchWeights({static_cast(Count)})); + setBranchWeights(NewInst, {static_cast(Count)}); } using namespace ore; diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index aea0f2f7cae77..4a5a0b25bebba 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1437,12 +1437,11 @@ void PGOUseFunc::populateCoverage(IndexedInstrProfReader *PGOReader) { // If A is uncovered, set weight=1. // This setup will allow BFI to give nonzero profile counts to only covered // blocks. - SmallVector Weights; + SmallVector Weights; for (auto *Succ : successors(&BB)) Weights.push_back((Coverage[Succ] || !Coverage[&BB]) ? 1 : 0); if (Weights.size() >= 2) - BB.getTerminator()->setMetadata(LLVMContext::MD_prof, - MDB.createBranchWeights(Weights)); + llvm::setBranchWeights(*BB.getTerminator(), Weights); } unsigned NumCorruptCoverage = 0; @@ -2205,7 +2204,6 @@ static std::string getSimpleNodeName(const BasicBlock *Node) { void llvm::setProfMetadata(Module *M, Instruction *TI, ArrayRef EdgeCounts, uint64_t MaxCount) { - MDBuilder MDB(M->getContext()); assert(MaxCount > 0 && "Bad max count"); uint64_t Scale = calculateCountScale(MaxCount); SmallVector Weights; @@ -2219,7 +2217,7 @@ void llvm::setProfMetadata(Module *M, Instruction *TI, misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false); - TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); + setBranchWeights(*TI, Weights); if (EmitBranchProbability) { std::string BrCondStr = getBranchCondString(TI); if (BrCondStr.empty()) diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 7a8128c5b6c09..2d899f100f815 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -228,17 +228,15 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { if (BP >= BranchProbability(50, 100)) continue; - SmallVector Weights; + uint32_t Weights[2]; if (PredBr->getSuccessor(0) == PredOutEdge.second) { - Weights.push_back(BP.getNumerator()); - Weights.push_back(BP.getCompl().getNumerator()); + Weights[0] = BP.getNumerator(); + Weights[1] = BP.getCompl().getNumerator(); } else { - Weights.push_back(BP.getCompl().getNumerator()); - Weights.push_back(BP.getNumerator()); + Weights[0] = BP.getCompl().getNumerator(); + Weights[1] = BP.getNumerator(); } - PredBr->setMetadata(LLVMContext::MD_prof, - MDBuilder(PredBr->getParent()->getContext()) - .createBranchWeights(Weights)); + setBranchWeights(*PredBr, Weights); } } @@ -2574,9 +2572,7 @@ void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB, Weights.push_back(Prob.getNumerator()); auto TI = BB->getTerminator(); - TI->setMetadata( - LLVMContext::MD_prof, - MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights)); + setBranchWeights(*TI, Weights); } } diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index ac87ee736c0d1..6f87e4d91d2c7 100644 --- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/MisExpect.h" @@ -101,10 +102,7 @@ static bool handleSwitchExpect(SwitchInst &SI) { misexpect::checkExpectAnnotations(SI, Weights, /*IsFrontend=*/true); SI.setCondition(ArgValue); - - SI.setMetadata(LLVMContext::MD_prof, - MDBuilder(CI->getContext()).createBranchWeights(Weights)); - + setBranchWeights(SI, Weights); return true; } diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index aacf66bfe38eb..c8c8d02cdcbb2 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -227,9 +227,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, // Remove weight for this case. std::swap(Weights[Idx + 1], Weights.back()); Weights.pop_back(); - SI->setMetadata(LLVMContext::MD_prof, - MDBuilder(BB->getContext()). - createBranchWeights(Weights)); + setBranchWeights(*SI, Weights); } // Remove this entry. BasicBlock *ParentBB = SI->getParent(); diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index 2881444206b0b..7566f70661baf 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -631,9 +631,7 @@ struct WeightInfo { /// To avoid dealing with division rounding we can just multiple both part /// of weights to E and use weight as (F - I * E, E). static void updateBranchWeights(Instruction *Term, WeightInfo &Info) { - MDBuilder MDB(Term->getContext()); - Term->setMetadata(LLVMContext::MD_prof, - MDB.createBranchWeights(Info.Weights)); + setBranchWeights(*Term, Info.Weights); for (auto [Idx, SubWeight] : enumerate(Info.SubWeights)) if (SubWeight != 0) // Don't set the probability of taking the edge from latch to loop header @@ -690,14 +688,6 @@ static void initBranchWeights(DenseMap &WeightInfos, } } -/// Update the weights of original exiting block after peeling off all -/// iterations. -static void fixupBranchWeights(Instruction *Term, const WeightInfo &Info) { - MDBuilder MDB(Term->getContext()); - Term->setMetadata(LLVMContext::MD_prof, - MDB.createBranchWeights(Info.Weights)); -} - /// Clones the body of the loop L, putting it between \p InsertTop and \p /// InsertBot. /// \param IterNumber The serial number of the iteration currently being @@ -1033,8 +1023,9 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, PHI->setIncomingValueForBlock(NewPreHeader, NewVal); } - for (const auto &[Term, Info] : Weights) - fixupBranchWeights(Term, Info); + for (const auto &[Term, Info] : Weights) { + setBranchWeights(*Term, Info.Weights); + } // Update Metadata for count of peeled off iterations. unsigned AlreadyPeeled = 0; diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 012aa5dbb9ca0..ae155ac082d81 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -352,16 +352,17 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI, LoopBackWeight = 0; } - MDBuilder MDB(LoopBI.getContext()); - MDNode *LoopWeightMD = - MDB.createBranchWeights(SuccsSwapped ? LoopBackWeight : ExitWeight1, - SuccsSwapped ? ExitWeight1 : LoopBackWeight); - LoopBI.setMetadata(LLVMContext::MD_prof, LoopWeightMD); + const uint32_t LoopBIWeights[] = { + SuccsSwapped ? LoopBackWeight : ExitWeight1, + SuccsSwapped ? ExitWeight1 : LoopBackWeight, + }; + setBranchWeights(LoopBI, LoopBIWeights); if (HasConditionalPreHeader) { - MDNode *PreHeaderWeightMD = - MDB.createBranchWeights(SuccsSwapped ? EnterWeight : ExitWeight0, - SuccsSwapped ? ExitWeight0 : EnterWeight); - PreHeaderBI.setMetadata(LLVMContext::MD_prof, PreHeaderWeightMD); + const uint32_t PreHeaderBIWeights[] = { + SuccsSwapped ? EnterWeight : ExitWeight0, + SuccsSwapped ? ExitWeight0 : EnterWeight, + }; + setBranchWeights(PreHeaderBI, PreHeaderBIWeights); } } From 7428c050fd25c91ac36ee8c2e6b8cebdf4431ca0 Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Thu, 24 Aug 2023 11:15:49 -0700 Subject: [PATCH 2/2] LoopVectorize: Set branch_weight for conditional branches Consistently add `branch_weights` metadata in any condition branch created from `LoopVectorize.cpp`: - Will only add metadata if the original loop-latch branch had metadata assigned. - Most checks should rarely trigger so I am using a 127:1 ratio. - For the middle block we assume an equal distribution of modulo results. --- .../Transforms/Vectorize/LoopVectorize.cpp | 79 +++++++++++++----- .../LoopVectorize/branch-weights.ll | 81 +++++++++++++++++++ .../LoopVectorize/first-order-recurrence.ll | 60 +++++++------- 3 files changed, 171 insertions(+), 49 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/branch-weights.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e9d0315d114f6..0a10f0d647176 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -112,10 +112,12 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -396,6 +398,19 @@ static cl::opt UseWiderVFIfCallVariantsPresent( cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants")); +// Likelyhood of bypassing the vectorized loop because assumptions about SCEV +// variables not overflowing do not hold. See `emitSCEVChecks`. +static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; +// Likelyhood of bypassing the vectorized loop because pointers overlap. See +// `emitMemRuntimeChecks`. +static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; +// Likelyhood of bypassing the vectorized loop because there are zero trips left +// after prolog. See `emitIterationCountCheck`. +static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; +// Likelyhood of bypassing the vectorized loop because of zero trips necessary. +// See `emitMinimumVectorEpilogueIterCountCheck`. +static constexpr uint32_t EpilogueMinItersBypassWeights[] = {1, 127}; + /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type. @@ -1962,12 +1977,14 @@ class GeneratedRTChecks { SCEVExpander MemCheckExp; bool CostTooHigh = false; + const bool AddBranchWeights; public: GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, - TargetTransformInfo *TTI, const DataLayout &DL) + TargetTransformInfo *TTI, const DataLayout &DL, + bool AddBranchWeights) : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), - MemCheckExp(SE, DL, "scev.check") {} + MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are @@ -2160,8 +2177,10 @@ class GeneratedRTChecks { DT->addNewBlock(SCEVCheckBlock, Pred); DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); - ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); + BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); + if (AddBranchWeights) + setBranchWeights(BI, SCEVCheckBypassWeights); + ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); return SCEVCheckBlock; } @@ -2185,9 +2204,12 @@ class GeneratedRTChecks { if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) PL->addBasicBlockToLoop(MemCheckBlock, *LI); - ReplaceInstWithInst( - MemCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); + BranchInst &BI = + *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); + if (AddBranchWeights) { + setBranchWeights(BI, MemCheckBypassWeights); + } + ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); MemCheckBlock->getTerminator()->setDebugLoc( Pred->getTerminator()->getDebugLoc()); @@ -2900,9 +2922,11 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { // dominator of the exit blocks. DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); - ReplaceInstWithInst( - TCCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + BranchInst &BI = + *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); + if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) + setBranchWeights(BI, MinItersBypassWeights); + ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); LoopBypassBlocks.push_back(TCCheckBlock); } @@ -3133,7 +3157,16 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { IRBuilder<> B(LoopMiddleBlock->getTerminator()); B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc()); Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n"); - cast(LoopMiddleBlock->getTerminator())->setCondition(CmpN); + BranchInst &BI = *cast(LoopMiddleBlock->getTerminator()); + BI.setCondition(CmpN); + if (hasBranchWeightMD(*ScalarLatchTerm)) { + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = UF * VF.getKnownMinValue(); + assert(TripCount > 0 && "trip count should not be zero"); + MDBuilder MDB(ScalarLatchTerm->getContext()); + MDNode *BranchWeights = MDB.createBranchWeights(1, TripCount - 1); + BI.setMetadata(LLVMContext::MD_prof, BranchWeights); + } } #ifdef EXPENSIVE_CHECKS @@ -7896,9 +7929,11 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, EPI.TripCount = Count; } - ReplaceInstWithInst( - TCCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + BranchInst &BI = + *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); + if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) + setBranchWeights(BI, MinItersBypassWeights); + ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); return TCCheckBlock; } @@ -8042,9 +8077,11 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( EPI.EpilogueVF, EPI.EpilogueUF), "min.epilog.iters.check"); - ReplaceInstWithInst( - Insert->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + BranchInst &BI = + *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); + if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) + setBranchWeights(BI, EpilogueMinItersBypassWeights); + ReplaceInstWithInst(Insert->getTerminator(), &BI); LoopBypassBlocks.push_back(Insert); return Insert; @@ -9731,8 +9768,10 @@ static bool processLoopInVPlanNativePath( VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); { + bool AddBranchWeights = + hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getParent()->getDataLayout()); + F->getParent()->getDataLayout(), AddBranchWeights); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" @@ -10077,8 +10116,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; + bool AddBranchWeights = + hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getParent()->getDataLayout()); + F->getParent()->getDataLayout(), AddBranchWeights); if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. diff --git a/llvm/test/Transforms/LoopVectorize/branch-weights.ll b/llvm/test/Transforms/LoopVectorize/branch-weights.ll new file mode 100644 index 0000000000000..36ff8e83b0ed1 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/branch-weights.ll @@ -0,0 +1,81 @@ +; RUN: opt < %s -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 | FileCheck %s + +; CHECK-LABEL: @f0( +; +; CHECK: entry: +; CHECK: br i1 %cmp.entry, label %iter.check, label %exit, !prof [[PROF_F0_ENTRY:![0-9]+]] +; +; CHECK: iter.check: +; CHECK: br i1 %min.iters.check, label %vec.epilog.scalar.ph, label %vector.scevcheck, !prof [[PROF_F0_UNLIKELY:![0-9]+]] +; +; CHECK: vector.scevcheck: +; CHECK: br i1 %4, label %vec.epilog.scalar.ph, label %vector.main.loop.iter.check, !prof [[PROF_F0_UNLIKELY]] +; +; CHECK: vector.main.loop.iter.check: +; CHECK: br i1 %min.iters.check1, label %vec.epilog.ph, label %vector.ph, !prof [[PROF_F0_UNLIKELY]] +; +; CHECK: vector.ph: +; CHECK: br label %vector.body +; +; CHECK: vector.body: +; CHECK: br i1 %8, label %middle.block, label %vector.body, !prof [[PROF_F0_VECTOR_BODY:![0-9]+]] +; +; CHECK: middle.block: +; CHECK: br i1 %cmp.n, label %exit.loopexit, label %vec.epilog.iter.check, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]] +; +; CHECK: vec.epilog.iter.check: +; CHECK: br i1 %min.epilog.iters.check, label %vec.epilog.scalar.ph, label %vec.epilog.ph, !prof [[PROF_F0_UNLIKELY]] +; +; CHECK: vec.epilog.ph: +; CHECK: br label %vec.epilog.vector.body +; +; CHECK: vec.epilog.vector.body: +; CHECK: br i1 %12, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !prof [[PROF_F0_VEC_EPILOG_VECTOR_BODY:![0-9]+]] +; +; CHECK: vec.epilog.middle.block: +; CHECK: br i1 %cmp.n7, label %exit.loopexit, label %vec.epilog.scalar.ph, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]] +; +; CHECK: vec.epilog.scalar.ph: +; CHECK: br label %loop +; +; CHECK: loop: +; CHECK: br i1 %cmp.loop, label %loop, label %exit.loopexit, !prof [[PROF_F0_LOOP:![0-9]+]] +; +; CHECK: exit.loopexit: +; CHECK: br label %exit +; +; CHECK: exit: +; CHECK: ret void + +define void @f0(i8 %n, i32 %len, ptr %p) !prof !0 { +entry: + %cmp.entry = icmp sgt i32 %len, 0 + br i1 %cmp.entry, label %loop, label %exit, !prof !1 + +loop: + %i8 = phi i8 [0, %entry], [%i8.inc, %loop] + %i32 = phi i32 [0, %entry], [%i32.inc, %loop] + + %ptr = getelementptr inbounds i32, ptr %p, i8 %i8 + store i32 %i32, ptr %ptr + + %i8.inc = add i8 %i8, 1 + %i32.inc = add i32 %i32, 1 + + %cmp.loop = icmp ult i32 %i32, %len + br i1 %cmp.loop, label %loop, label %exit, !prof !2 + +exit: + ret void +} + +!0 = !{!"function_entry_count", i64 13} +!1 = !{!"branch_weights", i32 12, i32 1} +!2 = !{!"branch_weights", i32 1234, i32 1} + +; CHECK: [[PROF_F0_ENTRY]] = !{!"branch_weights", i32 12, i32 1} +; CHECK: [[PROF_F0_UNLIKELY]] = !{!"branch_weights", i32 1, i32 127} +; CEHCK: [[PROF_F0_VECTOR_BODY]] = !{!"branch_weights", i32 1, i32 307} +; CHECK: [[PROF_F0_MIDDLE_BLOCKS]] = !{!"branch_weights", i32 1, i32 3} +; CHECK: [[PROF_F0_VEC_EPILOG_VECTOR_BODY]] = !{!"branch_weights", i32 0, i32 0} +; CEHCK: [[PROF_F0_LOOP]] = !{!"branch_weights", i32 2, i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index 7b242f565824b..506b66276121f 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -2758,7 +2758,7 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; UNROLL-NO-IC-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; UNROLL-NO-IC-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF26:![0-9]+]] ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7 ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8 @@ -2860,7 +2860,7 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-IC-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]] ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27:![0-9]+]], !llvm.loop [[LOOP28:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]] ; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) @@ -2882,14 +2882,14 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-IC-NEXT: [[VAR7]] = udiv i32 219220132, [[VAR3]] ; UNROLL-NO-IC-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-IC-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-IC-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28:![0-9]+]], !llvm.loop [[LOOP29:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]] ; ; UNROLL-NO-VF-LABEL: @sink_into_replication_region( ; UNROLL-NO-VF-NEXT: bb: ; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; UNROLL-NO-VF-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF26:![0-9]+]] ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1 ; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2 @@ -2927,7 +2927,7 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP11]], i32 [[VEC_PHI1]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27:![0-9]+]], !llvm.loop [[LOOP28:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]] ; UNROLL-NO-VF-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] @@ -2947,14 +2947,14 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-VF-NEXT: [[VAR7]] = udiv i32 219220132, [[VAR3]] ; UNROLL-NO-VF-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-VF-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28:![0-9]+]], !llvm.loop [[LOOP29:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]] ; ; SINK-AFTER-LABEL: @sink_into_replication_region( ; SINK-AFTER-NEXT: bb: ; SINK-AFTER-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; SINK-AFTER-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; SINK-AFTER-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; SINK-AFTER-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SINK-AFTER-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF26:![0-9]+]] ; SINK-AFTER: vector.ph: ; SINK-AFTER-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3 ; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 @@ -3014,7 +3014,7 @@ define i32 @sink_into_replication_region(i32 %y) { ; SINK-AFTER-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27:![0-9]+]], !llvm.loop [[LOOP28:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3 @@ -3035,7 +3035,7 @@ define i32 @sink_into_replication_region(i32 %y) { ; SINK-AFTER-NEXT: [[VAR7]] = udiv i32 219220132, [[VAR3]] ; SINK-AFTER-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; SINK-AFTER-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; SINK-AFTER-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28:![0-9]+]], !llvm.loop [[LOOP29:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]] ; bb: br label %bb2 @@ -3063,7 +3063,7 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; UNROLL-NO-IC-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; UNROLL-NO-IC-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF26]] ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7 ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8 @@ -3228,7 +3228,7 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], ; UNROLL-NO-IC-NEXT: [[TMP74:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26]], !llvm.loop [[LOOP30:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP31:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP73]], [[TMP72]] ; UNROLL-NO-IC-NEXT: [[TMP75:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) @@ -3255,14 +3255,14 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-IC-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-IC-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; UNROLL-NO-IC-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-IC-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28]], !llvm.loop [[LOOP31:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29]], !llvm.loop [[LOOP32:![0-9]+]] ; ; UNROLL-NO-VF-LABEL: @sink_into_replication_region_multiple( ; UNROLL-NO-VF-NEXT: bb: ; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; UNROLL-NO-VF-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF26]] ; UNROLL-NO-VF: vector.ph: ; UNROLL-NO-VF-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1 ; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2 @@ -3314,7 +3314,7 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = select i1 [[TMP5]], i32 [[TMP11]], i32 [[VEC_PHI2]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26]], !llvm.loop [[LOOP30:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP31:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP17]], [[TMP16]] ; UNROLL-NO-VF-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] @@ -3339,14 +3339,14 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-VF-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-VF-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; UNROLL-NO-VF-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28]], !llvm.loop [[LOOP31:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29]], !llvm.loop [[LOOP32:![0-9]+]] ; ; SINK-AFTER-LABEL: @sink_into_replication_region_multiple( ; SINK-AFTER-NEXT: bb: ; SINK-AFTER-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; SINK-AFTER-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; SINK-AFTER-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; SINK-AFTER-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SINK-AFTER-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF26]] ; SINK-AFTER: vector.ph: ; SINK-AFTER-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3 ; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 @@ -3437,7 +3437,7 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; SINK-AFTER-NEXT: [[TMP38:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SINK-AFTER-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26]], !llvm.loop [[LOOP30:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP31:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP37]]) ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3 @@ -3463,7 +3463,7 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; SINK-AFTER-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; SINK-AFTER-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; SINK-AFTER-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; SINK-AFTER-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28]], !llvm.loop [[LOOP31:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29]], !llvm.loop [[LOOP32:![0-9]+]] ; bb: br label %bb2 @@ -3520,7 +3520,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2 @@ -3542,7 +3542,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; UNROLL-NO-IC-NEXT: [[EXT:%.*]] = zext i1 [[B3]] to i32 ; UNROLL-NO-IC-NEXT: [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]] ; UNROLL-NO-IC-NEXT: store i32 0, ptr [[A_GEP]], align 4 -; UNROLL-NO-IC-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]] ; UNROLL-NO-IC: for.end: ; UNROLL-NO-IC-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[FOR_LCSSA]] @@ -3570,7 +3570,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; UNROLL-NO-VF-NEXT: store i32 0, ptr [[TMP9]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; UNROLL-NO-VF-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -3590,7 +3590,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; UNROLL-NO-VF-NEXT: [[EXT:%.*]] = zext i1 [[B3]] to i32 ; UNROLL-NO-VF-NEXT: [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]] ; UNROLL-NO-VF-NEXT: store i32 0, ptr [[A_GEP]], align 4 -; UNROLL-NO-VF-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: ret i32 [[FOR_LCSSA]] @@ -3616,7 +3616,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], ; SINK-AFTER-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; SINK-AFTER-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 @@ -3638,7 +3638,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; SINK-AFTER-NEXT: [[EXT:%.*]] = zext i1 [[B3]] to i32 ; SINK-AFTER-NEXT: [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]] ; SINK-AFTER-NEXT: store i32 0, ptr [[A_GEP]], align 4 -; SINK-AFTER-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]] ; SINK-AFTER: for.end: ; SINK-AFTER-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: ret i32 [[FOR_LCSSA]] @@ -3688,7 +3688,7 @@ define void @unused_recurrence(ptr %a) { ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3 ; UNROLL-NO-IC-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -3703,7 +3703,7 @@ define void @unused_recurrence(ptr %a) { ; UNROLL-NO-IC-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 ; UNROLL-NO-IC-NEXT: [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5 ; UNROLL-NO-IC-NEXT: [[CMP:%.*]] = icmp eq i16 [[IV]], 1000 -; UNROLL-NO-IC-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]] ; UNROLL-NO-IC: for.end: ; UNROLL-NO-IC-NEXT: ret void ; @@ -3725,7 +3725,7 @@ define void @unused_recurrence(ptr %a) { ; UNROLL-NO-VF-NEXT: [[TMP5]] = add i16 [[TMP3]], 5 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028 -; UNROLL-NO-VF-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -3739,7 +3739,7 @@ define void @unused_recurrence(ptr %a) { ; UNROLL-NO-VF-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 ; UNROLL-NO-VF-NEXT: [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5 ; UNROLL-NO-VF-NEXT: [[CMP:%.*]] = icmp eq i16 [[IV]], 1000 -; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: ret void ; @@ -3758,7 +3758,7 @@ define void @unused_recurrence(ptr %a) { ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], ; SINK-AFTER-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028 -; SINK-AFTER-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 ; SINK-AFTER-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -3773,7 +3773,7 @@ define void @unused_recurrence(ptr %a) { ; SINK-AFTER-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 ; SINK-AFTER-NEXT: [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5 ; SINK-AFTER-NEXT: [[CMP:%.*]] = icmp eq i16 [[IV]], 1000 -; SINK-AFTER-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]] ; SINK-AFTER: for.end: ; SINK-AFTER-NEXT: ret void ;