diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f829ef52d50bb7..772b276df124a9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -569,6 +569,11 @@ class InnerLoopVectorizer { Value *CountRoundDown, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader); + /// Introduce a conditional branch (on true, condition to be set later) at the + /// end of the header=latch connecting it to itself (across the backedge) and + /// to the exit block of \p L. + void createHeaderBranch(Loop *L); + /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(VPTransformState &State); @@ -626,8 +631,8 @@ class InnerLoopVectorizer { /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, /// vector loop preheader, middle block and scalar preheader. Also - /// allocate a loop object for the new vector loop. - void createVectorLoopSkeleton(StringRef Prefix); + /// allocate a loop object for the new vector loop and return it. + Loop *createVectorLoopSkeleton(StringRef Prefix); /// Create new phi nodes for the induction variables to resume iteration count /// in the scalar epilogue, from where the vectorized loop left off. @@ -2828,6 +2833,23 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, PredicatedInstructions.push_back(Cloned); } +void InnerLoopVectorizer::createHeaderBranch(Loop *L) { + BasicBlock *Header = L->getHeader(); + assert(!L->getLoopLatch() && "loop should not have a latch at this point"); + + IRBuilder<> B(Header->getTerminator()); + Instruction *OldInst = + getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); + setDebugLocFromInst(OldInst, &B); + + // Connect the header to the exit and header blocks and replace the old + // terminator. + B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header); + + // Now we have two terminators. Remove the old one from the block. + Header->getTerminator()->eraseFromParent(); +} + Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { if (TripCount) return TripCount; @@ -3070,7 +3092,7 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { return MemCheckBlock; } -void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { +Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); @@ -3128,6 +3150,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LI->addTopLevelLoop(Lp); } Lp->addBasicBlockToLoop(LoopVectorBody, *LI); + return Lp; } void InnerLoopVectorizer::createInductionResumeValues( @@ -3293,7 +3316,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() { // Create an empty vector loop, and prepare basic blocks for the runtime // checks. - createVectorLoopSkeleton(""); + Loop *Lp = createVectorLoopSkeleton(""); // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. This check also covers the case where the @@ -3311,6 +3334,8 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() { // faster. emitMemRuntimeChecks(LoopScalarPreHeader); + createHeaderBranch(Lp); + // Emit phis for the new starting index of the scalar loop. createInductionResumeValues(); @@ -7597,7 +7622,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; Value *CanonicalIVStartValue; - std::tie(State.CFG.VectorPreHeader, CanonicalIVStartValue) = + std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = ILV.createVectorizedLoopSkeleton(); ILV.collectPoisonGeneratingRecipes(State); @@ -7714,7 +7739,7 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } std::pair EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); - createVectorLoopSkeleton(""); + Loop *Lp = createVectorLoopSkeleton(""); // Generate the code to check the minimum iteration count of the vector // epilogue (see below). @@ -7743,6 +7768,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { // Generate the induction variable. Value *CountRoundDown = getOrCreateVectorTripCount(LoopVectorPreHeader); EPI.VectorTripCount = CountRoundDown; + createHeaderBranch(Lp); // Skip induction resume value creation here because they will be created in // the second pass. If we created them here, they wouldn't be used anyway, @@ -7834,7 +7860,7 @@ EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(BasicBlock *Bypass, std::pair EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); - createVectorLoopSkeleton("vec.epilog."); + Loop *Lp = createVectorLoopSkeleton("vec.epilog."); // Now, compare the remaining count and if there aren't enough iterations to // execute the vectorized epilogue skip to the scalar part. @@ -7915,6 +7941,9 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), EPI.MainLoopIterationCountCheck); + // Generate the induction variable. + createHeaderBranch(Lp); + // Generate induction resume values. These variables save the new starting // indexes for the scalar loop. They are used to test if there are any tail // iterations left once the vector loop has completed. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 60ca5d36e65ac5..3a2cb7f161bcfd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -857,7 +857,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, // Check if the backedge taken count is needed, and if so build it. if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { - IRBuilder<> Builder(State.CFG.VectorPreHeader->getTerminator()); + IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); auto *TCMO = Builder.CreateSub(TripCountV, ConstantInt::get(TripCountV->getType(), 1), "trip.count.minus.1"); @@ -898,16 +898,17 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, /// LoopVectorBody basic-block was created for this. Introduce additional /// basic-blocks as needed, and fill them all. void VPlan::execute(VPTransformState *State) { - // Set the reverse mapping from VPValues to Values for code generation. + // 0. Set the reverse mapping from VPValues to Values for code generation. for (auto &Entry : Value2VPValue) State->VPValue2Value[Entry.second] = Entry.first; - // Initialize CFG state. - State->CFG.PrevVPBB = nullptr; - BasicBlock *VectorHeaderBB = State->CFG.VectorPreHeader->getSingleSuccessor(); - State->CFG.PrevBB = VectorHeaderBB; - State->CFG.ExitBB = VectorHeaderBB->getSingleSuccessor(); + BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB; + State->CFG.VectorPreHeader = VectorPreHeaderBB; + BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor(); + assert(VectorHeaderBB && "Loop preheader does not have a single successor."); + State->CurrentVectorLoop = State->LI->getLoopFor(VectorHeaderBB); + State->CFG.ExitBB = State->CurrentVectorLoop->getExitBlock(); // Remove the edge between Header and Latch to allow other connections. // Temporarily terminate with unreachable until CFG is rewired. @@ -919,6 +920,9 @@ void VPlan::execute(VPTransformState *State) { State->Builder.SetInsertPoint(Terminator); // Generate code in loop body. + State->CFG.PrevVPBB = nullptr; + State->CFG.PrevBB = VectorHeaderBB; + for (VPBlockBase *Block : depth_first(Entry)) Block->execute(State);