diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 741392247c0d6..e13b8d96b29e0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -620,14 +620,15 @@ class LoopVectorizationPlanner { /// legal to vectorize the loop. This method creates VPlans using VPRecipes. void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); - // Adjust the recipes for reductions. For in-loop reductions the chain of - // instructions leading from the loop exit instr to the phi need to be - // converted to reductions, with one operand being vector and the other being - // the scalar reduction chain. For other reductions, a select is introduced - // between the phi and users outside the vector region when folding the tail. - void adjustRecipesForReductions(VPlanPtr &Plan, - VPRecipeBuilder &RecipeBuilder, - ElementCount MinVF); + /// Introduce recipes to compute the final reduction result + /// (ComputeFindIVResult, ComputeAnyOfResult, ComputeReductionResult depending + /// on the reduction) in the middle block. Selects are introduced for regular + /// reductions between the phi and users outside the vector region when + /// folding the tail. + /// + void introduceReductionResultComputation(VPlanPtr &Plan, + VPRecipeBuilder &RecipeBuilder, + ElementCount MinVF); /// Attach the runtime checks of \p RTChecks to \p Plan. void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 36982aaf717ac..78c7fdfd008f1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1412,6 +1412,11 @@ class LoopVectorizationCostModel { return InLoopReductions.contains(Phi); } + /// Returns the set of in-loop reduction PHIs. + const SmallPtrSetImpl &getInLoopReductions() const { + return InLoopReductions; + } + /// Returns true if the predicated reduction select should be used to set the /// incoming value for the reduction phi. bool usePredicatedReductionSelect() const { @@ -7627,62 +7632,6 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI, Consecutive, Reverse, *VPI, VPI->getDebugLoc()); } -/// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will -/// also insert a recipe to expand the step for the induction recipe. -static VPWidenIntOrFpInductionRecipe * -createWidenInductionRecipes(VPInstruction *PhiR, - const InductionDescriptor &IndDesc, VPlan &Plan, - ScalarEvolution &SE, Loop &OrigLoop) { - assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && - "step must be loop invariant"); - - VPValue *Start = PhiR->getOperand(0); - assert(Plan.getLiveIn(IndDesc.getStartValue()) == Start && - "Start VPValue must match IndDesc's start value"); - - // It is always safe to copy over the NoWrap and FastMath flags. In - // particular, when folding tail by masking, the masked-off lanes are never - // used, so it is safe. - VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc); - VPValue *Step = - vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep()); - - // Update wide induction increments to use the same step as the corresponding - // wide induction. This enables detecting induction increments directly in - // VPlan and removes redundant splats. - using namespace llvm::VPlanPatternMatch; - if (match(PhiR->getOperand(1), m_Add(m_Specific(PhiR), m_VPValue()))) - PhiR->getOperand(1)->getDefiningRecipe()->setOperand(1, Step); - - PHINode *Phi = cast(PhiR->getUnderlyingInstr()); - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), - IndDesc, Flags, PhiR->getDebugLoc()); -} - -VPHeaderPHIRecipe * -VPRecipeBuilder::tryToOptimizeInductionPHI(VPInstruction *VPI, VFRange &Range) { - auto *Phi = cast(VPI->getUnderlyingInstr()); - - // Check if this is an integer or fp induction. If so, build the recipe that - // produces its scalar and vector values. - if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) - return createWidenInductionRecipes(VPI, *II, Plan, *PSE.getSE(), *OrigLoop); - - // Check if this is pointer induction. If so, build the recipe for it. - if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { - VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep()); - return new VPWidenPointerInductionRecipe( - Phi, VPI->getOperand(0), Step, &Plan.getVFxUF(), *II, - LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { - return CM.isScalarAfterVectorization(Phi, VF); - }, - Range), - VPI->getDebugLoc()); - } - return nullptr; -} - VPWidenIntOrFpInductionRecipe * VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI, VFRange &Range) { @@ -8166,45 +8115,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, // First, check for specific widening recipes that deal with inductions, Phi // nodes, calls and memory operations. VPRecipeBase *Recipe; - if (auto *PhiR = dyn_cast(R)) { - VPBasicBlock *Parent = PhiR->getParent(); - [[maybe_unused]] VPRegionBlock *LoopRegionOf = - Parent->getEnclosingLoopRegion(); - assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent && - "Non-header phis should have been handled during predication"); - auto *Phi = cast(R->getUnderlyingInstr()); - assert(R->getNumOperands() == 2 && "Must have 2 operands for header phis"); - if ((Recipe = tryToOptimizeInductionPHI(PhiR, Range))) - return Recipe; - - VPHeaderPHIRecipe *PhiRecipe = nullptr; - assert((Legal->isReductionVariable(Phi) || - Legal->isFixedOrderRecurrence(Phi)) && - "can only widen reductions and fixed-order recurrences here"); - VPValue *StartV = R->getOperand(0); - if (Legal->isReductionVariable(Phi)) { - const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi); - assert(RdxDesc.getRecurrenceStartValue() == - Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - - // If the PHI is used by a partial reduction, set the scale factor. - unsigned ScaleFactor = - getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1); - PhiRecipe = new VPReductionPHIRecipe( - Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi), - CM.useOrderedReductions(RdxDesc), ScaleFactor); - } else { - // TODO: Currently fixed-order recurrences are modeled as chains of - // first-order recurrences. If there are no users of the intermediate - // recurrences in the chain, the fixed order recurrence should be modeled - // directly, enabling more efficient codegen. - PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); - } - // Add backedge value. - PhiRecipe->addOperand(R->getOperand(1)); - return PhiRecipe; - } - assert(!R->isPhi() && "only VPPhi nodes expected at this point"); + assert(!R->isPhi() && "phis must be handled earlier"); auto *VPI = cast(R); Instruction *Instr = R->getUnderlyingInstr(); @@ -8264,6 +8175,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction, if (isa(BinOp) || isa(BinOp)) std::swap(BinOp, Accumulator); + if (auto *RedPhiR = dyn_cast(Accumulator)) + RedPhiR->setVFScaleFactor(ScaleFactor); + assert(ScaleFactor == vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) && "all accumulators in chain must have same scale factor"); @@ -8311,6 +8225,12 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, OrigLoop, *LI, Legal->getWidestInductionType(), getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer); + // Create recipes for header phis. + VPlanTransforms::createHeaderPhiRecipes( + *VPlan0, *PSE.getSE(), *OrigLoop, Legal->getInductionVars(), + Legal->getReductionVars(), Legal->getFixedOrderRecurrences(), + CM.getInLoopReductions(), Hints.allowReordering()); + auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; @@ -8431,25 +8351,27 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Mapping from VPValues in the initial plan to their widened VPValues. Needed // temporarily to update created block masks. DenseMap Old2New; + + // Collect blocks that need predication for in-loop reduction recipes. + DenseSet BlocksNeedingPredication; + for (BasicBlock *BB : OrigLoop->blocks()) + if (CM.blockNeedsPredicationForAnyReason(BB)) + BlocksNeedingPredication.insert(BB); + + VPlanTransforms::createVPReductionRecipesForInLoopReductions( + *Plan, BlockMaskCache, BlocksNeedingPredication, Range.Start); + + // Now process all other blocks and instructions. for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { // Convert input VPInstructions to widened recipes. for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - auto *SingleDef = cast(&R); - auto *UnderlyingValue = SingleDef->getUnderlyingValue(); - // Skip recipes that do not need transforming, including canonical IV, - // wide canonical IV and VPInstructions without underlying values. The - // latter are added above for masking. - // FIXME: Migrate code relying on the underlying instruction from VPlan0 - // to construct recipes below to not use the underlying instruction. - if (isa( - &R) || - (isa(&R) && !UnderlyingValue)) + auto *SingleDef = dyn_cast(&R); + if (!SingleDef || !SingleDef->getUnderlyingValue()) continue; - assert(isa(&R) && UnderlyingValue && "unsupported recipe"); // TODO: Gradually replace uses of underlying instruction by analyses on // VPlan. - Instruction *Instr = cast(UnderlyingValue); + Instruction *Instr = cast(SingleDef->getUnderlyingValue()); Builder.setInsertPoint(SingleDef); // The stores with invariant address inside the loop will be deleted, and @@ -8519,8 +8441,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // bring the VPlan to its final state. // --------------------------------------------------------------------------- - // Adjust the recipes for any inloop reductions. - adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + introduceReductionResultComputation(Plan, RecipeBuilder, Range.Start); // Apply mandatory transformation to handle FP maxnum/minnum reduction with // NaNs if possible, bail out otherwise. @@ -8632,177 +8553,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { return Plan; } -// Adjust the recipes for reductions. For in-loop reductions the chain of -// instructions leading from the loop exit instr to the phi need to be converted -// to reductions, with one operand being vector and the other being the scalar -// reduction chain. For other reductions, a select is introduced between the phi -// and users outside the vector region when folding the tail. -// -// A ComputeReductionResult recipe is added to the middle block, also for -// in-loop reductions which compute their result in-loop, because generating -// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. -// -// Adjust AnyOf reductions; replace the reduction phi for the selected value -// with a boolean reduction phi node to check if the condition is true in any -// iteration. The final value is selected by the final ComputeReductionResult. -void LoopVectorizationPlanner::adjustRecipesForReductions( +void LoopVectorizationPlanner::introduceReductionResultComputation( VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { using namespace VPlanPatternMatch; - VPTypeAnalysis TypeInfo(*Plan); VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); - VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock(); SmallVector ToDelete; - for (VPRecipeBase &R : Header->phis()) { - auto *PhiR = dyn_cast(&R); - if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) - continue; - - RecurKind Kind = PhiR->getRecurrenceKind(); - assert( - !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && - !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) && - "AnyOf and FindIV reductions are not allowed for in-loop reductions"); - - bool IsFPRecurrence = - RecurrenceDescriptor::isFloatingPointRecurrenceKind(Kind); - FastMathFlags FMFs = - IsFPRecurrence ? FastMathFlags::getFast() : FastMathFlags(); - - // Collect the chain of "link" recipes for the reduction starting at PhiR. - SetVector Worklist; - Worklist.insert(PhiR); - for (unsigned I = 0; I != Worklist.size(); ++I) { - VPSingleDefRecipe *Cur = Worklist[I]; - for (VPUser *U : Cur->users()) { - auto *UserRecipe = cast(U); - if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { - assert((UserRecipe->getParent() == MiddleVPBB || - UserRecipe->getParent() == Plan->getScalarPreheader()) && - "U must be either in the loop region, the middle block or the " - "scalar preheader."); - continue; - } - Worklist.insert(UserRecipe); - } - } - - // Visit operation "Links" along the reduction chain top-down starting from - // the phi until LoopExitValue. We keep track of the previous item - // (PreviousLink) to tell which of the two operands of a Link will remain - // scalar and which will be reduced. For minmax by select(cmp), Link will be - // the select instructions. Blend recipes of in-loop reduction phi's will - // get folded to their non-phi operand, as the reduction recipe handles the - // condition directly. - VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0]. - for (VPSingleDefRecipe *CurrentLink : drop_begin(Worklist)) { - if (auto *Blend = dyn_cast(CurrentLink)) { - assert(Blend->getNumIncomingValues() == 2 && - "Blend must have 2 incoming values"); - if (Blend->getIncomingValue(0) == PhiR) { - Blend->replaceAllUsesWith(Blend->getIncomingValue(1)); - } else { - assert(Blend->getIncomingValue(1) == PhiR && - "PhiR must be an operand of the blend"); - Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); - } - continue; - } - - if (IsFPRecurrence) { - FastMathFlags CurFMF = - cast(CurrentLink)->getFastMathFlags(); - if (match(CurrentLink, m_Select(m_VPValue(), m_VPValue(), m_VPValue()))) - CurFMF |= cast(CurrentLink->getOperand(0)) - ->getFastMathFlags(); - FMFs &= CurFMF; - } - - Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); - - // Index of the first operand which holds a non-mask vector operand. - unsigned IndexOfFirstOperand; - // Recognize a call to the llvm.fmuladd intrinsic. - bool IsFMulAdd = (Kind == RecurKind::FMulAdd); - VPValue *VecOp; - VPBasicBlock *LinkVPBB = CurrentLink->getParent(); - if (IsFMulAdd) { - assert( - RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && - "Expected instruction to be a call to the llvm.fmuladd intrinsic"); - assert(((MinVF.isScalar() && isa(CurrentLink)) || - isa(CurrentLink)) && - CurrentLink->getOperand(2) == PreviousLink && - "expected a call where the previous link is the added operand"); - - // If the instruction is a call to the llvm.fmuladd intrinsic then we - // need to create an fmul recipe (multiplying the first two operands of - // the fmuladd together) to use as the vector operand for the fadd - // reduction. - VPInstruction *FMulRecipe = new VPInstruction( - Instruction::FMul, - {CurrentLink->getOperand(0), CurrentLink->getOperand(1)}, - CurrentLinkI->getFastMathFlags()); - LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); - VecOp = FMulRecipe; - } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs && - match(CurrentLink, m_Sub(m_VPValue(), m_VPValue()))) { - Type *PhiTy = TypeInfo.inferScalarType(PhiR); - auto *Zero = Plan->getConstantInt(PhiTy, 0); - VPWidenRecipe *Sub = new VPWidenRecipe( - Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {}, - VPIRMetadata(), CurrentLinkI->getDebugLoc()); - Sub->setUnderlyingValue(CurrentLinkI); - LinkVPBB->insert(Sub, CurrentLink->getIterator()); - VecOp = Sub; - } else { - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { - if (match(CurrentLink, m_Cmp(m_VPValue(), m_VPValue()))) - continue; - assert(isa(CurrentLink) && - "must be a select recipe"); - IndexOfFirstOperand = 1; - } else { - assert((MinVF.isScalar() || isa(CurrentLink)) && - "Expected to replace a VPWidenSC"); - IndexOfFirstOperand = 0; - } - // Note that for non-commutable operands (cmp-selects), the semantics of - // the cmp-select are captured in the recurrence kind. - unsigned VecOpId = - CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink - ? IndexOfFirstOperand + 1 - : IndexOfFirstOperand; - VecOp = CurrentLink->getOperand(VecOpId); - assert(VecOp != PreviousLink && - CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - - (VecOpId - IndexOfFirstOperand)) == - PreviousLink && - "PreviousLink must be the operand other than VecOp"); - } - - VPValue *CondOp = nullptr; - if (CM.blockNeedsPredicationForAnyReason(CurrentLinkI->getParent())) - CondOp = RecipeBuilder.getBlockInMask(CurrentLink->getParent()); - - auto *RedRecipe = new VPReductionRecipe( - Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp, - PhiR->isOrdered(), CurrentLinkI->getDebugLoc()); - // Append the recipe to the end of the VPBasicBlock because we need to - // ensure that it comes after all of it's inputs, including CondOp. - // Delete CurrentLink as it will be invalid if its operand is replaced - // with a reduction defined at the bottom of the block in the next link. - if (LinkVPBB->getNumSuccessors() == 0) - RedRecipe->insertBefore(&*std::prev(std::prev(LinkVPBB->end()))); - else - LinkVPBB->appendRecipe(RedRecipe); - - CurrentLink->replaceAllUsesWith(RedRecipe); - ToDelete.push_back(CurrentLink); - PreviousLink = RedRecipe; - } - } + VPTypeAnalysis TypeInfo(*Plan); VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock(); Builder.setInsertPoint(&*std::prev(std::prev(LatchVPBB->end()))); VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 87280b83fc0e5..fb96d0c213f3b 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -94,11 +94,6 @@ class VPRecipeBuilder { /// recipe that takes an additional VPInstruction for the mask. VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range); - /// Check if an induction recipe should be constructed for \p VPI. If so build - /// and return it. If not, return null. - VPHeaderPHIRecipe *tryToOptimizeInductionPHI(VPInstruction *VPI, - VFRange &Range); - /// Optimize the special case where the operand of \p VPI is a constant /// integer induction variable. VPWidenIntOrFpInductionRecipe * diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 13582f8bd2d62..e502e382d3dc7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1134,7 +1134,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, OpcodeTy Opcode; /// An optional name that can be used for the generated IR instruction. - const std::string Name; + std::string Name; /// Returns true if we can generate a scalar for the first lane only if /// needed. @@ -1225,6 +1225,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, /// Returns the symbolic name assigned to the VPInstruction. StringRef getName() const { return Name; } + void setName(StringRef NewName) { Name = NewName.str(); } + protected: #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the VPInstruction to \p O. @@ -2281,19 +2283,15 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe, }; class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe { - bool IsScalarAfterVectorization; - public: /// Create a new VPWidenPointerInductionRecipe for \p Phi with start value \p /// Start and the number of elements unrolled \p NumUnrolledElems, typically /// VF*UF. VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step, VPValue *NumUnrolledElems, - const InductionDescriptor &IndDesc, - bool IsScalarAfterVectorization, DebugLoc DL) + const InductionDescriptor &IndDesc, DebugLoc DL) : VPWidenInductionRecipe(VPDef::VPWidenPointerInductionSC, Phi, Start, - Step, IndDesc, DL), - IsScalarAfterVectorization(IsScalarAfterVectorization) { + Step, IndDesc, DL) { addOperand(NumUnrolledElems); } @@ -2302,8 +2300,7 @@ class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe { VPWidenPointerInductionRecipe *clone() override { return new VPWidenPointerInductionRecipe( cast(getUnderlyingInstr()), getOperand(0), getOperand(1), - getOperand(2), getInductionDescriptor(), IsScalarAfterVectorization, - getDebugLoc()); + getOperand(2), getInductionDescriptor(), getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC) @@ -2380,8 +2377,10 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { VP_CLASSOF_IMPL(VPDef::VPFirstOrderRecurrencePHISC) VPFirstOrderRecurrencePHIRecipe *clone() override { - return new VPFirstOrderRecurrencePHIRecipe( + auto *R = new VPFirstOrderRecurrencePHIRecipe( cast(getUnderlyingInstr()), *getOperand(0)); + R->addOperand(getOperand(1)); + return R; } void execute(VPTransformState &State) override; @@ -2451,6 +2450,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// Get the factor that the VF of this recipe's output should be scaled by. unsigned getVFScaleFactor() const { return VFScaleFactor; } + void setVFScaleFactor(unsigned ScaleFactor) { VFScaleFactor = ScaleFactor; } + /// Returns the number of incoming values, also number of incoming blocks. /// Note that at the moment, VPWidenPointerInductionRecipe only has a single /// incoming value, its start value. diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index dbbde1cafa9f2..754dee4bb728c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -13,6 +13,7 @@ #include "LoopVectorizationPlanner.h" #include "VPlan.h" +#include "VPlanAnalysis.h" #include "VPlanCFG.h" #include "VPlanDominatorTree.h" #include "VPlanPatternMatch.h" @@ -554,6 +555,15 @@ static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL, Plan.getEntry()->swapSuccessors(); createExtractsForLiveOuts(Plan, MiddleVPBB); + + VPBuilder ScalarPHBuilder(ScalarPH); + for (const auto &[PhiR, ScalarPhiR] : zip_equal( + drop_begin(HeaderVPBB->phis()), Plan.getScalarHeader()->phis())) { + auto *VectorPhiR = cast(&PhiR); + auto *ResumePhiR = ScalarPHBuilder.createScalarPhi( + {VectorPhiR, VectorPhiR->getOperand(0)}, VectorPhiR->getDebugLoc()); + cast(&ScalarPhiR)->addOperand(ResumePhiR); + } } std::unique_ptr @@ -566,6 +576,264 @@ VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, return VPlan0; } +/// Creates a VPWidenIntOrFpInductionRecipe or VPWidenPointerInductionRecipe +/// for \p Phi based on \p IndDesc. +static VPHeaderPHIRecipe * +createWidenInductionRecipe(PHINode *Phi, VPPhi *PhiR, + const InductionDescriptor &IndDesc, VPlan &Plan, + ScalarEvolution &SE, Loop &OrigLoop) { + assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && + "step must be loop invariant"); + + VPValue *Start = PhiR->getOperand(0); + assert(Plan.getLiveIn(IndDesc.getStartValue()) == Start && + "Start VPValue must match IndDesc's start value"); + VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep()); + + if (IndDesc.getKind() == InductionDescriptor::IK_PtrInduction) + return new VPWidenPointerInductionRecipe(Phi, Start, Step, &Plan.getVFxUF(), + IndDesc, PhiR->getDebugLoc()); + + // It is always safe to copy over the NoWrap and FastMath flags. In + // particular, when folding tail by masking, the masked-off lanes are never + // used, so it is safe. + VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc); + + // Update wide induction increments to use the same step as the corresponding + // wide induction. This enables detecting induction increments directly in + // VPlan and removes redundant splats. + using namespace llvm::VPlanPatternMatch; + if (match(PhiR->getOperand(1), m_Add(m_Specific(PhiR), m_VPValue()))) + PhiR->getOperand(1)->getDefiningRecipe()->setOperand(1, Step); + + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), + IndDesc, Flags, PhiR->getDebugLoc()); +} + +void VPlanTransforms::createHeaderPhiRecipes( + VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, + const MapVector &Inductions, + const MapVector &Reductions, + const SmallPtrSetImpl &FixedOrderRecurrences, + const SmallPtrSetImpl &InLoopReductions, bool AllowReordering) { + + VPBasicBlock *HeaderVPBB = cast( + Plan.getEntry()->getSuccessors()[1]->getSingleSuccessor()); + + for (VPRecipeBase &R : make_early_inc_range(*HeaderVPBB)) { + if (isa(&R)) + continue; + auto *PhiR = dyn_cast(&R); + if (!PhiR) + break; + + // TODO: Gradually replace uses of underlying instruction by analyses on + // VPlan. + auto *Phi = cast(PhiR->getUnderlyingInstr()); + assert(PhiR->getNumOperands() == 2 && + "Must have 2 operands for header phis"); + + VPHeaderPHIRecipe *HeaderPhiR = nullptr; + auto InductionIt = Inductions.find(Phi); + if (InductionIt != Inductions.end()) { + HeaderPhiR = createWidenInductionRecipe(Phi, PhiR, InductionIt->second, Plan, + SE, OrigLoop); + } else { + VPValue *Start = PhiR->getOperand(0); + auto ReductionIt = Reductions.find(Phi); + if (ReductionIt != Reductions.end()) { + const RecurrenceDescriptor &RdxDesc = ReductionIt->second; + assert(RdxDesc.getRecurrenceStartValue() == + Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); + + bool UseOrderedReductions = !AllowReordering && RdxDesc.isOrdered(); + HeaderPhiR = new VPReductionPHIRecipe(Phi, RdxDesc.getRecurrenceKind(), + *Start, InLoopReductions.contains(Phi), + UseOrderedReductions); + } else { + assert(FixedOrderRecurrences.contains(Phi) && + "can only widen reductions and fixed-order recurrences here"); + // TODO: Currently fixed-order recurrences are modeled as chains of + // first-order recurrences. If there are no users of the intermediate + // recurrences in the chain, the fixed order recurrence should be + // modeled directly, enabling more efficient codegen. + HeaderPhiR = new VPFirstOrderRecurrencePHIRecipe(Phi, *Start); + } + HeaderPhiR->addOperand(PhiR->getOperand(1)); + } + HeaderPhiR->insertBefore(PhiR); + PhiR->replaceAllUsesWith(HeaderPhiR); + PhiR->eraseFromParent(); + } +} + +void VPlanTransforms::createVPReductionRecipesForInLoopReductions( + VPlan &Plan, const DenseMap &BlockMaskCache, + const DenseSet &BlocksNeedingPredication, + ElementCount MinVF) { + VPTypeAnalysis TypeInfo(Plan); + VPRegionBlock *VectorLoopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); + VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock(); + SmallVector ToDelete; + + for (VPRecipeBase &R : Header->phis()) { + auto *PhiR = dyn_cast(&R); + if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) + continue; + + RecurKind Kind = PhiR->getRecurrenceKind(); + assert( + !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && + !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) && + "AnyOf and FindIV reductions are not allowed for in-loop reductions"); + + bool IsFPRecurrence = + RecurrenceDescriptor::isFloatingPointRecurrenceKind(Kind); + FastMathFlags FMFs = + IsFPRecurrence ? FastMathFlags::getFast() : FastMathFlags(); + + // Collect the chain of "link" recipes for the reduction starting at PhiR. + SetVector Worklist; + Worklist.insert(PhiR); + for (unsigned I = 0; I != Worklist.size(); ++I) { + VPSingleDefRecipe *Cur = Worklist[I]; + for (VPUser *U : Cur->users()) { + auto *UserRecipe = cast(U); + if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { + assert((UserRecipe->getParent() == MiddleVPBB || + UserRecipe->getParent() == Plan.getScalarPreheader()) && + "U must be either in the loop region, the middle block or the " + "scalar preheader."); + continue; + } + + // Stores using instructions will be sunk later. + if (match(UserRecipe, m_VPInstruction())) + continue; + Worklist.insert(UserRecipe); + } + } + + // Visit operation "Links" along the reduction chain top-down starting from + // the phi until LoopExitValue. We keep track of the previous item + // (PreviousLink) to tell which of the two operands of a Link will remain + // scalar and which will be reduced. For minmax by select(cmp), Link will be + // the select instructions. Blend recipes of in-loop reduction phi's will + // get folded to their non-phi operand, as the reduction recipe handles the + // condition directly. + VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0]. + for (VPSingleDefRecipe *CurrentLink : drop_begin(Worklist)) { + if (auto *Blend = dyn_cast(CurrentLink)) { + assert(Blend->getNumIncomingValues() == 2 && + "Blend must have 2 incoming values"); + if (Blend->getIncomingValue(0) == PhiR) { + Blend->replaceAllUsesWith(Blend->getIncomingValue(1)); + } else { + assert(Blend->getIncomingValue(1) == PhiR && + "PhiR must be an operand of the blend"); + Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); + } + continue; + } + + if (IsFPRecurrence) { + FastMathFlags CurFMF = + cast(CurrentLink)->getFastMathFlags(); + if (match(CurrentLink, m_Select(m_VPValue(), m_VPValue(), m_VPValue()))) + CurFMF |= cast(CurrentLink->getOperand(0)) + ->getFastMathFlags(); + FMFs &= CurFMF; + } + + Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); + + // Index of the first operand which holds a non-mask vector operand. + unsigned IndexOfFirstOperand; + // Recognize a call to the llvm.fmuladd intrinsic. + bool IsFMulAdd = Kind == RecurKind::FMulAdd && + RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI); + VPValue *VecOp; + VPBasicBlock *LinkVPBB = CurrentLink->getParent(); + if (IsFMulAdd) { + assert(CurrentLink->getOperand(2) == PreviousLink && + "expected a call where the previous link is the added operand"); + + // If the instruction is a call to the llvm.fmuladd intrinsic then we + // need to create an fmul recipe (multiplying the first two operands of + // the fmuladd together) to use as the vector operand for the fadd + // reduction. + VPInstruction *FMulRecipe = new VPInstruction( + Instruction::FMul, + {CurrentLink->getOperand(0), CurrentLink->getOperand(1)}, + CurrentLinkI->getFastMathFlags()); + LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); + VecOp = FMulRecipe; + } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs && + match(CurrentLink, m_Sub(m_VPValue(), m_VPValue()))) { + Type *PhiTy = TypeInfo.inferScalarType(PhiR); + auto *Zero = Plan.getConstantInt(PhiTy, 0); + VPWidenRecipe *Sub = new VPWidenRecipe( + Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {}, + VPIRMetadata(), CurrentLinkI->getDebugLoc()); + Sub->setUnderlyingValue(CurrentLinkI); + LinkVPBB->insert(Sub, CurrentLink->getIterator()); + VecOp = Sub; + } else { + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { + if (match(CurrentLink, m_Cmp(m_VPValue(), m_VPValue()))) + continue; + assert(match(CurrentLink, + m_Select(m_VPValue(), m_VPValue(), m_VPValue())) && + "must be a select recipe"); + IndexOfFirstOperand = 1; + } else { + IndexOfFirstOperand = 0; + } + // Note that for non-commutable operands (cmp-selects), the semantics of + // the cmp-select are captured in the recurrence kind. + unsigned VecOpId = + CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink + ? IndexOfFirstOperand + 1 + : IndexOfFirstOperand; + VecOp = CurrentLink->getOperand(VecOpId); + assert(VecOp != PreviousLink && + CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - + (VecOpId - IndexOfFirstOperand)) == + PreviousLink && + "PreviousLink must be the operand other than VecOp"); + } + + // Get block mask from BlockMaskCache if the block needs predication. + VPValue *CondOp = nullptr; + if (BlocksNeedingPredication.contains(CurrentLinkI->getParent())) { + auto MaskIt = BlockMaskCache.find(LinkVPBB); + if (MaskIt != BlockMaskCache.end()) + CondOp = MaskIt->second; + } + + auto *RedRecipe = new VPReductionRecipe( + Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp, + PhiR->isOrdered(), CurrentLinkI->getDebugLoc()); + // Append the recipe to the end of the VPBasicBlock because we need to + // ensure that it comes after all of it's inputs, including CondOp. + // Delete CurrentLink as it will be invalid if its operand is replaced + // with a reduction defined at the bottom of the block in the next link. + if (LinkVPBB->getNumSuccessors() == 0) + RedRecipe->insertBefore(&*std::prev(std::prev(LinkVPBB->end()))); + else + LinkVPBB->appendRecipe(RedRecipe); + + CurrentLink->replaceAllUsesWith(RedRecipe); + ToDelete.push_back(CurrentLink); + PreviousLink = RedRecipe; + } + } + + for (VPRecipeBase *R : ToDelete) + R->eraseFromParent(); +} + void VPlanTransforms::handleEarlyExits(VPlan &Plan, bool HasUncountableEarlyExit) { auto *MiddleVPBB = cast( diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7c9302860a3b5..e8e1a8d4eedde 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -4327,7 +4327,7 @@ void VPCanonicalIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, #endif bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { - return IsScalarAfterVectorization && + return vputils::onlyScalarValuesUsed(this) && (!IsScalable || vputils::onlyFirstLaneUsed(this)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 25557f1d5d651..916e8b1cea28e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -4527,9 +4527,10 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator( /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute /// the end value of the induction. -static VPInstruction *addResumePhiRecipeForInduction( - VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, - VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) { +static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, + VPBuilder &VectorPHBuilder, + VPTypeAnalysis &TypeInfo, + VPValue *VectorTC) { auto *WideIntOrFp = dyn_cast(WideIV); // Truncated wide inductions resume from the last lane of their vector value // in the last vector iteration which is handled elsewhere. @@ -4555,9 +4556,7 @@ static VPInstruction *addResumePhiRecipeForInduction( WideIV->getDebugLoc()); } - auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi( - {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val"); - return ResumePhiRecipe; + return EndValue; } void VPlanTransforms::addScalarResumePhis( @@ -4570,21 +4569,18 @@ void VPlanTransforms::addScalarResumePhis( VPBuilder VectorPHBuilder( cast(VectorRegion->getSinglePredecessor())); VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); - VPBuilder ScalarPHBuilder(ScalarPH); - for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) { - auto *ScalarPhiIRI = cast(&ScalarPhiR); + for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) { + auto *ResumePhiR = cast(&PhiR); // TODO: Extract final value from induction recipe initially, optimize to // pre-computed end value together in optimizeInductionExitUsers. - auto *VectorPhiR = - cast(Builder.getRecipe(&ScalarPhiIRI->getIRPhi())); + auto *VectorPhiR = cast(ResumePhiR->getOperand(0)); if (auto *WideIVR = dyn_cast(VectorPhiR)) { - if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction( - WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo, - &Plan.getVectorTripCount())) { - assert(isa(ResumePhi) && "Expected a phi"); - IVEndValues[WideIVR] = ResumePhi->getOperand(0); - ScalarPhiIRI->addOperand(ResumePhi); + if (VPValue *ResumeV = addResumePhiRecipeForInduction( + WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) { + IVEndValues[WideIVR] = ResumeV; + ResumePhiR->setOperand(0, ResumeV); + ResumePhiR->setName("bc.resume.val"); continue; } // TODO: Also handle truncated inductions here. Computing end-values @@ -4606,10 +4602,8 @@ void VPlanTransforms::addScalarResumePhis( ResumeFromVectorLoop = MiddleBuilder.createNaryOp( VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {}, "vector.recur.extract"); - StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx"; - auto *ResumePhiR = ScalarPHBuilder.createScalarPhi( - {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name); - ScalarPhiIRI->addOperand(ResumePhiR); + ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx"); + ResumePhiR->setOperand(0, ResumeFromVectorLoop); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 708ea4185e1cb..faba58112e7fa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -102,6 +102,26 @@ struct VPlanTransforms { buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL, PredicatedScalarEvolution &PSE, LoopVersioning *LVer = nullptr); + /// Replace VPPhi recipes in \p Plan's header with corresponding + /// VPHeaderPHIRecipe subclasses for inductions, reductions, and + /// fixed-order recurrences. This processes all header phis and creates + /// the appropriate widened recipe for each one. + static void createHeaderPhiRecipes( + VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, + const MapVector &Inductions, + const MapVector &Reductions, + const SmallPtrSetImpl &FixedOrderRecurrences, + const SmallPtrSetImpl &InLoopReductions, bool AllowReordering); + + /// Create VPReductionRecipes for in-loop reductions. This processes chains + /// of operations contributing to in-loop reductions and creates appropriate + /// VPReductionRecipe instances. Block masks from \p BlockMaskCache are used + /// to add predication for blocks in \p BlocksNeedingPredication. + static void createVPReductionRecipesForInLoopReductions( + VPlan &Plan, const DenseMap &BlockMaskCache, + const DenseSet &BlocksNeedingPredication, + ElementCount MinVF); + /// Update \p Plan to account for all early exits. LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan, bool HasUncountableExit); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll index 3010a9d75d039..3904d4bce9b4a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll @@ -11,21 +11,14 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) { ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR_START_1:%.*]], i64 10000 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_START_1:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[NEXT_GEP1]], i32 1 -; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP2]] -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP2]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 1 +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[PTR_START_1]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <2 x i64> +; CHECK-NEXT: [[NEXT_GEP:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> splat (i64 2) ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <2 x ptr> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 @@ -40,6 +33,7 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) { ; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[NEXT_GEP]], align 1 ; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP15]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 4 ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; CHECK: middle.block: @@ -49,16 +43,14 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) { ; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR_START_1]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX6]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX6]], 1 -; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP17]] -; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP7]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP8]], i32 1 +; CHECK-NEXT: [[POINTER_PHI2:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[PTR_IND5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[POINTER_PHI2]], <2 x i64> +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = extractelement <2 x ptr> [[TMP20]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0 ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1 @@ -66,6 +58,7 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) { ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP23]]) ; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[NEXT_GEP7]], align 1 ; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2 +; CHECK-NEXT: [[PTR_IND5]] = getelementptr i8, ptr [[POINTER_PHI2]], i64 2 ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT9]], 10000 ; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; CHECK: vec.epilog.middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll index f223786a07cdf..456c03824106a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -112,15 +112,11 @@ define void @pointer_induction(ptr noalias %start, i64 %N) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv2i64() -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[VECTOR_GEP]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX2]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP15]], align 1 ; CHECK-NEXT: [[TMP17:%.*]] = add [[WIDE_LOAD]], splat (i8 1) ; CHECK-NEXT: store [[TMP17]], ptr [[TMP15]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX2]], [[TMP4]] -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-opaque-pointers.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-opaque-pointers.ll index 3a07bcca523ce..13da121fe2dc2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-opaque-pointers.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-opaque-pointers.ll @@ -25,17 +25,14 @@ define void @test_pr55375_interleave_opaque_ptr(ptr %start, ptr %end) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x ptr> [[TMP9]], ptr [[TMP8]], i32 1 +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <2 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x ptr> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x ptr> zeroinitializer, <2 x ptr> [[TMP10]], <4 x i32> ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x ptr> [[TMP12]], <4 x ptr> poison, <4 x i32> ; CHECK-NEXT: store <4 x ptr> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 32 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll index cebf90ae9f8ce..94392f856c386 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll @@ -15,16 +15,20 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[SRC]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[DOTCAST]], 8 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[OFFSET_IDX]], 8 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[OFFSET_IDX]], 16 +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 24 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[SRC]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[SRC]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i32 [[TMP17]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP2]] @@ -36,7 +40,6 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s ; CHECK-NEXT: store ptr [[TMP13]], ptr [[TMP9]], align 8 ; CHECK-NEXT: store ptr [[TMP14]], ptr [[TMP10]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 32 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index 5742df2aa3c53..127e830b1bc14 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -113,12 +113,13 @@ compound=true N0 -> N2 [ label="F"] N1 [label = "scalar.ph:\l" + + " EMIT-SCALAR vp\<%6\> = phi [ ir\<%indvars.iv\>, middle.block ], [ ir\<0\>, ir-bb\ ]\l" + "Successor(s): ir-bb\\l" ] N1 -> N3 [ label=""] N3 [label = "ir-bb\:\l" + - " IR %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\l" + + " IR %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] (extra operand: vp\<%6\> from scalar.ph)\l" + " IR %arr.idx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv\l" + " IR %l1 = load i32, ptr %arr.idx, align 4\l" + " IR %res = add i32 %l1, 10\l" + @@ -282,12 +283,13 @@ compound=true N0 -> N2 [ label="F"] N1 [label = "scalar.ph:\l" + + " EMIT-SCALAR vp\<%6\> = phi [ ir\<%iv\>, middle.block ], [ ir\<0\>, ir-bb\ ]\l" + "Successor(s): ir-bb\\l" ] N1 -> N3 [ label=""] N3 [label = "ir-bb\:\l" + - " IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]\l" + + " IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] (extra operand: vp\<%6\> from scalar.ph)\l" + " IR %arr.idx = getelementptr inbounds i32, ptr %A, i64 %iv\l" + " IR %l1 = load i32, ptr %arr.idx, align 4\l" + " IR %c = icmp eq i32 %l1, 0\l" + diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp index 1b0cd1074c1b5..ec8f06a1eb755 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp @@ -346,6 +346,8 @@ TEST_F(VPIRVerifierTest, testVerifyIRPhiInScalarHeaderVPIRBB) { Function *F = M.getFunction("f"); BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); auto Plan = buildVPlan(LoopHeader); + VPValue *Zero = Plan->getConstantInt(32, 0); + Plan->getScalarHeader()->front().addOperand(Zero); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -387,8 +389,6 @@ TEST_F(VPIRVerifierTest, testVerifyIRPhiInExitVPIRBB) { {HeaderBlock->front().getVPSingleValue()}); DefI->insertBefore(Plan->getMiddleBlock()->getTerminator()); Plan->getExitBlocks()[0]->front().addOperand(DefI); - VPValue *Zero = Plan->getConstantInt(32, 0); - Plan->getScalarHeader()->front().addOperand(Zero); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr();