diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index f9e6da6d0846a..afa175704a7b1 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -70,6 +70,9 @@ enum class RecurKind { FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of ///< (x,y) is increasing loop induction, and both x and y ///< are integer type, producing a UMax reduction. + FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y + ///< can be any scalar type, one is the current recurrence + ///< value, and the other is an arbitrary value. // clang-format on // TODO: Any_of and FindLast reduction need not be restricted to integer type // only. @@ -183,6 +186,12 @@ class RecurrenceDescriptor { PHINode *OrigPhi, Instruction *I, ScalarEvolution &SE); + /// Returns a struct describing whether the instruction is of the form + /// Select(Cmp(A, B), X, Y) + /// where one of (X, Y) is the Phi value and the other is an arbitrary value. + LLVM_ABI static InstDesc isFindLastPattern(Instruction *I, PHINode *Phi, + Loop *TheLoop); + /// Returns a struct describing if the instruction is a /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern. LLVM_ABI static InstDesc isConditionalRdxPattern(Instruction *I); @@ -299,6 +308,12 @@ class RecurrenceDescriptor { isFindLastIVRecurrenceKind(Kind); } + /// Returns true if the recurrence kind is of the form + /// select(cmp(),x,y) where one of (x,y) is an arbitrary value. + static bool isFindLastRecurrenceKind(RecurKind Kind) { + return Kind == RecurKind::FindLast; + } + /// Returns the type of the recurrence. This type can be narrower than the /// actual type of the Phi if the recurrence has been type-promoted. Type *getRecurrenceType() const { return RecurrenceType; } diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index b8c540ce4b99d..6e57e4511c966 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -56,6 +56,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + // TODO: Make type-agnostic. + case RecurKind::FindLast: return true; } return false; @@ -426,6 +428,8 @@ bool RecurrenceDescriptor::AddReductionVar( ++NumCmpSelectPatternInst; if (isAnyOfRecurrenceKind(Kind) && IsASelect) ++NumCmpSelectPatternInst; + if (isFindLastRecurrenceKind(Kind) && IsASelect) + ++NumCmpSelectPatternInst; // Check whether we found a reduction operator. FoundReduxOp |= !IsAPhi && Cur != Start; @@ -789,6 +793,38 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, return InstDesc(false, I); } +RecurrenceDescriptor::InstDesc +RecurrenceDescriptor::isFindLastPattern(Instruction *I, PHINode *Phi, + Loop *TheLoop) { + // Must be a scalar. + Type *Type = Phi->getType(); + if (!Type->isIntegerTy() && !Type->isFloatingPointTy() && + !Type->isPointerTy()) + return InstDesc(false, I); + + SelectInst *Select = dyn_cast(I); + if (!Select) + return InstDesc(false, I); + + // FIXME: Support more complex patterns, including multiple selects. + // Phi or Select must be used only outside the loop, + // except for each other. + auto IsOnlyUsedOutsideLoop = [&](Value *V, Value *Ignore) { + return all_of(V->users(), [Ignore, TheLoop](User *U) { + if (U == Ignore) + return true; + if (auto *I = dyn_cast(U)) + return !TheLoop->contains(I); + return false; + }); + }; + if (!IsOnlyUsedOutsideLoop(Phi, Select) || + !IsOnlyUsedOutsideLoop(Select, Phi)) + return InstDesc(false, I); + + return InstDesc(I, RecurKind::FindLast); +} + RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind, const InstDesc &Prev) { @@ -927,6 +963,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( return isConditionalRdxPattern(I); if (isFindIVRecurrenceKind(Kind) && SE) return isFindIVPattern(Kind, L, OrigPhi, I, *SE); + if (isFindLastRecurrenceKind(Kind)) + return isFindLastPattern(I, OrigPhi, L); [[fallthrough]]; case Instruction::FCmp: case Instruction::ICmp: @@ -1123,7 +1161,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, << "\n"); return true; } - + if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC, + DT, SE)) { + LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n"); + return true; + } // Not a reduction of known type. return false; } @@ -1245,6 +1287,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { case RecurKind::SMin: case RecurKind::UMax: case RecurKind::UMin: + case RecurKind::FindLast: return Instruction::ICmp; case RecurKind::FMax: case RecurKind::FMin: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 92321a76dbd80..6595c6e770be0 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1004,6 +1004,13 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } break; } + case Intrinsic::experimental_vector_extract_last_active: + if (ST->isSVEAvailable()) { + auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]); + // This should turn into chained clastb instructions. + return LegalCost; + } + break; default: break; } @@ -5325,6 +5332,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction( case RecurKind::FMax: case RecurKind::FMulAdd: case RecurKind::AnyOf: + case RecurKind::FindLast: return true; default: return false; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b4acda80cfb93..6badc403fb78f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4047,6 +4047,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPWidenIntrinsicSC: case VPDef::VPWidenSC: case VPDef::VPWidenSelectSC: + case VPDef::VPWidenSelectVectorSC: case VPDef::VPBlendSC: case VPDef::VPFirstOrderRecurrencePHISC: case VPDef::VPHistogramSC: @@ -4546,6 +4547,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), IsaPred); + // FIXME: implement interleaving for FindLast transform correctly. + for (auto &[_, RdxDesc] : Legal->getReductionVars()) + if (RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind())) + return 1; + // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { @@ -8687,6 +8694,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( *Plan, Builder)) return nullptr; + // Create whole-vector selects for find-last recurrences. + VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences, *Plan, + RecipeBuilder, Legal); + if (useActiveLaneMask(Style)) { // TODO: Move checks to VPlanTransforms::addActiveLaneMask once // TailFoldingStyle is visible there. @@ -8779,6 +8790,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( RecurKind Kind = PhiR->getRecurrenceKind(); assert( + !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) && !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) && "AnyOf and FindIV reductions are not allowed for in-loop reductions"); @@ -8987,6 +8999,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( FinalReductionResult = Builder.createNaryOp(VPInstruction::ComputeAnyOfResult, {PhiR, Start, NewExitingVPV}, ExitDL); + } else if (RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + FinalReductionResult = Builder.createNaryOp( + VPInstruction::ExtractLastActive, {NewExitingVPV}, ExitDL); } else { VPIRFlags Flags = RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind) @@ -9076,7 +9092,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( RecurKind RK = RdxDesc.getRecurrenceKind(); if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) && !RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && - !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) { + !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) && + !RecurrenceDescriptor::isFindLastRecurrenceKind(RK))) { VPBuilder PHBuilder(Plan->getVectorPreheader()); VPValue *Iden = Plan->getOrAddLiveIn( getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags())); @@ -10069,6 +10086,22 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Override IC if user provided an interleave count. IC = UserIC > 0 ? UserIC : IC; + // FIXME: Enable interleaving for last_active reductions. + if (any_of(LVL.getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind()); + })) { + LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due " + << "to conditional scalar assignments.\n"); + IntDiagMsg = { + "ConditionalAssignmentPreventsScalarInterleaving", + "Unable to interleave without vectorization due to conditional " + "assignments"}; + InterleaveLoop = false; + IC = 1; + } + // Emit diagnostic messages, if any. const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 75cace77ec534..7b25731af19d8 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -24868,6 +24868,7 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FindLast: case RecurKind::FMaxNum: case RecurKind::FMinNum: case RecurKind::FMaximumNum: @@ -25009,6 +25010,7 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FindLast: case RecurKind::FMaxNum: case RecurKind::FMinNum: case RecurKind::FMaximumNum: @@ -25115,6 +25117,7 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FindLast: case RecurKind::FMaxNum: case RecurKind::FMinNum: case RecurKind::FMaximumNum: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 53291a931530f..2ffe68fedee05 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -548,6 +548,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenIntrinsicSC: case VPRecipeBase::VPWidenSC: case VPRecipeBase::VPWidenSelectSC: + case VPRecipeBase::VPWidenSelectVectorSC: case VPRecipeBase::VPBlendSC: case VPRecipeBase::VPPredInstPHISC: case VPRecipeBase::VPCanonicalIVPHISC: @@ -1059,6 +1060,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, ResumeForEpilogue, /// Returns the value for vscale. VScale, + // Extracts the last active lane based on a predicate vector operand. + ExtractLastActive, }; private: @@ -1749,6 +1752,47 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags, unsigned getOpcode() const { return Instruction::Select; } + VPValue *getCond() const { return getOperand(0); } + + bool isInvariantCond() const { + return getCond()->isDefinedOutsideLoopRegions(); + } + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getCond() && isInvariantCond(); + } +}; + +/// A recipe for selecting whole vector values. +struct VPWidenSelectVectorRecipe : public VPRecipeWithIRFlags { + VPWidenSelectVectorRecipe(ArrayRef Operands) + : VPRecipeWithIRFlags(VPDef::VPWidenSelectVectorSC, Operands) {} + + ~VPWidenSelectVectorRecipe() override = default; + + VPWidenSelectVectorRecipe *clone() override { + SmallVector Operands(operands()); + return new VPWidenSelectVectorRecipe(Operands); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenSelectVectorSC) + + /// Produce a widened version of the select instruction. + void execute(VPTransformState &State) override; + + /// Return the cost of this VPWidenSelectVectorRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + VPValue *getCond() const { return getOperand(0); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index d400ceff7797c..a299ab8593a2f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -115,7 +115,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::FirstActiveLane: return Type::getIntNTy(Ctx, 64); case VPInstruction::ExtractLastElement: - case VPInstruction::ExtractPenultimateElement: { + case VPInstruction::ExtractPenultimateElement: + case VPInstruction::ExtractLastActive: { Type *BaseTy = inferScalarType(R->getOperand(0)); if (auto *VecTy = dyn_cast(BaseTy)) return VecTy->getElementType(); @@ -308,7 +309,11 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { }) .Case([this](const auto *R) { return inferScalarType(R->getOperandOfResultType()); - }); + }) + .Case( + [this](const VPWidenSelectVectorRecipe *R) { + return inferScalarType(R->getOperand(1)); + }); assert(ResultTy && "could not infer type for the given VPValue"); CachedTypes[V] = ResultTy; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index bf51489543098..57f8b584b41dd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -86,7 +86,8 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenLoadSC: case VPWidenPHISC: case VPWidenSC: - case VPWidenSelectSC: { + case VPWidenSelectSC: + case VPWidenSelectVectorSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); (void)I; @@ -134,7 +135,8 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPWidenIntOrFpInductionSC: case VPWidenPHISC: case VPWidenSC: - case VPWidenSelectSC: { + case VPWidenSelectSC: + case VPWidenSelectVectorSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); (void)I; @@ -177,7 +179,8 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPWidenPHISC: case VPWidenPointerInductionSC: case VPWidenSC: - case VPWidenSelectSC: { + case VPWidenSelectSC: + case VPWidenSelectVectorSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); (void)I; @@ -522,6 +525,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::ActiveLaneMask: case VPInstruction::ComputeAnyOfResult: case VPInstruction::ReductionStartVector: + case VPInstruction::ExtractLastActive: return 3; case VPInstruction::ComputeFindIVResult: return 4; @@ -983,6 +987,17 @@ Value *VPInstruction::generate(VPTransformState &State) { } case VPInstruction::ResumeForEpilogue: return State.get(getOperand(0), true); + case VPInstruction::ExtractLastActive: { + Value *Data = State.get(getOperand(0)); + Value *Mask = State.get(getOperand(1)); + Value *Default = State.get(getOperand(2), /*IsScalar=*/true); + Type *VTy = Data->getType(); + + Module *M = State.Builder.GetInsertBlock()->getModule(); + Function *ExtractLast = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::experimental_vector_extract_last_active, {VTy}); + return Builder.CreateCall(ExtractLast, {Data, Mask, Default}); + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -1119,6 +1134,15 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } + case VPInstruction::ExtractLastActive: { + Type *ScalarTy = Ctx.Types.inferScalarType(this); + Type *VecTy = toVectorTy(ScalarTy, VF); + Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); + IntrinsicCostAttributes ICA( + Intrinsic::experimental_vector_extract_last_active, ScalarTy, + {VecTy, MaskTy, ScalarTy}); + return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind); + } case VPInstruction::FirstOrderRecurrenceSplice: { assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?"); SmallVector Mask(VF.getKnownMinValue()); @@ -1174,6 +1198,7 @@ bool VPInstruction::isVectorToScalar() const { getOpcode() == VPInstruction::FirstActiveLane || getOpcode() == VPInstruction::ComputeAnyOfResult || getOpcode() == VPInstruction::ComputeFindIVResult || + getOpcode() == VPInstruction::ExtractLastActive || getOpcode() == VPInstruction::ComputeReductionResult || getOpcode() == VPInstruction::AnyOf; } @@ -1243,6 +1268,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::ExtractLastElement: case VPInstruction::ExtractPenultimateElement: case VPInstruction::FirstActiveLane: + case VPInstruction::ExtractLastActive: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::Not: @@ -1414,6 +1440,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ResumeForEpilogue: O << "resume-for-epilogue"; break; + case VPInstruction::ExtractLastActive: + O << "extract-last-active"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -1927,7 +1956,9 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent, Mask->printAsOperand(O, SlotTracker); } } +#endif +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-SELECT "; @@ -2002,6 +2033,42 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenSelectVectorRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN-SELECT-VECTOR "; + printAsOperand(O, SlotTracker); + O << " = select "; + printFlags(O); + getOperand(0)->printAsOperand(O, SlotTracker); + O << ", "; + getOperand(1)->printAsOperand(O, SlotTracker); + O << ", "; + getOperand(2)->printAsOperand(O, SlotTracker); + O << (isInvariantCond() ? " (condition is loop invariant)" : ""); +} +#endif + +void VPWidenSelectVectorRecipe::execute(VPTransformState &State) { + State.setDebugLocFrom(getDebugLoc()); + + Value *Cond = State.get(getCond(), /*IsScalar=*/true); + Value *Op0 = State.get(getOperand(1)); + Value *Op1 = State.get(getOperand(2)); + Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); + State.set(this, Sel); +} + +InstructionCost +VPWidenSelectVectorRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + // Return a number for now; can tune later. This is likely to be expensive + // on some targets, as whole-vector selects may require branching to + // accomplish. + // FIXME: Check current cost model, specifically getCmpSelInstrCost. + return 5; +} + VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) { AllowReassoc = FMF.allowReassoc(); NoNaNs = FMF.noNaNs(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 2cac5557daeee..a4b398d171806 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -36,6 +36,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/TypeSize.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" using namespace llvm; using namespace VPlanPatternMatch; @@ -4138,3 +4139,69 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator( MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false); MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights); } + +/// Change CSA reductions to save the appropriate state. +void VPlanTransforms::convertFindLastRecurrences( + VPlan &Plan, VPRecipeBuilder &RecipeBuilder, + LoopVectorizationLegality *Legal) { + assert(Legal && "Need valid LoopVecLegality"); + + // May need to do something better than this? + if (Plan.hasScalarVFOnly()) + return; + + for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) { + if (RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + VPRecipeBase *PhiR = RecipeBuilder.getRecipe(Phi); + VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); + + // Add mask phi... + VPValue *False = + Plan.getOrAddLiveIn(ConstantInt::getFalse(Phi->getContext())); + // FIXME: Either come up with a new phi recipe or make an existing one + // more generic. There's only supposed to be one ALM PHI. + VPActiveLaneMaskPHIRecipe *MaskPHI = + new VPActiveLaneMaskPHIRecipe(False, DebugLoc()); + Builder.insert(MaskPHI); + + SelectInst *Select = cast(RdxDesc.getLoopExitInstr()); + auto *SR = cast(RecipeBuilder.getRecipe(Select)); + + // Add select for mask... + VPValue *Cond = SR->getCond(); + Builder.setInsertPoint(SR); + VPInstruction *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond}); + auto *MaskSelect = new VPWidenSelectVectorRecipe({AnyOf, Cond, MaskPHI}); + Builder.insert(MaskSelect); + MaskPHI->addOperand(MaskSelect); + + // Create new select for data... + auto *DataSelect = new VPWidenSelectVectorRecipe( + {AnyOf, SR->getOperand(1), SR->getOperand(2)}); + Builder.insert(DataSelect); + SR->replaceAllUsesWith(DataSelect); + SR->eraseFromParent(); + + // FIXME: I initially tried to replace the reduction calculation in + // the middle block here. However, it appears that the Def-Use + // graph is somewhat broken. Calling replaceAllUsesWith on the + // extract-from-end replaced the one use it knew about in + // ir-bb, but not a second reference in ir-bb + // Both references were pending extra operands to phi nodes. + + // I could either just replace the compute-reduction-result node alone, + // or (as I have done) plant the extract-last-active instead. But we + // now need to add the other operands to extract-last-active. + VPValue *Default = RecipeBuilder.getVPValueOrAddLiveIn( + RdxDesc.getRecurrenceStartValue()); + for (VPUser *U : DataSelect->users()) { + VPInstruction *I = dyn_cast(U); + if (I && I->getOpcode() == VPInstruction::ExtractLastActive) { + I->addOperand(MaskSelect); + I->addOperand(Default); + } + } + } + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 1957428fab799..d16cdf9351cde 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -23,6 +23,7 @@ namespace llvm { class InductionDescriptor; class Instruction; +class LoopVectorizationLegality; class PHINode; class ScalarEvolution; class PredicatedScalarEvolution; @@ -356,6 +357,12 @@ struct VPlanTransforms { static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional VScaleForTuning); + + /// Change FindLast reductions to save the appropriate state using selects + /// for entire vectors for both the data and the mask.. + static void convertFindLastRecurrences(VPlan &Plan, + VPRecipeBuilder &RecipeBuilder, + LoopVectorizationLegality *Legal); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 85c6c2c8d7965..e025cfc0a67c9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -355,6 +355,7 @@ class VPDef { VPWidenStoreSC, VPWidenSC, VPWidenSelectSC, + VPWidenSelectVectorSC, VPBlendSC, VPHistogramSC, // START: Phi-like recipes. Need to be kept together. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll new file mode 100644 index 0000000000000..2694554838bb1 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll @@ -0,0 +1,155 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-vectorize,instcombine -S < %s 2>&1 | FileCheck %s --check-prefix=NEON +; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE + +target triple = "aarch64-linux-gnu" + +define i32 @simple_csa_int_select(i32 %N, ptr %data, i32 %a) { +; NEON-LABEL: define i32 @simple_csa_int_select( +; NEON-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: [[A_FR:%.*]] = freeze i32 [[A]] +; NEON-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N]], 0 +; NEON-NEXT: br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]] +; NEON: [[LOOP_PREHEADER]]: +; NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NEON: [[VECTOR_PH]]: +; NEON-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483644 +; NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A_FR]], i64 0 +; NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; NEON-NEXT: br label %[[VECTOR_BODY:.*]] +; NEON: [[VECTOR_BODY]]: +; NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; NEON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; NEON-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]] +; NEON-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; NEON-NEXT: [[WIDE_LOAD_FR:%.*]] = freeze <4 x i32> [[WIDE_LOAD]] +; NEON-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD_FR]] +; NEON-NEXT: [[TMP2:%.*]] = bitcast <4 x i1> [[TMP1]] to i4 +; NEON-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP2]], 0 +; NEON-NEXT: [[TMP3]] = select i1 [[DOTNOT]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]] +; NEON-NEXT: [[TMP4]] = select i1 [[DOTNOT]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD_FR]] +; NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NEON-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NEON: [[MIDDLE_BLOCK]]: +; NEON-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP4]], <4 x i1> [[TMP3]], i32 -1) +; NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] +; NEON-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; NEON: [[SCALAR_PH]]: +; NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ -1, %[[LOOP_PREHEADER]] ] +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[EXIT_LOOPEXIT]]: +; NEON-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ] +; NEON-NEXT: br label %[[EXIT]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; NEON-NEXT: ret i32 [[T_0_LCSSA]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[T_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT]], %[[LOOP]] ] +; NEON-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NEON-NEXT: [[CMP1:%.*]] = icmp slt i32 [[A_FR]], [[TMP7]] +; NEON-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP7]], i32 [[T_010]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; NEON-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; +; SVE-LABEL: define i32 @simple_csa_int_select( +; SVE-SAME: i32 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: [[A_FR:%.*]] = freeze i32 [[A]] +; SVE-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N]], 0 +; SVE-NEXT: br i1 [[CMP9]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]] +; SVE: [[LOOP_PREHEADER]]: +; SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[WIDE_TRIP_COUNT]] +; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SVE: [[VECTOR_PH]]: +; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; SVE-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement zeroinitializer, i32 [[A_FR]], i64 0 +; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; SVE: [[VECTOR_BODY]]: +; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]] +; SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; SVE-NEXT: [[WIDE_LOAD_FR:%.*]] = freeze [[WIDE_LOAD]] +; SVE-NEXT: [[TMP5:%.*]] = icmp slt [[BROADCAST_SPLAT]], [[WIDE_LOAD_FR]] +; SVE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP5]]) +; SVE-NEXT: [[TMP7]] = select i1 [[TMP9]], [[TMP5]], [[ACTIVE_LANE_MASK]] +; SVE-NEXT: [[TMP8]] = select i1 [[TMP9]], [[WIDE_LOAD_FR]], [[VEC_PHI]] +; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; SVE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SVE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SVE: [[MIDDLE_BLOCK]]: +; SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( [[TMP8]], [[TMP7]], i32 -1) +; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; SVE: [[SCALAR_PH]]: +; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1, %[[LOOP_PREHEADER]] ] +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[EXIT_LOOPEXIT]]: +; SVE-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; SVE-NEXT: br label %[[EXIT]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; SVE-NEXT: ret i32 [[T_0_LCSSA]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[T_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT]], %[[LOOP]] ] +; SVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; SVE-NEXT: [[CMP1:%.*]] = icmp slt i32 [[A_FR]], [[TMP13]] +; SVE-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[T_010]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; SVE-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; +entry: + %cmp9 = icmp sgt i32 %N, 0 + br i1 %cmp9, label %loop.preheader, label %exit + +loop.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %loop + +exit: ; preds = %loop, %entry + %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %loop ] + ret i32 %t.0.lcssa + +loop: ; preds = %loop.preheader, %loop + %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ] + %t.010 = phi i32 [ -1, %loop.preheader ], [ %spec.select, %loop ] + %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %a, %0 + %spec.select = select i1 %cmp1, i32 %0, i32 %t.010 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count + br i1 %exitcond.not, label %exit, label %loop +} +;. +; NEON: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; NEON: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; NEON: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; NEON: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. +; SVE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; SVE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; SVE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; SVE: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll new file mode 100644 index 0000000000000..08ebfb7ec2c30 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll @@ -0,0 +1,138 @@ +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -scalable-vectorization=on -force-target-supports-scalable-vectors \ +; RUN: -disable-output 2>&1 < %s | FileCheck %s + + +; This function is generated from the following C/C++ program: +; int simple_csa_int_select(int N, int *data, int a) { +; int t = -1; +; for (int i = 0; i < N; i++) { +; if (a < data[i]) +; t = data[i]; +; } +; return t; // use t +; } +define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) { +entry: + %cmp9 = icmp sgt i32 %N, 0 + br i1 %cmp9, label %loop.preheader, label %exit + +loop.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %loop + +exit: ; preds = %loop, %entry + %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %loop ] + ret i32 %t.0.lcssa + +loop: ; preds = %loop.preheader, %loop + %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ] + %t.010 = phi i32 [ -1, %loop.preheader ], [ %spec.select, %loop ] + %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv + %0 = load i32, ptr %arrayidx, align 4 + %1 = sext i32 %0 to i64 + %cmp1 = icmp slt i64 %a, %1 + %spec.select = select i1 %cmp1, i32 %0, i32 %t.010 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count + br i1 %exitcond.not, label %exit, label %loop +} + +; CHECK: VPlan 'Initial VPlan for VF={vscale x 1},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %wide.trip.count = zext i32 %N to i64 +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (zext i32 %N to i64) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%4> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%t.010> = phi ir<-1>, vp<%10> +; CHECK-NEXT: ACTIVE-LANE-MASK-PHI vp<%5> = phi ir, vp<%9> +; CHECK-NEXT: vp<%6> = SCALAR-STEPS vp<%4>, ir<1>, vp<%0> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%data>, vp<%6> +; CHECK-NEXT: vp<%7> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%0> = load vp<%7> +; CHECK-NEXT: WIDEN-CAST ir<%1> = sext ir<%0> to i64 +; CHECK-NEXT: WIDEN ir<%cmp1> = icmp slt ir<%a>, ir<%1> +; CHECK-NEXT: EMIT vp<%8> = any-of ir<%cmp1> +; CHECK-NEXT: WIDEN-SELECT-VECTOR vp<%9> = select vp<%8>, ir<%cmp1>, vp<%5> +; CHECK-NEXT: WIDEN-SELECT-VECTOR vp<%10> = select vp<%8>, ir<%0>, ir<%t.010> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%4>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%12> = extract-last-active vp<%10>, vp<%9>, ir<-1> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %spec.select.lcssa = phi i32 [ %spec.select, %loop ] (extra operand: vp<%12> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%12>, middle.block ], [ ir<-1>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %t.010 = phi i32 [ -1, %loop.preheader ], [ %spec.select, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv +; CHECK-NEXT: IR %0 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: IR %1 = sext i32 %0 to i64 +; CHECK-NEXT: IR %cmp1 = icmp slt i64 %a, %1 +; CHECK-NEXT: IR %spec.select = select i1 %cmp1, i32 %0, i32 %t.010 +; CHECK-NEXT: IR %iv.next = add nuw nsw i64 %iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count +; CHECK-NEXT: No successors +; CHECK-NEXT: } + + +; CHECK: Cost of 1 for VF vscale x 1: induction instruction %iv.next = add nuw nsw i64 %iv, 1 +; CHECK-NEXT: Cost of 1 for VF vscale x 1: induction instruction %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ] +; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%4> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<%t.010> = phi ir<-1>, vp<%10> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: ACTIVE-LANE-MASK-PHI vp<%5> = phi ir, vp<%9> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%6> = SCALAR-STEPS vp<%4>, ir<1>, vp<%0> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<%arrayidx> = getelementptr inbounds ir<%data>, vp<%6> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%7> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%0> = load vp<%7> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: WIDEN-CAST ir<%1> = sext ir<%0> to i64 +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%cmp1> = icmp slt ir<%a>, ir<%1> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%8> = any-of ir<%cmp1> +; CHECK-NEXT: Cost of 5 for VF vscale x 1: WIDEN-SELECT-VECTOR vp<%9> = select vp<%8>, ir<%cmp1>, vp<%5> +; CHECK-NEXT: Cost of 5 for VF vscale x 1: WIDEN-SELECT-VECTOR vp<%10> = select vp<%8>, ir<%0>, ir<%t.010> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%index.next> = add nuw vp<%4>, vp<%1> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: vector loop backedge +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %wide.trip.count = zext i32 %N to i64 +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%3> = EXPAND SCEV (zext i32 %N to i64) +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%12>, middle.block ], [ ir<-1>, ir-bb ] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %t.010 = phi i32 [ -1, %loop.preheader ], [ %spec.select, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %0 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %1 = sext i32 %0 to i64 +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %cmp1 = icmp slt i64 %a, %1 +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %spec.select = select i1 %cmp1, i32 %0, i32 %t.010 +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv.next = add nuw nsw i64 %iv, 1 +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%12> = extract-last-active vp<%10>, vp<%9>, ir<-1> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %spec.select.lcssa = phi i32 [ %spec.select, %loop ] (extra operand: vp<%12> from middle.block) diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll index 615f50124b41d..12e62b019c67b 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll @@ -1121,25 +1121,141 @@ exit: ; preds = %loop ; In this test, %iv's range will include both signed and unsigned ; maximum (sentinel) values. define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start) { -; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 -; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 -; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 -; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] -; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] -; CHECK-NEXT: ret i64 [[COND_LCSSA]] +; IC1VF4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound( +; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; IC1VF4-NEXT: [[ENTRY:.*]]: +; IC1VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1VF4: [[VECTOR_PH]]: +; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1VF4: [[VECTOR_BODY]]: +; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC1VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; IC1VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; IC1VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; IC1VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 +; IC1VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]] +; IC1VF4-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP8]] +; IC1VF4-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]]) +; IC1VF4-NEXT: [[TMP11]] = select i1 [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]] +; IC1VF4-NEXT: [[TMP12]] = select i1 [[TMP10]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]] +; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC1VF4-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4 +; IC1VF4-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IC1VF4: [[MIDDLE_BLOCK]]: +; IC1VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC1VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP12]], <4 x i1> [[TMP11]], i64 [[TMP14]]) +; IC1VF4-NEXT: br label %[[SCALAR_PH]] +; IC1VF4: [[SCALAR_PH]]: +; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC1VF4-NEXT: br label %[[LOOP:.*]] +; IC1VF4: [[LOOP]]: +; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; IC1VF4: [[EXIT]]: +; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC1VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound( +; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; IC4VF4-NEXT: [[ENTRY:.*]]: +; IC4VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC4VF4: [[VECTOR_PH]]: +; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC4VF4: [[VECTOR_BODY]]: +; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC4VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; IC4VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; IC4VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 +; IC4VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]] +; IC4VF4-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP8]] +; IC4VF4-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]]) +; IC4VF4-NEXT: [[TMP11]] = select i1 [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]] +; IC4VF4-NEXT: [[TMP12]] = select i1 [[TMP10]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]] +; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC4VF4-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4 +; IC4VF4-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IC4VF4: [[MIDDLE_BLOCK]]: +; IC4VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC4VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP12]], <4 x i1> [[TMP11]], i64 [[TMP14]]) +; IC4VF4-NEXT: br label %[[SCALAR_PH]] +; IC4VF4: [[SCALAR_PH]]: +; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC4VF4-NEXT: br label %[[LOOP:.*]] +; IC4VF4: [[LOOP]]: +; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; IC4VF4: [[EXIT]]: +; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC4VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF1-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound( +; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: br label %[[LOOP:.*]] +; IC4VF1: [[LOOP]]: +; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] +; IC4VF1: [[EXIT]]: +; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC4VF1-NEXT: ret i64 [[COND_LCSSA]] ; entry: br label %loop @@ -1162,25 +1278,163 @@ exit: } define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) { -; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 -; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 -; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 -; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] -; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] -; CHECK-NEXT: ret i64 [[COND_LCSSA]] +; IC1VF4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start( +; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { +; IC1VF4-NEXT: [[ENTRY:.*]]: +; IC1VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; IC1VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1) +; IC1VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]] +; IC1VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; IC1VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1VF4: [[VECTOR_PH]]: +; IC1VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; IC1VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; IC1VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]] +; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; IC1VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1VF4: [[VECTOR_BODY]]: +; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC1VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 +; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; IC1VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 +; IC1VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; IC1VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]] +; IC1VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]] +; IC1VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]]) +; IC1VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]] +; IC1VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]] +; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC1VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; IC1VF4: [[MIDDLE_BLOCK]]: +; IC1VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC1VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]]) +; IC1VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; IC1VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1VF4: [[SCALAR_PH]]: +; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ] +; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC1VF4-NEXT: br label %[[LOOP:.*]] +; IC1VF4: [[LOOP]]: +; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; IC1VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 +; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] +; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 +; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] +; IC1VF4: [[EXIT]]: +; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ] +; IC1VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF4-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start( +; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { +; IC4VF4-NEXT: [[ENTRY:.*]]: +; IC4VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; IC4VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1) +; IC4VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]] +; IC4VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; IC4VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC4VF4: [[VECTOR_PH]]: +; IC4VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; IC4VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; IC4VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]] +; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; IC4VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC4VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC4VF4: [[VECTOR_BODY]]: +; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC4VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 +; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 +; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; IC4VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]] +; IC4VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]] +; IC4VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]]) +; IC4VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]] +; IC4VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]] +; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC4VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC4VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; IC4VF4: [[MIDDLE_BLOCK]]: +; IC4VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC4VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]]) +; IC4VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; IC4VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC4VF4: [[SCALAR_PH]]: +; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ] +; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC4VF4-NEXT: br label %[[LOOP:.*]] +; IC4VF4: [[LOOP]]: +; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; IC4VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 +; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] +; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 +; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] +; IC4VF4: [[EXIT]]: +; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ] +; IC4VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF1-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start( +; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { +; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: br label %[[LOOP:.*]] +; IC4VF1: [[LOOP]]: +; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC4VF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 +; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] +; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 +; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]] +; IC4VF1: [[EXIT]]: +; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC4VF1-NEXT: ret i64 [[COND_LCSSA]] ; entry: br label %loop @@ -1201,3 +1455,5 @@ loop: ; preds = %entry, %loop exit: ; preds = %loop ret i64 %cond } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll index c958ea7b9b88e..06005bfb36662 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll @@ -145,10 +145,44 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-LABEL: define i64 @select_icmp_nuw( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -157,9 +191,9 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-NEXT: [[INC]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[COND_LCSSA]] ; entry: @@ -186,10 +220,44 @@ define i64 @select_icmp_noflag(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-LABEL: define i64 @select_icmp_noflag( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -198,9 +266,9 @@ define i64 @select_icmp_noflag(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-NEXT: [[INC]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[COND_LCSSA]] ; entry: @@ -229,4 +297,8 @@ exit: ; preds = %for.body ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll index 72ed6537ef640..67d53d9bbcecc 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll @@ -1,32 +1,156 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4 define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %iv_start ,i64 %n) { -; CHECK-LABEL: define i64 @select_non_const_iv_start_signed_guard( -; CHECK-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] -; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] -; CHECK: [[FOR_BODY_PREHEADER]]: -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3 -; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] -; CHECK: [[EXIT_LOOPEXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] -; CHECK-NEXT: ret i64 [[IDX_0_LCSSA]] +; CHECK-VF4IC1-LABEL: define i64 @select_non_const_iv_start_signed_guard( +; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]] +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC1: [[FOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3 +; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label %[[EXIT]] +; CHECK-VF4IC1: [[EXIT]]: +; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC1-NEXT: ret i64 [[IDX_0_LCSSA]] +; +; CHECK-VF4IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard( +; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]] +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]] +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3) +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC4: [[FOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3 +; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] +; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label %[[EXIT]] +; CHECK-VF4IC4: [[EXIT]]: +; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC4-NEXT: ret i64 [[IDX_0_LCSSA]] +; +; CHECK-VF1IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard( +; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { +; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF1IC4: [[FOR_BODY]]: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3 +; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] +; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: br label %[[EXIT]] +; CHECK-VF1IC4: [[EXIT]]: +; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF1IC4-NEXT: ret i64 [[IDX_0_LCSSA]] ; entry: %guard = icmp slt i64 %iv_start, %n @@ -49,32 +173,162 @@ exit: } define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, i32 %iv_start ,i32 %n) { -; CHECK-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( -; CHECK-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] -; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] -; CHECK: [[FOR_BODY_PREHEADER]]: -; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[RDX_07:%.*]] = phi i32 [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[RDX_07]] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] -; CHECK: [[EXIT_LOOPEXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] -; CHECK-NEXT: ret i32 [[IDX_0_LCSSA]] +; CHECK-VF4IC1-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( +; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { +; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 +; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]] +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP8]], <4 x i1> [[TMP7]], i32 [[TMP10]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC1: [[FOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 3 +; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[RDX_07]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label %[[EXIT]] +; CHECK-VF4IC1: [[EXIT]]: +; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC1-NEXT: ret i32 [[IDX_0_LCSSA]] +; +; CHECK-VF4IC4-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( +; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { +; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 +; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]] +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP8]], <4 x i1> [[TMP7]], i32 [[TMP10]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC4: [[FOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 3 +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[RDX_07]] +; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label %[[EXIT]] +; CHECK-VF4IC4: [[EXIT]]: +; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC4-NEXT: ret i32 [[IDX_0_LCSSA]] +; +; CHECK-VF1IC4-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( +; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { +; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 +; CHECK-VF1IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF1IC4: [[FOR_BODY]]: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i32 [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 3 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[RDX_07]] +; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: br label %[[EXIT]] +; CHECK-VF1IC4: [[EXIT]]: +; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF1IC4-NEXT: ret i32 [[IDX_0_LCSSA]] ; entry: %guard = icmp slt i32 %iv_start, %n @@ -101,3 +355,18 @@ exit: %idx.0.lcssa = phi i32 [ %rdx_start, %entry ], [ %cond, %for.body ] ret i32 %idx.0.lcssa } +;. +; CHECK-VF4IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF4IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF4IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF4IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-VF4IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VF4IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. +; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF4IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF4IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-VF4IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VF4IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll index 80c5bb359cb4a..1e1ffc484e2f7 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll @@ -814,20 +814,49 @@ define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(ptr %a, ; CHECK-VF4IC1-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0 ; CHECK-VF4IC1-NEXT: br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: br label %[[EXIT]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] @@ -839,20 +868,49 @@ define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(ptr %a, ; CHECK-VF4IC4-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0 ; CHECK-VF4IC4-NEXT: br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: br label %[[EXIT]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] @@ -919,20 +977,49 @@ define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(pt ; CHECK-VF4IC1-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]] ; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: br label %[[EXIT]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] @@ -945,20 +1032,49 @@ define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(pt ; CHECK-VF4IC4-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]] ; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: br label %[[EXIT]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] @@ -1025,40 +1141,88 @@ exit: ; preds = %for.body, %entry define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) { ; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = add i64 4294967294, [[INDEX]] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512 +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC1-NEXT: br label %[[EXIT:.*]] +; CHECK-VF4IC1: [[SCALAR_PH]]: ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ 4294967294, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC1-NEXT: [[CONV:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[CONV:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806 -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = add i64 4294967294, [[INDEX]] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512 +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC4-NEXT: br label %[[EXIT:.*]] +; CHECK-VF4IC4: [[SCALAR_PH]]: ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ 4294967294, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC4-NEXT: [[CONV:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[CONV:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806 -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound( @@ -1106,44 +1270,112 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 [[TMP8]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[COND_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 [[TMP8]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[COND_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard( @@ -1198,18 +1430,43 @@ define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) { ; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648 +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 -1) +; CHECK-VF4IC1-NEXT: br label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483648, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00 -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649 -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] @@ -1217,18 +1474,43 @@ define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) { ; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648 +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 -1) +; CHECK-VF4IC4-NEXT: br label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483648, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00 -; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649 -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] @@ -1282,22 +1564,56 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p ; CHECK-VF4IC1-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> [[TMP6]], <4 x i1> [[TMP5]], i16 [[TMP8]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i16 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i16 ; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: br label %[[EXIT]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] @@ -1310,22 +1626,56 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p ; CHECK-VF4IC4-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> [[TMP6]], <4 x i1> [[TMP5]], i16 [[TMP8]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i16 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i16 ; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: br label %[[EXIT]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll index fcaff55ba368f..4f60debd78a5b 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll @@ -1957,16 +1957,52 @@ exit: ; preds = %for.body } ; The sentinel value for increasing-IV vectorization is -LONG_MAX, and since -; the IV hits this value, it is impossible to vectorize this case. +; the IV hits this value, it is vectorized as a generic last-active reduction. define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start, i64 %n) { ; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]] ; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]] @@ -1976,19 +2012,55 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx. ; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1 ; CHECK-VF4IC1-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]] ; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]] @@ -1998,9 +2070,9 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx. ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1 ; CHECK-VF4IC4-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound( @@ -2051,10 +2123,50 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]] +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -2063,18 +2175,58 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]] +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]] +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -2083,9 +2235,9 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value( diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll index e4922d3e4f627..8f4743b3829e9 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -1128,27 +1128,124 @@ exit: ; preds = %loop ret float %sel } -; We don't support selecting loop-variant values. define i32 @select_variant_i32_from_icmp(ptr %v1, ptr %v2, i64 %n) { -; CHECK-LABEL: define i32 @select_variant_i32_from_icmp( -; CHECK-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] -; CHECK-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 -; CHECK-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] -; CHECK-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 -; CHECK-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 -; CHECK-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ] -; CHECK-NEXT: ret i32 [[SEL_LCSSA]] +; CHECK-VF4IC1-LABEL: define i32 @select_variant_i32_from_icmp( +; CHECK-VF4IC1-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 3), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 3) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: br label %[[LOOP:.*]] +; CHECK-VF4IC1: [[LOOP]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 +; CHECK-VF4IC1-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 +; CHECK-VF4IC1-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-VF4IC1: [[EXIT]]: +; CHECK-VF4IC1-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: ret i32 [[SEL_LCSSA]] +; +; CHECK-VF4IC4-LABEL: define i32 @select_variant_i32_from_icmp( +; CHECK-VF4IC4-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ splat (i32 3), %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD9]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]]) +; CHECK-VF4IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], <4 x i1> [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK6]] +; CHECK-VF4IC4-NEXT: [[TMP27]] = select i1 [[TMP19]], <4 x i32> [[VEC_PHI3]], <4 x i32> [[WIDE_LOAD13]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP27]], <4 x i1> [[TMP23]], i32 3) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: br label %[[LOOP:.*]] +; CHECK-VF4IC4: [[LOOP]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-VF4IC4-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 +; CHECK-VF4IC4-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 +; CHECK-VF4IC4-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] +; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-VF4IC4: [[EXIT]]: +; CHECK-VF4IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: ret i32 [[SEL_LCSSA]] +; +; CHECK-VF1IC4-LABEL: define i32 @select_variant_i32_from_icmp( +; CHECK-VF1IC4-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: br label %[[LOOP:.*]] +; CHECK-VF1IC4: [[LOOP]]: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-VF1IC4-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 +; CHECK-VF1IC4-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 +; CHECK-VF1IC4-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 +; CHECK-VF1IC4-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] +; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-VF1IC4: [[EXIT]]: +; CHECK-VF1IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ] +; CHECK-VF1IC4-NEXT: ret i32 [[SEL_LCSSA]] ; entry: br label %loop @@ -1220,6 +1317,8 @@ exit: ; preds = %loop ; CHECK-VF4IC1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} ; CHECK-VF4IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK-VF4IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-VF4IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK-VF4IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} ;. ; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -1235,6 +1334,8 @@ exit: ; preds = %loop ; CHECK-VF4IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} ; CHECK-VF4IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK-VF4IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-VF4IC4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK-VF4IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} ;. ; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}