diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 12fb46da8e71a..ec47e697d4f0e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6750,46 +6750,6 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, } } - /// Compute the cost of all exiting conditions of the loop using the legacy - /// cost model. This is to match the legacy behavior, which adds the cost of - /// all exit conditions. Note that this over-estimates the cost, as there will - /// be a single condition to control the vector loop. - SmallVector Exiting; - CM.TheLoop->getExitingBlocks(Exiting); - SetVector ExitInstrs; - // Collect all exit conditions. - for (BasicBlock *EB : Exiting) { - auto *Term = dyn_cast(EB->getTerminator()); - if (!Term || CostCtx.skipCostComputation(Term, VF.isVector())) - continue; - if (auto *CondI = dyn_cast(Term->getOperand(0))) { - ExitInstrs.insert(CondI); - } - } - // Compute the cost of all instructions only feeding the exit conditions. - for (unsigned I = 0; I != ExitInstrs.size(); ++I) { - Instruction *CondI = ExitInstrs[I]; - if (!OrigLoop->contains(CondI) || - !CostCtx.SkipCostComputation.insert(CondI).second) - continue; - InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF); - LLVM_DEBUG({ - dbgs() << "Cost of " << CondICost << " for VF " << VF - << ": exit condition instruction " << *CondI << "\n"; - }); - Cost += CondICost; - for (Value *Op : CondI->operands()) { - auto *OpI = dyn_cast(Op); - if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) || - any_of(OpI->users(), [&ExitInstrs, this](User *U) { - return OrigLoop->contains(cast(U)->getParent()) && - !ExitInstrs.contains(cast(U)); - })) - continue; - ExitInstrs.insert(OpI); - } - } - // Pre-compute the costs for branches except for the backedge, as the number // of replicate regions in a VPlan may not directly match the number of // branches, which would lead to different decisions. @@ -6969,6 +6929,35 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, }); }); } + +static bool planContainsDifferentCompares(VPlan &Plan, VPCostContext &CostCtx, + Loop *TheLoop, ElementCount VF) { + // Count how many compare instructions there are in the legacy cost model. + unsigned NumLegacyCompares = 0; + for (BasicBlock *BB : TheLoop->blocks()) { + for (auto &I : *BB) { + if (isa(I)) { + NumLegacyCompares += 1; + } + } + } + + // Count how many compare instructions there are in the VPlan. + unsigned NumVPlanCompares = 0; + auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { + for (VPRecipeBase &R : *VPBB) { + using namespace VPlanPatternMatch; + if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())) || + match(&R, m_Cmp(m_VPValue(), m_VPValue()))) + NumVPlanCompares += 1; + } + } + + // If we have a different amount, then the legacy cost model and vplan will + // disagree. + return NumLegacyCompares != NumVPlanCompares; +} #endif VectorizationFactor LoopVectorizationPlanner::computeBestVF() { @@ -7078,7 +7067,9 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { CostCtx, OrigLoop, BestFactor.Width) || planContainsAdditionalSimplifications( - getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) && + getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width) || + planContainsDifferentCompares(BestPlan, CostCtx, OrigLoop, + BestFactor.Width)) && " VPlan cost model and legacy cost model disagreed"); assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && "when vectorizing, the scalar cost must be computed."); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 3a55710d59b08..b2b5595549a87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1143,6 +1143,32 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement, VecTy, Ctx.CostKind, 0); } + case VPInstruction::BranchOnCount: { + // If TC <= VF then this is just a branch. + // FIXME: Removing the branch happens in simplifyBranchConditionForVFAndUF + // where it checks TC <= VF * UF, but we don't know UF yet. This means in + // some cases we get a cost that's too high due to counting a cmp that + // later gets removed. + // FIXME: The compare could also be removed if TC = M * vscale, + // VF = N * vscale, and M <= N. Detecting that would require having the + // trip count as a SCEV though. + Value *TC = getParent()->getPlan()->getTripCount()->getUnderlyingValue(); + ConstantInt *TCConst = dyn_cast_if_present(TC); + if (TCConst && TCConst->getValue().ule(VF.getKnownMinValue())) + return 0; + // Otherwise BranchOnCount generates ICmpEQ followed by a branch. + Type *ValTy = Ctx.Types.inferScalarType(getOperand(0)); + return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy, + CmpInst::makeCmpResultType(ValTy), + CmpInst::ICMP_EQ, Ctx.CostKind); + } + case Instruction::FCmp: + case Instruction::ICmp: { + Type *ValTy = Ctx.Types.inferScalarType(getOperand(0)); + return Ctx.TTI.getCmpSelInstrCost(getOpcode(), ValTy, + CmpInst::makeCmpResultType(ValTy), + getPredicate(), Ctx.CostKind); + } case VPInstruction::ExtractPenultimateElement: if (VF == ElementCount::getScalable(1)) return InstructionCost::getInvalid(); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index e4ee6776ae24c..0f84e1866256b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -533,25 +533,47 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; DEFAULT-LABEL: define void @multiple_exit_conditions( ; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] { ; DEFAULT-NEXT: [[ENTRY:.*:]] -; DEFAULT-NEXT: br label %[[VECTOR_PH:.*]] +; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP0]], 3 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP4]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: -; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048 -; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] -; DEFAULT: [[VECTOR_BODY]]: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]] +; DEFAULT-NEXT: [[INDEX:%.*]] = sub i64 257, [[N_MOD_VF]] ; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] +; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 2 +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX1]], 8 +; DEFAULT-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX1]] ; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2 -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], splat (i16 1) -; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double> -; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[NEXT_GEP]], align 8 -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; DEFAULT-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP1]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP11:%.*]] = or [[BROADCAST_SPLAT]], splat (i16 1) +; DEFAULT-NEXT: [[TMP15:%.*]] = uitofp [[TMP11]] to +; DEFAULT-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 1 +; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[NEXT_GEP1]], i64 [[TMP17]] +; DEFAULT-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 2 +; DEFAULT-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[NEXT_GEP1]], i64 [[TMP20]] +; DEFAULT-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 6 +; DEFAULT-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[NEXT_GEP1]], i64 [[TMP23]] +; DEFAULT-NEXT: store [[TMP15]], ptr [[NEXT_GEP1]], align 8 +; DEFAULT-NEXT: store [[TMP15]], ptr [[TMP18]], align 8 +; DEFAULT-NEXT: store [[TMP15]], ptr [[TMP21]], align 8 +; DEFAULT-NEXT: store [[TMP15]], ptr [[TMP24]], align 8 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP3]] +; DEFAULT-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[INDEX]] +; DEFAULT-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: br label %[[SCALAR_PH:.*]] +; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[INDEX]] +; DEFAULT-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] ; DEFAULT: [[SCALAR_PH]]: ; ; PRED-LABEL: define void @multiple_exit_conditions( @@ -609,7 +631,6 @@ exit: } define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { -; ; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store( ; COMMON-SAME: ptr [[DST:%.*]]) { ; COMMON-NEXT: [[ENTRY:.*:]] @@ -659,15 +680,15 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { ; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1 ; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; COMMON: [[PRED_STORE_CONTINUE12]]: -; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]] +; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT1:.*]] ; COMMON: [[PRED_STORE_IF13]]: ; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7 ; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1 -; COMMON-NEXT: br label %[[EXIT]] -; COMMON: [[EXIT]]: +; COMMON-NEXT: br label %[[EXIT1]] +; COMMON: [[EXIT1]]: ; COMMON-NEXT: br label %[[SCALAR_PH:.*]] ; COMMON: [[SCALAR_PH]]: -; COMMON-NEXT: br [[EXIT1:label %.*]] +; COMMON-NEXT: br [[EXIT:label %.*]] ; COMMON: [[SCALAR_PH1:.*:]] ; entry: @@ -1300,7 +1321,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 { ; PRED-NEXT: br label %[[VECTOR_MEMCHECK:.*]] ; PRED: [[VECTOR_MEMCHECK]]: ; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16 +; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4 ; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[C1]], [[A2]] ; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] ; PRED-NEXT: [[TMP4:%.*]] = sub i64 [[C1]], [[B3]] @@ -1309,42 +1330,42 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 { ; PRED-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; PRED: [[VECTOR_PH]]: ; PRED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16 +; PRED-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4 ; PRED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 4 +; PRED-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 2 ; PRED-NEXT: [[TMP9:%.*]] = sub i64 [[TMP0]], [[TMP8]] ; PRED-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP0]], [[TMP8]] ; PRED-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[Y]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]]) +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[Y]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; PRED-NEXT: br label %[[VECTOR_BODY:.*]] ; PRED: [[VECTOR_BODY]]: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PRED-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; PRED-NEXT: [[TMP13:%.*]] = uitofp [[WIDE_MASKED_LOAD]] to +; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; PRED-NEXT: [[TMP13:%.*]] = uitofp [[WIDE_MASKED_LOAD]] to ; PRED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; PRED-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; PRED-NEXT: [[TMP15:%.*]] = icmp ne [[WIDE_MASKED_LOAD5]], zeroinitializer -; PRED-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer -; PRED-NEXT: [[TMP17:%.*]] = xor [[WIDE_MASKED_LOAD]], splat (i8 1) -; PRED-NEXT: [[TMP18:%.*]] = select [[TMP16]], [[BROADCAST_SPLAT]], splat (i8 1) -; PRED-NEXT: [[TMP19:%.*]] = udiv [[TMP17]], [[TMP18]] -; PRED-NEXT: [[TMP20:%.*]] = icmp ugt [[TMP19]], splat (i8 1) -; PRED-NEXT: [[TMP21:%.*]] = select [[TMP20]], zeroinitializer, splat (i32 255) -; PRED-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP21]], zeroinitializer -; PRED-NEXT: [[TMP22:%.*]] = zext [[WIDE_MASKED_LOAD]] to -; PRED-NEXT: [[TMP23:%.*]] = sub [[PREDPHI]], [[TMP22]] -; PRED-NEXT: [[TMP24:%.*]] = sitofp [[TMP23]] to -; PRED-NEXT: [[TMP25:%.*]] = call @llvm.fmuladd.nxv16f32( [[TMP24]], splat (float 3.000000e+00), [[TMP13]]) -; PRED-NEXT: [[TMP26:%.*]] = fptoui [[TMP25]] to +; PRED-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; PRED-NEXT: [[TMP15:%.*]] = icmp ne [[WIDE_MASKED_LOAD5]], zeroinitializer +; PRED-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer +; PRED-NEXT: [[TMP17:%.*]] = xor [[WIDE_MASKED_LOAD]], splat (i8 1) +; PRED-NEXT: [[TMP18:%.*]] = select [[TMP16]], [[BROADCAST_SPLAT]], splat (i8 1) +; PRED-NEXT: [[TMP19:%.*]] = udiv [[TMP17]], [[TMP18]] +; PRED-NEXT: [[TMP20:%.*]] = icmp ugt [[TMP19]], splat (i8 1) +; PRED-NEXT: [[TMP21:%.*]] = select [[TMP20]], zeroinitializer, splat (i32 255) +; PRED-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP21]], zeroinitializer +; PRED-NEXT: [[TMP22:%.*]] = zext [[WIDE_MASKED_LOAD]] to +; PRED-NEXT: [[TMP23:%.*]] = sub [[PREDPHI]], [[TMP22]] +; PRED-NEXT: [[TMP24:%.*]] = sitofp [[TMP23]] to +; PRED-NEXT: [[TMP25:%.*]] = call @llvm.fmuladd.nxv4f32( [[TMP24]], splat (float 3.000000e+00), [[TMP13]]) +; PRED-NEXT: [[TMP26:%.*]] = fptoui [[TMP25]] to ; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[C]], i64 [[INDEX]] -; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP26]], ptr [[TMP27]], i32 1, [[ACTIVE_LANE_MASK]]) +; PRED-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP26]], ptr [[TMP27]], i32 1, [[ACTIVE_LANE_MASK]]) ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]]) -; PRED-NEXT: [[TMP28:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP11]]) +; PRED-NEXT: [[TMP28:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP29:%.*]] = xor i1 [[TMP28]], true ; PRED-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index c3b0bc8c00a74..fa807c9938661 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -9,11 +9,12 @@ define i64 @test(ptr %a, ptr %b) #0 { ; CHECK-LABEL: LV: Checking a loop in 'test' ; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] -; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 16: 56 ; CHECK: LV: Selecting VF: 16 entry: @@ -43,12 +44,13 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 { ; CHECK-LABEL: LV: Checking a loop in 'test_external_iv_user' ; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] -; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 8: 30 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 16: 57 ; CHECK: LV: Selecting VF: vscale x 2 entry: @@ -80,12 +82,13 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] -; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 8: 27 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 16: 48 ; CHECK: LV: Selecting VF: 16 entry: @@ -116,11 +119,14 @@ define i1 @test_extra_cmp_user(ptr nocapture noundef %dst, ptr nocapture noundef ; CHECK-LABEL: LV: Checking a loop in 'test_extra_cmp_user' ; CHECK: Cost of 4 for VF 8: induction instruction %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: Cost of 4 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %indvars.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 12 +; CHECK: Cost of 4 for VF 8: WIDEN ir<%exitcond.not> = icmp eq ir<%indvars.iv.next>, ir<16> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost for VF 8: 13 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK: Cost of 0 for VF 16: WIDEN ir<%exitcond.not> = icmp eq ir<%indvars.iv.next>, ir<16> +; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 16: 4 ; CHECK: LV: Selecting VF: 16 entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index fd6e275d098ca..db465ae02d64f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -127,40 +127,40 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; PRED-NEXT: br label %[[VECTOR_MEMCHECK:.*]] ; PRED: [[VECTOR_MEMCHECK]]: ; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16 +; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 8 ; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]] ; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] ; PRED-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; PRED: [[VECTOR_PH]]: ; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16 -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 4 -; PRED-NEXT: [[TMP8:%.*]] = sub i64 [[TMP0]], [[TMP7]] -; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP0]], [[TMP7]] -; PRED-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[TMP11:%.*]] = trunc [[BROADCAST_SPLAT]] to +; PRED-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; PRED-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 3 +; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]] +; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], [[TMP12]] +; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]]) +; PRED-NEXT: [[TMP19:%.*]] = trunc [[BROADCAST_SPLAT]] to ; PRED-NEXT: br label %[[VECTOR_BODY:.*]] ; PRED: [[VECTOR_BODY]]: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PRED-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]] -; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; PRED-NEXT: [[TMP13:%.*]] = zext [[WIDE_MASKED_LOAD]] to -; PRED-NEXT: [[TMP14:%.*]] = mul [[TMP13]], [[TMP11]] -; PRED-NEXT: [[TMP15:%.*]] = or [[TMP14]], [[TMP13]] -; PRED-NEXT: [[TMP16:%.*]] = lshr [[TMP15]], splat (i16 1) -; PRED-NEXT: [[TMP17:%.*]] = trunc [[TMP16]] to -; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] -; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP17]], ptr [[TMP18]], i32 1, [[ACTIVE_LANE_MASK]]) +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]] +; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr [[TMP18]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; PRED-NEXT: [[TMP20:%.*]] = zext [[WIDE_MASKED_LOAD]] to +; PRED-NEXT: [[TMP21:%.*]] = mul [[TMP20]], [[TMP19]] +; PRED-NEXT: [[TMP16:%.*]] = or [[TMP21]], [[TMP20]] +; PRED-NEXT: [[TMP17:%.*]] = lshr [[TMP16]], splat (i16 1) +; PRED-NEXT: [[TMP23:%.*]] = trunc [[TMP17]] to +; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] +; PRED-NEXT: call void @llvm.masked.store.nxv8i8.p0( [[TMP23]], ptr [[TMP26]], i32 1, [[ACTIVE_LANE_MASK]]) ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]] -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP10]]) -; PRED-NEXT: [[TMP19:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 -; PRED-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true -; PRED-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]]) +; PRED-NEXT: [[TMP25:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; PRED-NEXT: [[TMP27:%.*]] = xor i1 [[TMP25]], true +; PRED-NEXT: br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: @@ -460,61 +460,43 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]] ; PRED-NEXT: br i1 [[TMP13]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4 +; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 2 +; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 2 ; PRED-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]]) +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; PRED-NEXT: br label %[[VECTOR_BODY:.*]] ; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ] +; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE3]] ] +; PRED-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE3]] ] ; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 -; PRED-NEXT: [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64> -; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 +; PRED-NEXT: [[TMP17:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] +; PRED-NEXT: [[TMP18:%.*]] = zext <2 x i32> [[TMP17]] to <2 x i64> +; PRED-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 ; PRED-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; PRED: [[PRED_STORE_IF]]: -; PRED-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0 +; PRED-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP18]], i32 0 ; PRED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]] ; PRED-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 0 ; PRED-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] ; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]] +; PRED-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1 +; PRED-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]] ; PRED: [[PRED_STORE_IF2]]: -; PRED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1 +; PRED-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP18]], i32 1 ; PRED-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]] ; PRED-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1 ; PRED-NEXT: store i32 [[TMP26]], ptr [[TMP25]], align 4 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE3]] ; PRED: [[PRED_STORE_CONTINUE3]]: -; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; PRED-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]] -; PRED: [[PRED_STORE_IF4]]: -; PRED-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2 -; PRED-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]] -; PRED-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2 -; PRED-NEXT: store i32 [[TMP30]], ptr [[TMP29]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE5]] -; PRED: [[PRED_STORE_CONTINUE5]]: -; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; PRED-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]] -; PRED: [[PRED_STORE_IF6]]: -; PRED-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3 -; PRED-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]] -; PRED-NEXT: [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3 -; PRED-NEXT: store i32 [[TMP34]], ptr [[TMP33]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE7]] -; PRED: [[PRED_STORE_CONTINUE7]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]]) -; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP16]]) +; PRED-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP36:%.*]] = xor i1 [[TMP35]], true -; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) ; PRED-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] @@ -649,61 +631,43 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]] ; PRED-NEXT: br i1 [[TMP12]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4 +; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 2 +; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2 ; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]]) +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[ADD]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; PRED-NEXT: br label %[[VECTOR_BODY:.*]] ; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] +; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ] +; PRED-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ] ; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 -; PRED-NEXT: [[TMP16:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP17:%.*]] = zext <4 x i32> [[TMP16]] to <4 x i64> -; PRED-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 +; PRED-NEXT: [[TMP16:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] +; PRED-NEXT: [[TMP17:%.*]] = zext <2 x i32> [[TMP16]] to <2 x i64> +; PRED-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 ; PRED-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; PRED: [[PRED_STORE_IF]]: -; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0 +; PRED-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[TMP17]], i32 0 ; PRED-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]] ; PRED-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0 ; PRED-NEXT: store i32 [[TMP21]], ptr [[TMP20]], align 4 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] ; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; PRED-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1 +; PRED-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]] ; PRED: [[PRED_STORE_IF1]]: -; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1 +; PRED-NEXT: [[TMP23:%.*]] = extractelement <2 x i64> [[TMP17]], i32 1 ; PRED-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP23]] ; PRED-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 1 ; PRED-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE2]] ; PRED: [[PRED_STORE_CONTINUE2]]: -; PRED-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; PRED-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; PRED: [[PRED_STORE_IF3]]: -; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 -; PRED-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]] -; PRED-NEXT: [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2 -; PRED-NEXT: store i32 [[TMP29]], ptr [[TMP28]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE4]] -; PRED: [[PRED_STORE_CONTINUE4]]: -; PRED-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; PRED-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] -; PRED: [[PRED_STORE_IF5]]: -; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3 -; PRED-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]] -; PRED-NEXT: [[TMP33:%.*]] = add i32 [[OFFSET_IDX]], 3 -; PRED-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE6]] -; PRED: [[PRED_STORE_CONTINUE6]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; PRED-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]]) +; PRED-NEXT: [[TMP34:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP35:%.*]] = xor i1 [[TMP34]], true -; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) ; PRED-NEXT: br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] @@ -811,38 +775,54 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] ; PRED-NEXT: br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1 -; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 3 +; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 ; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; PRED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1 -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; PRED-NEXT: br label %[[VECTOR_BODY:.*]] ; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE5:.*]] ] -; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer -; PRED-NEXT: [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT3]], -; PRED-NEXT: [[TMP6:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] -; PRED-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[PRED_STORE_CONTINUE9:.*]] ] +; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer +; PRED-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT3]], +; PRED-NEXT: [[TMP8:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; PRED-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 ; PRED-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; PRED: [[PRED_STORE_IF]]: -; PRED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; PRED-NEXT: [[TMP9:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP8]], i32 2 +; PRED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; PRED-NEXT: [[TMP9:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP10]], i32 2 ; PRED-NEXT: store i32 0, ptr [[TMP9]], align 8 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] ; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 -; PRED-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5]] +; PRED-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1 +; PRED-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]] ; PRED: [[PRED_STORE_IF4]]: -; PRED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 -; PRED-NEXT: [[TMP12:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP11]], i32 2 +; PRED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 +; PRED-NEXT: [[TMP12:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP13]], i32 2 ; PRED-NEXT: store i32 0, ptr [[TMP12]], align 8 ; PRED-NEXT: br label %[[PRED_STORE_CONTINUE5]] ; PRED: [[PRED_STORE_CONTINUE5]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; PRED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; PRED-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; PRED-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2 +; PRED-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]] +; PRED: [[PRED_STORE_IF6]]: +; PRED-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], 2 +; PRED-NEXT: [[TMP15:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[INDEX_NEXT]], i32 2 +; PRED-NEXT: store i32 0, ptr [[TMP15]], align 8 +; PRED-NEXT: br label %[[PRED_STORE_CONTINUE7]] +; PRED: [[PRED_STORE_CONTINUE7]]: +; PRED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3 +; PRED-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9]] +; PRED: [[PRED_STORE_IF8]]: +; PRED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 3 +; PRED-NEXT: [[TMP18:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP17]], i32 2 +; PRED-NEXT: store i32 0, ptr [[TMP18]], align 8 +; PRED-NEXT: br label %[[PRED_STORE_CONTINUE9]] +; PRED: [[PRED_STORE_CONTINUE9]]: +; PRED-NEXT: [[INDEX_NEXT1]] = add i64 [[INDEX]], 4 +; PRED-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] +; PRED-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll index bdf832f32964f..13a686251daef 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll @@ -21,7 +21,6 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: LV: Scalar loop costs: 5. ; CHECK: Cost of 1 for VF 2: induction instruction %inc = add nuw nsw i32 %i.016, 1 ; CHECK: Cost of 0 for VF 2: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] -; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n ; CHECK: Cost of 0 for VF 2: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 2: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1> ; CHECK: Cost of 0 for VF 2: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> @@ -34,11 +33,10 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 2: vp<{{.+}}> = vector-pointer ir<%arrayidx7> ; CHECK: Cost of 16 for VF 2: WIDEN store vp<{{.+}}>, ir<%conv6>, ir<%cmp2> ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<{{.+}}>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 2: 86 (Estimated cost per lane: 43. ; CHECK: Cost of 1 for VF 4: induction instruction %inc = add nuw nsw i32 %i.016, 1 ; CHECK: Cost of 0 for VF 4: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] -; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n ; CHECK: Cost of 0 for VF 4: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 4: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1> ; CHECK: Cost of 0 for VF 4: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> @@ -51,11 +49,10 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 4: vp<{{.+}}> = vector-pointer ir<%arrayidx7> ; CHECK: Cost of 2 for VF 4: WIDEN store vp<{{.+}}>, ir<%conv6>, ir<%cmp2> ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<{{.+}}>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 4: 10 (Estimated cost per lane: 2. ; CHECK: Cost of 1 for VF 8: induction instruction %inc = add nuw nsw i32 %i.016, 1 ; CHECK: Cost of 0 for VF 8: induction instruction %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] -; CHECK: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i32 %inc, %n ; CHECK: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 8: vp<{{.+}}> = SCALAR-STEPS vp<{{.+}}>, ir<1> ; CHECK: Cost of 0 for VF 8: CLONE ir<%arrayidx> = getelementptr inbounds ir<%s>, vp<{{.+}}> @@ -68,7 +65,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: Cost of 0 for VF 8: vp<{{.+}}> = vector-pointer ir<%arrayidx7> ; CHECK: Cost of 2 for VF 8: WIDEN store vp<{{.+}}>, ir<%conv6>, ir<%cmp2> ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<{{.+}}>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 8: 46 (Estimated cost per lane: 5. ; CHECK: LV: Selecting VF: 4. define void @expensive_icmp(ptr noalias nocapture %d, ptr nocapture readonly %s, i32 %n, i16 zeroext %m) #0 { @@ -134,7 +131,6 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 2: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: Cost of 0 for VF 2: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 2: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] -; CHECK: Cost of 1 for VF 2: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 ; CHECK: Cost of 0 for VF 2: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 2: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 2: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> @@ -156,7 +152,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 2: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 18 for VF 2: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 2: 130 (Estimated cost per lane: 65. ; CHECK: Cost of 1 for VF 4: induction instruction %dec = add i32 %blkCnt.012, -1 ; CHECK: Cost of 0 for VF 4: induction instruction %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ] @@ -166,7 +162,6 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 4: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: Cost of 0 for VF 4: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 4: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] -; CHECK: Cost of 1 for VF 4: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 ; CHECK: Cost of 0 for VF 4: EMIT vp<[[CAN_IV:%.]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 4: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 4: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> @@ -188,7 +183,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 4: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 4: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 4: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 4: 14 (Estimated cost per lane: 3. ; CHECK: Cost of 1 for VF 8: induction instruction %dec = add i32 %blkCnt.012, -1 ; CHECK: Cost of 0 for VF 8: induction instruction %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ] @@ -198,7 +193,6 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 8: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: Cost of 0 for VF 8: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 8: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] -; CHECK: Cost of 1 for VF 8: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 ; CHECK: Cost of 0 for VF 8: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 8: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 8: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> @@ -220,7 +214,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 8: vp<[[VEC_PTR3:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 8: WIDEN store vp<[[VEC_PTR3]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 8: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}} -; CHECK: Cost of 0 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 8: 26 (Estimated cost per lane: 3. ; CHECK: Cost of 1 for VF 16: induction instruction %dec = add i32 %blkCnt.012, -1 ; CHECK: Cost of 0 for VF 16: induction instruction %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ] @@ -230,7 +224,6 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 16: induction instruction %pDst.addr.010 = phi ptr [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: Cost of 0 for VF 16: induction instruction %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.09, i32 1 ; CHECK: Cost of 0 for VF 16: induction instruction %pSrcB.addr.09 = phi ptr [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] -; CHECK: Cost of 1 for VF 16: exit condition instruction %cmp.not = icmp eq i32 %dec, 0 ; CHECK: Cost of 0 for VF 16: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 16: vp<[[STEPS1:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK: Cost of 0 for VF 16: EMIT vp<%next.gep> = ptradd ir<%pSrcA>, vp<[[STEPS1]]> @@ -252,7 +245,7 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: Cost of 0 for VF 16: vp<[[VEC_PTR2:%.+]]> = vector-pointer vp<%next.gep>.1 ; CHECK: Cost of 2 for VF 16: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv4> ; CHECK: Cost of 0 for VF 16: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; CHECK: Cost for VF 16: 50 ; CHECK: LV: Selecting VF: 16. define void @cheap_icmp(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr nocapture %pDst, i32 %blockSize) #0 { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 53907fadf8187..1c8e792aaa8dd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -125,12 +125,12 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 8, i32 4, i1 true) -; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP9:%.*]], splat (i1 true), i32 [[TMP5]]) -; CHECK-NEXT: [[TMP6:%.*]] = shl [[VP_OP_LOAD]], splat (i8 1) -; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP12:%.*]], splat (i1 true), i32 [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], [[VP_OP_LOAD1]] -; CHECK-NEXT: call void @llvm.vp.store.nxv4i8.p0( [[TMP7]], ptr align 1 [[TMP12]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 8, i32 8, i1 true) +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP9:%.*]], splat (i1 true), i32 [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = shl [[VP_OP_LOAD]], splat (i8 1) +; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP12:%.*]], splat (i1 true), i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = add [[TMP1]], [[VP_OP_LOAD1]] +; CHECK-NEXT: call void @llvm.vp.store.nxv8i8.p0( [[TMP2]], ptr align 1 [[TMP12]], splat (i1 true), i32 [[TMP0]]) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll index bb85b88f181f7..cff8b93008212 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll @@ -9,10 +9,9 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) ; CHECK-LABEL: 'wide_or_replaced_with_add_vpinstruction' ; CHECK: Cost of 1 for VF 2: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 2: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] -; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 ; CHECK: Cost of 0 for VF 2: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 2: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0> -; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1> +; CHECK: Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> ; CHECK: Cost of 0 for VF 2: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4> ; CHECK: Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%g.src> ; CHECK: Cost of 1 for VF 2: WIDEN ir<%l> = load vp<%5> @@ -23,14 +22,21 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) ; CHECK: Cost of 0 for VF 2: vp<%6> = vector-pointer ir<%g.dst> ; CHECK: Cost of 1 for VF 2: WIDEN store vp<%6>, ir<%iv.4>, ir<%c> ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<%2> ; CHECK: Cost of 0 for VF 2: vector loop backedge +; CHECK: Cost of 0 for VF 2: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK: Cost of 0 for VF 2: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK: Cost of 0 for VF 2: IR %g.src = getelementptr inbounds i64, ptr %src, i64 %iv +; CHECK: Cost of 0 for VF 2: IR %l = load i64, ptr %g.src, align 8 +; CHECK: Cost of 0 for VF 2: IR %iv.4 = add nuw nsw i64 %iv, 4 +; CHECK: Cost of 0 for VF 2: IR %c = icmp ule i64 %l, 128 +; CHECK: Cost of 1 for VF 2: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<%2> +; CHECK: Cost of 0 for VF 2: EMIT branch-on-cond vp<%cmp.n> ; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] -; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 ; CHECK: Cost of 0 for VF 4: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 0 for VF 4: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0> -; CHECK: Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%3>, ir<1> +; CHECK: Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> ; CHECK: Cost of 0 for VF 4: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4> ; CHECK: Cost of 0 for VF 4: vp<%5> = vector-pointer ir<%g.src> ; CHECK: Cost of 1 for VF 4: WIDEN ir<%l> = load vp<%5> @@ -41,11 +47,18 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst) ; CHECK: Cost of 0 for VF 4: vp<%6> = vector-pointer ir<%g.dst> ; CHECK: Cost of 1 for VF 4: WIDEN store vp<%6>, ir<%iv.4>, ir<%c> ; CHECK: Cost of 0 for VF 4: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> -; CHECK: Cost of 0 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK: Cost of 1 for VF 4: EMIT branch-on-count vp<%index.next>, vp<%2> ; CHECK: Cost of 0 for VF 4: vector loop backedge +; CHECK: Cost of 0 for VF 4: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK: Cost of 0 for VF 4: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK: Cost of 0 for VF 4: IR %g.src = getelementptr inbounds i64, ptr %src, i64 %iv +; CHECK: Cost of 0 for VF 4: IR %l = load i64, ptr %g.src, align 8 +; CHECK: Cost of 0 for VF 4: IR %iv.4 = add nuw nsw i64 %iv, 4 +; CHECK: Cost of 0 for VF 4: IR %c = icmp ule i64 %l, 128 +; CHECK: Cost of 1 for VF 4: EMIT vp<%cmp.n> = icmp eq ir<32>, vp<%2> +; CHECK: Cost of 0 for VF 4: EMIT branch-on-cond vp<%cmp.n> ; CHECK: Cost of 1 for VF 4: induction instruction %iv.next = add nuw nsw i64 %iv, 1 ; CHECK: Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] -; CHECK: Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32 ; entry: br label %loop.header diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll index e11b1ad7f09dc..13e8b86e02a73 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -166,7 +166,6 @@ attributes #0 = { "target-cpu"="knl" } ; CHECK: LV: Found uniform instruction: {{%.*}} = icmp eq i32 {{%.*}}, 0 ; CHECK-NOT: LV: Found uniform instruction: {{%.*}} = load i32, ptr {{%.*}}, align 1 ; CHECK: LV: Found not uniform due to requiring predication: {{%.*}} = load i32, ptr {{%.*}}, align 1 -; CHECK: LV: Found scalar instruction: {{%.*}} = getelementptr inbounds [3 x i32], ptr @a, i32 0, i32 {{%.*}} ; ; @a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1 @@ -175,16 +174,34 @@ attributes #0 = { "target-cpu"="knl" } define void @PR40816() #1 { ; CHECK-LABEL: define void @PR40816( ; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: store i32 [[TMP0]], ptr @b, align 1 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 2 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[TMP0]], 1 -; CHECK-NEXT: br i1 [[CMP2]], label %[[RETURN:.*]], label %[[FOR_BODY]] +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: store i32 0, ptr @b, align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] +; CHECK: [[PRED_STORE_IF1]]: +; CHECK-NEXT: store i32 1, ptr @b, align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; CHECK: [[PRED_STORE_CONTINUE2]]: +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[RETURN:.*]] +; CHECK: [[PRED_STORE_IF3]]: +; CHECK-NEXT: store i32 2, ptr @b, align 1 +; CHECK-NEXT: br label %[[RETURN]] ; CHECK: [[RETURN]]: -; CHECK-NEXT: ret void +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] +; CHECK: [[PRED_STORE_IF5]]: +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] +; CHECK: [[PRED_STORE_CONTINUE6]]: +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[RETURN1:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] ; ; FORCE-LABEL: define void @PR40816( ; FORCE-SAME: ) #[[ATTR1:[0-9]+]] { diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 0ba885d7811b2..9b77bad2dc4e6 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -334,7 +334,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]] ; CHECK-NEXT: [[UMIN7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[A:%.*]]) ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[UMIN7]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 28 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 14 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[B]], i64 1) @@ -853,54 +853,54 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[N]], 4294967295 ; CHECK-NEXT: br i1 [[TMP2]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP1]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP1]], 32 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 32 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) -; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4) -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i32> [[STEP_ADD_2]] to <4 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i32> [[STEP_ADD_3]] to <4 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[BROADCAST_SPLAT]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2) -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2) -; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2) -; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 2) -; CHECK-NEXT: [[TMP15]] = or <4 x i32> [[TMP11]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP16]] = or <4 x i32> [[TMP12]], [[VEC_PHI2]] -; CHECK-NEXT: [[TMP17]] = or <4 x i32> [[TMP13]], [[VEC_PHI3]] -; CHECK-NEXT: [[TMP18]] = or <4 x i32> [[TMP14]], [[VEC_PHI4]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i32> [[VEC_IND]], splat (i32 8) +; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <8 x i32> [[STEP_ADD]], splat (i32 8) +; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <8 x i32> [[STEP_ADD_2]], splat (i32 8) +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[VEC_IND]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i32> [[STEP_ADD]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i32> [[STEP_ADD_2]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = zext <8 x i32> [[STEP_ADD_3]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <8 x i64> [[BROADCAST_SPLAT]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2) +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2) +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP9]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2) +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> zeroinitializer, <8 x i32> splat (i32 2) +; CHECK-NEXT: [[TMP15]] = or <8 x i32> [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP16]] = or <8 x i32> [[TMP12]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP17]] = or <8 x i32> [[TMP13]], [[VEC_PHI3]] +; CHECK-NEXT: [[TMP18]] = or <8 x i32> [[TMP14]], [[VEC_PHI4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[STEP_ADD_3]], splat (i32 8) ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[BIN_RDX5:%.*]] = or <4 x i32> [[TMP17]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX6:%.*]] = or <4 x i32> [[TMP18]], [[BIN_RDX5]] -; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[BIN_RDX6]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i32> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[BIN_RDX5:%.*]] = or <8 x i32> [[TMP17]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX6:%.*]] = or <8 x i32> [[TMP18]], [[BIN_RDX5]] +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[BIN_RDX6]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF26:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -924,7 +924,7 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i32 [[INDEX9]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT11]] = add <4 x i32> [[VEC_IND10]], splat (i32 4) ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT15]], [[N_VEC8]] -; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP25]]) ; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC8]] @@ -941,7 +941,7 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[SELECT_I:%.*]] = select i1 [[EXITCOND]], i32 0, i32 2 ; CHECK-NEXT: [[SELECT_NEXT]] = or i32 [[SELECT_I]], [[SELECT]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[SELECT_NEXT_LCSSA:%.*]] = phi i32 [ [[SELECT_NEXT]], [[LOOP]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SELECT_NEXT_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll index de6418066dea0..3c75133f69c90 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll @@ -27,23 +27,30 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP10]], 1 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP18]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -7 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP28]], align 1 -; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i8> [[WIDE_LOAD]], <8 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP29]] = xor <8 x i8> [[REVERSE]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -3 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP26]], i32 -4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 -3 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD4]], <4 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP16]] = xor <4 x i8> [[REVERSE]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17]] = xor <4 x i8> [[REVERSE5]], [[VEC_PHI3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP10]], 8 ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP31:%.*]] = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> [[TMP29]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <4 x i8> [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> [[BIN_RDX]]) ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP31]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll index 3922796a1a4b8..0ffddc2429727 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll @@ -24,7 +24,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 8) +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 9) ; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer @@ -54,7 +54,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 { ; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 -; CHECK-NEXT: [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 90 +; CHECK-NEXT: [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 89 ; CHECK-NEXT: br i1 [[ICMP22]], label [[BB6]], label [[LOOP_HEADER]], !prof [[PROF6:![0-9]+]] ; CHECK: bb6: ; CHECK-NEXT: ret void @@ -76,7 +76,7 @@ bb18: ; preds = %loop.header loop.latch: ; preds = %bb18, %loop.header %iv.next = add nsw i64 %iv, -1 - %icmp22 = icmp eq i64 %iv.next, 90 + %icmp22 = icmp eq i64 %iv.next, 89 br i1 %icmp22, label %bb6, label %loop.header, !prof !22 bb6: diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll index 2cda2533e80e0..d905d925dc834 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll @@ -30,8 +30,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction: br ; CHECK: Cost of 1 for VF 2: induction instruction %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 ; CHECK: Cost of 1 for VF 2: induction instruction %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] -; CHECK: Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i32 %lftr.wideiv, %n -; CHECK: Cost of 0 for VF 2: exit condition instruction %lftr.wideiv = trunc i64 %indvars.iv.next to i32 ; CHECK: Cost of 0 for VF 2: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost of 1 for VF 2: WIDEN-REDUCTION-PHI ir<%sum.013> = phi vp<{{.+}}>, vp<[[EXT:%.+]]> ; CHECK: Cost of 0 for VF 2: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> @@ -49,7 +47,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<[[TRUNC:%.+]]> = trunc ir<%add5> to i8 ; CHECK: Cost of 0 for VF 2: WIDEN-CAST vp<[[EXT]]> = zext vp<[[TRUNC]]> to i32 ; CHECK: Cost of 0 for VF 2: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<{{.+}}> -; CHECK: Cost of 0 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> +; CHECK: Cost of 1 for VF 2: EMIT branch-on-count vp<%index.next>, vp<{{.+}}> ; define i8 @reduction_i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) { entry: