diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4edc004f161a1..c07663ad9670c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9329,13 +9329,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, // one exists. TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width); - // If the expected trip count is less than the VF, the vector loop will only - // execute a single iteration. Then the middle block is executed the same - // number of times as the vector region. - // TODO: Extend logic to always account for the cost of the middle block. - auto ExpectedTC = getSmallBestKnownTC(PSE, L); - if (ExpectedTC && ElementCount::isKnownLE(*ExpectedTC, VF.Width)) - TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx); + TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx); // When interleaving only scalar and vector cost will be equal, which in turn // would lead to a divide by 0. Fall back to hard threshold. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 320baeb454d46..4ad098d748568 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1385,12 +1385,16 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { return; } - // Look through ExtractLastLane (BuildVector ....). - if (match(Def, m_ExtractLastLane(m_BuildVector()))) { - auto *BuildVector = cast(Def->getOperand(0)); - Def->replaceAllUsesWith( - BuildVector->getOperand(BuildVector->getNumOperands() - 1)); - return; + // Look through ExtractLastLane. + if (match(Def, m_ExtractLastLane(m_VPValue(A)))) { + if (match(A, m_BuildVector())) { + auto *BuildVector = cast(A); + Def->replaceAllUsesWith( + BuildVector->getOperand(BuildVector->getNumOperands() - 1)); + return; + } + if (Plan->hasScalarVFOnly()) + return Def->replaceAllUsesWith(A); } // Look through ExtractPenultimateElement (BuildVector ....). diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 6fb706ea7d64b..7b4c524712d9a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -371,10 +371,9 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { continue; } - if (match(&R, m_ExtractLastLaneOfLastPart(m_VPValue(Op0))) || - match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) { - addUniformForAllParts(cast(&R)); - if (Plan.hasScalarVFOnly()) { + if (Plan.hasScalarVFOnly()) { + if (match(&R, m_ExtractLastPart(m_VPValue(Op0))) || + match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) { auto *I = cast(&R); bool IsPenultimatePart = I->getOpcode() == VPInstruction::ExtractPenultimateElement; @@ -383,7 +382,10 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { I->replaceAllUsesWith(getValueForPart(Op0, PartIdx)); continue; } - // For vector VF, always extract from the last part. + } + if (match(&R, m_ExtractLastLaneOfLastPart(m_VPValue(Op0))) || + match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) { + addUniformForAllParts(cast(&R)); R.setOperand(0, getValueForPart(Op0, UF - 1)); continue; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll index 7ae50a5e4a075..de5870e269b67 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll @@ -96,7 +96,7 @@ define i64 @vectorization_not_profitable_due_to_trunc(ptr dereferenceable(800) % ; CHECK-NEXT: Calculating cost of work in exit block vector.early.exit: ; CHECK-NEXT: Cost of 1 for VF 1: EMIT vp<%first.active.lane> = first-active-lane ir<%t> ; CHECK-NEXT: Cost of 0 for VF 1: EMIT vp<%early.exit.value> = extract-lane vp<%first.active.lane>, ir<%l> -; CHECK-NEXT: LV: Vectorization is possible but not beneficial. +; CHECK: LV: Vectorization is possible but not beneficial. entry: br label %loop.header diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll index 7b42e565e127d..40db6a53b49e4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll @@ -94,7 +94,7 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) { ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]] ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll index 611b980999bfe..df1c639911cb0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll @@ -8,7 +8,7 @@ define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef reado ; CHECK: Calculating cost of runtime checks: ; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop. ; CHECK: Total cost of runtime checks: 4 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:16 entry: br label %inner.loop @@ -34,7 +34,7 @@ define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonl ; CHECK: Calculating cost of runtime checks: ; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3 ; CHECK: Total cost of runtime checks: 3 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:16 entry: br label %outer.loop @@ -71,7 +71,7 @@ define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef rea ; CHECK: Calculating cost of runtime checks: ; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 ; CHECK: Total cost of runtime checks: 2 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:16 entry: br label %outer.loop @@ -108,7 +108,7 @@ define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef re ; CHECK: Calculating cost of runtime checks: ; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1 ; CHECK: Total cost of runtime checks: 1 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:16 entry: br label %outer.loop @@ -145,7 +145,7 @@ define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonl ; CHECK: Calculating cost of runtime checks: ; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 ; CHECK: Total cost of runtime checks: 2 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:16 entry: br label %outer.loop @@ -182,7 +182,7 @@ define void @outer_pgo_minus1(ptr nocapture noundef %a, ptr nocapture noundef re ; CHECK: Calculating cost of runtime checks: ; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1 ; CHECK: Total cost of runtime checks: 1 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:16 entry: br label %outer.loop @@ -219,7 +219,7 @@ define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr n ; CHECK: Calculating cost of runtime checks: ; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 ; CHECK: Total cost of runtime checks: 2 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:4 entry: br label %outer.loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll index e338b828d2520..dd6f0fe5f1292 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll @@ -16,7 +16,8 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0 ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[IDX]] ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP3]], i32 6) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], [[UMAX]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll index bbd596a772c53..c77afa870e2c1 100644 --- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll +++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll @@ -220,7 +220,6 @@ exit: ; DBG-EMPTY: ; DBG-NEXT: middle.block: ; DBG-NEXT: EMIT vp<[[RESUME_1_PART:%.+]]> = extract-last-part vp<[[SCALAR_STEPS]]> -; DBG-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-last-lane vp<[[RESUME_1_PART]]> ; DBG-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]> ; DBG-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; DBG-NEXT: Successor(s): ir-bb, scalar.ph @@ -230,7 +229,7 @@ exit: ; DBG-EMPTY: ; DBG-NEXT: scalar.ph: ; DBG-NEXT: EMIT-SCALAR vp<[[RESUME_IV:%.+]]> = phi [ vp<[[VTC]]>, middle.block ], [ ir<0>, ir-bb ] -; DBG-NEXT: EMIT-SCALAR vp<[[RESUME_P:%.*]]> = phi [ vp<[[RESUME_1]]>, middle.block ], [ ir<0>, ir-bb ] +; DBG-NEXT: EMIT-SCALAR vp<[[RESUME_P:%.*]]> = phi [ vp<[[RESUME_1_PART]]>, middle.block ], [ ir<0>, ir-bb ] ; DBG-NEXT: Successor(s): ir-bb ; DBG-EMPTY: ; DBG-NEXT: ir-bb: