From b7d6dc27a266c69a20ac833679c818aab69460b7 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 7 Dec 2025 22:24:00 +0000 Subject: [PATCH] [LV] Always include middle block cost in isOutsideLoopWorkProfitable. Always include the cost of the middle block in isOutsideLoopWorkProfitable. This addresses the TODO from https://github.com/llvm/llvm-project/pull/168949 and removes the temporary restriction. isOutsideLoopWorkProfitable already scales the cost outside loops according the expected trip counts. In practice this increases the minimum iteration threshold in a few cases. On a large IR corpus based on C/C++ workloads, ~50 out of 179450 vector loops have their thresholds increased slightly. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 +------- .../LoopVectorize/AArch64/early_exit_costs.ll | 2 +- .../LoopVectorize/AArch64/induction-costs.ll | 2 +- .../AArch64/low_trip_memcheck_cost.ll | 14 +++++++------- .../AArch64/scalable-avoid-scalarization.ll | 3 ++- 5 files changed, 12 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 15d0fa41bd902..e6241d97e4794 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9357,13 +9357,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, // one exists. TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width); - // If the expected trip count is less than the VF, the vector loop will only - // execute a single iteration. Then the middle block is executed the same - // number of times as the vector region. - // TODO: Extend logic to always account for the cost of the middle block. - auto ExpectedTC = getSmallBestKnownTC(PSE, L); - if (ExpectedTC && ElementCount::isKnownLE(*ExpectedTC, VF.Width)) - TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx); + TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx); // When interleaving only scalar and vector cost will be equal, which in turn // would lead to a divide by 0. Fall back to hard threshold. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll index 791ef734ec48b..77b584655187c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll @@ -96,7 +96,7 @@ define i64 @vectorization_not_profitable_due_to_trunc(ptr dereferenceable(800) % ; CHECK-NEXT: Calculating cost of work in exit block vector.early.exit: ; CHECK-NEXT: Cost of 1 for VF 1: EMIT vp<%first.active.lane> = first-active-lane ir<%t> ; CHECK-NEXT: Cost of 0 for VF 1: EMIT vp<%early.exit.value> = extract-lane vp<%first.active.lane>, ir<%l> -; CHECK-NEXT: LV: Vectorization is possible but not beneficial. +; CHECK: LV: Vectorization is possible but not beneficial. entry: br label %loop.header diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll index 7b42e565e127d..40db6a53b49e4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll @@ -94,7 +94,7 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) { ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]] ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll index 611b980999bfe..df1c639911cb0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll @@ -8,7 +8,7 @@ define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef reado ; CHECK: Calculating cost of runtime checks: ; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop. ; CHECK: Total cost of runtime checks: 4 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:16 entry: br label %inner.loop @@ -34,7 +34,7 @@ define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonl ; CHECK: Calculating cost of runtime checks: ; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3 ; CHECK: Total cost of runtime checks: 3 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:16 entry: br label %outer.loop @@ -71,7 +71,7 @@ define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef rea ; CHECK: Calculating cost of runtime checks: ; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 ; CHECK: Total cost of runtime checks: 2 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:16 entry: br label %outer.loop @@ -108,7 +108,7 @@ define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef re ; CHECK: Calculating cost of runtime checks: ; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1 ; CHECK: Total cost of runtime checks: 1 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:16 entry: br label %outer.loop @@ -145,7 +145,7 @@ define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonl ; CHECK: Calculating cost of runtime checks: ; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 ; CHECK: Total cost of runtime checks: 2 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:16 entry: br label %outer.loop @@ -182,7 +182,7 @@ define void @outer_pgo_minus1(ptr nocapture noundef %a, ptr nocapture noundef re ; CHECK: Calculating cost of runtime checks: ; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1 ; CHECK: Total cost of runtime checks: 1 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:16 entry: br label %outer.loop @@ -219,7 +219,7 @@ define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr n ; CHECK: Calculating cost of runtime checks: ; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 ; CHECK: Total cost of runtime checks: 2 -; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4 +; CHECK: LV: Minimum required TC for runtime checks to be profitable:4 entry: br label %outer.loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll index e338b828d2520..dd6f0fe5f1292 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll @@ -16,7 +16,8 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0 ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[IDX]] ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP3]], i32 6) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], [[UMAX]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32()