From 1e5f44eb7ee71c285bf803106307a05cdd2b4b48 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 14 Nov 2025 00:06:41 +0800 Subject: [PATCH 1/7] [VPlan] Fix LastActiveLane assertion on scalar VF For a scalar only VPlan with tail folding, if it has a phi live out then legalizeAndOptimizeInductions will scalarize the widened canonical IV feeding into the header mask: vector loop: { vector.body: EMIT vp<%4> = CANONICAL-INDUCTION ir<0>, vp<%index.next> vp<%5> = SCALAR-STEPS vp<%4>, ir<1>, vp<%0> EMIT vp<%6> = icmp ule vp<%5>, vp<%3> EMIT vp<%index.next> = add nuw vp<%4>, vp<%1> EMIT branch-on-count vp<%index.next>, vp<%2> No successors } Successor(s): middle.block middle.block: EMIT vp<%8> = last-active-lane vp<%6> EMIT vp<%9> = extract-lane vp<%8>, vp<%5> Successor(s): ir-bb The verifier complains about this but this should still generate the correct last active lane, so this fixes the assert by handling this case in isHeaderMask. There is a similar pattern already there for ActiveLaneMask, which also expects a VPScalarIVSteps recipe. Fixes #167813 --- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 7 +++ .../tail-folding-live-out-scalar-vf.ll | 60 +++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index e22c5dfdb9f38..c9de9b82bca7c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -66,6 +66,13 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { m_One(), m_Specific(&Plan.getVF()))) || IsWideCanonicalIV(A)); + if (match(V, + m_ICmp(m_ScalarIVSteps( + m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()), + m_One(), m_Specific(&Plan.getVF())), + m_Specific(Plan.getBackedgeTakenCount())))) + return true; + return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) && B == Plan.getBackedgeTakenCount(); } diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll new file mode 100644 index 0000000000000..5964cf45fb6be --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -p loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-width=1 -force-vector-interleave=2 -S %s | FileCheck %s + +define i64 @live_out_scalar_vf(i64 %n) { +; CHECK-LABEL: define i64 @live_out_scalar_vf( +; CHECK-SAME: i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP1]], [[TRIP_COUNT_MINUS_1]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[TMP2]], [[TRIP_COUNT_MINUS_1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i1 [[TMP4]], false +; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 1, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i1 [[TMP3]], false +; CHECK-NEXT: [[TMP10:%.*]] = zext i1 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 0, [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 [[TMP8]] +; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP13]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[LAST_ACTIVE_LANE]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = icmp uge i64 [[LAST_ACTIVE_LANE]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP2]], i64 [[TMP1]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret i64 [[TMP16]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] + br label %latch + +latch: + ; Need to use a phi otherwise the header mask will use a + ; VPWidenCanonicalIVRecipe instead of a VPScalarIVStepsRecipe. + %exitval = phi i64 [ %iv, %loop ] + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, %n + br i1 %ec, label %exit, label %loop + +exit: + ret i64 %exitval +} + From 3863112c91c94b8da52037124676882c44d26e2a Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 14 Nov 2025 00:39:22 +0800 Subject: [PATCH 2/7] Remove trivial branch --- .../tail-folding-live-out-scalar-vf.ll | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll index 5964cf45fb6be..21b26c1ddec3d 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll @@ -15,8 +15,9 @@ define i64 @live_out_scalar_vf(i64 %n) { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP1]], [[TRIP_COUNT_MINUS_1]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[TMP2]], [[TRIP_COUNT_MINUS_1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 @@ -33,23 +34,23 @@ define i64 @live_out_scalar_vf(i64 %n) { ; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 [[TMP8]] ; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP13]], 1 ; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[LAST_ACTIVE_LANE]], 1 -; CHECK-NEXT: [[TMP15:%.*]] = icmp uge i64 [[LAST_ACTIVE_LANE]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[TMP14]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = icmp uge i64 [[TMP14]], 1 ; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP2]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[LAST_ACTIVE_LANE]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[VECTOR_RECUR]], i64 [[TMP16]] ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: ret i64 [[TMP16]] +; CHECK-NEXT: ret i64 [[TMP19]] ; entry: br label %loop loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] - br label %latch - -latch: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] ; Need to use a phi otherwise the header mask will use a ; VPWidenCanonicalIVRecipe instead of a VPScalarIVStepsRecipe. - %exitval = phi i64 [ %iv, %loop ] + %exitval = phi i64 [ 0, %entry ], [ %iv, %loop ] %iv.next = add i64 %iv, 1 %ec = icmp eq i64 %iv, %n br i1 %ec, label %exit, label %loop From 23bce03a6741a1ae1a597c678fd7dff6df378fec Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 14 Nov 2025 14:48:15 +0800 Subject: [PATCH 3/7] Add comment and assert --- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index c9de9b82bca7c..476b51d4d0951 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -66,12 +66,16 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { m_One(), m_Specific(&Plan.getVF()))) || IsWideCanonicalIV(A)); + // For scalar plans, the header mask uses the scalar steps. if (match(V, m_ICmp(m_ScalarIVSteps( m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()), m_One(), m_Specific(&Plan.getVF())), - m_Specific(Plan.getBackedgeTakenCount())))) + m_Specific(Plan.getBackedgeTakenCount())))) { + assert(Plan.hasScalarVFOnly() && + "Non-scalar VF using scalar IV steps for header mask?"); return true; + } return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) && B == Plan.getBackedgeTakenCount(); From 67bb40220ac5c4046755a0edf7da481ff4ca4bf4 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 14 Nov 2025 14:59:23 +0800 Subject: [PATCH 4/7] Update test after merging --- .../tail-folding-live-out-scalar-vf.ll | 48 ++++++++----------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll index 21b26c1ddec3d..ea1383946102e 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll @@ -4,43 +4,35 @@ define i64 @live_out_scalar_vf(i64 %n) { ; CHECK-LABEL: define i64 @live_out_scalar_vf( ; CHECK-SAME: i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP2]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP1]], [[TRIP_COUNT_MINUS_1]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[TMP2]], [[TRIP_COUNT_MINUS_1]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i1 [[TMP4]], false -; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP6]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 1, [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i1 [[TMP3]], false -; CHECK-NEXT: [[TMP10:%.*]] = zext i1 [[TMP9]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 0, [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP10]], 1 -; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i64 [[TMP11]], i64 [[TMP8]] -; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP13]], 1 -; CHECK-NEXT: [[TMP14:%.*]] = sub i64 [[LAST_ACTIVE_LANE]], 1 -; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[TMP14]], 1 -; CHECK-NEXT: [[TMP15:%.*]] = icmp uge i64 [[TMP14]], 1 -; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP2]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[LAST_ACTIVE_LANE]], 0 -; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[VECTOR_RECUR]], i64 [[TMP16]] -; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[EXITVAL:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[IV]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: +; CHECK-NEXT: [[TMP19:%.*]] = phi i64 [ [[EXITVAL]], %[[LOOP]] ], [ [[INDEX]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[TMP19]] ; entry: From efca5e5a1abe71378f0864896a59ab62c2673bb7 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 17 Nov 2025 11:48:18 +0800 Subject: [PATCH 5/7] Add m_CanonicalScalarIVSteps helper --- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 976b094fd4ddb..084e1bc12d5b9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -46,6 +46,12 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { return Expanded; } +static inline auto m_CanonicalScalarIVSteps(const VPlan &Plan) { + return m_ScalarIVSteps( + m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()), m_One(), + m_Specific(&Plan.getVF())); +} + bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { if (isa(V)) return true; @@ -60,18 +66,11 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One()))) return B == Plan.getTripCount() && - (match(A, - m_ScalarIVSteps( - m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()), - m_One(), m_Specific(&Plan.getVF()))) || - IsWideCanonicalIV(A)); + (match(A, m_CanonicalScalarIVSteps(Plan)) || IsWideCanonicalIV(A)); // For scalar plans, the header mask uses the scalar steps. - if (match(V, - m_ICmp(m_ScalarIVSteps( - m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()), - m_One(), m_Specific(&Plan.getVF())), - m_Specific(Plan.getBackedgeTakenCount())))) { + if (match(V, m_ICmp(m_CanonicalScalarIVSteps(Plan), + m_Specific(Plan.getBackedgeTakenCount())))) { assert(Plan.hasScalarVFOnly() && "Non-scalar VF using scalar IV steps for header mask?"); return true; From c0a70672c2a9f64b4efcc71ee786eb2234f9261e Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 17 Nov 2025 16:53:03 +0800 Subject: [PATCH 6/7] Move to variable --- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 084e1bc12d5b9..3bc2dfd623777 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -46,12 +46,6 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { return Expanded; } -static inline auto m_CanonicalScalarIVSteps(const VPlan &Plan) { - return m_ScalarIVSteps( - m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()), m_One(), - m_Specific(&Plan.getVF())); -} - bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { if (isa(V)) return true; @@ -64,12 +58,16 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { VPValue *A, *B; + auto m_CanonicalScalarIVSteps = + m_ScalarIVSteps(m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()), + m_One(), m_Specific(&Plan.getVF())); + if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One()))) return B == Plan.getTripCount() && - (match(A, m_CanonicalScalarIVSteps(Plan)) || IsWideCanonicalIV(A)); + (match(A, m_CanonicalScalarIVSteps) || IsWideCanonicalIV(A)); // For scalar plans, the header mask uses the scalar steps. - if (match(V, m_ICmp(m_CanonicalScalarIVSteps(Plan), + if (match(V, m_ICmp(m_CanonicalScalarIVSteps, m_Specific(Plan.getBackedgeTakenCount())))) { assert(Plan.hasScalarVFOnly() && "Non-scalar VF using scalar IV steps for header mask?"); From c29428f4759cd6d34c31b0bb0fb552cc74b1d038 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 17 Nov 2025 18:29:55 +0800 Subject: [PATCH 7/7] Move test to tail-folding-vectorization-factor-1.ll Needed to add -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue flag to run lines. Looks like the existing tests weren't necessarily tail folded? --- .../tail-folding-live-out-scalar-vf.ll | 53 ---------------- .../tail-folding-vectorization-factor-1.ll | 60 ++++++++++++++++++- 2 files changed, 58 insertions(+), 55 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll deleted file mode 100644 index ea1383946102e..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-live-out-scalar-vf.ll +++ /dev/null @@ -1,53 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 -; RUN: opt -p loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-width=1 -force-vector-interleave=2 -S %s | FileCheck %s - -define i64 @live_out_scalar_vf(i64 %n) { -; CHECK-LABEL: define i64 @live_out_scalar_vf( -; CHECK-SAME: i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[EXITVAL:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[IV]], %[[LOOP]] ] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[TMP19:%.*]] = phi i64 [ [[EXITVAL]], %[[LOOP]] ], [ [[INDEX]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i64 [[TMP19]] -; -entry: - br label %loop - -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - ; Need to use a phi otherwise the header mask will use a - ; VPWidenCanonicalIVRecipe instead of a VPScalarIVStepsRecipe. - %exitval = phi i64 [ 0, %entry ], [ %iv, %loop ] - %iv.next = add i64 %iv, 1 - %ec = icmp eq i64 %iv, %n - br i1 %ec, label %exit, label %loop - -exit: - ret i64 %exitval -} - diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll index 3bc5da155b351..277be4666243b 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -pass-remarks='loop-vectorize' -disable-output -S 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS -; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -S | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -pass-remarks='loop-vectorize' -disable-output -S 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s ; These tests are to check that fold-tail procedure produces correct scalar code when ; loop-vectorization is only unrolling but not vectorizing. @@ -141,5 +141,61 @@ for.body: %cond = icmp eq ptr %ptr, %ptr2 br i1 %cond, label %for.cond.cleanup, label %for.body } + +define i64 @live_out_scalar_vf(i64 %n) { +; CHECK-LABEL: @live_out_scalar_vf( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4) +; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4) +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[STEP_ADD_3]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[STEP_ADD_3]], i32 2 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[EXITVAL:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV]], [[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[TMP19:%.*]] = phi i64 [ [[EXITVAL]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[TMP19]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + ; Need to use a phi otherwise the header mask will use a + ; VPWidenCanonicalIVRecipe instead of a VPScalarIVStepsRecipe. + %exitval = phi i64 [ 0, %entry ], [ %iv, %loop ] + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, %n + br i1 %ec, label %exit, label %loop + +exit: + ret i64 %exitval +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-REMARKS: {{.*}}