llvm · kmclaughlin-arm · Oct 13, 2025 · Oct 20, 2025 · Oct 20, 2025
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -249,6 +249,10 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
                    "Use predicated EVL instructions for tail folding. If EVL "
                    "is unsupported, fallback to data-without-lane-mask.")));
 
+cl::opt<bool> llvm::EnableWideActiveLaneMask(
+    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
+    cl::desc("Enable use of wide get active lane mask instructions"));
+
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -1286,6 +1290,11 @@ class LoopVectorizationCostModel {
     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
   }
 
+  bool preferPredicatedEpilogue() const {
+    return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
+           ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
+  }
+
   /// Returns the TailFoldingStyle that is best for the current loop.
   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
     if (!ChosenTailFoldingStyle)
@@ -1346,6 +1355,15 @@ class LoopVectorizationCostModel {
     return getTailFoldingStyle() != TailFoldingStyle::None;
   }
 
+  bool useWideActiveLaneMask() const {
+    if (!EnableWideActiveLaneMask)
+      return false;
+
+    TailFoldingStyle TF = getTailFoldingStyle();
+    return TF == TailFoldingStyle::DataAndControlFlow ||
+           TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+  }
+
   /// Return maximum safe number of elements to be processed per vector
   /// iteration, which do not prevent store-load forwarding and are safe with
   /// regard to the memory dependencies. Required for EVL-based VPlans to
@@ -4518,7 +4536,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
   // 3. We don't interleave if we think that we will spill registers to memory
   // due to the increased register pressure.
 
-  if (!CM.isScalarEpilogueAllowed())
+  // Only interleave tail-folded loops if wide lane masks requested, as the
+  // overhead of multiple instructions to calculate the predicate is likely
+  // not beneficial. If a scalar epilogue is not allowed for any other reason,
+  // do not interleave.
+  if (!CM.isScalarEpilogueAllowed() &&
+      !(CM.preferPredicatedEpilogue() && CM.useWideActiveLaneMask()))
     return 1;
 
   if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,10 +40,6 @@
 using namespace llvm;
 using namespace VPlanPatternMatch;
 
-static cl::opt<bool> EnableWideActiveLaneMask(
-    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
-    cl::desc("Enable use of wide get active lane mask instructions"));
-
 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
     VPlan &Plan,
     function_ref<const InductionDescriptor *(PHINode *)>

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -32,6 +32,7 @@ class VPRecipeBuilder;
 struct VFRange;
 
 extern cl::opt<bool> VerifyEachVPlan;
+extern cl::opt<bool> EnableWideActiveLaneMask;
 
 struct VPlanTransforms {
   /// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
@@ -1,6 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^middle.block:" --version 4
 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=1 < %s | FileCheck %s -check-prefix CHECK-UF1
 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4
+; RUN: opt -S --passes=loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize < %s | FileCheck %s -check-prefix CHECK-TF
+; RUN: opt -S --passes=forceattrs,loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-attribute=optsize < %s | FileCheck %s -check-prefix CHECK-UF1
 
 target triple = "aarch64-unknown-linux"
 
@@ -101,6 +103,49 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src,
 ; CHECK-UF4-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-UF4:       middle.block:
 ;
+; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask(
+; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-TF-NEXT:  entry:
+; CHECK-TF-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-TF:       vector.ph:
+; CHECK-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
+; CHECK-TF-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5
+; CHECK-TF-NEXT:    [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]]
+; CHECK-TF-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]]
+; CHECK-TF-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]])
+; CHECK-TF-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 16)
+; CHECK-TF-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-TF:       vector.body:
+; CHECK-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 16 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 4
+; CHECK-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP9]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[TMP13:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
+; CHECK-TF-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3)
+; CHECK-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 4
+; CHECK-TF-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[TMP17]]
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr align 1 [[TMP15]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP14]], ptr align 1 [[TMP18]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]])
+; CHECK-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP6]])
+; CHECK-TF-NEXT:    [[TMP19]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 16)
+; CHECK-TF-NEXT:    [[TMP20]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-TF-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[TMP20]], i32 0
+; CHECK-TF-NEXT:    [[TMP22:%.*]] = xor i1 [[TMP21]], true
+; CHECK-TF-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-TF:       middle.block:
+;
 entry:
   br label %for.body
 
@@ -222,6 +267,52 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl
 ; CHECK-UF4-NEXT:    br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-UF4:       middle.block:
 ;
+; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask_double(
+; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-TF-NEXT:  entry:
+; CHECK-TF-NEXT:    [[CMP6:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-TF-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK-TF:       for.body.preheader:
+; CHECK-TF-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-TF:       vector.ph:
+; CHECK-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-TF-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-TF-NEXT:    [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]]
+; CHECK-TF-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]]
+; CHECK-TF-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-TF-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
+; CHECK-TF-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-TF:       vector.body:
+; CHECK-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 2 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
+; CHECK-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP9]], i64 [[TMP11]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 2 x double> poison)
+; CHECK-TF-NEXT:    [[TMP13:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
+; CHECK-TF-NEXT:    [[TMP14:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD2]], splat (double 3.000000e+00)
+; CHECK-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 1
+; CHECK-TF-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP17]]
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP13]], ptr align 8 [[TMP15]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr align 8 [[TMP18]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK1]])
+; CHECK-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
+; CHECK-TF-NEXT:    [[TMP19]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
+; CHECK-TF-NEXT:    [[TMP20]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-TF-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[TMP20]], i32 0
+; CHECK-TF-NEXT:    [[TMP22:%.*]] = xor i1 [[TMP21]], true
+; CHECK-TF-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-TF:       middle.block:
+;
 entry:
   %cmp6 = icmp sgt i64 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -243,14 +334,3 @@ for.end:
 
 attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" }
 
-;.
-; CHECK-UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-;.
-; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-;.