Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,10 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
"Use predicated EVL instructions for tail folding. If EVL "
"is unsupported, fallback to data-without-lane-mask.")));

cl::opt<bool> llvm::EnableWideActiveLaneMask(
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
cl::desc("Enable use of wide get active lane mask instructions"));

static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
cl::desc("Maximize bandwidth when selecting vectorization factor which "
Expand Down Expand Up @@ -1286,6 +1290,11 @@ class LoopVectorizationCostModel {
return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
}

bool preferPredicatedEpilogue() const {
return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
}

/// Returns the TailFoldingStyle that is best for the current loop.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
if (!ChosenTailFoldingStyle)
Expand Down Expand Up @@ -1346,6 +1355,15 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}

bool useWideActiveLaneMask() const {
if (!EnableWideActiveLaneMask)
return false;

TailFoldingStyle TF = getTailFoldingStyle();
return TF == TailFoldingStyle::DataAndControlFlow ||
TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
}

/// Return maximum safe number of elements to be processed per vector
/// iteration, which do not prevent store-load forwarding and are safe with
/// regard to the memory dependencies. Required for EVL-based VPlans to
Expand Down Expand Up @@ -4518,7 +4536,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
// 3. We don't interleave if we think that we will spill registers to memory
// due to the increased register pressure.

if (!CM.isScalarEpilogueAllowed())
// Only interleave tail-folded loops if wide lane masks requested, as the
// overhead of multiple instructions to calculate the predicate is likely
// not beneficial. If a scalar epilogue is not allowed for any other reason,
// do not interleave.
if (!CM.isScalarEpilogueAllowed() &&
!(CM.preferPredicatedEpilogue() && CM.useWideActiveLaneMask()))
return 1;

if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
Expand Down
4 changes: 0 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,6 @@
using namespace llvm;
using namespace VPlanPatternMatch;

static cl::opt<bool> EnableWideActiveLaneMask(
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
cl::desc("Enable use of wide get active lane mask instructions"));

bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
VPlan &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class VPRecipeBuilder;
struct VFRange;

extern cl::opt<bool> VerifyEachVPlan;
extern cl::opt<bool> EnableWideActiveLaneMask;

struct VPlanTransforms {
/// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra
Expand Down
104 changes: 92 additions & 12 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^middle.block:" --version 4
; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=1 < %s | FileCheck %s -check-prefix CHECK-UF1
; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4
; RUN: opt -S --passes=loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize < %s | FileCheck %s -check-prefix CHECK-TF
; RUN: opt -S --passes=forceattrs,loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-attribute=optsize < %s | FileCheck %s -check-prefix CHECK-UF1

target triple = "aarch64-unknown-linux"

Expand Down Expand Up @@ -101,6 +103,49 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src,
; CHECK-UF4-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-UF4: middle.block:
;
; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask(
; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-TF-NEXT: entry:
; CHECK-TF-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-TF: vector.ph:
; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-TF-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-TF-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5
; CHECK-TF-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]]
; CHECK-TF-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]]
; CHECK-TF-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]])
; CHECK-TF-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 16)
; CHECK-TF-NEXT: [[TMP8:%.*]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-TF: vector.body:
; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 16 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 4
; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]]
; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP9]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 16 x i8> poison)
; CHECK-TF-NEXT: [[TMP13:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
; CHECK-TF-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3)
; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 4
; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[TMP17]]
; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr align 1 [[TMP15]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP14]], ptr align 1 [[TMP18]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]])
; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP6]])
; CHECK-TF-NEXT: [[TMP19]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 16)
; CHECK-TF-NEXT: [[TMP20]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[TMP20]], i32 0
; CHECK-TF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true
; CHECK-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-TF: middle.block:
;
entry:
br label %for.body

Expand Down Expand Up @@ -222,6 +267,52 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl
; CHECK-UF4-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK-UF4: middle.block:
;
; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask_double(
; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CHECK-TF-NEXT: entry:
; CHECK-TF-NEXT: [[CMP6:%.*]] = icmp sgt i64 [[N]], 0
; CHECK-TF-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
; CHECK-TF: for.body.preheader:
; CHECK-TF-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK-TF: vector.ph:
; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-TF-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-TF-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
; CHECK-TF-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]]
; CHECK-TF-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]]
; CHECK-TF-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
; CHECK-TF-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
; CHECK-TF-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-TF: vector.body:
; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 2 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP9]], i64 [[TMP11]]
; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 2 x double> poison)
; CHECK-TF-NEXT: [[TMP13:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
; CHECK-TF-NEXT: [[TMP14:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD2]], splat (double 3.000000e+00)
; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 1
; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP17]]
; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP13]], ptr align 8 [[TMP15]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr align 8 [[TMP18]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK1]])
; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
; CHECK-TF-NEXT: [[TMP19]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
; CHECK-TF-NEXT: [[TMP20]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[TMP20]], i32 0
; CHECK-TF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true
; CHECK-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK-TF: middle.block:
;
entry:
%cmp6 = icmp sgt i64 %n, 0
br i1 %cmp6, label %for.body, label %for.end
Expand All @@ -243,14 +334,3 @@ for.end:

attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" }

;.
; CHECK-UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK-UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK-UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK-UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
;.
; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
;.