diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 5d3b233ed6b6a..14c6125bb1dbf 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -209,9 +209,10 @@ struct TailFoldingInfo { TargetLibraryInfo *TLI; LoopVectorizationLegality *LVL; InterleavedAccessInfo *IAI; + bool UseWideLaneMask; TailFoldingInfo(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) - : TLI(TLI), LVL(LVL), IAI(IAI) {} + InterleavedAccessInfo *IAI, bool UseWideLaneMask = false) + : TLI(TLI), LVL(LVL), IAI(IAI), UseWideLaneMask(UseWideLaneMask) {} }; class TargetTransformInfo; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 479e34515fc8a..742327635c178 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -957,10 +957,18 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return TyL.first + ExtraCost; } case Intrinsic::get_active_lane_mask: { - auto *RetTy = dyn_cast(ICA.getReturnType()); - if (RetTy) { - EVT RetVT = getTLI()->getValueType(DL, RetTy); - EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); + auto RetTy = cast(ICA.getReturnType()); + EVT RetVT = getTLI()->getValueType(DL, RetTy); + EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); + if (RetTy->isScalableTy()) { + if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) || + (!ST->hasSVE2p1() && !ST->hasSME2()) || + TLI->getTypeAction(RetTy->getContext(), RetVT) != + TargetLowering::TypeSplitVector) + break; + auto LT = getTypeLegalizationCost(RetTy); + return LT.first / 2; + } else { if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) && !getTLI()->isTypeLegal(RetVT)) { // We don't have enough context at this point to determine if the mask @@ -972,7 +980,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, // NOTE: getScalarizationOverhead returns a cost that's far too // pessimistic for the actual generated codegen. In reality there are // two instructions generated per lane. - return RetTy->getNumElements() * 2; + return cast(RetTy)->getNumElements() * 2; } } break; @@ -6146,8 +6154,11 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { if (Required == TailFoldingOpts::Disabled) Required |= TailFoldingOpts::Simple; - if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(), - Required)) + TailFoldingOpts DefaultOpts = ST->getSVETailFoldingDefaultOpts(); + if (TFI->UseWideLaneMask) + DefaultOpts |= TailFoldingOpts::Simple; + + if (!TailFoldingOptionLoc.satisfies(DefaultOpts, Required)) return false; // Don't tail-fold for tight loops where we would be better off interleaving diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b96d29e635465..5b487586169e6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -249,6 +249,10 @@ static cl::opt ForceTailFoldingStyle( "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask."))); +cl::opt llvm::EnableWideActiveLaneMask( + "enable-wide-lane-mask", cl::init(false), cl::Hidden, + cl::desc("Enable use of wide get active lane mask instructions")); + static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -1346,6 +1350,15 @@ class LoopVectorizationCostModel { return getTailFoldingStyle() != TailFoldingStyle::None; } + bool useWideActiveLaneMask() const { + if (!EnableWideActiveLaneMask) + return false; + + TailFoldingStyle TF = getTailFoldingStyle(); + return TF == TailFoldingStyle::DataAndControlFlow || + TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; + } + /// Return maximum safe number of elements to be processed per vector /// iteration, which do not prevent store-load forwarding and are safe with /// regard to the memory dependencies. Required for EVL-based VPlans to @@ -4518,7 +4531,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - if (!CM.isScalarEpilogueAllowed()) + if (!CM.isScalarEpilogueAllowed() && !CM.useWideActiveLaneMask()) return 1; if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), @@ -8995,7 +9008,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( }; // 4) if the TTI hook indicates this is profitable, request predication. - TailFoldingInfo TFI(TLI, &LVL, IAI); + TailFoldingInfo TFI(TLI, &LVL, IAI, EnableWideActiveLaneMask); if (TTI->preferPredicateOverEpilogue(&TFI)) return CM_ScalarEpilogueNotNeededUsePredicate; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 40b7e8df7aec9..d8a20958b8718 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -40,10 +40,6 @@ using namespace llvm; using namespace VPlanPatternMatch; -static cl::opt EnableWideActiveLaneMask( - "enable-wide-lane-mask", cl::init(false), cl::Hidden, - cl::desc("Enable use of wide get active lane mask instructions")); - bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( VPlan &Plan, function_ref diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 5a8a2bbc2975e..50eefbdcee14a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -32,6 +32,7 @@ class VPRecipeBuilder; struct VFRange; extern cl::opt VerifyEachVPlan; +extern cl::opt EnableWideActiveLaneMask; struct VPlanTransforms { /// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll index 0976a108cfb2c..cb21789ab432d 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-1 -; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-2 +; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-2 --check-prefix=CHECK-SVE +; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve2p1 | FileCheck %s --check-prefix=CHECK-VSCALE-2 --check-prefix=CHECK-SVE2p1-OR-SME2 +; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sme2 | FileCheck %s --check-prefix=CHECK-VSCALE-2 --check-prefix=CHECK-SVE2p1-OR-SME2 ; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -intrinsic-cost-strategy=type-based-intrinsic-cost -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=TYPE_BASED_ONLY target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -920,6 +922,7 @@ define void @get_lane_mask() #0 { ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) +; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) @@ -934,28 +937,53 @@ define void @get_lane_mask() #0 { ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; -; CHECK-VSCALE-2-LABEL: 'get_lane_mask' -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; CHECK-SVE-LABEL: 'get_lane_mask' +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 4 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; +; CHECK-SVE2p1-OR-SME2-LABEL: 'get_lane_mask' +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 2 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; TYPE_BASED_ONLY-LABEL: 'get_lane_mask' ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison) @@ -966,6 +994,7 @@ define void @get_lane_mask() #0 { ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) @@ -990,6 +1019,7 @@ define void @get_lane_mask() #0 { %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) + %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) @@ -1416,6 +1446,7 @@ declare @llvm.get.active.lane.mask.nxv16i1.i32(i32, i32) declare @llvm.get.active.lane.mask.nxv8i1.i32(i32, i32) declare @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32) declare @llvm.get.active.lane.mask.nxv2i1.i32(i32, i32) +declare @llvm.get.active.lane.mask.nxv64i1.i64(i64, i64) declare @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64) declare @llvm.get.active.lane.mask.nxv16i1.i16(i16, i16) declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64, i64) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll index 75acbea978410..a6846b93578ee 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=1 < %s | FileCheck %s -check-prefix CHECK-UF1 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4 +; RUN: opt -S --passes=loop-vectorize -enable-wide-lane-mask -sve-tail-folding-insn-threshold=0 < %s | FileCheck %s -check-prefix CHECK-TF-0 target triple = "aarch64-unknown-linux" @@ -101,6 +102,49 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src, ; CHECK-UF4-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-UF4: middle.block: ; +; CHECK-TF-0-LABEL: define void @scalable_wide_active_lane_mask( +; CHECK-TF-0-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-TF-0-NEXT: entry: +; CHECK-TF-0-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-TF-0: vector.ph: +; CHECK-TF-0-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-0-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32 +; CHECK-TF-0-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-0-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5 +; CHECK-TF-0-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]] +; CHECK-TF-0-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]] +; CHECK-TF-0-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 +; CHECK-TF-0-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-0-NEXT: [[TMP8:%.*]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 16) +; CHECK-TF-0-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-TF-0-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF-0: vector.body: +; CHECK-TF-0-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-0-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-0-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-0-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-TF-0-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-0-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 4 +; CHECK-TF-0-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-TF-0-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-TF-0-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-TF-0-NEXT: [[TMP14:%.*]] = mul [[WIDE_MASKED_LOAD2]], splat (i8 3) +; CHECK-TF-0-NEXT: [[TMP23:%.*]] = mul [[WIDE_MASKED_LOAD3]], splat (i8 3) +; CHECK-TF-0-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-TF-0-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-0-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 4 +; CHECK-TF-0-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[TMP17]] +; CHECK-TF-0-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP14]], ptr [[TMP15]], i32 1, [[ACTIVE_LANE_MASK1]]) +; CHECK-TF-0-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP23]], ptr [[TMP18]], i32 1, [[ACTIVE_LANE_MASK2]]) +; CHECK-TF-0-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-TF-0-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-TF-0-NEXT: [[TMP20]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_NEXT]], i64 16) +; CHECK-TF-0-NEXT: [[TMP19]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-TF-0-NEXT: [[TMP21:%.*]] = extractelement [[TMP19]], i32 0 +; CHECK-TF-0-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true +; CHECK-TF-0-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF-0: middle.block: +; entry: br label %for.body @@ -222,6 +266,52 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl ; CHECK-UF4-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-UF4: middle.block: ; +; CHECK-TF-0-LABEL: define void @scalable_wide_active_lane_mask_double( +; CHECK-TF-0-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-TF-0-NEXT: entry: +; CHECK-TF-0-NEXT: [[CMP6:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-TF-0-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-TF-0: for.body.preheader: +; CHECK-TF-0-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-TF-0: vector.ph: +; CHECK-TF-0-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-0-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-TF-0-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-0-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-TF-0-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]] +; CHECK-TF-0-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]] +; CHECK-TF-0-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 +; CHECK-TF-0-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-0-NEXT: [[TMP8:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-TF-0-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-TF-0-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF-0: vector.body: +; CHECK-TF-0-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-0-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-0-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-0-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-TF-0-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-0-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1 +; CHECK-TF-0-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-TF-0-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK1]], poison) +; CHECK-TF-0-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK2]], poison) +; CHECK-TF-0-NEXT: [[TMP14:%.*]] = fmul [[WIDE_MASKED_LOAD2]], splat (double 3.000000e+00) +; CHECK-TF-0-NEXT: [[TMP23:%.*]] = fmul [[WIDE_MASKED_LOAD3]], splat (double 3.000000e+00) +; CHECK-TF-0-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-TF-0-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-0-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 1 +; CHECK-TF-0-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP17]] +; CHECK-TF-0-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr [[TMP15]], i32 8, [[ACTIVE_LANE_MASK1]]) +; CHECK-TF-0-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP23]], ptr [[TMP18]], i32 8, [[ACTIVE_LANE_MASK2]]) +; CHECK-TF-0-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-TF-0-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-TF-0-NEXT: [[TMP20]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-TF-0-NEXT: [[TMP19]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-TF-0-NEXT: [[TMP21:%.*]] = extractelement [[TMP19]], i32 0 +; CHECK-TF-0-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true +; CHECK-TF-0-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-TF-0: middle.block: +; entry: %cmp6 = icmp sgt i64 %n, 0 br i1 %cmp6, label %for.body, label %for.end @@ -254,3 +344,8 @@ attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" } ; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ;. +; CHECK-TF-0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-TF-0: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-TF-0: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-TF-0: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +;.