From 939ecd21ed2c977d872f27f76e86b65b4c9bb64b Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Thu, 16 Oct 2025 10:12:49 +0000 Subject: [PATCH 1/3] [AArch64][CostModel] Reduce cost of wider than legal get.active.lane.mask getIntrinsicInstrCost should halve the cost returned by getTypeLegalizationCost when the return type requires splitting, but we know that the whilelo (predicate pair) instruction can be used. --- .../AArch64/AArch64TargetTransformInfo.cpp | 24 ++++-- .../CostModel/AArch64/sve-intrinsics.ll | 77 +++++++++++++------ 2 files changed, 73 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 479e34515fc8a..c0e9b24486b6f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -957,10 +957,24 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return TyL.first + ExtraCost; } case Intrinsic::get_active_lane_mask: { - auto *RetTy = dyn_cast(ICA.getReturnType()); - if (RetTy) { - EVT RetVT = getTLI()->getValueType(DL, RetTy); - EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); + auto RetTy = cast(ICA.getReturnType()); + EVT RetVT = getTLI()->getValueType(DL, RetTy); + EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); + if (RetTy->isScalableTy()) { + // When SVE2p1 or SME2 is available, get_active_lane_mask will lower + // to the sve_whilelo_x2 intrinsic which returns a predicate pair. + // This means we can halve getTypeLegalizationCost, since the + // predicate pair intrinsic will split the result, e.g. + // nxv32i1 = get_active_lane_mask(base, idx) -> + // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx) + if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) || + (!ST->hasSVE2p1() && !ST->hasSME2()) || + TLI->getTypeAction(RetTy->getContext(), RetVT) != + TargetLowering::TypeSplitVector) + break; + auto LT = getTypeLegalizationCost(RetTy); + return LT.first / 2; + } else { if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) && !getTLI()->isTypeLegal(RetVT)) { // We don't have enough context at this point to determine if the mask @@ -972,7 +986,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, // NOTE: getScalarizationOverhead returns a cost that's far too // pessimistic for the actual generated codegen. In reality there are // two instructions generated per lane. - return RetTy->getNumElements() * 2; + return cast(RetTy)->getNumElements() * 2; } } break; diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll index 0976a108cfb2c..a3992b017f3c4 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-1 -; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-2 +; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefixes=CHECK-VSCALE-2,CHECK-SVE +; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve2p1 | FileCheck %s --check-prefixes=CHECK-VSCALE-2,CHECK-SVE2p1-OR-SME2 +; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sme2 | FileCheck %s --check-prefixes=CHECK-VSCALE-2,CHECK-SVE2p1-OR-SME2 ; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -intrinsic-cost-strategy=type-based-intrinsic-cost -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=TYPE_BASED_ONLY target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -920,6 +922,7 @@ define void @get_lane_mask() #0 { ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) +; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) @@ -934,28 +937,53 @@ define void @get_lane_mask() #0 { ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; -; CHECK-VSCALE-2-LABEL: 'get_lane_mask' -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; CHECK-SVE-LABEL: 'get_lane_mask' +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 4 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; +; CHECK-SVE2p1-OR-SME2-LABEL: 'get_lane_mask' +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 2 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; TYPE_BASED_ONLY-LABEL: 'get_lane_mask' ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison) @@ -966,6 +994,7 @@ define void @get_lane_mask() #0 { ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) @@ -990,6 +1019,7 @@ define void @get_lane_mask() #0 { %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) + %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) @@ -1416,6 +1446,7 @@ declare @llvm.get.active.lane.mask.nxv16i1.i32(i32, i32) declare @llvm.get.active.lane.mask.nxv8i1.i32(i32, i32) declare @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32) declare @llvm.get.active.lane.mask.nxv2i1.i32(i32, i32) +declare @llvm.get.active.lane.mask.nxv64i1.i64(i64, i64) declare @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64) declare @llvm.get.active.lane.mask.nxv16i1.i16(i16, i16) declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64, i64) From 1ebafe4d4efca8f8475f314545c053577645d18a Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Tue, 21 Oct 2025 12:57:46 +0000 Subject: [PATCH 2/3] - Add variable for shouldExpandGetActiveLaneMask - Include cost of saturating add if whilelo requires splitting more than once --- .../AArch64/AArch64TargetTransformInfo.cpp | 60 +++++++++++-------- .../CostModel/AArch64/sve-intrinsics.ll | 14 ++--- 2 files changed, 43 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index c0e9b24486b6f..81d3153911ab4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -960,34 +960,46 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, auto RetTy = cast(ICA.getReturnType()); EVT RetVT = getTLI()->getValueType(DL, RetTy); EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); + bool ShouldExpand = getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT); if (RetTy->isScalableTy()) { - // When SVE2p1 or SME2 is available, get_active_lane_mask will lower - // to the sve_whilelo_x2 intrinsic which returns a predicate pair. - // This means we can halve getTypeLegalizationCost, since the - // predicate pair intrinsic will split the result, e.g. - // nxv32i1 = get_active_lane_mask(base, idx) -> - // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx) - if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) || - (!ST->hasSVE2p1() && !ST->hasSME2()) || - TLI->getTypeAction(RetTy->getContext(), RetVT) != - TargetLowering::TypeSplitVector) + if (TLI->getTypeAction(RetTy->getContext(), RetVT) != + TargetLowering::TypeSplitVector) break; + auto LT = getTypeLegalizationCost(RetTy); - return LT.first / 2; - } else { - if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) && - !getTLI()->isTypeLegal(RetVT)) { - // We don't have enough context at this point to determine if the mask - // is going to be kept live after the block, which will force the vXi1 - // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. - // For now, we just assume the vectorizer created this intrinsic and - // the result will be the input for a PHI. In this case the cost will - // be extremely high for fixed-width vectors. - // NOTE: getScalarizationOverhead returns a cost that's far too - // pessimistic for the actual generated codegen. In reality there are - // two instructions generated per lane. - return cast(RetTy)->getNumElements() * 2; + InstructionCost Cost = LT.first; + + // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost + // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g. + // nxv32i1 = get_active_lane_mask(base, idx) -> + // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx) + if (!ShouldExpand && (ST->hasSVE2p1() || ST->hasSME2())) { + Cost /= 2; + if (Cost == 1) + return Cost; } + + // If more than one whilelo intrinsic is required, include the extra cost + // required by the saturating add & select required to increment the + // start value after the first intrinsic call. + Type *OpTy = ICA.getArgTypes()[0]; + IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy}); + InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind); + Type *CondTy = OpTy->getWithNewBitWidth(1); + SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy, + CmpInst::ICMP_UGT, CostKind); + return Cost + (SplitCost * (Cost - 1)); + } else if (!ShouldExpand && !getTLI()->isTypeLegal(RetVT)) { + // We don't have enough context at this point to determine if the mask + // is going to be kept live after the block, which will force the vXi1 + // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. + // For now, we just assume the vectorizer created this intrinsic and + // the result will be the input for a PHI. In this case the cost will + // be extremely high for fixed-width vectors. + // NOTE: getScalarizationOverhead returns a cost that's far too + // pessimistic for the actual generated codegen. In reality there are + // two instructions generated per lane. + return cast(RetTy)->getNumElements() * 2; } break; } diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll index a3992b017f3c4..b692cfbcacf4d 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -922,8 +922,8 @@ define void @get_lane_mask() #0 { ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) -; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) -; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) +; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 13 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) +; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 5 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison) @@ -946,8 +946,8 @@ define void @get_lane_mask() #0 { ; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) ; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) ; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) -; CHECK-SVE-NEXT: Cost Model: Found costs of 4 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) -; CHECK-SVE-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 13 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) +; CHECK-SVE-NEXT: Cost Model: Found costs of 5 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) ; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) ; CHECK-SVE-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) ; CHECK-SVE-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison) @@ -970,7 +970,7 @@ define void @get_lane_mask() #0 { ; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) ; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) ; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) -; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 2 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) +; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 5 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) ; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) ; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) ; CHECK-SVE2p1-OR-SME2-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) @@ -994,8 +994,8 @@ define void @get_lane_mask() #0 { ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 13 for: %mask_nxv64i1_i64 = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 5 for: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison) From 77a5519450845eda91671bef89e6b86e89e5d5fa Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Thu, 23 Oct 2025 09:38:22 +0000 Subject: [PATCH 3/3] - Check shouldExpandGetActiveLaneMask earlier --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 81d3153911ab4..fce8f12612855 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -960,7 +960,9 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, auto RetTy = cast(ICA.getReturnType()); EVT RetVT = getTLI()->getValueType(DL, RetTy); EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); - bool ShouldExpand = getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT); + if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT)) + break; + if (RetTy->isScalableTy()) { if (TLI->getTypeAction(RetTy->getContext(), RetVT) != TargetLowering::TypeSplitVector) @@ -968,12 +970,11 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, auto LT = getTypeLegalizationCost(RetTy); InstructionCost Cost = LT.first; - // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g. // nxv32i1 = get_active_lane_mask(base, idx) -> // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx) - if (!ShouldExpand && (ST->hasSVE2p1() || ST->hasSME2())) { + if (ST->hasSVE2p1() || ST->hasSME2()) { Cost /= 2; if (Cost == 1) return Cost; @@ -989,7 +990,7 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy, CmpInst::ICMP_UGT, CostKind); return Cost + (SplitCost * (Cost - 1)); - } else if (!ShouldExpand && !getTLI()->isTypeLegal(RetVT)) { + } else if (!getTLI()->isTypeLegal(RetVT)) { // We don't have enough context at this point to determine if the mask // is going to be kept live after the block, which will force the vXi1 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.