From eab11c8a3c2e25ac9d24d507fc5bec1e0d68ad6f Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 11 Nov 2025 16:25:57 +0000 Subject: [PATCH 1/2] [Analysis][AArch64] Add cost model for loop.dependence.{war/raw}.mask This PR adds the cost model for the loop dependence mask intrinsics, both for cases where they must be expanded and when they can be lowered for AArch64. --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 47 ++++++++ .../AArch64/AArch64TargetTransformInfo.cpp | 34 ++++++ .../CostModel/AArch64/loop_dependence_mask.ll | 104 ++++++++++++++++++ 3 files changed, 185 insertions(+) create mode 100644 llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 1c167af4b0478..c1b0dc5db9607 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2190,6 +2190,53 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Otherwise, fallback to default scalarization cost. break; } + case Intrinsic::loop_dependence_raw_mask: + case Intrinsic::loop_dependence_war_mask: { + InstructionCost Cost = 0; + Type *PtrTy = ICA.getArgTypes()[0]; + bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask; + + Cost += + thisT()->getArithmeticInstrCost(Instruction::Sub, PtrTy, CostKind); + if (IsReadAfterWrite) { + IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, PtrTy, {PtrTy}, {}); + Cost += thisT()->getIntrinsicInstrCost(AbsAttrs, CostKind); + } + + Cost += + thisT()->getArithmeticInstrCost(Instruction::SDiv, PtrTy, CostKind); + Type *CmpTy = + getTLI() + ->getSetCCResultType( + thisT()->getDataLayout(), RetTy->getContext(), + getTLI()->getValueType(thisT()->getDataLayout(), PtrTy)) + .getTypeForEVT(RetTy->getContext()); + Cost += thisT()->getCmpSelInstrCost( + BinaryOperator::ICmp, CmpTy, PtrTy, + IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE, CostKind); + + // The deconstructed active lane mask + VectorType *RetTyVec = cast(RetTy); + VectorType *SplatTy = cast(RetTyVec->getWithNewType(PtrTy)); + Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SplatTy, SplatTy, {}, + CostKind, 0, nullptr); + IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, SplatTy, {}, + FMF); + Cost += thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind); + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SplatTy, + SplatTy, CmpInst::ICMP_ULT, CostKind); + + Cost += + thisT()->getCastInstrCost(Instruction::CastOps::ZExt, RetTy, SplatTy, + TTI::CastContextHint::None, CostKind); + Cost += thisT()->getCastInstrCost(Instruction::CastOps::ZExt, + RetTyVec->getElementType(), CmpTy, + TTI::CastContextHint::None, CostKind); + Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, RetTyVec, RetTyVec, {}, + CostKind, 0, nullptr); + Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); + return Cost; + } } // Assume that we need to scalarize this intrinsic.) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 197aae6e03cb1..c4bd3c7803c1a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1032,6 +1032,40 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } break; } + case Intrinsic::loop_dependence_raw_mask: + case Intrinsic::loop_dependence_war_mask: { + auto *EltSize = cast(ICA.getArgs()[2]); + EVT VecVT = getTLI()->getValueType(DL, RetTy); + // An invalid element size and return type combination must be expanded. + bool MustBeExpanded = false; + switch (EltSize->getSExtValue()) { + case 1: + if (VecVT != MVT::v16i1 && VecVT != MVT::nxv16i1) + MustBeExpanded = true; + break; + case 2: + if (VecVT != MVT::v8i1 && VecVT != MVT::nxv8i1) + MustBeExpanded = true; + break; + case 4: + if (VecVT != MVT::v4i1 && VecVT != MVT::nxv4i1) + MustBeExpanded = true; + break; + case 8: + if (VecVT != MVT::v2i1 && VecVT != MVT::nxv2i1) + MustBeExpanded = true; + break; + default: + MustBeExpanded = true; + // Other element sizes are incompatible with whilewr/rw, so expand instead + break; + } + + // The whilewr/rw instructions require SVE2 or SME + if (MustBeExpanded || (!ST->hasSVE2() && !ST->hasSME())) + break; + return 1; + } case Intrinsic::experimental_vector_extract_last_active: if (ST->isSVEorStreamingSVEAvailable()) { auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]); diff --git a/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll new file mode 100644 index 0000000000000..1074d41d994d9 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-EXPANDED +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve2 | FileCheck %s --check-prefix=CHECK +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sme | FileCheck %s --check-prefix=CHECK + +; loop.dependence.{war,raw}.mask can be lowered to while{wr,rw} if SVE2 or SME is enabled. +define void @loop_dependence_war_mask(ptr %a, ptr %b) { +; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask' +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-LABEL: 'loop_dependence_war_mask' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) + %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2) + %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4) + %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8) + ret void +} + +define void @loop_dependence_raw_mask(ptr %a, ptr %b) { +; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask' +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-LABEL: 'loop_dependence_raw_mask' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1) + %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2) + %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4) + %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8) + ret void +} + +; Invalid element size and return type combinations must be expanded, even with sve2/sme +define void @loop_dependence_war_mask_invalid(ptr %a, ptr %b) { +; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask_invalid' +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-LABEL: 'loop_dependence_war_mask_invalid' +; CHECK-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8) +; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4) +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8) + %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4) + %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2) + %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1) + %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10) + ret void +} + +define void @loop_dependence_raw_mask_invalid(ptr %a, ptr %b) { +; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask_invalid' +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10) +; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CHECK-LABEL: 'loop_dependence_raw_mask_invalid' +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8) +; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8) + %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4) + %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2) + %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1) + %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10) + ret void +} From 0744053d9bd1135aa790a293f2cfd6f59e5e2a98 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 13 Nov 2025 15:46:05 +0000 Subject: [PATCH 2/2] Address review --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 16 +++++++++ .../SelectionDAG/LegalizeVectorOps.cpp | 16 +++++++++ .../AArch64/AArch64TargetTransformInfo.cpp | 35 +++++-------------- 3 files changed, 40 insertions(+), 27 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index c1b0dc5db9607..56bf0d8e4a48b 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2192,6 +2192,22 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } case Intrinsic::loop_dependence_raw_mask: case Intrinsic::loop_dependence_war_mask: { + // Compute the cost of the expanded version of these intrinsics: + // ; Figure out if there's overlap between the pointers. + // diff = (ptrB - ptrA) / eltSize ; read-after-write will use the + // absolute difference + // cmp = diff <= 0 ; read-after-write will check for equality + // with 0 + // ; Create a mask with each lane < diff active. This is essentiallly + // an active lane mask between 0 and diff. + // diff_splat = splat diff to + // steps = stepvector + // diff_mask = steps <= diff_splat + // ; OR that diff mask with the comparison result, so that each lane is + // active if it's less than diff or there was no overlap in the + // first place. Otherwise the lane is inactive. + // cmp_splat = splat cmp to + // result = or cmp_splat diff_mask InstructionCost Cost = 0; Type *PtrTy = ICA.getArgTypes()[0]; bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index f5a54497c8a98..38c934e27d40f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1816,6 +1816,22 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) { } SDValue VectorLegalizer::ExpandLOOP_DEPENDENCE_MASK(SDNode *N) { + // Expand these intrinsics: + // ; Figure out if there's overlap between the pointers. + // diff = (ptrB - ptrA) / eltSize ; read-after-write will use the absolute + // difference + // cmp = diff <= 0 ; read-after-write will check for equality + // with 0 + // ; Create a mask with each lane < diff active. This is essentiallly an + // active lane mask between 0 and diff. + // diff_splat = splat diff to + // steps = stepvector + // diff_mask = steps <= diff_splat + // ; OR that diff mask with the comparison result, so that each lane is + // active if it's less than diff or there was no overlap in the + // first place. Otherwise the lane is inactive. + // cmp_splat = splat cmp to + // result = or cmp_splat diff_mask SDLoc DL(N); SDValue SourceValue = N->getOperand(0); SDValue SinkValue = N->getOperand(1); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index c4bd3c7803c1a..0e769b3faf490 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1034,37 +1034,18 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } case Intrinsic::loop_dependence_raw_mask: case Intrinsic::loop_dependence_war_mask: { - auto *EltSize = cast(ICA.getArgs()[2]); + unsigned EltSizeInBytes = + cast(ICA.getArgs()[2])->getZExtValue(); EVT VecVT = getTLI()->getValueType(DL, RetTy); // An invalid element size and return type combination must be expanded. - bool MustBeExpanded = false; - switch (EltSize->getSExtValue()) { - case 1: - if (VecVT != MVT::v16i1 && VecVT != MVT::nxv16i1) - MustBeExpanded = true; - break; - case 2: - if (VecVT != MVT::v8i1 && VecVT != MVT::nxv8i1) - MustBeExpanded = true; - break; - case 4: - if (VecVT != MVT::v4i1 && VecVT != MVT::nxv4i1) - MustBeExpanded = true; - break; - case 8: - if (VecVT != MVT::v2i1 && VecVT != MVT::nxv2i1) - MustBeExpanded = true; - break; - default: - MustBeExpanded = true; - // Other element sizes are incompatible with whilewr/rw, so expand instead - break; - } + bool MustBeExpanded = + VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes) || + !isPowerOf2_32(EltSizeInBytes) || EltSizeInBytes > 8; // The whilewr/rw instructions require SVE2 or SME - if (MustBeExpanded || (!ST->hasSVE2() && !ST->hasSME())) - break; - return 1; + if (!MustBeExpanded && (ST->hasSVE2() || ST->hasSME())) + return 1; + break; } case Intrinsic::experimental_vector_extract_last_active: if (ST->isSVEorStreamingSVEAvailable()) {