From eab11c8a3c2e25ac9d24d507fc5bec1e0d68ad6f Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs@arm.com>
Date: Tue, 11 Nov 2025 16:25:57 +0000
Subject: [PATCH 1/2] [Analysis][AArch64] Add cost model for
 loop.dependence.{war/raw}.mask

This PR adds the cost model for the loop dependence mask intrinsics,
both for cases where they must be expanded and when they can be lowered
for AArch64.
---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  47 ++++++++
 .../AArch64/AArch64TargetTransformInfo.cpp    |  34 ++++++
 .../CostModel/AArch64/loop_dependence_mask.ll | 104 ++++++++++++++++++
 3 files changed, 185 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 1c167af4b0478..c1b0dc5db9607 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2190,6 +2190,53 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       // Otherwise, fallback to default scalarization cost.
       break;
     }
+    case Intrinsic::loop_dependence_raw_mask:
+    case Intrinsic::loop_dependence_war_mask: {
+      InstructionCost Cost = 0;
+      Type *PtrTy = ICA.getArgTypes()[0];
+      bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;
+
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::Sub, PtrTy, CostKind);
+      if (IsReadAfterWrite) {
+        IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, PtrTy, {PtrTy}, {});
+        Cost += thisT()->getIntrinsicInstrCost(AbsAttrs, CostKind);
+      }
+
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::SDiv, PtrTy, CostKind);
+      Type *CmpTy =
+          getTLI()
+              ->getSetCCResultType(
+                  thisT()->getDataLayout(), RetTy->getContext(),
+                  getTLI()->getValueType(thisT()->getDataLayout(), PtrTy))
+              .getTypeForEVT(RetTy->getContext());
+      Cost += thisT()->getCmpSelInstrCost(
+          BinaryOperator::ICmp, CmpTy, PtrTy,
+          IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE, CostKind);
+
+      // The deconstructed active lane mask
+      VectorType *RetTyVec = cast<VectorType>(RetTy);
+      VectorType *SplatTy = cast<VectorType>(RetTyVec->getWithNewType(PtrTy));
+      Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SplatTy, SplatTy, {},
+                                      CostKind, 0, nullptr);
+      IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, SplatTy, {},
+                                           FMF);
+      Cost += thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SplatTy,
+                                          SplatTy, CmpInst::ICMP_ULT, CostKind);
+
+      Cost +=
+          thisT()->getCastInstrCost(Instruction::CastOps::ZExt, RetTy, SplatTy,
+                                    TTI::CastContextHint::None, CostKind);
+      Cost += thisT()->getCastInstrCost(Instruction::CastOps::ZExt,
+                                        RetTyVec->getElementType(), CmpTy,
+                                        TTI::CastContextHint::None, CostKind);
+      Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, RetTyVec, RetTyVec, {},
+                                      CostKind, 0, nullptr);
+      Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
+      return Cost;
+    }
     }
 
     // Assume that we need to scalarize this intrinsic.)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 197aae6e03cb1..c4bd3c7803c1a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1032,6 +1032,40 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     break;
   }
+  case Intrinsic::loop_dependence_raw_mask:
+  case Intrinsic::loop_dependence_war_mask: {
+    auto *EltSize = cast<ConstantInt>(ICA.getArgs()[2]);
+    EVT VecVT = getTLI()->getValueType(DL, RetTy);
+    // An invalid element size and return type combination must be expanded.
+    bool MustBeExpanded = false;
+    switch (EltSize->getSExtValue()) {
+    case 1:
+      if (VecVT != MVT::v16i1 && VecVT != MVT::nxv16i1)
+        MustBeExpanded = true;
+      break;
+    case 2:
+      if (VecVT != MVT::v8i1 && VecVT != MVT::nxv8i1)
+        MustBeExpanded = true;
+      break;
+    case 4:
+      if (VecVT != MVT::v4i1 && VecVT != MVT::nxv4i1)
+        MustBeExpanded = true;
+      break;
+    case 8:
+      if (VecVT != MVT::v2i1 && VecVT != MVT::nxv2i1)
+        MustBeExpanded = true;
+      break;
+    default:
+      MustBeExpanded = true;
+      // Other element sizes are incompatible with whilewr/rw, so expand instead
+      break;
+    }
+
+    // The whilewr/rw instructions require SVE2 or SME
+    if (MustBeExpanded || (!ST->hasSVE2() && !ST->hasSME()))
+      break;
+    return 1;
+  }
   case Intrinsic::experimental_vector_extract_last_active:
     if (ST->isSVEorStreamingSVEAvailable()) {
       auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
diff --git a/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
new file mode 100644
index 0000000000000..1074d41d994d9
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-EXPANDED
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve2 | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sme | FileCheck %s --check-prefix=CHECK
+
+; loop.dependence.{war,raw}.mask can be lowered to while{wr,rw} if SVE2 or SME is enabled.
+define void @loop_dependence_war_mask(ptr %a, ptr %b) {
+; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask'
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-LABEL: 'loop_dependence_war_mask'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+  %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+  %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+  %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+  %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+  ret void
+}
+
+define void @loop_dependence_raw_mask(ptr %a, ptr %b) {
+; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask'
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-LABEL: 'loop_dependence_raw_mask'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+  %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+  %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+  %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+  %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+  ret void
+}
+
+; Invalid element size and return type combinations must be expanded, even with sve2/sme
+define void @loop_dependence_war_mask_invalid(ptr %a, ptr %b) {
+; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask_invalid'
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-LABEL: 'loop_dependence_war_mask_invalid'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+  %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+  %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
+  %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
+  %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
+  %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
+  ret void
+}
+
+define void @loop_dependence_raw_mask_invalid(ptr %a, ptr %b) {
+; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask_invalid'
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-EXPANDED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-LABEL: 'loop_dependence_raw_mask_invalid'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+  %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+  %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
+  %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
+  %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
+  %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
+  ret void
+}

From 0744053d9bd1135aa790a293f2cfd6f59e5e2a98 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs@arm.com>
Date: Thu, 13 Nov 2025 15:46:05 +0000
Subject: [PATCH 2/2] Address review

---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 16 +++++++++
 .../SelectionDAG/LegalizeVectorOps.cpp        | 16 +++++++++
 .../AArch64/AArch64TargetTransformInfo.cpp    | 35 +++++--------------
 3 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c1b0dc5db9607..56bf0d8e4a48b 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2192,6 +2192,22 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     }
     case Intrinsic::loop_dependence_raw_mask:
     case Intrinsic::loop_dependence_war_mask: {
+      // Compute the cost of the expanded version of these intrinsics:
+      //    ; Figure out if there's overlap between the pointers.
+      //    diff = (ptrB - ptrA) / eltSize ; read-after-write will use the
+      //        absolute difference
+      //    cmp = diff <= 0 ; read-after-write will check for equality
+      //        with 0
+      //    ; Create a mask with each lane < diff active. This is essentiallly
+      //        an active lane mask between 0 and diff.
+      //    diff_splat = splat diff to <Y x i64>
+      //    steps = stepvector <Y x i64>
+      //    diff_mask = steps <= diff_splat
+      //    ; OR that diff mask with the comparison result, so that each lane is
+      //        active if it's less than diff or there was no overlap in the
+      //        first place. Otherwise the lane is inactive.
+      //    cmp_splat = splat cmp to <Y x i1>
+      //    result = or cmp_splat diff_mask
       InstructionCost Cost = 0;
       Type *PtrTy = ICA.getArgTypes()[0];
       bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index f5a54497c8a98..38c934e27d40f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1816,6 +1816,22 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) {
 }
 
 SDValue VectorLegalizer::ExpandLOOP_DEPENDENCE_MASK(SDNode *N) {
+  // Expand these intrinsics:
+  //    ; Figure out if there's overlap between the pointers.
+  //    diff = (ptrB - ptrA) / eltSize ; read-after-write will use the absolute
+  //        difference
+  //    cmp = diff <= 0 ; read-after-write will check for equality
+  //        with 0
+  //    ; Create a mask with each lane < diff active. This is essentiallly an
+  //        active lane mask between 0 and diff.
+  //    diff_splat = splat diff to <Y x i64>
+  //    steps = stepvector <Y x i64>
+  //    diff_mask = steps <= diff_splat
+  //    ; OR that diff mask with the comparison result, so that each lane is
+  //        active if it's less than diff or there was no overlap in the
+  //        first place. Otherwise the lane is inactive.
+  //    cmp_splat = splat cmp to <Y x i1>
+  //    result = or cmp_splat diff_mask
   SDLoc DL(N);
   SDValue SourceValue = N->getOperand(0);
   SDValue SinkValue = N->getOperand(1);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c4bd3c7803c1a..0e769b3faf490 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1034,37 +1034,18 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   }
   case Intrinsic::loop_dependence_raw_mask:
   case Intrinsic::loop_dependence_war_mask: {
-    auto *EltSize = cast<ConstantInt>(ICA.getArgs()[2]);
+    unsigned EltSizeInBytes =
+        cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
     EVT VecVT = getTLI()->getValueType(DL, RetTy);
     // An invalid element size and return type combination must be expanded.
-    bool MustBeExpanded = false;
-    switch (EltSize->getSExtValue()) {
-    case 1:
-      if (VecVT != MVT::v16i1 && VecVT != MVT::nxv16i1)
-        MustBeExpanded = true;
-      break;
-    case 2:
-      if (VecVT != MVT::v8i1 && VecVT != MVT::nxv8i1)
-        MustBeExpanded = true;
-      break;
-    case 4:
-      if (VecVT != MVT::v4i1 && VecVT != MVT::nxv4i1)
-        MustBeExpanded = true;
-      break;
-    case 8:
-      if (VecVT != MVT::v2i1 && VecVT != MVT::nxv2i1)
-        MustBeExpanded = true;
-      break;
-    default:
-      MustBeExpanded = true;
-      // Other element sizes are incompatible with whilewr/rw, so expand instead
-      break;
-    }
+    bool MustBeExpanded =
+        VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes) ||
+        !isPowerOf2_32(EltSizeInBytes) || EltSizeInBytes > 8;
 
     // The whilewr/rw instructions require SVE2 or SME
-    if (MustBeExpanded || (!ST->hasSVE2() && !ST->hasSME()))
-      break;
-    return 1;
+    if (!MustBeExpanded && (ST->hasSVE2() || ST->hasSME()))
+      return 1;
+    break;
   }
   case Intrinsic::experimental_vector_extract_last_active:
     if (ST->isSVEorStreamingSVEAvailable()) {