llvm · MacDue · Nov 20, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19443,20 +19443,37 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
   return CSNeg;
 }
 
-static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
+static bool IsSVECntIntrinsic(SDValue S) {
   switch(getIntrinsicID(S.getNode())) {
   default:
     break;
   case Intrinsic::aarch64_sve_cntb:
-    return 8;
   case Intrinsic::aarch64_sve_cnth:
-    return 16;
   case Intrinsic::aarch64_sve_cntw:
-    return 32;
   case Intrinsic::aarch64_sve_cntd:
-    return 64;
+    return true;
+  }
+  return false;
+}
+
+// Returns the maximum (scalable) value that can be returned by an SVE count
+// intrinsic. Returns std::nullopt if \p Op is not aarch64_sve_cnt*.
+static std::optional<ElementCount> getMaxValueForSVECntIntrinsic(SDValue Op) {
+  Intrinsic::ID IID = getIntrinsicID(Op.getNode());
+  if (IID == Intrinsic::aarch64_sve_cntp)
+    return Op.getOperand(1).getValueType().getVectorElementCount();
+  switch (IID) {
+  case Intrinsic::aarch64_sve_cntd:
+    return ElementCount::getScalable(2);
+  case Intrinsic::aarch64_sve_cntw:
+    return ElementCount::getScalable(4);
+  case Intrinsic::aarch64_sve_cnth:
+    return ElementCount::getScalable(8);
+  case Intrinsic::aarch64_sve_cntb:
+    return ElementCount::getScalable(16);
+  default:
+    return std::nullopt;
   }
-  return {};
 }
 
 /// Calculates what the pre-extend type is, based on the extension
@@ -31666,22 +31683,24 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
     return false;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    if (auto ElementSize = IsSVECntIntrinsic(Op)) {
-      unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
-      if (!MaxSVEVectorSizeInBits)
-        MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
-      unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
-      // The SVE count intrinsics don't support the multiplier immediate so we
-      // don't have to account for that here. The value returned may be slightly
-      // over the true required bits, as this is based on the "ALL" pattern. The
-      // other patterns are also exposed by these intrinsics, but they all
-      // return a value that's strictly less than "ALL".
-      unsigned RequiredBits = llvm::bit_width(MaxElements);
-      unsigned BitWidth = Known.Zero.getBitWidth();
-      if (RequiredBits < BitWidth)
-        Known.Zero.setHighBits(BitWidth - RequiredBits);
+    std::optional<ElementCount> MaxCount = getMaxValueForSVECntIntrinsic(Op);
+    if (!MaxCount)
       return false;
-    }
+    unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
+    if (!MaxSVEVectorSizeInBits)
+      MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
+    unsigned VscaleMax = MaxSVEVectorSizeInBits / 128;
+    unsigned MaxValue = MaxCount->getKnownMinValue() * VscaleMax;
+    // The SVE count intrinsics don't support the multiplier immediate so we
+    // don't have to account for that here. The value returned may be slightly
+    // over the true required bits, as this is based on the "ALL" pattern. The
+    // other patterns are also exposed by these intrinsics, but they all
+    // return a value that's strictly less than "ALL".
+    unsigned RequiredBits = llvm::bit_width(MaxValue);
+    unsigned BitWidth = Known.Zero.getBitWidth();
+    if (RequiredBits < BitWidth)
+      Known.Zero.setHighBits(BitWidth - RequiredBits);
+    return false;
   }
   }
 

diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
@@ -143,20 +143,19 @@ define <vscale x 8 x i32> @test_compress_large(<vscale x 8 x i32> %vec, <vscale
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    cnth x9
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p2.s
 ; CHECK-NEXT:    sub x9, x9, #1
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    compact z0.s, p2, z0.s
-; CHECK-NEXT:    cntp x8, p1, p2.s
+; CHECK-NEXT:    compact z0.s, p1, z0.s
+; CHECK-NEXT:    cntp x8, p2, p1.s
 ; CHECK-NEXT:    compact z1.s, p0, z1.s
 ; CHECK-NEXT:    str z0, [sp]
-; CHECK-NEXT:    mov w8, w8
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    st1w { z1.s }, p1, [x9, x8, lsl #2]
+; CHECK-NEXT:    st1w { z1.s }, p2, [x9, x8, lsl #2]
 ; CHECK-NEXT:    ldr z0, [sp]
 ; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #2

diff --git a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
@@ -80,6 +80,62 @@ define i64 @cntd_and_elimination() {
   ret i64 %result
 }
 
+define i64 @cntp_nxv16i1_and_elimination(<vscale x 16 x i1> %p) {
+; CHECK-LABEL: cntp_nxv16i1_and_elimination:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.b
+; CHECK-NEXT:    and x9, x8, #0x1fc
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+  %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv16i1(<vscale x 16 x i1> %p, <vscale x 16 x i1> %p)
+  %and_redundant = and i64 %cntp, 511
+  %and_required = and i64 %cntp, 17179869180
+  %result = add i64 %and_redundant, %and_required
+  ret i64 %result
+}
+
+define i64 @cntp_nxv8i1_and_elimination(<vscale x 8 x i1> %p) {
+; CHECK-LABEL: cntp_nxv8i1_and_elimination:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.h
+; CHECK-NEXT:    and x9, x8, #0xfc
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+  %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1> %p, <vscale x 8 x i1> %p)
+  %and_redundant = and i64 %cntp, 1023
+  %and_required = and i64 %cntp, 17179869180
+  %result = add i64 %and_redundant, %and_required
+  ret i64 %result
+}
+
+define i64 @cntp_nxv4i1_and_elimination(<vscale x 4 x i1> %p) {
+; CHECK-LABEL: cntp_nxv4i1_and_elimination:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.s
+; CHECK-NEXT:    and x9, x8, #0x7c
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+  %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> %p, <vscale x 4 x i1> %p)
+  %and_redundant = and i64 %cntp, 127
+  %and_required = and i64 %cntp, 17179869180
+  %result = add i64 %and_redundant, %and_required
+  ret i64 %result
+}
+
+define i64 @cntp_nxv2i1_and_elimination(<vscale x 2 x i1> %p) {
+; CHECK-LABEL: cntp_nxv2i1_and_elimination:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.d
+; CHECK-NEXT:    and x9, x8, #0x3c
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+  %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1> %p, <vscale x 2 x i1> %p)
+  %and_redundant = and i64 %cntp, 63
+  %and_required = and i64 %cntp, 17179869180
+  %result = add i64 %and_redundant, %and_required
+  ret i64 %result
+}
+
 define i64 @vscale_trunc_zext() vscale_range(1,16) {
 ; CHECK-LABEL: vscale_trunc_zext:
 ; CHECK:       // %bb.0: