-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AArch64][SVE] Implement demanded bits for @llvm.aarch64.sve.cntp #168714
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This allows DemandedBits to see that the SVE CNTP intrinsic will only ever produce small positive integers. The maximum value you could get here is 256, which is CNTP on a nxv16i1 on a machine with a 2048bit vector size (the maximum for SVE). Using this various redundant operations (zexts, sexts, ands, ors, etc) can be eliminated.
|
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis allows DemandedBits to see that the SVE CNTP intrinsic will only ever produce small positive integers. The maximum value you could get here is 256, which is CNTP on a nxv16i1 on a machine with a 2048bit vector size (the maximum for SVE). Using this various redundant operations (zexts, sexts, ands, ors, etc) can be eliminated. Full diff: https://github.com/llvm/llvm-project/pull/168714.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8f41f230b5521..809c2af499958 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19459,6 +19459,32 @@ static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
return {};
}
+// Returns the element size associated with an SVE cnt[bhwdp] intrinsic. For
+// cntp (predicate), the element size corresponds to the legal (packed) SVE
+// vector type associated with the predicate. E.g. nxv4i1 returns 32.
+static std::optional<unsigned> GetSVECntElementSize(SDValue Op) {
+ if (auto ElementSize = IsSVECntIntrinsic(Op))
+ return ElementSize;
+ Intrinsic::ID IID = getIntrinsicID(Op.getNode());
+ if (IID != Intrinsic::aarch64_sve_cntp)
+ return {};
+ EVT PredVT = Op.getOperand(Op.getNumOperands() - 1).getValueType();
+ switch (PredVT.getSimpleVT().SimpleTy) {
+ case MVT::nxv1i1:
+ return 128;
+ case MVT::nxv2i1:
+ return 64;
+ case MVT::nxv4i1:
+ return 32;
+ case MVT::nxv8i1:
+ return 16;
+ case MVT::nxv16i1:
+ return 8;
+ default:
+ llvm_unreachable("unexpected predicate type");
+ }
+}
+
/// Calculates what the pre-extend type is, based on the extension
/// operation node provided by \p Extend.
///
@@ -31666,7 +31692,7 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
return false;
}
case ISD::INTRINSIC_WO_CHAIN: {
- if (auto ElementSize = IsSVECntIntrinsic(Op)) {
+ if (auto ElementSize = GetSVECntElementSize(Op)) {
unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
if (!MaxSVEVectorSizeInBits)
MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
index cc3a3734a9721..f700dee0fb2e4 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
@@ -143,20 +143,19 @@ define <vscale x 8 x i32> @test_compress_large(<vscale x 8 x i32> %vec, <vscale
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: cnth x9
-; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ptrue p2.s
; CHECK-NEXT: sub x9, x9, #1
; CHECK-NEXT: punpkhi p0.h, p0.b
-; CHECK-NEXT: compact z0.s, p2, z0.s
-; CHECK-NEXT: cntp x8, p1, p2.s
+; CHECK-NEXT: compact z0.s, p1, z0.s
+; CHECK-NEXT: cntp x8, p2, p1.s
; CHECK-NEXT: compact z1.s, p0, z1.s
; CHECK-NEXT: str z0, [sp]
-; CHECK-NEXT: mov w8, w8
; CHECK-NEXT: cmp x8, x9
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: st1w { z1.s }, p1, [x9, x8, lsl #2]
+; CHECK-NEXT: st1w { z1.s }, p2, [x9, x8, lsl #2]
; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #2
diff --git a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
index 9572778484f8d..568abe718ad9b 100644
--- a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
+++ b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
@@ -80,6 +80,62 @@ define i64 @cntd_and_elimination() {
ret i64 %result
}
+define i64 @cntp_nxv16i1_and_elimination(<vscale x 16 x i1> %p) {
+; CHECK-LABEL: cntp_nxv16i1_and_elimination:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.b
+; CHECK-NEXT: and x9, x8, #0x1fc
+; CHECK-NEXT: add x0, x8, x9
+; CHECK-NEXT: ret
+ %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv16i1(<vscale x 16 x i1> %p, <vscale x 16 x i1> %p)
+ %and_redundant = and i64 %cntp, 511
+ %and_required = and i64 %cntp, 17179869180
+ %result = add i64 %and_redundant, %and_required
+ ret i64 %result
+}
+
+define i64 @cntp_nxv8i1_and_elimination(<vscale x 8 x i1> %p) {
+; CHECK-LABEL: cntp_nxv8i1_and_elimination:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.h
+; CHECK-NEXT: and x9, x8, #0xfc
+; CHECK-NEXT: add x0, x8, x9
+; CHECK-NEXT: ret
+ %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1> %p, <vscale x 8 x i1> %p)
+ %and_redundant = and i64 %cntp, 1023
+ %and_required = and i64 %cntp, 17179869180
+ %result = add i64 %and_redundant, %and_required
+ ret i64 %result
+}
+
+define i64 @cntp_nxv4i1_and_elimination(<vscale x 4 x i1> %p) {
+; CHECK-LABEL: cntp_nxv4i1_and_elimination:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: and x9, x8, #0x7c
+; CHECK-NEXT: add x0, x8, x9
+; CHECK-NEXT: ret
+ %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> %p, <vscale x 4 x i1> %p)
+ %and_redundant = and i64 %cntp, 127
+ %and_required = and i64 %cntp, 17179869180
+ %result = add i64 %and_redundant, %and_required
+ ret i64 %result
+}
+
+define i64 @cntp_nxv2i1_and_elimination(<vscale x 2 x i1> %p) {
+; CHECK-LABEL: cntp_nxv2i1_and_elimination:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.d
+; CHECK-NEXT: and x9, x8, #0x3c
+; CHECK-NEXT: add x0, x8, x9
+; CHECK-NEXT: ret
+ %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1> %p, <vscale x 2 x i1> %p)
+ %and_redundant = and i64 %cntp, 63
+ %and_required = and i64 %cntp, 17179869180
+ %result = add i64 %and_redundant, %and_required
+ ret i64 %result
+}
+
define i64 @vscale_trunc_zext() vscale_range(1,16) {
; CHECK-LABEL: vscale_trunc_zext:
; CHECK: // %bb.0:
|
paulwalker-arm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps the original code is backward in asking for the element size?
I wonder if it should be more literal and ask for the maximum result as an element count, or maximum unscaled result as an unsigned (not really sure which is better). Then have the caller multiply the result by the maximum value of vscale (i.e. (MaxSVEVectorSizeInBits or SVEMaxBitsPerVector) / 128)).
What do you think?
🐧 Linux x64 Test Results
|
I think that's a little more intuitive 👍 |
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
|
Note: I slightly tweaked the patch to not add |
paulwalker-arm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new changes look good to me.
This allows DemandedBits to see that the SVE CNTP intrinsic will only ever produce small positive integers. The maximum value you could get here is 256, which is CNTP on a nxv16i1 on a machine with a 2048bit vector size (the maximum for SVE).
Using this various redundant operations (zexts, sexts, ands, ors, etc) can be eliminated.