-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LLVM][CodeGen][AArch64] Remove bogus lowering of sve_while{gt/ge/hi/hs} intrinsics. #88126
Conversation
…hs} intrinsics. When fed constant operands we try to lower WHILE intrinsics to PTRUE using a fixed length pattern. This is not valid for the decrementing variants of WHILE because they construct their result predicate vector by traversing from high->low lanes whereas the incrementing variants and PTRUE traverse from low->high lanes. Whilst we can still utilise PTRUE by reversing its result I figure replacing a single WHILE with multiple SVE instructions is likely counterproductive.
@llvm/pr-subscribers-backend-aarch64 Author: Paul Walker (paulwalker-arm) ChangesWhen fed constant operands we try to lower WHILE intrinsics to PTRUE using a fixed length pattern. This is not valid for the decrementing variants of WHILE because they construct their result predicate vector by traversing from high->low lanes whereas the incrementing variants and PTRUE traverse from low->high lanes. Whilst we can still utilise PTRUE by reversing its result I figure replacing a single WHILE with multiple SVE instructions is likely counterproductive. Full diff: https://github.com/llvm/llvm-project/pull/88126.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 819e8ccd5c33f0..dc46dfc82b8c65 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5036,7 +5036,7 @@ static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
}
static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned,
- bool IsLess, bool IsEqual) {
+ bool IsEqual) {
if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
!isa<ConstantSDNode>(Op.getOperand(2)))
return SDValue();
@@ -5044,12 +5044,9 @@ static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned,
SDLoc dl(Op);
APInt X = Op.getConstantOperandAPInt(1);
APInt Y = Op.getConstantOperandAPInt(2);
- APInt NumActiveElems;
bool Overflow;
- if (IsLess)
- NumActiveElems = IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
- else
- NumActiveElems = IsSigned ? X.ssub_ov(Y, Overflow) : X.usub_ov(Y, Overflow);
+ APInt NumActiveElems =
+ IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
if (Overflow)
return SDValue();
@@ -5396,29 +5393,13 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return SDValue();
}
case Intrinsic::aarch64_sve_whilelo:
- return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
- /*IsEqual=*/false);
+ return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsEqual=*/false);
case Intrinsic::aarch64_sve_whilelt:
- return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
- /*IsEqual=*/false);
+ return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsEqual=*/false);
case Intrinsic::aarch64_sve_whilels:
- return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
- /*IsEqual=*/true);
+ return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsEqual=*/true);
case Intrinsic::aarch64_sve_whilele:
- return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
- /*IsEqual=*/true);
- case Intrinsic::aarch64_sve_whilege:
- return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
- /*IsEqual=*/true);
- case Intrinsic::aarch64_sve_whilegt:
- return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
- /*IsEqual=*/false);
- case Intrinsic::aarch64_sve_whilehs:
- return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
- /*IsEqual=*/true);
- case Intrinsic::aarch64_sve_whilehi:
- return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
- /*IsEqual=*/false);
+ return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsEqual=*/true);
case Intrinsic::aarch64_sve_sunpkhi:
return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
Op.getOperand(1));
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-while.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-while.ll
index 9b82d79cfa9625..57795ea29c5dcd 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-while.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-while.ll
@@ -81,7 +81,7 @@ define <vscale x 2 x i1> @whilege_d_xx(i64 %a, i64 %b) {
define <vscale x 2 x i1> @whilege_d_ii_dont_fold_to_ptrue_larger_than_minvec() {
; CHECK-LABEL: whilege_d_ii_dont_fold_to_ptrue_larger_than_minvec:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #3
+; CHECK-NEXT: mov w8, #3 // =0x3
; CHECK-NEXT: whilege p0.d, x8, xzr
; CHECK-NEXT: ret
%out = call <vscale x 2 x i1> @llvm.aarch64.sve.whilege.nxv2i1.i64(i64 3, i64 0)
@@ -91,7 +91,9 @@ define <vscale x 2 x i1> @whilege_d_ii_dont_fold_to_ptrue_larger_than_minvec() {
define <vscale x 16 x i1> @whilege_b_ii() {
; CHECK-LABEL: whilege_b_ii:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ptrue p0.b, vl6
+; CHECK-NEXT: mov w8, #-2 // =0xfffffffe
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: whilege p0.b, w9, w8
; CHECK-NEXT: ret
entry:
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilege.nxv16i1.i32(i32 3, i32 -2)
@@ -101,7 +103,7 @@ entry:
define <vscale x 16 x i1> @whilege_b_ii_dont_fold_to_ptrue_nonexistent_vl9() {
; CHECK-LABEL: whilege_b_ii_dont_fold_to_ptrue_nonexistent_vl9:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #9
+; CHECK-NEXT: mov w8, #9 // =0x9
; CHECK-NEXT: whilege p0.b, x8, xzr
; CHECK-NEXT: ret
entry:
@@ -112,7 +114,8 @@ entry:
define <vscale x 16 x i1> @whilege_b_ii_vl_maximum() vscale_range(16, 16) {
; CHECK-LABEL: whilege_b_ii_vl_maximum:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: mov w8, #255 // =0xff
+; CHECK-NEXT: whilege p0.b, x8, xzr
; CHECK-NEXT: ret
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilege.nxv16i1.i64(i64 255, i64 0)
ret <vscale x 16 x i1> %out
@@ -121,8 +124,8 @@ define <vscale x 16 x i1> @whilege_b_ii_vl_maximum() vscale_range(16, 16) {
define <vscale x 16 x i1> @whilege_b_ii_dont_fold_to_ptrue_overflow() {
; CHECK-LABEL: whilege_b_ii_dont_fold_to_ptrue_overflow:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #2
-; CHECK-NEXT: mov w9, #2147483647
+; CHECK-NEXT: mov w8, #2 // =0x2
+; CHECK-NEXT: mov w9, #2147483647 // =0x7fffffff
; CHECK-NEXT: movk w8, #32768, lsl #16
; CHECK-NEXT: whilege p0.b, w9, w8
; CHECK-NEXT: ret
@@ -134,8 +137,8 @@ entry:
define <vscale x 16 x i1> @whilege_b_ii_dont_fold_to_ptrue_increment_overflow() {
; CHECK-LABEL: whilege_b_ii_dont_fold_to_ptrue_increment_overflow:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #2147483647
-; CHECK-NEXT: mov w9, #-2147483641
+; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT: mov w9, #-2147483641 // =0x80000007
; CHECK-NEXT: whilege p0.b, w9, w8
; CHECK-NEXT: ret
entry:
@@ -222,7 +225,7 @@ define <vscale x 2 x i1> @whilehs_d_xx(i64 %a, i64 %b) {
define <vscale x 2 x i1> @whilehs_d_ii_dont_fold_to_ptrue_larger_than_minvec() {
; CHECK-LABEL: whilehs_d_ii_dont_fold_to_ptrue_larger_than_minvec:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #3
+; CHECK-NEXT: mov w8, #3 // =0x3
; CHECK-NEXT: whilehs p0.d, x8, xzr
; CHECK-NEXT: ret
%out = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehs.nxv2i1.i64(i64 3, i64 0)
@@ -232,7 +235,9 @@ define <vscale x 2 x i1> @whilehs_d_ii_dont_fold_to_ptrue_larger_than_minvec() {
define <vscale x 16 x i1> @whilehs_b_ii() {
; CHECK-LABEL: whilehs_b_ii:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ptrue p0.b, vl7
+; CHECK-NEXT: mov w8, #2 // =0x2
+; CHECK-NEXT: mov w9, #8 // =0x8
+; CHECK-NEXT: whilehs p0.b, x9, x8
; CHECK-NEXT: ret
entry:
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehs.nxv16i1.i64(i64 8, i64 2)
@@ -242,7 +247,7 @@ entry:
define <vscale x 16 x i1> @whilehs_b_ii_dont_fold_to_ptrue_nonexistent_vl9() {
; CHECK-LABEL: whilehs_b_ii_dont_fold_to_ptrue_nonexistent_vl9:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #9
+; CHECK-NEXT: mov w8, #9 // =0x9
; CHECK-NEXT: whilehs p0.b, x8, xzr
; CHECK-NEXT: ret
entry:
@@ -253,7 +258,8 @@ entry:
define <vscale x 16 x i1> @whilehs_b_ii_vl_maximum() vscale_range(16, 16) {
; CHECK-LABEL: whilehs_b_ii_vl_maximum:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: mov w8, #255 // =0xff
+; CHECK-NEXT: whilehs p0.b, x8, xzr
; CHECK-NEXT: ret
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehs.nxv16i1.i64(i64 255, i64 0)
ret <vscale x 16 x i1> %out
@@ -262,8 +268,8 @@ define <vscale x 16 x i1> @whilehs_b_ii_vl_maximum() vscale_range(16, 16) {
define <vscale x 16 x i1> @whilehs_b_ii_dont_fold_to_ptrue_overflow() {
; CHECK-LABEL: whilehs_b_ii_dont_fold_to_ptrue_overflow:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #-1
-; CHECK-NEXT: mov w9, #6
+; CHECK-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-NEXT: mov w9, #6 // =0x6
; CHECK-NEXT: whilehs p0.b, w9, w8
; CHECK-NEXT: ret
entry:
@@ -274,7 +280,7 @@ entry:
define <vscale x 16 x i1> @whilehs_b_ii_dont_fold_to_ptrue_increment_overflow() {
; CHECK-LABEL: whilehs_b_ii_dont_fold_to_ptrue_increment_overflow:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #-1
+; CHECK-NEXT: mov w8, #-1 // =0xffffffff
; CHECK-NEXT: whilehs p0.b, w8, wzr
; CHECK-NEXT: ret
entry:
@@ -361,7 +367,7 @@ define <vscale x 2 x i1> @whilegt_d_xx(i64 %a, i64 %b) {
define <vscale x 2 x i1> @whilegt_d_ii_dont_fold_to_ptrue_larger_than_minvec() {
; CHECK-LABEL: whilegt_d_ii_dont_fold_to_ptrue_larger_than_minvec:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #3
+; CHECK-NEXT: mov w8, #3 // =0x3
; CHECK-NEXT: whilegt p0.d, x8, xzr
; CHECK-NEXT: ret
%out = call <vscale x 2 x i1> @llvm.aarch64.sve.whilegt.nxv2i1.i64(i64 3, i64 0)
@@ -371,7 +377,9 @@ define <vscale x 2 x i1> @whilegt_d_ii_dont_fold_to_ptrue_larger_than_minvec() {
define <vscale x 16 x i1> @whilegt_b_ii() {
; CHECK-LABEL: whilegt_b_ii:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ptrue p0.b, vl5
+; CHECK-NEXT: mov w8, #-2 // =0xfffffffe
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: whilegt p0.b, w9, w8
; CHECK-NEXT: ret
entry:
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilegt.nxv16i1.i32(i32 3, i32 -2)
@@ -381,7 +389,7 @@ entry:
define <vscale x 16 x i1> @whilegt_b_ii_fold_to_ptrue_nonexistent_vl9() {
; CHECK-LABEL: whilegt_b_ii_fold_to_ptrue_nonexistent_vl9:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #9
+; CHECK-NEXT: mov w8, #9 // =0x9
; CHECK-NEXT: whilegt p0.b, x8, xzr
; CHECK-NEXT: ret
entry:
@@ -392,7 +400,8 @@ entry:
define <vscale x 16 x i1> @whilegt_b_ii_vl_maximum() vscale_range(16, 16) {
; CHECK-LABEL: whilegt_b_ii_vl_maximum:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: mov w8, #256 // =0x100
+; CHECK-NEXT: whilegt p0.b, x8, xzr
; CHECK-NEXT: ret
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilegt.nxv16i1.i64(i64 256, i64 0)
ret <vscale x 16 x i1> %out
@@ -401,8 +410,8 @@ define <vscale x 16 x i1> @whilegt_b_ii_vl_maximum() vscale_range(16, 16) {
define <vscale x 16 x i1> @whilegt_b_ii_dont_fold_to_ptrue_overflow() {
; CHECK-LABEL: whilegt_b_ii_dont_fold_to_ptrue_overflow:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #2147483647
-; CHECK-NEXT: mov w9, #-2147483641
+; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT: mov w9, #-2147483641 // =0x80000007
; CHECK-NEXT: whilegt p0.b, w9, w8
; CHECK-NEXT: ret
entry:
@@ -489,7 +498,7 @@ define <vscale x 2 x i1> @whilehi_d_xx(i64 %a, i64 %b) {
define <vscale x 2 x i1> @whilehi_d_ii_dont_fold_to_ptrue_larger_than_minvec() {
; CHECK-LABEL: whilehi_d_ii_dont_fold_to_ptrue_larger_than_minvec:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #3
+; CHECK-NEXT: mov w8, #3 // =0x3
; CHECK-NEXT: whilehi p0.d, x8, xzr
; CHECK-NEXT: ret
%out = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehi.nxv2i1.i64(i64 3, i64 0)
@@ -499,7 +508,9 @@ define <vscale x 2 x i1> @whilehi_d_ii_dont_fold_to_ptrue_larger_than_minvec() {
define <vscale x 16 x i1> @whilehi_b_ii() {
; CHECK-LABEL: whilehi_b_ii:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ptrue p0.b, vl6
+; CHECK-NEXT: mov w8, #2 // =0x2
+; CHECK-NEXT: mov w9, #8 // =0x8
+; CHECK-NEXT: whilehi p0.b, x9, x8
; CHECK-NEXT: ret
entry:
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehi.nxv16i1.i64(i64 8, i64 2)
@@ -509,7 +520,7 @@ entry:
define <vscale x 16 x i1> @whilehi_b_ii_dont_fold_to_ptrue_nonexistent_vl9() {
; CHECK-LABEL: whilehi_b_ii_dont_fold_to_ptrue_nonexistent_vl9:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #9
+; CHECK-NEXT: mov w8, #9 // =0x9
; CHECK-NEXT: whilehi p0.b, x8, xzr
; CHECK-NEXT: ret
entry:
@@ -520,7 +531,8 @@ entry:
define <vscale x 16 x i1> @whilehi_b_ii_vl_maximum() vscale_range(16, 16) {
; CHECK-LABEL: whilehi_b_ii_vl_maximum:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.b, vl256
+; CHECK-NEXT: mov w8, #256 // =0x100
+; CHECK-NEXT: whilehi p0.b, x8, xzr
; CHECK-NEXT: ret
%out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehi.nxv16i1.i64(i64 256, i64 0)
ret <vscale x 16 x i1> %out
@@ -529,8 +541,8 @@ define <vscale x 16 x i1> @whilehi_b_ii_vl_maximum() vscale_range(16, 16) {
define <vscale x 16 x i1> @whilelhi_b_ii_dont_fold_to_ptrue_overflow() {
; CHECK-LABEL: whilelhi_b_ii_dont_fold_to_ptrue_overflow:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #-1
-; CHECK-NEXT: mov w9, #7
+; CHECK-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-NEXT: mov w9, #7 // =0x7
; CHECK-NEXT: whilehi p0.b, w9, w8
; CHECK-NEXT: ret
entry:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, With unused tests now could be removed.
I've kept just a single test to show we don't perform the ptrue transformation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, While revisiting this review once again examining the change, I am a bit confused now. If we look at WHILE{GT/GE/HI,HS} instructions, according to the spec it says:
...
for e = elements-1 downto 0
...
Looks like, it should not be a problem for traversing from high->low lanes?
The fact the WHILE{GT/GE/HI,HS} instructions traverse from high->low lanes is what makes the replacement with PTRUE incorrect. For example:
whereas:
|
@@ -5036,20 +5036,17 @@ static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, | |||
} | |||
|
|||
static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
minor nit: is it worth renaming this to optimizeIncrementingWhile
to distinguish it further?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds sensible to me. Done.
…hs} intrinsics. (llvm#88126) When fed constant operands we try to lower WHILE intrinsics to PTRUE using a fixed length pattern. This is not valid for the decrementing variants of WHILE because they construct their result predicate vector by traversing from high->low lanes whereas the incrementing variants and PTRUE traverse from low->high lanes. Whilst we can still utilise PTRUE by reversing its result I figure replacing a single WHILE with multiple SVE instructions is likely counterproductive.
…hs} intrinsics. (llvm#88126) When fed constant operands we try to lower WHILE intrinsics to PTRUE using a fixed length pattern. This is not valid for the decrementing variants of WHILE because they construct their result predicate vector by traversing from high->low lanes whereas the incrementing variants and PTRUE traverse from low->high lanes. Whilst we can still utilise PTRUE by reversing its result I figure replacing a single WHILE with multiple SVE instructions is likely counterproductive.
When fed constant operands we try to lower WHILE intrinsics to PTRUE using a fixed length pattern. This is not valid for the decrementing variants of WHILE because they construct their result predicate vector by traversing from high->low lanes whereas the incrementing variants and PTRUE traverse from low->high lanes.
Whilst we can still utilise PTRUE by reversing its result I figure replacing a single WHILE with multiple SVE instructions is likely counterproductive.