-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Optimise MOVI + CMGT to CMGE #74499
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Sjoerd Meijer (sjoerdmeijer) ChangesThis fixes a regression that occured for a pattern of MOVI + CMGT instructions, which can be optimised to CMGE. I.e., when the signed greater than compare has -1 as an operand, we can rewrite that as a compare greater equal than 0, which is what CMGE does. Fixes #61836 Full diff: https://github.com/llvm/llvm-project/pull/74499.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f36607b03e76f..01c1a9660eb0a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13710,6 +13710,7 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
APInt UndefBits(VT.getSizeInBits(), 0);
bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
bool IsZero = IsCnst && (CnstBits == 0);
+ bool IsMinusOne = IsCnst && CnstBits.isAllOnes();
if (SrcVT.getVectorElementType().isFloatingPoint()) {
switch (CC) {
@@ -13778,6 +13779,8 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
case AArch64CC::GT:
if (IsZero)
return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
+ if (IsMinusOne)
+ return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
case AArch64CC::LE:
if (IsZero)
diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index d16b5786a9965..09a6e26fe5a40 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -176,12 +176,11 @@ define <4 x i32> @sign_4xi32_multi_use(<4 x i32> %a) {
; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT: cmlt v2.4s, v0.4s, #0
-; CHECK-NEXT: orr v2.4s, #1
-; CHECK-NEXT: cmgt v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill
-; CHECK-NEXT: xtn v0.4h, v1.4s
+; CHECK-NEXT: cmlt v1.4s, v0.4s, #0
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: orr v1.4s, #1
+; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill
; CHECK-NEXT: bl use_4xi1
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
@@ -227,9 +226,8 @@ define <4 x i32> @not_sign_4xi32_2(<4 x i32> %a) {
define <4 x i32> @not_sign_4xi32_3(<4 x i32> %a) {
; CHECK-LABEL: not_sign_4xi32_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
; CHECK-NEXT: adrp x8, .LCPI18_0
-; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0]
; CHECK-NEXT: bic v1.16b, v1.16b, v0.16b
; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s
diff --git a/llvm/test/CodeGen/AArch64/signbit-shift.ll b/llvm/test/CodeGen/AArch64/signbit-shift.ll
index cb758f8a6202b..253ea1cab91fb 100644
--- a/llvm/test/CodeGen/AArch64/signbit-shift.ll
+++ b/llvm/test/CodeGen/AArch64/signbit-shift.ll
@@ -29,10 +29,9 @@ define i32 @add_zext_ifpos(i32 %x) {
define <4 x i32> @add_zext_ifpos_vec_splat(<4 x i32> %x) {
; CHECK-LABEL: add_zext_ifpos_vec_splat:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT: movi v2.4s, #41
-; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: movi v1.4s, #41
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s
; CHECK-NEXT: ret
%c = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%e = zext <4 x i1> %c to <4 x i32>
@@ -43,7 +42,7 @@ define <4 x i32> @add_zext_ifpos_vec_splat(<4 x i32> %x) {
define i32 @sel_ifpos_tval_bigger(i32 %x) {
; CHECK-LABEL: sel_ifpos_tval_bigger:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #41
+; CHECK-NEXT: mov w8, #41 // =0x29
; CHECK-NEXT: cmp w0, #0
; CHECK-NEXT: cinc w0, w8, ge
; CHECK-NEXT: ret
@@ -78,10 +77,9 @@ define i32 @add_sext_ifpos(i32 %x) {
define <4 x i32> @add_sext_ifpos_vec_splat(<4 x i32> %x) {
; CHECK-LABEL: add_sext_ifpos_vec_splat:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT: movi v2.4s, #42
-; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: movi v1.4s, #42
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%c = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%e = sext <4 x i1> %c to <4 x i32>
@@ -92,7 +90,7 @@ define <4 x i32> @add_sext_ifpos_vec_splat(<4 x i32> %x) {
define i32 @sel_ifpos_fval_bigger(i32 %x) {
; CHECK-LABEL: sel_ifpos_fval_bigger:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #41
+; CHECK-NEXT: mov w8, #41 // =0x29
; CHECK-NEXT: cmp w0, #0
; CHECK-NEXT: cinc w0, w8, lt
; CHECK-NEXT: ret
@@ -128,7 +126,7 @@ define i32 @add_zext_ifneg(i32 %x) {
define i32 @sel_ifneg_tval_bigger(i32 %x) {
; CHECK-LABEL: sel_ifneg_tval_bigger:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #41
+; CHECK-NEXT: mov w8, #41 // =0x29
; CHECK-NEXT: cmp w0, #0
; CHECK-NEXT: cinc w0, w8, lt
; CHECK-NEXT: ret
@@ -162,7 +160,7 @@ define i32 @add_sext_ifneg(i32 %x) {
define i32 @sel_ifneg_fval_bigger(i32 %x) {
; CHECK-LABEL: sel_ifneg_fval_bigger:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #41
+; CHECK-NEXT: mov w8, #41 // =0x29
; CHECK-NEXT: cmp w0, #0
; CHECK-NEXT: cinc w0, w8, ge
; CHECK-NEXT: ret
@@ -199,7 +197,7 @@ define <4 x i32> @add_lshr_not_vec_splat(<4 x i32> %x) {
define i32 @sub_lshr_not(i32 %x) {
; CHECK-LABEL: sub_lshr_not:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #42
+; CHECK-NEXT: mov w8, #42 // =0x2a
; CHECK-NEXT: bfxil w8, w0, #31, #1
; CHECK-NEXT: mov w0, w8
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll
index 0ae09ebe91630..b80955665c74f 100644
--- a/llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -543,15 +543,14 @@ entry:
define <16 x i32> @same_zext_used_in_cmp_signed_pred_and_select_can_convert_to_unsigned_pred(<16 x i8> %a) {
; CHECK-LABEL: same_zext_used_in_cmp_signed_pred_and_select_can_convert_to_unsigned_pred:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: movi.2d v1, #0xffffffffffffffff
+; CHECK-NEXT: cmge.16b v1, v0, #0
; CHECK-NEXT: ushll.8h v2, v0, #0
-; CHECK-NEXT: ushll.4s v4, v2, #0
-; CHECK-NEXT: ushll2.4s v2, v2, #0
-; CHECK-NEXT: cmgt.16b v1, v0, v1
; CHECK-NEXT: ushll2.8h v0, v0, #0
; CHECK-NEXT: sshll.8h v3, v1, #0
; CHECK-NEXT: sshll2.8h v1, v1, #0
+; CHECK-NEXT: ushll.4s v4, v2, #0
; CHECK-NEXT: ushll.4s v5, v0, #0
+; CHECK-NEXT: ushll2.4s v2, v2, #0
; CHECK-NEXT: ushll2.4s v6, v0, #0
; CHECK-NEXT: sshll.4s v0, v3, #0
; CHECK-NEXT: sshll.4s v7, v1, #0
@@ -574,52 +573,51 @@ define void @extension_in_loop_v16i8_to_v16i32(ptr %src, ptr %dst) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: Lloh2:
; CHECK-NEXT: adrp x8, lCPI24_0@PAGE
-; CHECK-NEXT: movi.2d v0, #0xffffffffffffffff
; CHECK-NEXT: Lloh3:
-; CHECK-NEXT: adrp x9, lCPI24_2@PAGE
+; CHECK-NEXT: adrp x9, lCPI24_1@PAGE
; CHECK-NEXT: Lloh4:
-; CHECK-NEXT: ldr q1, [x8, lCPI24_0@PAGEOFF]
+; CHECK-NEXT: adrp x10, lCPI24_2@PAGE
; CHECK-NEXT: Lloh5:
-; CHECK-NEXT: adrp x8, lCPI24_1@PAGE
+; CHECK-NEXT: ldr q0, [x8, lCPI24_0@PAGEOFF]
; CHECK-NEXT: Lloh6:
-; CHECK-NEXT: adrp x10, lCPI24_3@PAGE
+; CHECK-NEXT: adrp x8, lCPI24_3@PAGE
; CHECK-NEXT: Lloh7:
-; CHECK-NEXT: ldr q2, [x8, lCPI24_1@PAGEOFF]
+; CHECK-NEXT: ldr q1, [x9, lCPI24_1@PAGEOFF]
; CHECK-NEXT: Lloh8:
-; CHECK-NEXT: ldr q3, [x9, lCPI24_2@PAGEOFF]
+; CHECK-NEXT: ldr q2, [x10, lCPI24_2@PAGEOFF]
; CHECK-NEXT: Lloh9:
-; CHECK-NEXT: ldr q4, [x10, lCPI24_3@PAGEOFF]
+; CHECK-NEXT: ldr q3, [x8, lCPI24_3@PAGEOFF]
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: LBB24_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q5, [x0, x8]
+; CHECK-NEXT: ldr q4, [x0, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
-; CHECK-NEXT: cmgt.16b v6, v5, v0
-; CHECK-NEXT: tbl.16b v16, { v5 }, v1
-; CHECK-NEXT: tbl.16b v17, { v5 }, v2
-; CHECK-NEXT: tbl.16b v19, { v5 }, v3
-; CHECK-NEXT: tbl.16b v5, { v5 }, v4
-; CHECK-NEXT: sshll2.8h v7, v6, #0
-; CHECK-NEXT: sshll.8h v6, v6, #0
-; CHECK-NEXT: sshll2.4s v18, v7, #0
-; CHECK-NEXT: sshll.4s v7, v7, #0
-; CHECK-NEXT: sshll2.4s v20, v6, #0
+; CHECK-NEXT: cmge.16b v5, v4, #0
+; CHECK-NEXT: tbl.16b v7, { v4 }, v0
+; CHECK-NEXT: tbl.16b v16, { v4 }, v1
+; CHECK-NEXT: tbl.16b v18, { v4 }, v2
+; CHECK-NEXT: tbl.16b v4, { v4 }, v3
+; CHECK-NEXT: sshll2.8h v6, v5, #0
+; CHECK-NEXT: sshll.8h v5, v5, #0
+; CHECK-NEXT: sshll2.4s v17, v6, #0
; CHECK-NEXT: sshll.4s v6, v6, #0
-; CHECK-NEXT: and.16b v16, v16, v18
-; CHECK-NEXT: and.16b v7, v17, v7
-; CHECK-NEXT: and.16b v17, v19, v20
-; CHECK-NEXT: and.16b v5, v5, v6
-; CHECK-NEXT: stp q7, q16, [x1, #32]
-; CHECK-NEXT: stp q5, q17, [x1], #64
+; CHECK-NEXT: sshll2.4s v19, v5, #0
+; CHECK-NEXT: sshll.4s v5, v5, #0
+; CHECK-NEXT: and.16b v7, v7, v17
+; CHECK-NEXT: and.16b v6, v16, v6
+; CHECK-NEXT: and.16b v16, v18, v19
+; CHECK-NEXT: and.16b v4, v4, v5
+; CHECK-NEXT: stp q6, q7, [x1, #32]
+; CHECK-NEXT: stp q4, q16, [x1], #64
; CHECK-NEXT: b.ne LBB24_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh9
-; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh7
-; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh8
-; CHECK-NEXT: .loh AdrpAdrp Lloh2, Lloh5
-; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh4
+; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh8
+; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7
+; CHECK-NEXT: .loh AdrpAdrp Lloh2, Lloh6
+; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh5
entry:
br label %loop
@@ -645,52 +643,51 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(ptr %src, ptr %dst) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: Lloh10:
; CHECK-NEXT: adrp x8, lCPI25_0@PAGE
-; CHECK-NEXT: movi.2d v0, #0xffffffffffffffff
; CHECK-NEXT: Lloh11:
-; CHECK-NEXT: adrp x9, lCPI25_2@PAGE
+; CHECK-NEXT: adrp x9, lCPI25_1@PAGE
; CHECK-NEXT: Lloh12:
-; CHECK-NEXT: ldr q1, [x8, lCPI25_0@PAGEOFF]
+; CHECK-NEXT: adrp x10, lCPI25_2@PAGE
; CHECK-NEXT: Lloh13:
-; CHECK-NEXT: adrp x8, lCPI25_1@PAGE
+; CHECK-NEXT: ldr q0, [x8, lCPI25_0@PAGEOFF]
; CHECK-NEXT: Lloh14:
-; CHECK-NEXT: adrp x10, lCPI25_3@PAGE
+; CHECK-NEXT: adrp x8, lCPI25_3@PAGE
; CHECK-NEXT: Lloh15:
-; CHECK-NEXT: ldr q2, [x8, lCPI25_1@PAGEOFF]
+; CHECK-NEXT: ldr q1, [x9, lCPI25_1@PAGEOFF]
; CHECK-NEXT: Lloh16:
-; CHECK-NEXT: ldr q3, [x9, lCPI25_2@PAGEOFF]
+; CHECK-NEXT: ldr q2, [x10, lCPI25_2@PAGEOFF]
; CHECK-NEXT: Lloh17:
-; CHECK-NEXT: ldr q4, [x10, lCPI25_3@PAGEOFF]
+; CHECK-NEXT: ldr q3, [x8, lCPI25_3@PAGEOFF]
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: LBB25_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q5, [x0, x8]
+; CHECK-NEXT: ldr q4, [x0, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
-; CHECK-NEXT: cmgt.16b v6, v5, v0
-; CHECK-NEXT: tbl.16b v16, { v5 }, v1
-; CHECK-NEXT: tbl.16b v17, { v5 }, v2
-; CHECK-NEXT: tbl.16b v19, { v5 }, v3
-; CHECK-NEXT: tbl.16b v5, { v5 }, v4
-; CHECK-NEXT: sshll2.8h v7, v6, #0
-; CHECK-NEXT: sshll.8h v6, v6, #0
-; CHECK-NEXT: sshll2.4s v18, v7, #0
-; CHECK-NEXT: sshll.4s v7, v7, #0
-; CHECK-NEXT: sshll2.4s v20, v6, #0
+; CHECK-NEXT: cmge.16b v5, v4, #0
+; CHECK-NEXT: tbl.16b v7, { v4 }, v0
+; CHECK-NEXT: tbl.16b v16, { v4 }, v1
+; CHECK-NEXT: tbl.16b v18, { v4 }, v2
+; CHECK-NEXT: tbl.16b v4, { v4 }, v3
+; CHECK-NEXT: sshll2.8h v6, v5, #0
+; CHECK-NEXT: sshll.8h v5, v5, #0
+; CHECK-NEXT: sshll2.4s v17, v6, #0
; CHECK-NEXT: sshll.4s v6, v6, #0
-; CHECK-NEXT: and.16b v16, v16, v18
-; CHECK-NEXT: and.16b v7, v17, v7
-; CHECK-NEXT: and.16b v17, v19, v20
-; CHECK-NEXT: and.16b v5, v5, v6
-; CHECK-NEXT: stp q7, q16, [x1, #32]
-; CHECK-NEXT: stp q5, q17, [x1], #64
+; CHECK-NEXT: sshll2.4s v19, v5, #0
+; CHECK-NEXT: sshll.4s v5, v5, #0
+; CHECK-NEXT: and.16b v7, v7, v17
+; CHECK-NEXT: and.16b v6, v16, v6
+; CHECK-NEXT: and.16b v16, v18, v19
+; CHECK-NEXT: and.16b v4, v4, v5
+; CHECK-NEXT: stp q6, q7, [x1, #32]
+; CHECK-NEXT: stp q4, q16, [x1], #64
; CHECK-NEXT: b.ne LBB25_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh17
-; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh15
-; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh16
-; CHECK-NEXT: .loh AdrpAdrp Lloh10, Lloh13
-; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh12
+; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh16
+; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15
+; CHECK-NEXT: .loh AdrpAdrp Lloh10, Lloh14
+; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh13
entry:
br label %loop
@@ -717,52 +714,51 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(ptr %src, ptr %dst) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: Lloh18:
; CHECK-NEXT: adrp x8, lCPI26_0@PAGE
-; CHECK-NEXT: movi.2d v0, #0xffffffffffffffff
; CHECK-NEXT: Lloh19:
-; CHECK-NEXT: adrp x9, lCPI26_2@PAGE
+; CHECK-NEXT: adrp x9, lCPI26_1@PAGE
; CHECK-NEXT: Lloh20:
-; CHECK-NEXT: ldr q1, [x8, lCPI26_0@PAGEOFF]
+; CHECK-NEXT: adrp x10, lCPI26_2@PAGE
; CHECK-NEXT: Lloh21:
-; CHECK-NEXT: adrp x8, lCPI26_1@PAGE
+; CHECK-NEXT: ldr q0, [x8, lCPI26_0@PAGEOFF]
; CHECK-NEXT: Lloh22:
-; CHECK-NEXT: adrp x10, lCPI26_3@PAGE
+; CHECK-NEXT: adrp x8, lCPI26_3@PAGE
; CHECK-NEXT: Lloh23:
-; CHECK-NEXT: ldr q2, [x8, lCPI26_1@PAGEOFF]
+; CHECK-NEXT: ldr q1, [x9, lCPI26_1@PAGEOFF]
; CHECK-NEXT: Lloh24:
-; CHECK-NEXT: ldr q3, [x9, lCPI26_2@PAGEOFF]
+; CHECK-NEXT: ldr q2, [x10, lCPI26_2@PAGEOFF]
; CHECK-NEXT: Lloh25:
-; CHECK-NEXT: ldr q4, [x10, lCPI26_3@PAGEOFF]
+; CHECK-NEXT: ldr q3, [x8, lCPI26_3@PAGEOFF]
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: LBB26_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q5, [x0, x8]
+; CHECK-NEXT: ldr q4, [x0, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
-; CHECK-NEXT: cmgt.16b v6, v5, v0
-; CHECK-NEXT: tbl.16b v16, { v5 }, v1
-; CHECK-NEXT: tbl.16b v17, { v5 }, v2
-; CHECK-NEXT: tbl.16b v19, { v5 }, v3
-; CHECK-NEXT: tbl.16b v5, { v5 }, v4
-; CHECK-NEXT: sshll2.8h v7, v6, #0
-; CHECK-NEXT: sshll.8h v6, v6, #0
-; CHECK-NEXT: sshll2.4s v18, v7, #0
-; CHECK-NEXT: sshll.4s v7, v7, #0
-; CHECK-NEXT: sshll2.4s v20, v6, #0
+; CHECK-NEXT: cmge.16b v5, v4, #0
+; CHECK-NEXT: tbl.16b v7, { v4 }, v0
+; CHECK-NEXT: tbl.16b v16, { v4 }, v1
+; CHECK-NEXT: tbl.16b v18, { v4 }, v2
+; CHECK-NEXT: tbl.16b v4, { v4 }, v3
+; CHECK-NEXT: sshll2.8h v6, v5, #0
+; CHECK-NEXT: sshll.8h v5, v5, #0
+; CHECK-NEXT: sshll2.4s v17, v6, #0
; CHECK-NEXT: sshll.4s v6, v6, #0
-; CHECK-NEXT: and.16b v16, v16, v18
-; CHECK-NEXT: and.16b v7, v17, v7
-; CHECK-NEXT: and.16b v17, v19, v20
-; CHECK-NEXT: and.16b v5, v5, v6
-; CHECK-NEXT: stp q7, q16, [x1, #32]
-; CHECK-NEXT: stp q5, q17, [x1], #64
+; CHECK-NEXT: sshll2.4s v19, v5, #0
+; CHECK-NEXT: sshll.4s v5, v5, #0
+; CHECK-NEXT: and.16b v7, v7, v17
+; CHECK-NEXT: and.16b v6, v16, v6
+; CHECK-NEXT: and.16b v16, v18, v19
+; CHECK-NEXT: and.16b v4, v4, v5
+; CHECK-NEXT: stp q6, q7, [x1, #32]
+; CHECK-NEXT: stp q4, q16, [x1], #64
; CHECK-NEXT: b.ne LBB26_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh22, Lloh25
-; CHECK-NEXT: .loh AdrpLdr Lloh21, Lloh23
-; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh24
-; CHECK-NEXT: .loh AdrpAdrp Lloh18, Lloh21
-; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh20
+; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh24
+; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh23
+; CHECK-NEXT: .loh AdrpAdrp Lloh18, Lloh22
+; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh21
entry:
br label %loop
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds great. are you interested in adding slt 1 while you are here?
https://godbolt.org/z/oW8MKnKqf
Yeah, I thought about it.... I was afraid you might say this. ;) |
✅ With the latest revision this PR passed the C/C++ code formatter. |
Thanks @davemgreen for the suggestion, I have added that to the patch. |
This fixes a regression that occured for a pattern of MOVI + CMGT instructions, which can be optimised to CMGE. I.e., when the signed greater than compare has -1 as an operand, we can rewrite that as a compare greater equal than 0, which is what CMGE does. And similarly, for SLT 1 we will now generate CMLE. Fixes llvm#61836
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks pretty good to me!
Thanks guys! |
This fixes a regression that occured for a pattern of MOVI + CMGT instructions, which can be optimised to CMGE. I.e., when the signed greater than compare has -1 as an operand, we can rewrite that as a compare greater equal than 0, which is what CMGE does.
Fixes #61836