Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64] Optimise MOVI + CMGT to CMGE #74499

Merged
merged 1 commit into from
Dec 7, 2023
Merged

Conversation

sjoerdmeijer
Copy link
Collaborator

This fixes a regression that occured for a pattern of MOVI + CMGT instructions, which can be optimised to CMGE. I.e., when the signed greater than compare has -1 as an operand, we can rewrite that as a compare greater equal than 0, which is what CMGE does.

Fixes #61836

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 5, 2023

@llvm/pr-subscribers-backend-aarch64

Author: Sjoerd Meijer (sjoerdmeijer)

Changes

This fixes a regression that occured for a pattern of MOVI + CMGT instructions, which can be optimised to CMGE. I.e., when the signed greater than compare has -1 as an operand, we can rewrite that as a compare greater equal than 0, which is what CMGE does.

Fixes #61836


Full diff: https://github.com/llvm/llvm-project/pull/74499.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+3)
  • (modified) llvm/test/CodeGen/AArch64/cmp-select-sign.ll (+6-8)
  • (modified) llvm/test/CodeGen/AArch64/signbit-shift.ll (+11-13)
  • (modified) llvm/test/CodeGen/AArch64/vselect-ext.ll (+87-91)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f36607b03e76f..01c1a9660eb0a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13710,6 +13710,7 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
   APInt UndefBits(VT.getSizeInBits(), 0);
   bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
   bool IsZero = IsCnst && (CnstBits == 0);
+  bool IsMinusOne = IsCnst && CnstBits.isAllOnes();
 
   if (SrcVT.getVectorElementType().isFloatingPoint()) {
     switch (CC) {
@@ -13778,6 +13779,8 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
   case AArch64CC::GT:
     if (IsZero)
       return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
+    if (IsMinusOne)
+      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
     return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
   case AArch64CC::LE:
     if (IsZero)
diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index d16b5786a9965..09a6e26fe5a40 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -176,12 +176,11 @@ define <4 x i32> @sign_4xi32_multi_use(<4 x i32> %a) {
 ; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT:    cmlt v2.4s, v0.4s, #0
-; CHECK-NEXT:    orr v2.4s, #1
-; CHECK-NEXT:    cmgt v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    str q2, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    xtn v0.4h, v1.4s
+; CHECK-NEXT:    cmlt v1.4s, v0.4s, #0
+; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    orr v1.4s, #1
+; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl use_4xi1
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
@@ -227,9 +226,8 @@ define <4 x i32> @not_sign_4xi32_2(<4 x i32> %a) {
 define <4 x i32> @not_sign_4xi32_3(<4 x i32> %a) {
 ; CHECK-LABEL: not_sign_4xi32_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
 ; CHECK-NEXT:    adrp x8, .LCPI18_0
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
 ; CHECK-NEXT:    bic v1.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
diff --git a/llvm/test/CodeGen/AArch64/signbit-shift.ll b/llvm/test/CodeGen/AArch64/signbit-shift.ll
index cb758f8a6202b..253ea1cab91fb 100644
--- a/llvm/test/CodeGen/AArch64/signbit-shift.ll
+++ b/llvm/test/CodeGen/AArch64/signbit-shift.ll
@@ -29,10 +29,9 @@ define i32 @add_zext_ifpos(i32 %x) {
 define <4 x i32> @add_zext_ifpos_vec_splat(<4 x i32> %x) {
 ; CHECK-LABEL: add_zext_ifpos_vec_splat:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT:    movi v2.4s, #41
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    sub v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    movi v1.4s, #41
+; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
+; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ret
   %c = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   %e = zext <4 x i1> %c to <4 x i32>
@@ -43,7 +42,7 @@ define <4 x i32> @add_zext_ifpos_vec_splat(<4 x i32> %x) {
 define i32 @sel_ifpos_tval_bigger(i32 %x) {
 ; CHECK-LABEL: sel_ifpos_tval_bigger:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #41
+; CHECK-NEXT:    mov w8, #41 // =0x29
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    cinc w0, w8, ge
 ; CHECK-NEXT:    ret
@@ -78,10 +77,9 @@ define i32 @add_sext_ifpos(i32 %x) {
 define <4 x i32> @add_sext_ifpos_vec_splat(<4 x i32> %x) {
 ; CHECK-LABEL: add_sext_ifpos_vec_splat:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    movi v1.4s, #42
+; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %c = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   %e = sext <4 x i1> %c to <4 x i32>
@@ -92,7 +90,7 @@ define <4 x i32> @add_sext_ifpos_vec_splat(<4 x i32> %x) {
 define i32 @sel_ifpos_fval_bigger(i32 %x) {
 ; CHECK-LABEL: sel_ifpos_fval_bigger:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #41
+; CHECK-NEXT:    mov w8, #41 // =0x29
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    cinc w0, w8, lt
 ; CHECK-NEXT:    ret
@@ -128,7 +126,7 @@ define i32 @add_zext_ifneg(i32 %x) {
 define i32 @sel_ifneg_tval_bigger(i32 %x) {
 ; CHECK-LABEL: sel_ifneg_tval_bigger:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #41
+; CHECK-NEXT:    mov w8, #41 // =0x29
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    cinc w0, w8, lt
 ; CHECK-NEXT:    ret
@@ -162,7 +160,7 @@ define i32 @add_sext_ifneg(i32 %x) {
 define i32 @sel_ifneg_fval_bigger(i32 %x) {
 ; CHECK-LABEL: sel_ifneg_fval_bigger:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #41
+; CHECK-NEXT:    mov w8, #41 // =0x29
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    cinc w0, w8, ge
 ; CHECK-NEXT:    ret
@@ -199,7 +197,7 @@ define <4 x i32> @add_lshr_not_vec_splat(<4 x i32> %x) {
 define i32 @sub_lshr_not(i32 %x) {
 ; CHECK-LABEL: sub_lshr_not:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    bfxil w8, w0, #31, #1
 ; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll
index 0ae09ebe91630..b80955665c74f 100644
--- a/llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -543,15 +543,14 @@ entry:
 define <16 x i32> @same_zext_used_in_cmp_signed_pred_and_select_can_convert_to_unsigned_pred(<16 x i8> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_signed_pred_and_select_can_convert_to_unsigned_pred:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    movi.2d v1, #0xffffffffffffffff
+; CHECK-NEXT:    cmge.16b v1, v0, #0
 ; CHECK-NEXT:    ushll.8h v2, v0, #0
-; CHECK-NEXT:    ushll.4s v4, v2, #0
-; CHECK-NEXT:    ushll2.4s v2, v2, #0
-; CHECK-NEXT:    cmgt.16b v1, v0, v1
 ; CHECK-NEXT:    ushll2.8h v0, v0, #0
 ; CHECK-NEXT:    sshll.8h v3, v1, #0
 ; CHECK-NEXT:    sshll2.8h v1, v1, #0
+; CHECK-NEXT:    ushll.4s v4, v2, #0
 ; CHECK-NEXT:    ushll.4s v5, v0, #0
+; CHECK-NEXT:    ushll2.4s v2, v2, #0
 ; CHECK-NEXT:    ushll2.4s v6, v0, #0
 ; CHECK-NEXT:    sshll.4s v0, v3, #0
 ; CHECK-NEXT:    sshll.4s v7, v1, #0
@@ -574,52 +573,51 @@ define void @extension_in_loop_v16i8_to_v16i32(ptr %src, ptr %dst) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh2:
 ; CHECK-NEXT:    adrp x8, lCPI24_0@PAGE
-; CHECK-NEXT:    movi.2d v0, #0xffffffffffffffff
 ; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    adrp x9, lCPI24_2@PAGE
+; CHECK-NEXT:    adrp x9, lCPI24_1@PAGE
 ; CHECK-NEXT:  Lloh4:
-; CHECK-NEXT:    ldr q1, [x8, lCPI24_0@PAGEOFF]
+; CHECK-NEXT:    adrp x10, lCPI24_2@PAGE
 ; CHECK-NEXT:  Lloh5:
-; CHECK-NEXT:    adrp x8, lCPI24_1@PAGE
+; CHECK-NEXT:    ldr q0, [x8, lCPI24_0@PAGEOFF]
 ; CHECK-NEXT:  Lloh6:
-; CHECK-NEXT:    adrp x10, lCPI24_3@PAGE
+; CHECK-NEXT:    adrp x8, lCPI24_3@PAGE
 ; CHECK-NEXT:  Lloh7:
-; CHECK-NEXT:    ldr q2, [x8, lCPI24_1@PAGEOFF]
+; CHECK-NEXT:    ldr q1, [x9, lCPI24_1@PAGEOFF]
 ; CHECK-NEXT:  Lloh8:
-; CHECK-NEXT:    ldr q3, [x9, lCPI24_2@PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x10, lCPI24_2@PAGEOFF]
 ; CHECK-NEXT:  Lloh9:
-; CHECK-NEXT:    ldr q4, [x10, lCPI24_3@PAGEOFF]
+; CHECK-NEXT:    ldr q3, [x8, lCPI24_3@PAGEOFF]
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB24_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q5, [x0, x8]
+; CHECK-NEXT:    ldr q4, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    cmgt.16b v6, v5, v0
-; CHECK-NEXT:    tbl.16b v16, { v5 }, v1
-; CHECK-NEXT:    tbl.16b v17, { v5 }, v2
-; CHECK-NEXT:    tbl.16b v19, { v5 }, v3
-; CHECK-NEXT:    tbl.16b v5, { v5 }, v4
-; CHECK-NEXT:    sshll2.8h v7, v6, #0
-; CHECK-NEXT:    sshll.8h v6, v6, #0
-; CHECK-NEXT:    sshll2.4s v18, v7, #0
-; CHECK-NEXT:    sshll.4s v7, v7, #0
-; CHECK-NEXT:    sshll2.4s v20, v6, #0
+; CHECK-NEXT:    cmge.16b v5, v4, #0
+; CHECK-NEXT:    tbl.16b v7, { v4 }, v0
+; CHECK-NEXT:    tbl.16b v16, { v4 }, v1
+; CHECK-NEXT:    tbl.16b v18, { v4 }, v2
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v3
+; CHECK-NEXT:    sshll2.8h v6, v5, #0
+; CHECK-NEXT:    sshll.8h v5, v5, #0
+; CHECK-NEXT:    sshll2.4s v17, v6, #0
 ; CHECK-NEXT:    sshll.4s v6, v6, #0
-; CHECK-NEXT:    and.16b v16, v16, v18
-; CHECK-NEXT:    and.16b v7, v17, v7
-; CHECK-NEXT:    and.16b v17, v19, v20
-; CHECK-NEXT:    and.16b v5, v5, v6
-; CHECK-NEXT:    stp q7, q16, [x1, #32]
-; CHECK-NEXT:    stp q5, q17, [x1], #64
+; CHECK-NEXT:    sshll2.4s v19, v5, #0
+; CHECK-NEXT:    sshll.4s v5, v5, #0
+; CHECK-NEXT:    and.16b v7, v7, v17
+; CHECK-NEXT:    and.16b v6, v16, v6
+; CHECK-NEXT:    and.16b v16, v18, v19
+; CHECK-NEXT:    and.16b v4, v4, v5
+; CHECK-NEXT:    stp q6, q7, [x1, #32]
+; CHECK-NEXT:    stp q4, q16, [x1], #64
 ; CHECK-NEXT:    b.ne LBB24_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh9
-; CHECK-NEXT:    .loh AdrpLdr Lloh5, Lloh7
-; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh8
-; CHECK-NEXT:    .loh AdrpAdrp Lloh2, Lloh5
-; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh4
+; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh8
+; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh7
+; CHECK-NEXT:    .loh AdrpAdrp Lloh2, Lloh6
+; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh5
 entry:
   br label %loop
 
@@ -645,52 +643,51 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(ptr %src, ptr %dst) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh10:
 ; CHECK-NEXT:    adrp x8, lCPI25_0@PAGE
-; CHECK-NEXT:    movi.2d v0, #0xffffffffffffffff
 ; CHECK-NEXT:  Lloh11:
-; CHECK-NEXT:    adrp x9, lCPI25_2@PAGE
+; CHECK-NEXT:    adrp x9, lCPI25_1@PAGE
 ; CHECK-NEXT:  Lloh12:
-; CHECK-NEXT:    ldr q1, [x8, lCPI25_0@PAGEOFF]
+; CHECK-NEXT:    adrp x10, lCPI25_2@PAGE
 ; CHECK-NEXT:  Lloh13:
-; CHECK-NEXT:    adrp x8, lCPI25_1@PAGE
+; CHECK-NEXT:    ldr q0, [x8, lCPI25_0@PAGEOFF]
 ; CHECK-NEXT:  Lloh14:
-; CHECK-NEXT:    adrp x10, lCPI25_3@PAGE
+; CHECK-NEXT:    adrp x8, lCPI25_3@PAGE
 ; CHECK-NEXT:  Lloh15:
-; CHECK-NEXT:    ldr q2, [x8, lCPI25_1@PAGEOFF]
+; CHECK-NEXT:    ldr q1, [x9, lCPI25_1@PAGEOFF]
 ; CHECK-NEXT:  Lloh16:
-; CHECK-NEXT:    ldr q3, [x9, lCPI25_2@PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x10, lCPI25_2@PAGEOFF]
 ; CHECK-NEXT:  Lloh17:
-; CHECK-NEXT:    ldr q4, [x10, lCPI25_3@PAGEOFF]
+; CHECK-NEXT:    ldr q3, [x8, lCPI25_3@PAGEOFF]
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB25_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q5, [x0, x8]
+; CHECK-NEXT:    ldr q4, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    cmgt.16b v6, v5, v0
-; CHECK-NEXT:    tbl.16b v16, { v5 }, v1
-; CHECK-NEXT:    tbl.16b v17, { v5 }, v2
-; CHECK-NEXT:    tbl.16b v19, { v5 }, v3
-; CHECK-NEXT:    tbl.16b v5, { v5 }, v4
-; CHECK-NEXT:    sshll2.8h v7, v6, #0
-; CHECK-NEXT:    sshll.8h v6, v6, #0
-; CHECK-NEXT:    sshll2.4s v18, v7, #0
-; CHECK-NEXT:    sshll.4s v7, v7, #0
-; CHECK-NEXT:    sshll2.4s v20, v6, #0
+; CHECK-NEXT:    cmge.16b v5, v4, #0
+; CHECK-NEXT:    tbl.16b v7, { v4 }, v0
+; CHECK-NEXT:    tbl.16b v16, { v4 }, v1
+; CHECK-NEXT:    tbl.16b v18, { v4 }, v2
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v3
+; CHECK-NEXT:    sshll2.8h v6, v5, #0
+; CHECK-NEXT:    sshll.8h v5, v5, #0
+; CHECK-NEXT:    sshll2.4s v17, v6, #0
 ; CHECK-NEXT:    sshll.4s v6, v6, #0
-; CHECK-NEXT:    and.16b v16, v16, v18
-; CHECK-NEXT:    and.16b v7, v17, v7
-; CHECK-NEXT:    and.16b v17, v19, v20
-; CHECK-NEXT:    and.16b v5, v5, v6
-; CHECK-NEXT:    stp q7, q16, [x1, #32]
-; CHECK-NEXT:    stp q5, q17, [x1], #64
+; CHECK-NEXT:    sshll2.4s v19, v5, #0
+; CHECK-NEXT:    sshll.4s v5, v5, #0
+; CHECK-NEXT:    and.16b v7, v7, v17
+; CHECK-NEXT:    and.16b v6, v16, v6
+; CHECK-NEXT:    and.16b v16, v18, v19
+; CHECK-NEXT:    and.16b v4, v4, v5
+; CHECK-NEXT:    stp q6, q7, [x1, #32]
+; CHECK-NEXT:    stp q4, q16, [x1], #64
 ; CHECK-NEXT:    b.ne LBB25_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh17
-; CHECK-NEXT:    .loh AdrpLdr Lloh13, Lloh15
-; CHECK-NEXT:    .loh AdrpLdr Lloh11, Lloh16
-; CHECK-NEXT:    .loh AdrpAdrp Lloh10, Lloh13
-; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh12
+; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh16
+; CHECK-NEXT:    .loh AdrpLdr Lloh11, Lloh15
+; CHECK-NEXT:    .loh AdrpAdrp Lloh10, Lloh14
+; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh13
 entry:
   br label %loop
 
@@ -717,52 +714,51 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(ptr %src, ptr %dst) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh18:
 ; CHECK-NEXT:    adrp x8, lCPI26_0@PAGE
-; CHECK-NEXT:    movi.2d v0, #0xffffffffffffffff
 ; CHECK-NEXT:  Lloh19:
-; CHECK-NEXT:    adrp x9, lCPI26_2@PAGE
+; CHECK-NEXT:    adrp x9, lCPI26_1@PAGE
 ; CHECK-NEXT:  Lloh20:
-; CHECK-NEXT:    ldr q1, [x8, lCPI26_0@PAGEOFF]
+; CHECK-NEXT:    adrp x10, lCPI26_2@PAGE
 ; CHECK-NEXT:  Lloh21:
-; CHECK-NEXT:    adrp x8, lCPI26_1@PAGE
+; CHECK-NEXT:    ldr q0, [x8, lCPI26_0@PAGEOFF]
 ; CHECK-NEXT:  Lloh22:
-; CHECK-NEXT:    adrp x10, lCPI26_3@PAGE
+; CHECK-NEXT:    adrp x8, lCPI26_3@PAGE
 ; CHECK-NEXT:  Lloh23:
-; CHECK-NEXT:    ldr q2, [x8, lCPI26_1@PAGEOFF]
+; CHECK-NEXT:    ldr q1, [x9, lCPI26_1@PAGEOFF]
 ; CHECK-NEXT:  Lloh24:
-; CHECK-NEXT:    ldr q3, [x9, lCPI26_2@PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x10, lCPI26_2@PAGEOFF]
 ; CHECK-NEXT:  Lloh25:
-; CHECK-NEXT:    ldr q4, [x10, lCPI26_3@PAGEOFF]
+; CHECK-NEXT:    ldr q3, [x8, lCPI26_3@PAGEOFF]
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB26_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q5, [x0, x8]
+; CHECK-NEXT:    ldr q4, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    cmgt.16b v6, v5, v0
-; CHECK-NEXT:    tbl.16b v16, { v5 }, v1
-; CHECK-NEXT:    tbl.16b v17, { v5 }, v2
-; CHECK-NEXT:    tbl.16b v19, { v5 }, v3
-; CHECK-NEXT:    tbl.16b v5, { v5 }, v4
-; CHECK-NEXT:    sshll2.8h v7, v6, #0
-; CHECK-NEXT:    sshll.8h v6, v6, #0
-; CHECK-NEXT:    sshll2.4s v18, v7, #0
-; CHECK-NEXT:    sshll.4s v7, v7, #0
-; CHECK-NEXT:    sshll2.4s v20, v6, #0
+; CHECK-NEXT:    cmge.16b v5, v4, #0
+; CHECK-NEXT:    tbl.16b v7, { v4 }, v0
+; CHECK-NEXT:    tbl.16b v16, { v4 }, v1
+; CHECK-NEXT:    tbl.16b v18, { v4 }, v2
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v3
+; CHECK-NEXT:    sshll2.8h v6, v5, #0
+; CHECK-NEXT:    sshll.8h v5, v5, #0
+; CHECK-NEXT:    sshll2.4s v17, v6, #0
 ; CHECK-NEXT:    sshll.4s v6, v6, #0
-; CHECK-NEXT:    and.16b v16, v16, v18
-; CHECK-NEXT:    and.16b v7, v17, v7
-; CHECK-NEXT:    and.16b v17, v19, v20
-; CHECK-NEXT:    and.16b v5, v5, v6
-; CHECK-NEXT:    stp q7, q16, [x1, #32]
-; CHECK-NEXT:    stp q5, q17, [x1], #64
+; CHECK-NEXT:    sshll2.4s v19, v5, #0
+; CHECK-NEXT:    sshll.4s v5, v5, #0
+; CHECK-NEXT:    and.16b v7, v7, v17
+; CHECK-NEXT:    and.16b v6, v16, v6
+; CHECK-NEXT:    and.16b v16, v18, v19
+; CHECK-NEXT:    and.16b v4, v4, v5
+; CHECK-NEXT:    stp q6, q7, [x1, #32]
+; CHECK-NEXT:    stp q4, q16, [x1], #64
 ; CHECK-NEXT:    b.ne LBB26_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh25
-; CHECK-NEXT:    .loh AdrpLdr Lloh21, Lloh23
-; CHECK-NEXT:    .loh AdrpLdr Lloh19, Lloh24
-; CHECK-NEXT:    .loh AdrpAdrp Lloh18, Lloh21
-; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh20
+; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh24
+; CHECK-NEXT:    .loh AdrpLdr Lloh19, Lloh23
+; CHECK-NEXT:    .loh AdrpAdrp Lloh18, Lloh22
+; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh21
 entry:
   br label %loop
 

Copy link
Collaborator

@davemgreen davemgreen left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds great. are you interested in adding slt 1 while you are here?
https://godbolt.org/z/oW8MKnKqf

@sjoerdmeijer
Copy link
Collaborator Author

Sounds great. are you interested in adding slt 1 while you are here?
https://godbolt.org/z/oW8MKnKqf

Yeah, I thought about it.... I was afraid you might say this. ;)
But sure, let me have a look at that too.

Copy link

github-actions bot commented Dec 6, 2023

✅ With the latest revision this PR passed the C/C++ code formatter.

@sjoerdmeijer
Copy link
Collaborator Author

Thanks @davemgreen for the suggestion, I have added that to the patch.

This fixes a regression that occured for a pattern of MOVI + CMGT
instructions, which can be optimised to CMGE. I.e., when the signed
greater than compare has -1 as an operand, we can rewrite that as a
compare greater equal than 0, which is what CMGE does.  And similarly,
for SLT 1 we will now generate CMLE.

Fixes llvm#61836
Copy link
Collaborator

@SamTebbs33 SamTebbs33 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks pretty good to me!

@sjoerdmeijer
Copy link
Collaborator Author

Thanks guys!

@sjoerdmeijer sjoerdmeijer merged commit 3acbd38 into llvm:main Dec 7, 2023
4 checks passed
@sjoerdmeijer sjoerdmeijer deleted the cmge branch December 7, 2023 08:32
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

[AArch64] LLVM 16 regression: poor codegen for vcgez intrinsics
4 participants