Recommit [AArch64] Optimize memcmp when the result is tested for [in]…

…equality with 0 Fixes 1st issue of #58061 Fixes the crash of #58675 Reviewed By: dmgreen, efriedma Differential Revision: https://reviews.llvm.org/D136244
llvm · Oct 29, 2022 · 63a4638 · 63a4638
1 parent c4a8f9a
commit 63a4638
Show file tree

Hide file tree

Showing 7 changed files with 246 additions and 117 deletions.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19490,6 +19490,35 @@ static SDValue performSETCCCombine(SDNode *N,
     }
   }
 
+  // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
+  // cmp A0, A0; ccmp A0, B1, 0, eq; cmp inv(Cond) flag
+  if (!DCI.isBeforeLegalize() && VT.isScalarInteger() &&
+      (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
+      LHS->getOpcode() == ISD::OR &&
+      (LHS.getOperand(0)->getOpcode() == ISD::XOR &&
+       LHS.getOperand(1)->getOpcode() == ISD::XOR) &&
+      LHS.hasOneUse() && LHS.getOperand(0)->hasOneUse() &&
+      LHS.getOperand(1)->hasOneUse()) {
+    SDValue XOR0 = LHS.getOperand(0);
+    SDValue XOR1 = LHS.getOperand(1);
+    SDValue CCVal = DAG.getConstant(AArch64CC::EQ, DL, MVT_CC);
+    EVT TstVT = LHS->getValueType(0);
+    SDValue Cmp =
+        DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(TstVT, MVT::i32),
+                    XOR0.getOperand(0), XOR0.getOperand(1));
+    SDValue Overflow = Cmp.getValue(1);
+    SDValue NZCVOp = DAG.getConstant(0, DL, MVT::i32);
+    SDValue CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, XOR1.getOperand(0),
+                               XOR1.getOperand(1), NZCVOp, CCVal, Overflow);
+    // Invert CSEL's operands.
+    SDValue TVal = DAG.getConstant(1, DL, VT);
+    SDValue FVal = DAG.getConstant(0, DL, VT);
+    AArch64CC::CondCode CC = changeIntCCToAArch64CC(Cond);
+    AArch64CC::CondCode InvCC = AArch64CC::getInvertedCondCode(CC);
+    return DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal,
+                       DAG.getConstant(InvCC, DL, MVT::i32), CCmp);
+  }
+
   return SDValue();
 }
 

diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
@@ -216,38 +216,40 @@ define i128 @test_rmw_add_128(i128* %dst)   {
 ; NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; NOLSE-NEXT:    // Child Loop BB4_2 Depth 2
 ; NOLSE-NEXT:    ldr x11, [sp, #40] // 8-byte Folded Reload
-; NOLSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
-; NOLSE-NEXT:    ldr x13, [sp, #24] // 8-byte Folded Reload
-; NOLSE-NEXT:    adds x14, x8, #1
+; NOLSE-NEXT:    ldr x13, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x10, [sp, #24] // 8-byte Folded Reload
+; NOLSE-NEXT:    adds x14, x13, #1
 ; NOLSE-NEXT:    cinc x15, x11, hs
 ; NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; NOLSE-NEXT:    // Parent Loop BB4_1 Depth=1
 ; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxp x10, x9, [x13]
-; NOLSE-NEXT:    cmp x10, x8
-; NOLSE-NEXT:    cset w12, ne
-; NOLSE-NEXT:    cmp x9, x11
-; NOLSE-NEXT:    cinc w12, w12, ne
-; NOLSE-NEXT:    cbnz w12, .LBB4_4
+; NOLSE-NEXT:    ldaxp x12, x8, [x10]
+; NOLSE-NEXT:    cmp x12, x13
+; NOLSE-NEXT:    cset w9, ne
+; NOLSE-NEXT:    cmp x8, x11
+; NOLSE-NEXT:    cinc w9, w9, ne
+; NOLSE-NEXT:    cbnz w9, .LBB4_4
 ; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
 ; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=2
-; NOLSE-NEXT:    stlxp w12, x14, x15, [x13]
-; NOLSE-NEXT:    cbnz w12, .LBB4_2
+; NOLSE-NEXT:    stlxp w9, x14, x15, [x10]
+; NOLSE-NEXT:    cbnz w9, .LBB4_2
 ; NOLSE-NEXT:    b .LBB4_5
 ; NOLSE-NEXT:  .LBB4_4: // %atomicrmw.start
 ; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=2
-; NOLSE-NEXT:    stlxp w12, x10, x9, [x13]
-; NOLSE-NEXT:    cbnz w12, .LBB4_2
+; NOLSE-NEXT:    stlxp w9, x12, x8, [x10]
+; NOLSE-NEXT:    cbnz w9, .LBB4_2
 ; NOLSE-NEXT:  .LBB4_5: // %atomicrmw.start
 ; NOLSE-NEXT:    // in Loop: Header=BB4_1 Depth=1
-; NOLSE-NEXT:    eor x11, x9, x11
-; NOLSE-NEXT:    eor x8, x10, x8
-; NOLSE-NEXT:    orr x8, x8, x11
+; NOLSE-NEXT:    mov x9, x8
 ; NOLSE-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
+; NOLSE-NEXT:    mov x10, x12
 ; NOLSE-NEXT:    str x10, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    subs x12, x12, x13
+; NOLSE-NEXT:    ccmp x8, x11, #0, eq
+; NOLSE-NEXT:    cset w8, ne
 ; NOLSE-NEXT:    str x10, [sp, #32] // 8-byte Folded Spill
 ; NOLSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; NOLSE-NEXT:    cbnz x8, .LBB4_1
+; NOLSE-NEXT:    tbnz w8, #0, .LBB4_1
 ; NOLSE-NEXT:    b .LBB4_6
 ; NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
 ; NOLSE-NEXT:    ldr x1, [sp, #8] // 8-byte Folded Reload
@@ -267,26 +269,26 @@ define i128 @test_rmw_add_128(i128* %dst)   {
 ; LSE-NEXT:    b .LBB4_1
 ; LSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    ldr x10, [sp, #40] // 8-byte Folded Reload
-; LSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x8, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x11, [sp, #32] // 8-byte Folded Reload
 ; LSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
-; LSE-NEXT:    mov x0, x8
-; LSE-NEXT:    mov x1, x10
-; LSE-NEXT:    adds x2, x8, #1
-; LSE-NEXT:    cinc x11, x10, hs
+; LSE-NEXT:    mov x0, x11
+; LSE-NEXT:    mov x1, x8
+; LSE-NEXT:    adds x2, x11, #1
+; LSE-NEXT:    cinc x10, x8, hs
 ; LSE-NEXT:    // kill: def $x2 killed $x2 def $x2_x3
-; LSE-NEXT:    mov x3, x11
+; LSE-NEXT:    mov x3, x10
 ; LSE-NEXT:    caspal x0, x1, x2, x3, [x9]
 ; LSE-NEXT:    mov x9, x1
 ; LSE-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
-; LSE-NEXT:    eor x11, x9, x10
 ; LSE-NEXT:    mov x10, x0
 ; LSE-NEXT:    str x10, [sp, #16] // 8-byte Folded Spill
-; LSE-NEXT:    eor x8, x10, x8
-; LSE-NEXT:    orr x8, x8, x11
+; LSE-NEXT:    subs x11, x10, x11
+; LSE-NEXT:    ccmp x9, x8, #0, eq
+; LSE-NEXT:    cset w8, ne
 ; LSE-NEXT:    str x10, [sp, #32] // 8-byte Folded Spill
 ; LSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; LSE-NEXT:    cbnz x8, .LBB4_1
+; LSE-NEXT:    tbnz w8, #0, .LBB4_1
 ; LSE-NEXT:    b .LBB4_2
 ; LSE-NEXT:  .LBB4_2: // %atomicrmw.end
 ; LSE-NEXT:    ldr x1, [sp, #8] // 8-byte Folded Reload
@@ -606,42 +608,44 @@ define i128 @test_rmw_nand_128(i128* %dst)   {
 ; NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; NOLSE-NEXT:    // Child Loop BB9_2 Depth 2
 ; NOLSE-NEXT:    ldr x11, [sp, #40] // 8-byte Folded Reload
-; NOLSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
-; NOLSE-NEXT:    ldr x13, [sp, #24] // 8-byte Folded Reload
-; NOLSE-NEXT:    mov w9, w8
-; NOLSE-NEXT:    mvn w10, w9
-; NOLSE-NEXT:    // implicit-def: $x9
-; NOLSE-NEXT:    mov w9, w10
-; NOLSE-NEXT:    orr x14, x9, #0xfffffffffffffffe
+; NOLSE-NEXT:    ldr x13, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x10, [sp, #24] // 8-byte Folded Reload
+; NOLSE-NEXT:    mov w8, w13
+; NOLSE-NEXT:    mvn w9, w8
+; NOLSE-NEXT:    // implicit-def: $x8
+; NOLSE-NEXT:    mov w8, w9
+; NOLSE-NEXT:    orr x14, x8, #0xfffffffffffffffe
 ; NOLSE-NEXT:    mov x15, #-1
 ; NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
 ; NOLSE-NEXT:    // Parent Loop BB9_1 Depth=1
 ; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxp x10, x9, [x13]
-; NOLSE-NEXT:    cmp x10, x8
-; NOLSE-NEXT:    cset w12, ne
-; NOLSE-NEXT:    cmp x9, x11
-; NOLSE-NEXT:    cinc w12, w12, ne
-; NOLSE-NEXT:    cbnz w12, .LBB9_4
+; NOLSE-NEXT:    ldaxp x12, x8, [x10]
+; NOLSE-NEXT:    cmp x12, x13
+; NOLSE-NEXT:    cset w9, ne
+; NOLSE-NEXT:    cmp x8, x11
+; NOLSE-NEXT:    cinc w9, w9, ne
+; NOLSE-NEXT:    cbnz w9, .LBB9_4
 ; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
 ; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=2
-; NOLSE-NEXT:    stlxp w12, x14, x15, [x13]
-; NOLSE-NEXT:    cbnz w12, .LBB9_2
+; NOLSE-NEXT:    stlxp w9, x14, x15, [x10]
+; NOLSE-NEXT:    cbnz w9, .LBB9_2
 ; NOLSE-NEXT:    b .LBB9_5
 ; NOLSE-NEXT:  .LBB9_4: // %atomicrmw.start
 ; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=2
-; NOLSE-NEXT:    stlxp w12, x10, x9, [x13]
-; NOLSE-NEXT:    cbnz w12, .LBB9_2
+; NOLSE-NEXT:    stlxp w9, x12, x8, [x10]
+; NOLSE-NEXT:    cbnz w9, .LBB9_2
 ; NOLSE-NEXT:  .LBB9_5: // %atomicrmw.start
 ; NOLSE-NEXT:    // in Loop: Header=BB9_1 Depth=1
-; NOLSE-NEXT:    eor x11, x9, x11
-; NOLSE-NEXT:    eor x8, x10, x8
-; NOLSE-NEXT:    orr x8, x8, x11
+; NOLSE-NEXT:    mov x9, x8
 ; NOLSE-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
+; NOLSE-NEXT:    mov x10, x12
 ; NOLSE-NEXT:    str x10, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    subs x12, x12, x13
+; NOLSE-NEXT:    ccmp x8, x11, #0, eq
+; NOLSE-NEXT:    cset w8, ne
 ; NOLSE-NEXT:    str x10, [sp, #32] // 8-byte Folded Spill
 ; NOLSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; NOLSE-NEXT:    cbnz x8, .LBB9_1
+; NOLSE-NEXT:    tbnz w8, #0, .LBB9_1
 ; NOLSE-NEXT:    b .LBB9_6
 ; NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
 ; NOLSE-NEXT:    ldr x1, [sp, #8] // 8-byte Folded Reload
@@ -661,30 +665,30 @@ define i128 @test_rmw_nand_128(i128* %dst)   {
 ; LSE-NEXT:    b .LBB9_1
 ; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    ldr x10, [sp, #40] // 8-byte Folded Reload
-; LSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x8, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x11, [sp, #32] // 8-byte Folded Reload
 ; LSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
-; LSE-NEXT:    mov x0, x8
-; LSE-NEXT:    mov x1, x10
-; LSE-NEXT:    mov w11, w8
-; LSE-NEXT:    mvn w12, w11
-; LSE-NEXT:    // implicit-def: $x11
-; LSE-NEXT:    mov w11, w12
-; LSE-NEXT:    orr x2, x11, #0xfffffffffffffffe
-; LSE-NEXT:    mov x11, #-1
+; LSE-NEXT:    mov x0, x11
+; LSE-NEXT:    mov x1, x8
+; LSE-NEXT:    mov w10, w11
+; LSE-NEXT:    mvn w12, w10
+; LSE-NEXT:    // implicit-def: $x10
+; LSE-NEXT:    mov w10, w12
+; LSE-NEXT:    orr x2, x10, #0xfffffffffffffffe
+; LSE-NEXT:    mov x10, #-1
 ; LSE-NEXT:    // kill: def $x2 killed $x2 def $x2_x3
-; LSE-NEXT:    mov x3, x11
+; LSE-NEXT:    mov x3, x10
 ; LSE-NEXT:    caspal x0, x1, x2, x3, [x9]
 ; LSE-NEXT:    mov x9, x1
 ; LSE-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
-; LSE-NEXT:    eor x11, x9, x10
 ; LSE-NEXT:    mov x10, x0
 ; LSE-NEXT:    str x10, [sp, #16] // 8-byte Folded Spill
-; LSE-NEXT:    eor x8, x10, x8
-; LSE-NEXT:    orr x8, x8, x11
+; LSE-NEXT:    subs x11, x10, x11
+; LSE-NEXT:    ccmp x9, x8, #0, eq
+; LSE-NEXT:    cset w8, ne
 ; LSE-NEXT:    str x10, [sp, #32] // 8-byte Folded Spill
 ; LSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; LSE-NEXT:    cbnz x8, .LBB9_1
+; LSE-NEXT:    tbnz w8, #0, .LBB9_1
 ; LSE-NEXT:    b .LBB9_2
 ; LSE-NEXT:  .LBB9_2: // %atomicrmw.end
 ; LSE-NEXT:    ldr x1, [sp, #8] // 8-byte Folded Reload

diff --git a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
@@ -12,10 +12,8 @@ define i1 @test_b2(i8* %s1, i8* %s2) {
 ; CHECKN-NEXT:    ldr x9, [x1]
 ; CHECKN-NEXT:    ldur x10, [x0, #7]
 ; CHECKN-NEXT:    ldur x11, [x1, #7]
-; CHECKN-NEXT:    eor x8, x8, x9
-; CHECKN-NEXT:    eor x9, x10, x11
-; CHECKN-NEXT:    orr x8, x8, x9
-; CHECKN-NEXT:    cmp x8, #0
+; CHECKN-NEXT:    cmp x8, x9
+; CHECKN-NEXT:    ccmp x10, x11, #0, eq
 ; CHECKN-NEXT:    cset w0, eq
 ; CHECKN-NEXT:    ret
 ;
@@ -44,10 +42,8 @@ define i1 @test_b2_align8(i8* align 8 %s1, i8* align 8 %s2) {
 ; CHECKN-NEXT:    ldr x9, [x1]
 ; CHECKN-NEXT:    ldur x10, [x0, #7]
 ; CHECKN-NEXT:    ldur x11, [x1, #7]
-; CHECKN-NEXT:    eor x8, x8, x9
-; CHECKN-NEXT:    eor x9, x10, x11
-; CHECKN-NEXT:    orr x8, x8, x9
-; CHECKN-NEXT:    cmp x8, #0
+; CHECKN-NEXT:    cmp x8, x9
+; CHECKN-NEXT:    ccmp x10, x11, #0, eq
 ; CHECKN-NEXT:    cset w0, eq
 ; CHECKN-NEXT:    ret
 ;

diff --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll
@@ -113,10 +113,8 @@ define i1 @bcmp7(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr w9, [x1]
 ; CHECK-NEXT:    ldur w10, [x0, #3]
 ; CHECK-NEXT:    ldur w11, [x1, #3]
-; CHECK-NEXT:    eor w8, w8, w9
-; CHECK-NEXT:    eor w9, w10, w11
-; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    ccmp w10, w11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 7)
@@ -182,10 +180,8 @@ define i1 @bcmp11(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr x9, [x1]
 ; CHECK-NEXT:    ldur x10, [x0, #3]
 ; CHECK-NEXT:    ldur x11, [x1, #3]
-; CHECK-NEXT:    eor x8, x8, x9
-; CHECK-NEXT:    eor x9, x10, x11
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 11)
@@ -218,10 +214,8 @@ define i1 @bcmp13(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr x9, [x1]
 ; CHECK-NEXT:    ldur x10, [x0, #5]
 ; CHECK-NEXT:    ldur x11, [x1, #5]
-; CHECK-NEXT:    eor x8, x8, x9
-; CHECK-NEXT:    eor x9, x10, x11
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 13)
@@ -236,10 +230,8 @@ define i1 @bcmp14(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr x9, [x1]
 ; CHECK-NEXT:    ldur x10, [x0, #6]
 ; CHECK-NEXT:    ldur x11, [x1, #6]
-; CHECK-NEXT:    eor x8, x8, x9
-; CHECK-NEXT:    eor x9, x10, x11
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 14)
@@ -254,10 +246,8 @@ define i1 @bcmp15(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr x9, [x1]
 ; CHECK-NEXT:    ldur x10, [x0, #7]
 ; CHECK-NEXT:    ldur x11, [x1, #7]
-; CHECK-NEXT:    eor x8, x8, x9
-; CHECK-NEXT:    eor x9, x10, x11
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 15)
@@ -270,10 +260,8 @@ define i1 @bcmp16(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp x8, x9, [x0]
 ; CHECK-NEXT:    ldp x10, x11, [x1]
-; CHECK-NEXT:    eor x8, x8, x10
-; CHECK-NEXT:    eor x9, x9, x11
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 16)