Skip to content

Commit

Permalink
Recommit [AArch64] Optimize memcmp when the result is tested for [in]…
Browse files Browse the repository at this point in the history
…equality with 0

Fixes 1st issue of #58061
Fixes the crash of #58675

Reviewed By: dmgreen, efriedma
Differential Revision: https://reviews.llvm.org/D136244
  • Loading branch information
vfdff committed Oct 29, 2022
1 parent c4a8f9a commit 63a4638
Show file tree
Hide file tree
Showing 7 changed files with 246 additions and 117 deletions.
29 changes: 29 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -19490,6 +19490,35 @@ static SDValue performSETCCCombine(SDNode *N,
}
}

// Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
// cmp A0, A0; ccmp A0, B1, 0, eq; cmp inv(Cond) flag
if (!DCI.isBeforeLegalize() && VT.isScalarInteger() &&
(Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
LHS->getOpcode() == ISD::OR &&
(LHS.getOperand(0)->getOpcode() == ISD::XOR &&
LHS.getOperand(1)->getOpcode() == ISD::XOR) &&
LHS.hasOneUse() && LHS.getOperand(0)->hasOneUse() &&
LHS.getOperand(1)->hasOneUse()) {
SDValue XOR0 = LHS.getOperand(0);
SDValue XOR1 = LHS.getOperand(1);
SDValue CCVal = DAG.getConstant(AArch64CC::EQ, DL, MVT_CC);
EVT TstVT = LHS->getValueType(0);
SDValue Cmp =
DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(TstVT, MVT::i32),
XOR0.getOperand(0), XOR0.getOperand(1));
SDValue Overflow = Cmp.getValue(1);
SDValue NZCVOp = DAG.getConstant(0, DL, MVT::i32);
SDValue CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, XOR1.getOperand(0),
XOR1.getOperand(1), NZCVOp, CCVal, Overflow);
// Invert CSEL's operands.
SDValue TVal = DAG.getConstant(1, DL, VT);
SDValue FVal = DAG.getConstant(0, DL, VT);
AArch64CC::CondCode CC = changeIntCCToAArch64CC(Cond);
AArch64CC::CondCode InvCC = AArch64CC::getInvertedCondCode(CC);
return DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal,
DAG.getConstant(InvCC, DL, MVT::i32), CCmp);
}

return SDValue();
}

Expand Down
132 changes: 68 additions & 64 deletions llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
Expand Up @@ -216,38 +216,40 @@ define i128 @test_rmw_add_128(i128* %dst) {
; NOLSE-NEXT: // =>This Loop Header: Depth=1
; NOLSE-NEXT: // Child Loop BB4_2 Depth 2
; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload
; NOLSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload
; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload
; NOLSE-NEXT: adds x14, x8, #1
; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload
; NOLSE-NEXT: ldr x10, [sp, #24] // 8-byte Folded Reload
; NOLSE-NEXT: adds x14, x13, #1
; NOLSE-NEXT: cinc x15, x11, hs
; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start
; NOLSE-NEXT: // Parent Loop BB4_1 Depth=1
; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
; NOLSE-NEXT: ldaxp x10, x9, [x13]
; NOLSE-NEXT: cmp x10, x8
; NOLSE-NEXT: cset w12, ne
; NOLSE-NEXT: cmp x9, x11
; NOLSE-NEXT: cinc w12, w12, ne
; NOLSE-NEXT: cbnz w12, .LBB4_4
; NOLSE-NEXT: ldaxp x12, x8, [x10]
; NOLSE-NEXT: cmp x12, x13
; NOLSE-NEXT: cset w9, ne
; NOLSE-NEXT: cmp x8, x11
; NOLSE-NEXT: cinc w9, w9, ne
; NOLSE-NEXT: cbnz w9, .LBB4_4
; NOLSE-NEXT: // %bb.3: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2
; NOLSE-NEXT: stlxp w12, x14, x15, [x13]
; NOLSE-NEXT: cbnz w12, .LBB4_2
; NOLSE-NEXT: stlxp w9, x14, x15, [x10]
; NOLSE-NEXT: cbnz w9, .LBB4_2
; NOLSE-NEXT: b .LBB4_5
; NOLSE-NEXT: .LBB4_4: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2
; NOLSE-NEXT: stlxp w12, x10, x9, [x13]
; NOLSE-NEXT: cbnz w12, .LBB4_2
; NOLSE-NEXT: stlxp w9, x12, x8, [x10]
; NOLSE-NEXT: cbnz w9, .LBB4_2
; NOLSE-NEXT: .LBB4_5: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1
; NOLSE-NEXT: eor x11, x9, x11
; NOLSE-NEXT: eor x8, x10, x8
; NOLSE-NEXT: orr x8, x8, x11
; NOLSE-NEXT: mov x9, x8
; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
; NOLSE-NEXT: mov x10, x12
; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill
; NOLSE-NEXT: subs x12, x12, x13
; NOLSE-NEXT: ccmp x8, x11, #0, eq
; NOLSE-NEXT: cset w8, ne
; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
; NOLSE-NEXT: cbnz x8, .LBB4_1
; NOLSE-NEXT: tbnz w8, #0, .LBB4_1
; NOLSE-NEXT: b .LBB4_6
; NOLSE-NEXT: .LBB4_6: // %atomicrmw.end
; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
Expand All @@ -267,26 +269,26 @@ define i128 @test_rmw_add_128(i128* %dst) {
; LSE-NEXT: b .LBB4_1
; LSE-NEXT: .LBB4_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
; LSE-NEXT: ldr x10, [sp, #40] // 8-byte Folded Reload
; LSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload
; LSE-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload
; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
; LSE-NEXT: mov x0, x8
; LSE-NEXT: mov x1, x10
; LSE-NEXT: adds x2, x8, #1
; LSE-NEXT: cinc x11, x10, hs
; LSE-NEXT: mov x0, x11
; LSE-NEXT: mov x1, x8
; LSE-NEXT: adds x2, x11, #1
; LSE-NEXT: cinc x10, x8, hs
; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3
; LSE-NEXT: mov x3, x11
; LSE-NEXT: mov x3, x10
; LSE-NEXT: caspal x0, x1, x2, x3, [x9]
; LSE-NEXT: mov x9, x1
; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
; LSE-NEXT: eor x11, x9, x10
; LSE-NEXT: mov x10, x0
; LSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill
; LSE-NEXT: eor x8, x10, x8
; LSE-NEXT: orr x8, x8, x11
; LSE-NEXT: subs x11, x10, x11
; LSE-NEXT: ccmp x9, x8, #0, eq
; LSE-NEXT: cset w8, ne
; LSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
; LSE-NEXT: cbnz x8, .LBB4_1
; LSE-NEXT: tbnz w8, #0, .LBB4_1
; LSE-NEXT: b .LBB4_2
; LSE-NEXT: .LBB4_2: // %atomicrmw.end
; LSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
Expand Down Expand Up @@ -606,42 +608,44 @@ define i128 @test_rmw_nand_128(i128* %dst) {
; NOLSE-NEXT: // =>This Loop Header: Depth=1
; NOLSE-NEXT: // Child Loop BB9_2 Depth 2
; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload
; NOLSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload
; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload
; NOLSE-NEXT: mov w9, w8
; NOLSE-NEXT: mvn w10, w9
; NOLSE-NEXT: // implicit-def: $x9
; NOLSE-NEXT: mov w9, w10
; NOLSE-NEXT: orr x14, x9, #0xfffffffffffffffe
; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload
; NOLSE-NEXT: ldr x10, [sp, #24] // 8-byte Folded Reload
; NOLSE-NEXT: mov w8, w13
; NOLSE-NEXT: mvn w9, w8
; NOLSE-NEXT: // implicit-def: $x8
; NOLSE-NEXT: mov w8, w9
; NOLSE-NEXT: orr x14, x8, #0xfffffffffffffffe
; NOLSE-NEXT: mov x15, #-1
; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start
; NOLSE-NEXT: // Parent Loop BB9_1 Depth=1
; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
; NOLSE-NEXT: ldaxp x10, x9, [x13]
; NOLSE-NEXT: cmp x10, x8
; NOLSE-NEXT: cset w12, ne
; NOLSE-NEXT: cmp x9, x11
; NOLSE-NEXT: cinc w12, w12, ne
; NOLSE-NEXT: cbnz w12, .LBB9_4
; NOLSE-NEXT: ldaxp x12, x8, [x10]
; NOLSE-NEXT: cmp x12, x13
; NOLSE-NEXT: cset w9, ne
; NOLSE-NEXT: cmp x8, x11
; NOLSE-NEXT: cinc w9, w9, ne
; NOLSE-NEXT: cbnz w9, .LBB9_4
; NOLSE-NEXT: // %bb.3: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2
; NOLSE-NEXT: stlxp w12, x14, x15, [x13]
; NOLSE-NEXT: cbnz w12, .LBB9_2
; NOLSE-NEXT: stlxp w9, x14, x15, [x10]
; NOLSE-NEXT: cbnz w9, .LBB9_2
; NOLSE-NEXT: b .LBB9_5
; NOLSE-NEXT: .LBB9_4: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2
; NOLSE-NEXT: stlxp w12, x10, x9, [x13]
; NOLSE-NEXT: cbnz w12, .LBB9_2
; NOLSE-NEXT: stlxp w9, x12, x8, [x10]
; NOLSE-NEXT: cbnz w9, .LBB9_2
; NOLSE-NEXT: .LBB9_5: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1
; NOLSE-NEXT: eor x11, x9, x11
; NOLSE-NEXT: eor x8, x10, x8
; NOLSE-NEXT: orr x8, x8, x11
; NOLSE-NEXT: mov x9, x8
; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
; NOLSE-NEXT: mov x10, x12
; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill
; NOLSE-NEXT: subs x12, x12, x13
; NOLSE-NEXT: ccmp x8, x11, #0, eq
; NOLSE-NEXT: cset w8, ne
; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
; NOLSE-NEXT: cbnz x8, .LBB9_1
; NOLSE-NEXT: tbnz w8, #0, .LBB9_1
; NOLSE-NEXT: b .LBB9_6
; NOLSE-NEXT: .LBB9_6: // %atomicrmw.end
; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
Expand All @@ -661,30 +665,30 @@ define i128 @test_rmw_nand_128(i128* %dst) {
; LSE-NEXT: b .LBB9_1
; LSE-NEXT: .LBB9_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
; LSE-NEXT: ldr x10, [sp, #40] // 8-byte Folded Reload
; LSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload
; LSE-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload
; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
; LSE-NEXT: mov x0, x8
; LSE-NEXT: mov x1, x10
; LSE-NEXT: mov w11, w8
; LSE-NEXT: mvn w12, w11
; LSE-NEXT: // implicit-def: $x11
; LSE-NEXT: mov w11, w12
; LSE-NEXT: orr x2, x11, #0xfffffffffffffffe
; LSE-NEXT: mov x11, #-1
; LSE-NEXT: mov x0, x11
; LSE-NEXT: mov x1, x8
; LSE-NEXT: mov w10, w11
; LSE-NEXT: mvn w12, w10
; LSE-NEXT: // implicit-def: $x10
; LSE-NEXT: mov w10, w12
; LSE-NEXT: orr x2, x10, #0xfffffffffffffffe
; LSE-NEXT: mov x10, #-1
; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3
; LSE-NEXT: mov x3, x11
; LSE-NEXT: mov x3, x10
; LSE-NEXT: caspal x0, x1, x2, x3, [x9]
; LSE-NEXT: mov x9, x1
; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
; LSE-NEXT: eor x11, x9, x10
; LSE-NEXT: mov x10, x0
; LSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill
; LSE-NEXT: eor x8, x10, x8
; LSE-NEXT: orr x8, x8, x11
; LSE-NEXT: subs x11, x10, x11
; LSE-NEXT: ccmp x9, x8, #0, eq
; LSE-NEXT: cset w8, ne
; LSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
; LSE-NEXT: cbnz x8, .LBB9_1
; LSE-NEXT: tbnz w8, #0, .LBB9_1
; LSE-NEXT: b .LBB9_2
; LSE-NEXT: .LBB9_2: // %atomicrmw.end
; LSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
Expand Down
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
Expand Up @@ -12,10 +12,8 @@ define i1 @test_b2(i8* %s1, i8* %s2) {
; CHECKN-NEXT: ldr x9, [x1]
; CHECKN-NEXT: ldur x10, [x0, #7]
; CHECKN-NEXT: ldur x11, [x1, #7]
; CHECKN-NEXT: eor x8, x8, x9
; CHECKN-NEXT: eor x9, x10, x11
; CHECKN-NEXT: orr x8, x8, x9
; CHECKN-NEXT: cmp x8, #0
; CHECKN-NEXT: cmp x8, x9
; CHECKN-NEXT: ccmp x10, x11, #0, eq
; CHECKN-NEXT: cset w0, eq
; CHECKN-NEXT: ret
;
Expand Down Expand Up @@ -44,10 +42,8 @@ define i1 @test_b2_align8(i8* align 8 %s1, i8* align 8 %s2) {
; CHECKN-NEXT: ldr x9, [x1]
; CHECKN-NEXT: ldur x10, [x0, #7]
; CHECKN-NEXT: ldur x11, [x1, #7]
; CHECKN-NEXT: eor x8, x8, x9
; CHECKN-NEXT: eor x9, x10, x11
; CHECKN-NEXT: orr x8, x8, x9
; CHECKN-NEXT: cmp x8, #0
; CHECKN-NEXT: cmp x8, x9
; CHECKN-NEXT: ccmp x10, x11, #0, eq
; CHECKN-NEXT: cset w0, eq
; CHECKN-NEXT: ret
;
Expand Down
36 changes: 12 additions & 24 deletions llvm/test/CodeGen/AArch64/bcmp.ll
Expand Up @@ -113,10 +113,8 @@ define i1 @bcmp7(ptr %a, ptr %b) {
; CHECK-NEXT: ldr w9, [x1]
; CHECK-NEXT: ldur w10, [x0, #3]
; CHECK-NEXT: ldur w11, [x1, #3]
; CHECK-NEXT: eor w8, w8, w9
; CHECK-NEXT: eor w9, w10, w11
; CHECK-NEXT: orr w8, w8, w9
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: cmp w8, w9
; CHECK-NEXT: ccmp w10, w11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 7)
Expand Down Expand Up @@ -182,10 +180,8 @@ define i1 @bcmp11(ptr %a, ptr %b) {
; CHECK-NEXT: ldr x9, [x1]
; CHECK-NEXT: ldur x10, [x0, #3]
; CHECK-NEXT: ldur x11, [x1, #3]
; CHECK-NEXT: eor x8, x8, x9
; CHECK-NEXT: eor x9, x10, x11
; CHECK-NEXT: orr x8, x8, x9
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: cmp x8, x9
; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 11)
Expand Down Expand Up @@ -218,10 +214,8 @@ define i1 @bcmp13(ptr %a, ptr %b) {
; CHECK-NEXT: ldr x9, [x1]
; CHECK-NEXT: ldur x10, [x0, #5]
; CHECK-NEXT: ldur x11, [x1, #5]
; CHECK-NEXT: eor x8, x8, x9
; CHECK-NEXT: eor x9, x10, x11
; CHECK-NEXT: orr x8, x8, x9
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: cmp x8, x9
; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 13)
Expand All @@ -236,10 +230,8 @@ define i1 @bcmp14(ptr %a, ptr %b) {
; CHECK-NEXT: ldr x9, [x1]
; CHECK-NEXT: ldur x10, [x0, #6]
; CHECK-NEXT: ldur x11, [x1, #6]
; CHECK-NEXT: eor x8, x8, x9
; CHECK-NEXT: eor x9, x10, x11
; CHECK-NEXT: orr x8, x8, x9
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: cmp x8, x9
; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 14)
Expand All @@ -254,10 +246,8 @@ define i1 @bcmp15(ptr %a, ptr %b) {
; CHECK-NEXT: ldr x9, [x1]
; CHECK-NEXT: ldur x10, [x0, #7]
; CHECK-NEXT: ldur x11, [x1, #7]
; CHECK-NEXT: eor x8, x8, x9
; CHECK-NEXT: eor x9, x10, x11
; CHECK-NEXT: orr x8, x8, x9
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: cmp x8, x9
; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 15)
Expand All @@ -270,10 +260,8 @@ define i1 @bcmp16(ptr %a, ptr %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldp x8, x9, [x0]
; CHECK-NEXT: ldp x10, x11, [x1]
; CHECK-NEXT: eor x8, x8, x10
; CHECK-NEXT: eor x9, x9, x11
; CHECK-NEXT: orr x8, x8, x9
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: cmp x8, x10
; CHECK-NEXT: ccmp x9, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 16)
Expand Down

0 comments on commit 63a4638

Please sign in to comment.