diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d16b11686e3c1..f7cdfd00d84ec 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22328,6 +22328,37 @@ static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift); } +// Attempt to combine the following patterns: +// SUB x, (CSET LO, (CMP a, b)) -> SBC x, 0, (CMP a, b) +// SUB (SUB x, y), (CSET LO, (CMP a, b)) -> SBC x, y, (CMP a, b) +// The CSET may be preceded by a ZEXT. +static SDValue performSubWithBorrowCombine(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() != ISD::SUB) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse()) + N1 = N1.getOperand(0); + if (!N1.hasOneUse() || getCSETCondCode(N1) != AArch64CC::LO) + return SDValue(); + + SDValue Flags = N1.getOperand(3); + if (Flags.getOpcode() != AArch64ISD::SUBS) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + if (N0->getOpcode() != ISD::SUB) + return DAG.getNode(AArch64ISD::SBC, DL, VT, N0, DAG.getConstant(0, DL, VT), + Flags); + return DAG.getNode(AArch64ISD::SBC, DL, VT, N0.getOperand(0), + N0.getOperand(1), Flags); +} + static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // Try to change sum of two reductions. @@ -22349,6 +22380,8 @@ static SDValue performAddSubCombine(SDNode *N, return Val; if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG)) return Val; + if (SDValue Val = performSubWithBorrowCombine(N, DCI.DAG)) + return Val; if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG)) return Val; diff --git a/llvm/test/CodeGen/AArch64/sbc.ll b/llvm/test/CodeGen/AArch64/sbc.ll new file mode 100644 index 0000000000000..fff63c1709218 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sbc.ll @@ -0,0 +1,392 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck --check-prefixes=CHECK,CHECK-SD %s +; RUN: llc < %s -global-isel | FileCheck --check-prefixes=CHECK,CHECK-GI %s + +target triple = "aarch64-none-linux-gnu" + +define i32 @test_basic_i32(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_basic_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_basic_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i64 @test_basic_i64(i64 %a, i64 %b, i64 %x, i64 %y) { +; CHECK-SD-LABEL: test_basic_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp x0, x1 +; CHECK-SD-NEXT: sbc x0, x2, x3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_basic_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp x0, x1 +; CHECK-GI-NEXT: sub x9, x2, x3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub x0, x9, x8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i64 %a, %b + %carry = zext i1 %cc to i64 + %sub = sub i64 %x, %y + %res = sub i64 %sub, %carry + ret i64 %res +} + +define i64 @test_mixed_i32_i64(i32 %a, i32 %b, i64 %x, i64 %y) { +; CHECK-SD-LABEL: test_mixed_i32_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sbc x0, x2, x3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_mixed_i32_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub x9, x2, x3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub x0, x9, x8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i64 + %sub = sub i64 %x, %y + %res = sub i64 %sub, %carry + ret i64 %res +} + +define i32 @test_mixed_i64_i32(i64 %a, i64 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_mixed_i64_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp x0, x1 +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_mixed_i64_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp x0, x1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i64 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i32 @test_only_borrow(i32 %a, i32 %b, i32 %x) { +; CHECK-SD-LABEL: test_only_borrow: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sbc w0, w2, wzr +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_only_borrow: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w2, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %res = sub i32 %x, %carry + ret i32 %res +} + +define i32 @test_sext_add(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_sext_add: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_sext_add: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sbfx w8, w8, #0, #1 +; CHECK-GI-NEXT: add w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = sext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = add i32 %sub, %carry + ret i32 %res +} + +; FIXME: This case could be supported with reversed operands to the CMP. +define i32 @test_ugt(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_ugt: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sub w8, w2, w3 +; CHECK-SD-NEXT: cset w9, hi +; CHECK-SD-NEXT: sub w0, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_ugt: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ugt i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i32 @test_unsupported_cc_slt(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_unsupported_cc_slt: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sub w8, w2, w3 +; CHECK-SD-NEXT: cset w9, lt +; CHECK-SD-NEXT: sub w0, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_unsupported_cc_slt: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp slt i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i32 @test_unsupported_cc_sgt(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_unsupported_cc_sgt: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sub w8, w2, w3 +; CHECK-SD-NEXT: cset w9, gt +; CHECK-SD-NEXT: sub w0, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_unsupported_cc_sgt: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp sgt i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i32 @test_multiple_setcc_uses(i32 %a, i32 %b, i32 %x) { +; CHECK-SD-LABEL: test_multiple_setcc_uses: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: sub w19, w2, w0 +; CHECK-SD-NEXT: bl use +; CHECK-SD-NEXT: mov w0, w19 +; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_multiple_setcc_uses: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: mov w19, w2 +; CHECK-GI-NEXT: cset w20, lo +; CHECK-GI-NEXT: mov w0, w20 +; CHECK-GI-NEXT: bl use +; CHECK-GI-NEXT: sub w0, w19, w20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %res = sub i32 %x, %carry + tail call void @use(i1 %cc) + ret i32 %res +} + +define i32 @test_multiple_carry_uses(i32 %a, i32 %b, i32 %x) { +; CHECK-SD-LABEL: test_multiple_carry_uses: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: sub w19, w2, w0 +; CHECK-SD-NEXT: bl use +; CHECK-SD-NEXT: mov w0, w19 +; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_multiple_carry_uses: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: mov w19, w2 +; CHECK-GI-NEXT: cset w20, lo +; CHECK-GI-NEXT: mov w0, w20 +; CHECK-GI-NEXT: bl use +; CHECK-GI-NEXT: sub w0, w19, w20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %res = sub i32 %x, %carry + tail call void @use(i32 %carry) + ret i32 %res +} + +define i32 @test_multiple_sub_uses(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_multiple_sub_uses: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: sub w8, w2, w3 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: mov w0, w8 +; CHECK-SD-NEXT: sbc w19, w2, w3 +; CHECK-SD-NEXT: bl use +; CHECK-SD-NEXT: mov w0, w19 +; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_multiple_sub_uses: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: sub w19, w2, w3 +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: mov w0, w19 +; CHECK-GI-NEXT: cset w20, lo +; CHECK-GI-NEXT: bl use +; CHECK-GI-NEXT: sub w0, w19, w20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + tail call void @use(i32 %sub) + ret i32 %res +} + +define i8 @test_i8(i8 %a, i8 %b, i8 %x, i8 %y) { +; CHECK-SD-LABEL: test_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xff +; CHECK-SD-NEXT: cmp w8, w1, uxtb +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: and w8, w0, #0xff +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cmp w8, w1, uxtb +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i8 %a, %b + %carry = zext i1 %cc to i8 + %sub = sub i8 %x, %y + %res = sub i8 %sub, %carry + ret i8 %res +} + +define i16 @test_i16(i16 %a, i16 %b, i16 %x, i16 %y) { +; CHECK-SD-LABEL: test_i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xffff +; CHECK-SD-NEXT: cmp w8, w1, uxth +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: and w8, w0, #0xffff +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cmp w8, w1, uxth +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i16 %a, %b + %carry = zext i1 %cc to i16 + %sub = sub i16 %x, %y + %res = sub i16 %sub, %carry + ret i16 %res +} + +define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) { +; CHECK-SD-LABEL: test_v4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sub v2.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: cmhi v0.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_v4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v4.4s, #1 +; CHECK-GI-NEXT: cmhi v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: sub v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-GI-NEXT: sub v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret + %cc = icmp ult <4 x i32> %a, %b + %carry = zext <4 x i1> %cc to <4 x i32> + %sub = sub <4 x i32> %x, %y + %res = sub <4 x i32> %sub, %carry + ret <4 x i32> %res +} + +declare void @use() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}}