diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4dfc400fbbb6c..410f20edc6281 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57617,10 +57617,10 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, } // Fold any similar generic ADD/SUB opcodes to reuse this node. - auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) { + auto MatchGeneric = [&](unsigned Opc, SDValue N0, SDValue N1, bool Negate) { SDValue Ops[] = {N0, N1}; SDVTList VTs = DAG.getVTList(N->getValueType(0)); - if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) { + if (SDNode *GenericAddSub = DAG.getNodeIfExists(Opc, VTs, Ops)) { SDValue Op(N, 0); if (Negate) { // Bail if this is only used by a user of the x86 add/sub. @@ -57632,8 +57632,25 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, DCI.CombineTo(GenericAddSub, Op); } }; - MatchGeneric(LHS, RHS, false); - MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode()); + MatchGeneric(GenericOpc, LHS, RHS, false); + MatchGeneric(GenericOpc, RHS, LHS, X86ISD::SUB == N->getOpcode()); + + if (auto *Const = dyn_cast(RHS)) { + SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT); + if (X86ISD::SUB == N->getOpcode()) { + // Fold generic add(LHS, -C) to X86ISD::SUB(LHS, C). + MatchGeneric(ISD::ADD, LHS, NegC, false); + } else { + // Negate X86ISD::ADD(LHS, C) and replace generic sub(-C, LHS). + MatchGeneric(ISD::SUB, NegC, LHS, true); + } + } else if (auto *Const = dyn_cast(LHS)) { + if (X86ISD::SUB == N->getOpcode()) { + SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT); + // Negate X86ISD::SUB(C, RHS) and replace generic add(RHS, -C). + MatchGeneric(ISD::ADD, RHS, NegC, true); + } + } // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the // EFLAGS result doesn't change. diff --git a/llvm/test/CodeGen/X86/combine-adc.ll b/llvm/test/CodeGen/X86/combine-adc.ll index 22417363f1093..a2aaea31aa6ff 100644 --- a/llvm/test/CodeGen/X86/combine-adc.ll +++ b/llvm/test/CodeGen/X86/combine-adc.ll @@ -89,4 +89,52 @@ define i32 @adc_merge_constants(i32 %a0) nounwind { ret i32 %sum } +define i32 @adc_merge_sub(i32 %a0) nounwind { +; X86-LABEL: adc_merge_sub: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: addl $42, %edi +; X86-NEXT: setb %al +; X86-NEXT: movl %edi, %esi +; X86-NEXT: negl %esi +; X86-NEXT: pushl %eax +; X86-NEXT: calll use@PLT +; X86-NEXT: addl $4, %esp +; X86-NEXT: xorl %edi, %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: adc_merge_sub: +; X64: # %bb.0: +; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %rbx +; X64-NEXT: pushq %rax +; X64-NEXT: movl %edi, %ebx +; X64-NEXT: xorl %edi, %edi +; X64-NEXT: addl $42, %ebx +; X64-NEXT: setb %dil +; X64-NEXT: movl %ebx, %ebp +; X64-NEXT: negl %ebp +; X64-NEXT: callq use@PLT +; X64-NEXT: xorl %ebx, %ebp +; X64-NEXT: movl %ebp, %eax +; X64-NEXT: addq $8, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %rbp +; X64-NEXT: retq + %adc = tail call { i8, i32 } @llvm.x86.addcarry.32(i8 0, i32 %a0, i32 42) + %carry = extractvalue { i8, i32 } %adc, 0 + call void @use(i8 %carry) + %sum = extractvalue { i8, i32 } %adc, 1 + %sub = sub i32 -42, %a0 + %result = xor i32 %sum, %sub + ret i32 %result +} + declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32) +declare void @use(i8) diff --git a/llvm/test/CodeGen/X86/combine-sbb.ll b/llvm/test/CodeGen/X86/combine-sbb.ll index 89aee965a2c1f..62744d4f3050a 100644 --- a/llvm/test/CodeGen/X86/combine-sbb.ll +++ b/llvm/test/CodeGen/X86/combine-sbb.ll @@ -333,4 +333,85 @@ define i32 @PR40483_sub6(ptr, i32) nounwind { ret i32 %10 } +define i32 @sbb_merge_add1(i32 %a0) nounwind { +; X86-LABEL: sbb_merge_add1: +; X86: # %bb.0: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $42, {{[0-9]+}}(%esp) +; X86-NEXT: setb %al +; X86-NEXT: pushl %eax +; X86-NEXT: calll use@PLT +; X86-NEXT: addl $4, %esp +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; +; X64-LABEL: sbb_merge_add1: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $42, %edi +; X64-NEXT: setb %al +; X64-NEXT: movl %eax, %edi +; X64-NEXT: callq use@PLT +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %sbb = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 %a0, i32 42) + %borrow = extractvalue { i8, i32 } %sbb, 0 + call void @use(i8 %borrow) + %diff = extractvalue { i8, i32 } %sbb, 1 + %add = add i32 %a0, -42 + %result = xor i32 %diff, %add + ret i32 %result +} + +define i32 @sbb_merge_add2(i32 %a0) nounwind { +; X86-LABEL: sbb_merge_add2: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl $42, %edi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: subl {{[0-9]+}}(%esp), %edi +; X86-NEXT: setb %al +; X86-NEXT: movl %edi, %esi +; X86-NEXT: negl %esi +; X86-NEXT: pushl %eax +; X86-NEXT: calll use@PLT +; X86-NEXT: addl $4, %esp +; X86-NEXT: xorl %edi, %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: sbb_merge_add2: +; X64: # %bb.0: +; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %rbx +; X64-NEXT: pushq %rax +; X64-NEXT: movl $42, %ebp +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: subl %edi, %ebp +; X64-NEXT: setb %al +; X64-NEXT: movl %ebp, %ebx +; X64-NEXT: negl %ebx +; X64-NEXT: movl %eax, %edi +; X64-NEXT: callq use@PLT +; X64-NEXT: xorl %ebp, %ebx +; X64-NEXT: movl %ebx, %eax +; X64-NEXT: addq $8, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %rbp +; X64-NEXT: retq + %sbb = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 42, i32 %a0) + %borrow = extractvalue { i8, i32 } %sbb, 0 + call void @use(i8 %borrow) + %diff = extractvalue { i8, i32 } %sbb, 1 + %add = add i32 %a0, -42 + %result = xor i32 %diff, %add + ret i32 %result +} + declare { i8, i32 } @llvm.x86.subborrow.32(i8, i32, i32) +declare void @use(i8) diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll index b428ce457ff40..71ad598abe683 100644 --- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll +++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll @@ -96,6 +96,17 @@ entry: define void @_Z2x6v() local_unnamed_addr { ; CHECK-LABEL: _Z2x6v: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq x1@GOTPCREL(%rip), %rax +; CHECK-NEXT: movl (%rax), %edx +; CHECK-NEXT: andl $511, %edx # imm = 0x1FF +; CHECK-NEXT: leaq 1(%rdx), %rax +; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rcx +; CHECK-NEXT: movl %eax, (%rcx) +; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rcx +; CHECK-NEXT: movl (%rcx), %ecx +; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: je .LBB1_18 +; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r15 @@ -114,58 +125,47 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq x1@GOTPCREL(%rip), %rax -; CHECK-NEXT: movl (%rax), %ebx -; CHECK-NEXT: andl $511, %ebx # imm = 0x1FF -; CHECK-NEXT: leaq 1(%rbx), %rax -; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rcx -; CHECK-NEXT: movl %eax, (%rcx) -; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rcx -; CHECK-NEXT: movl (%rcx), %ecx -; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: je .LBB1_18 -; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph -; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rdx -; CHECK-NEXT: movq (%rdx), %rsi -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: notl %edx -; CHECK-NEXT: leaq 8(,%rdx,8), %rdi +; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rsi +; CHECK-NEXT: movq (%rsi), %rsi +; CHECK-NEXT: movl %ecx, %edi +; CHECK-NEXT: notl %edi +; CHECK-NEXT: leaq 8(,%rdi,8), %rdi ; CHECK-NEXT: imulq %rax, %rdi ; CHECK-NEXT: addq %rsi, %rdi ; CHECK-NEXT: movq x2@GOTPCREL(%rip), %r8 -; CHECK-NEXT: movl (%r8), %edx -; CHECK-NEXT: leal 8(,%rbx,8), %eax +; CHECK-NEXT: movl (%r8), %r9d +; CHECK-NEXT: leal 8(,%rdx,8), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: leaq 32(%rsi), %r11 -; CHECK-NEXT: leaq 8(,%rbx,8), %rbx -; CHECK-NEXT: xorl %r14d, %r14d -; CHECK-NEXT: movq x0@GOTPCREL(%rip), %r15 -; CHECK-NEXT: movq %rsi, %r12 +; CHECK-NEXT: leaq 32(%rsi), %rbx +; CHECK-NEXT: leaq 8(,%rdx,8), %r14 +; CHECK-NEXT: xorl %r15d, %r15d +; CHECK-NEXT: movq x0@GOTPCREL(%rip), %r12 +; CHECK-NEXT: movq %rsi, %r13 ; CHECK-NEXT: jmp .LBB1_2 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_15: # %for.cond1.for.inc3_crit_edge ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: movl %edx, (%r8) +; CHECK-NEXT: movl %r9d, (%r8) ; CHECK-NEXT: .LBB1_16: # %for.inc3 ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: addq %rbx, %r12 -; CHECK-NEXT: incq %r14 -; CHECK-NEXT: addq %rbx, %r11 +; CHECK-NEXT: addq %r14, %r13 +; CHECK-NEXT: incq %r15 +; CHECK-NEXT: addq %r14, %rbx ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: je .LBB1_17 ; CHECK-NEXT: .LBB1_2: # %for.cond1thread-pre-split ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB1_12 Depth 2 ; CHECK-NEXT: # Child Loop BB1_14 Depth 2 -; CHECK-NEXT: testl %edx, %edx +; CHECK-NEXT: testl %r9d, %r9d ; CHECK-NEXT: jns .LBB1_16 ; CHECK-NEXT: # %bb.3: # %for.body2.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: movslq %edx, %r13 -; CHECK-NEXT: testq %r13, %r13 +; CHECK-NEXT: movslq %r9d, %r9 +; CHECK-NEXT: testq %r9, %r9 ; CHECK-NEXT: movq $-1, %rbp -; CHECK-NEXT: cmovnsq %r13, %rbp -; CHECK-NEXT: subq %r13, %rbp +; CHECK-NEXT: cmovnsq %r9, %rbp +; CHECK-NEXT: subq %r9, %rbp ; CHECK-NEXT: incq %rbp ; CHECK-NEXT: cmpq $4, %rbp ; CHECK-NEXT: jb .LBB1_14 @@ -177,20 +177,20 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: # %bb.5: # %vector.memcheck ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: imulq %r14, %rax -; CHECK-NEXT: leaq (%rsi,%rax), %r10 -; CHECK-NEXT: leaq (%r10,%r13,8), %r9 -; CHECK-NEXT: testq %r13, %r13 -; CHECK-NEXT: movq $-1, %r10 -; CHECK-NEXT: cmovnsq %r13, %r10 -; CHECK-NEXT: cmpq %r15, %r9 +; CHECK-NEXT: imulq %r15, %rax +; CHECK-NEXT: leaq (%rsi,%rax), %r11 +; CHECK-NEXT: leaq (%r11,%r9,8), %r10 +; CHECK-NEXT: testq %r9, %r9 +; CHECK-NEXT: movq $-1, %r11 +; CHECK-NEXT: cmovnsq %r9, %r11 +; CHECK-NEXT: cmpq %r12, %r10 ; CHECK-NEXT: jae .LBB1_7 ; CHECK-NEXT: # %bb.6: # %vector.memcheck ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: leaq 8(%rsi), %r9 -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: leaq (%rax,%r10,8), %rax -; CHECK-NEXT: cmpq %r15, %rax +; CHECK-NEXT: leaq 8(%rsi), %r10 +; CHECK-NEXT: addq %r10, %rax +; CHECK-NEXT: leaq (%rax,%r11,8), %rax +; CHECK-NEXT: cmpq %r12, %rax ; CHECK-NEXT: ja .LBB1_14 ; CHECK-NEXT: .LBB1_7: # %vector.body.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 @@ -201,50 +201,47 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; CHECK-NEXT: movdqu %xmm0, (%r12,%r13,8) -; CHECK-NEXT: movdqu %xmm0, 16(%r12,%r13,8) -; CHECK-NEXT: movl $4, %r10d +; CHECK-NEXT: movdqu %xmm0, (%r13,%r9,8) +; CHECK-NEXT: movdqu %xmm0, 16(%r13,%r9,8) +; CHECK-NEXT: movl $4, %r11d ; CHECK-NEXT: shrq $2, %rax ; CHECK-NEXT: jne .LBB1_11 ; CHECK-NEXT: jmp .LBB1_13 ; CHECK-NEXT: .LBB1_8: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: xorl %r10d, %r10d +; CHECK-NEXT: xorl %r11d, %r11d ; CHECK-NEXT: shrq $2, %rax ; CHECK-NEXT: je .LBB1_13 ; CHECK-NEXT: .LBB1_11: # %vector.body.preheader.new ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: movq %r11, %rax ; CHECK-NEXT: subq %rdx, %rax -; CHECK-NEXT: addq %r13, %r10 -; CHECK-NEXT: leaq (%r11,%r10,8), %r10 +; CHECK-NEXT: addq %r9, %r11 +; CHECK-NEXT: leaq (%rbx,%r11,8), %r11 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_12: # %vector.body ; CHECK-NEXT: # Parent Loop BB1_2 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movdqu %xmm0, -32(%r10) -; CHECK-NEXT: movdqu %xmm0, -16(%r10) -; CHECK-NEXT: movdqu %xmm0, (%r10) -; CHECK-NEXT: movdqu %xmm0, 16(%r10) -; CHECK-NEXT: addq $64, %r10 +; CHECK-NEXT: movdqu %xmm0, -32(%r11) +; CHECK-NEXT: movdqu %xmm0, -16(%r11) +; CHECK-NEXT: movdqu %xmm0, (%r11) +; CHECK-NEXT: movdqu %xmm0, 16(%r11) +; CHECK-NEXT: addq $64, %r11 ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: jne .LBB1_12 ; CHECK-NEXT: .LBB1_13: # %middle.block ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: addq %rdx, %r13 +; CHECK-NEXT: addq %rdx, %r9 ; CHECK-NEXT: cmpq %rdx, %rbp -; CHECK-NEXT: movq %r13, %rdx ; CHECK-NEXT: je .LBB1_15 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_14: # %for.body2 ; CHECK-NEXT: # Parent Loop BB1_2 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movq (%r15), %rax -; CHECK-NEXT: movq %rax, (%r12,%r13,8) -; CHECK-NEXT: leaq 1(%r13), %rdx -; CHECK-NEXT: cmpq $-1, %r13 -; CHECK-NEXT: movq %rdx, %r13 +; CHECK-NEXT: movq (%r12), %rax +; CHECK-NEXT: movq %rax, (%r13,%r9,8) +; CHECK-NEXT: incq %r9 ; CHECK-NEXT: jl .LBB1_14 ; CHECK-NEXT: jmp .LBB1_15 ; CHECK-NEXT: .LBB1_17: # %for.cond.for.end5_crit_edge @@ -252,7 +249,6 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: movq %rdi, (%rax) ; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rax ; CHECK-NEXT: movl $0, (%rax) -; CHECK-NEXT: .LBB1_18: # %for.end5 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 @@ -265,6 +261,13 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_restore %rbx +; CHECK-NEXT: .cfi_restore %r12 +; CHECK-NEXT: .cfi_restore %r13 +; CHECK-NEXT: .cfi_restore %r14 +; CHECK-NEXT: .cfi_restore %r15 +; CHECK-NEXT: .cfi_restore %rbp +; CHECK-NEXT: .LBB1_18: # %for.end5 ; CHECK-NEXT: retq entry: %0 = load i32, ptr @x1, align 4