Skip to content

Commit

Permalink
[x86] use more shift or LEA for select-of-constants
Browse files Browse the repository at this point in the history
We can convert any select-of-constants to math ops:
http://rise4fun.com/Alive/d7d

For this patch, I'm enhancing an existing x86 transform that uses fake multiplies 
(they always become shl/lea) to avoid cmov or branching. The current code misses 
cases where we have a negative constant and a positive constant, so this is just 
trying to plug that hole.

The DAGCombiner diff prevents us from hitting a terrible inefficiency: we can start 
with a select in IR, create a select DAG node, convert it into a sext, convert it 
back into a select, and then lower it to sext machine code.

Some notes about the test diffs:

1. 2010-08-04-MaskedSignedCompare.ll - We were creating control flow that didn't exist in the IR.
2. memcmp.ll - Choose -1 or 1 is the case that got me looking at this again. I 
   think we could avoid the push/pop in some cases if we used 'movzbl %al' instead of an xor on 
   a different reg? That's a post-DAG problem though.
3. mul-constant-result.ll - The trade-off between sbb+not vs. setne+neg could be addressed if 
   that's a regression, but I think those would always be nearly equivalent.
4. pr22338.ll and sext-i1.ll - These tests have undef operands, so I don't think we actually care about these diffs.
5. sbb.ll - This shows a win for what I think is a common case: choose -1 or 0.
6. select.ll - There's another borderline case here: cmp+sbb+or vs. test+set+lea? Also, sbb+not vs. setae+neg shows up again.
7. select_const.ll - These are motivating cases for the enhancement; replace cmov with cheaper ops.

Assembly differences between movzbl and xor to avoid a partial reg stall are caused later by the X86 Fixup SetCC pass.

Differential Revision: https://reviews.llvm.org/D35340

llvm-svn: 310208
  • Loading branch information
rotateright committed Aug 6, 2017
1 parent a9b5bba commit a923c2e
Show file tree
Hide file tree
Showing 12 changed files with 178 additions and 251 deletions.
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -7394,7 +7394,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
return SCC;

if (!VT.isVector()) {
if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath()) {
EVT SetCCVT = getSetCCResultType(N00VT);
// Don't do this transform for i1 because there's a select transform
// that would reverse it.
Expand Down
95 changes: 31 additions & 64 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -30060,78 +30060,45 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
return SDValue();

// Don't do this for crazy integer types.
if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
EVT VT = N->getValueType(0);
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();

// If this is efficiently invertible, canonicalize the LHSC/RHSC values
// so that TrueC (the true value) is larger than FalseC.
bool NeedsCondInvert = false;
if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
// Efficiently invertible.
(Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
(Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
isa<ConstantSDNode>(Cond.getOperand(1))))) {
NeedsCondInvert = true;
std::swap(TrueC, FalseC);
}

// Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
if (NeedsCondInvert) // Invert the condition if needed.
Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
DAG.getConstant(1, DL, Cond.getValueType()));

// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);

unsigned ShAmt = TrueC->getAPIntValue().logBase2();
return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
DAG.getConstant(ShAmt, DL, MVT::i8));
}
// We're going to use the condition bit in math or logic ops. We could allow
// this with a wider condition value (post-legalization it becomes an i8),
// but if nothing is creating selects that late, it doesn't matter.
if (Cond.getValueType() != MVT::i1)
return SDValue();

// Optimize cases that will turn into an LEA instruction. This requires
// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
if (N->getValueType(0) == MVT::i32)
Diff = (unsigned)Diff;
// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
// 3, 5, or 9 with i32/i64, so those get transformed too.
// TODO: For constants that do not differ by power-of-2 or small multiplier,
// convert to 'and' + 'add'.
APInt AbsDiff = (TrueC->getAPIntValue() - FalseC->getAPIntValue()).abs();
if (AbsDiff.isPowerOf2() ||
((VT == MVT::i32 || VT == MVT::i64) &&
(AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {

bool IsFastMultiplier = false;
if (Diff < 10) {
switch ((unsigned char)Diff) {
default:
break;
case 1: // result = add base, cond
case 2: // result = lea base( , cond*2)
case 3: // result = lea base(cond, cond*2)
case 4: // result = lea base( , cond*4)
case 5: // result = lea base(cond, cond*4)
case 8: // result = lea base( , cond*8)
case 9: // result = lea base(cond, cond*8)
IsFastMultiplier = true;
break;
}
// We need a positive multiplier constant for shift/LEA codegen. The 'not'
// of the condition can usually be folded into a compare predicate, but even
// without that, the sequence should be cheaper than a CMOV alternative.
if (TrueC->getAPIntValue().slt(FalseC->getAPIntValue())) {
Cond = DAG.getNOT(DL, Cond, MVT::i1);
std::swap(TrueC, FalseC);
}

if (IsFastMultiplier) {
APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
if (NeedsCondInvert) // Invert the condition if needed.
Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
DAG.getConstant(1, DL, Cond.getValueType()));
// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
// Scale the condition by the difference.
if (Diff != 1)
Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
DAG.getConstant(Diff, DL, Cond.getValueType()));
// Multiply condition by the difference if non-one.
if (!AbsDiff.isOneValue())
R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

// Add the base if non-zero.
if (FalseC->getAPIntValue() != 0)
Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
SDValue(FalseC, 0));
return Cond;
}
// Add the base if non-zero.
if (!FalseC->isNullValue())
R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

return R;
}

return SDValue();
Expand Down
18 changes: 8 additions & 10 deletions llvm/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
Expand Up @@ -9,21 +9,19 @@
define i32 @main() nounwind {
; CHECK-LABEL: main:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: cmpq $0, {{.*}}(%rip)
; CHECK-NEXT: movb $-106, %al
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: # BB#1: # %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: .LBB0_2: # %entry
; CHECK-NEXT: cmpq {{.*}}(%rip), %rax
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: andl $150, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jle .LBB0_3
; CHECK-NEXT: # BB#4: # %if.then
; CHECK-NEXT: jle .LBB0_1
; CHECK-NEXT: # BB#2: # %if.then
; CHECK-NEXT: movl $1, {{.*}}(%rip)
; CHECK-NEXT: movl $1, %esi
; CHECK-NEXT: jmp .LBB0_5
; CHECK-NEXT: .LBB0_3: # %entry.if.end_crit_edge
; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .LBB0_1: # %entry.if.end_crit_edge
; CHECK-NEXT: movl {{.*}}(%rip), %esi
; CHECK-NEXT: .LBB0_5: # %if.end
; CHECK-NEXT: .LBB0_3: # %if.end
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: movl $.L.str, %edi
; CHECK-NEXT: xorl %eax, %eax
Expand Down
48 changes: 20 additions & 28 deletions llvm/test/CodeGen/X86/memcmp-optsize.ll
Expand Up @@ -125,12 +125,9 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: jmp .LBB4_3
; X86-NEXT: .LBB4_1: # %res_block
; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: incl %ecx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: decl %eax
; X86-NEXT: cmpw %si, %dx
; X86-NEXT: cmovael %ecx, %eax
; X86-NEXT: setae %al
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB4_3: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
Expand All @@ -149,9 +146,9 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB4_1: # %res_block
; X64-NEXT: movl $-1, %ecx
; X64-NEXT: movl $1, %eax
; X64-NEXT: cmovbl %ecx, %eax
; X64-NEXT: setae %al
; X64-NEXT: movzbl %al, %eax
; X64-NEXT: leal -1(%rax,%rax), %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
ret i32 %m
Expand Down Expand Up @@ -286,12 +283,9 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: jmp .LBB9_3
; X86-NEXT: .LBB9_1: # %res_block
; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: incl %ecx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: decl %eax
; X86-NEXT: cmpl %esi, %edx
; X86-NEXT: cmovael %ecx, %eax
; X86-NEXT: setae %al
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB9_3: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
Expand All @@ -310,9 +304,9 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB9_1: # %res_block
; X64-NEXT: movl $-1, %ecx
; X64-NEXT: movl $1, %eax
; X64-NEXT: cmovbl %ecx, %eax
; X64-NEXT: setae %al
; X64-NEXT: movzbl %al, %eax
; X64-NEXT: leal -1(%rax,%rax), %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
ret i32 %m
Expand Down Expand Up @@ -381,12 +375,10 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: je .LBB11_3
; X86-NEXT: .LBB11_1: # %res_block
; X86-NEXT: xorl %esi, %esi
; X86-NEXT: incl %esi
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: decl %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: cmovael %esi, %eax
; X86-NEXT: setae %al
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB11_3: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
Expand Down Expand Up @@ -531,10 +523,10 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
; X64-NEXT: # BB#3: # %endblock
; X64-NEXT: retq
; X64-NEXT: .LBB15_1: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: movl $-1, %ecx
; X64-NEXT: movl $1, %eax
; X64-NEXT: cmovbl %ecx, %eax
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
ret i32 %m
Expand Down Expand Up @@ -572,10 +564,10 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
; X64-NEXT: # BB#3: # %endblock
; X64-NEXT: retq
; X64-NEXT: .LBB16_1: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: movl $-1, %ecx
; X64-NEXT: movl $1, %eax
; X64-NEXT: cmovbl %ecx, %eax
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
ret i32 %m
Expand Down
42 changes: 21 additions & 21 deletions llvm/test/CodeGen/X86/memcmp.ll
Expand Up @@ -126,9 +126,9 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
; X86-NEXT: popl %esi
; X86-NEXT: retl
; X86-NEXT: .LBB4_1: # %res_block
; X86-NEXT: movl $-1, %ecx
; X86-NEXT: movl $1, %eax
; X86-NEXT: cmovbl %ecx, %eax
; X86-NEXT: setae %al
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
Expand All @@ -146,9 +146,9 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB4_1: # %res_block
; X64-NEXT: movl $-1, %ecx
; X64-NEXT: movl $1, %eax
; X64-NEXT: cmovbl %ecx, %eax
; X64-NEXT: setae %al
; X64-NEXT: movzbl %al, %eax
; X64-NEXT: leal -1(%rax,%rax), %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
ret i32 %m
Expand Down Expand Up @@ -283,9 +283,9 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
; X86-NEXT: popl %esi
; X86-NEXT: retl
; X86-NEXT: .LBB9_1: # %res_block
; X86-NEXT: movl $-1, %ecx
; X86-NEXT: movl $1, %eax
; X86-NEXT: cmovbl %ecx, %eax
; X86-NEXT: setae %al
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
Expand All @@ -303,9 +303,9 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB9_1: # %res_block
; X64-NEXT: movl $-1, %ecx
; X64-NEXT: movl $1, %eax
; X64-NEXT: cmovbl %ecx, %eax
; X64-NEXT: setae %al
; X64-NEXT: movzbl %al, %eax
; X64-NEXT: leal -1(%rax,%rax), %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
ret i32 %m
Expand Down Expand Up @@ -376,10 +376,10 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
; X86-NEXT: popl %esi
; X86-NEXT: retl
; X86-NEXT: .LBB11_1: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: movl $-1, %ecx
; X86-NEXT: movl $1, %eax
; X86-NEXT: cmovbl %ecx, %eax
; X86-NEXT: setae %al
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
Expand Down Expand Up @@ -521,10 +521,10 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
; X64-NEXT: # BB#3: # %endblock
; X64-NEXT: retq
; X64-NEXT: .LBB15_1: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: movl $-1, %ecx
; X64-NEXT: movl $1, %eax
; X64-NEXT: cmovbl %ecx, %eax
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
ret i32 %m
Expand Down Expand Up @@ -562,10 +562,10 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
; X64-NEXT: # BB#3: # %endblock
; X64-NEXT: retq
; X64-NEXT: .LBB16_1: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: movl $-1, %ecx
; X64-NEXT: movl $1, %eax
; X64-NEXT: cmovbl %ecx, %eax
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
ret i32 %m
Expand Down
8 changes: 3 additions & 5 deletions llvm/test/CodeGen/X86/merge-consecutive-stores.ll
Expand Up @@ -16,11 +16,9 @@ define i32 @foo (i64* %so) nounwind uwtable ssp {
; CHECK-NEXT: cmpl 16(%eax), %edx
; CHECK-NEXT: movl $0, 16(%eax)
; CHECK-NEXT: sbbl %ecx, %edx
; CHECK-NEXT: movl $-1, %eax
; CHECK-NEXT: jl .LBB0_2
; CHECK-NEXT: # BB#1:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: setl %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: negl %eax
; CHECK-NEXT: retl
%used = getelementptr inbounds i64, i64* %so, i32 3
store i64 0, i64* %used, align 8
Expand Down

0 comments on commit a923c2e

Please sign in to comment.