Skip to content

Commit

Permalink
[CodeGen] [SelectionDAG] More efficient code for X % C == 0 (UREM cas…
Browse files Browse the repository at this point in the history
…e) (try 3)

Summary:
I'm submitting a new revision since i don't understand how to reclaim/reopen/take over the existing one, D50222.
There is no such action in "Add Action" menu...

This implements an optimization described in Hacker's Delight 10-17: when `C` is constant,
the result of `X % C == 0` can be computed more cheaply without actually calculating the remainder.
The motivation is discussed here: https://bugs.llvm.org/show_bug.cgi?id=35479.

This is a recommit, the original commit rL364563 was reverted in rL364568
because test-suite detected miscompile - the new comparison constant 'Q'
was being computed incorrectly (we divided by `D0` instead of `D`).

Original patch D50222 by @hermord (Dmytro Shynkevych)

Notes:
- In principle, it's possible to also handle the `X % C1 == C2` case, as discussed on bugzilla.
  This seems to require an extra branch on overflow, so I refrained from implementing this for now.
- An explicit check for when the `REM` can be reduced to just its LHS is included:
  the `X % C` == 0 optimization breaks `test1` in `test/CodeGen/X86/jump_sign.ll` otherwise.
  I hadn't managed to find a better way to not generate worse output in this case.
- The `test/CodeGen/X86/jump_sign.ll` regresses, and is being fixed by a followup patch D63390.

Reviewers: RKSimon, craig.topper, spatel, hermord, xbolva00

Reviewed By: RKSimon, xbolva00

Subscribers: dexonsmith, kristina, xbolva00, javed.absar, llvm-commits, hermord

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D63391

llvm-svn: 364600
  • Loading branch information
LebedevRI committed Jun 27, 2019
1 parent a59cf87 commit 29d05c0
Show file tree
Hide file tree
Showing 13 changed files with 284 additions and 316 deletions.
6 changes: 6 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Expand Up @@ -4054,6 +4054,12 @@ class TargetLowering : public TargetLoweringBase {
SDValue N1, ISD::CondCode Cond,
DAGCombinerInfo &DCI,
const SDLoc &DL) const;

SDValue prepareUREMEqFold(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
DAGCombinerInfo &DCI, const SDLoc &DL,
SmallVectorImpl<SDNode *> &Created) const;
SDValue buildUREMEqFold(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
DAGCombinerInfo &DCI, const SDLoc &DL) const;
};

/// Given an LLVM IR type and return type attributes, compute the return value
Expand Down
109 changes: 109 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Expand Up @@ -3460,6 +3460,18 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
return V;
}

// Fold remainder of division by a constant.
if (N0.getOpcode() == ISD::UREM && N0.hasOneUse() &&
(Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

// When division is cheap or optimizing for minimum size,
// fall through to DIVREM creation by skipping this fold.
if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize))
if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
return Folded;
}

// Fold away ALL boolean setcc's.
if (N0.getValueType().getScalarType() == MVT::i1 && foldBooleans) {
SDValue Temp;
Expand Down Expand Up @@ -4445,6 +4457,103 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
return DAG.getSelect(dl, VT, IsOne, N0, Q);
}

/// Given an ISD::UREM used only by an ISD::SETEQ or ISD::SETNE
/// where the divisor is constant and the comparison target is zero,
/// return a DAG expression that will generate the same comparison result
/// using only multiplications, additions and shifts/rotations.
/// Ref: "Hacker's Delight" 10-17.
SDValue TargetLowering::buildUREMEqFold(EVT VT, SDValue REMNode,
SDValue CompNode, ISD::CondCode Cond,
DAGCombinerInfo &DCI,
const SDLoc &DL) const {
SmallVector<SDNode *, 2> Built;
if (SDValue Folded =
prepareUREMEqFold(VT, REMNode, CompNode, Cond, DCI, DL, Built)) {
for (SDNode *N : Built)
DCI.AddToWorklist(N);
return Folded;
}

return SDValue();
}

SDValue
TargetLowering::prepareUREMEqFold(EVT VT, SDValue REMNode, SDValue CompNode,
ISD::CondCode Cond, DAGCombinerInfo &DCI,
const SDLoc &DL,
SmallVectorImpl<SDNode *> &Created) const {
// fold (seteq/ne (urem N, D), 0) -> (setule/ugt (rotr (mul N, P), K), Q)
// - D must be constant with D = D0 * 2^K where D0 is odd and D0 != 1
// - P is the multiplicative inverse of D0 modulo 2^W
// - Q = floor((2^W - 1) / D0)
// where W is the width of the common type of N and D.
assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
"Only applicable for (in)equality comparisons.");

EVT REMVT = REMNode->getValueType(0);

// If MUL is unavailable, we cannot proceed in any case.
if (!isOperationLegalOrCustom(ISD::MUL, REMVT))
return SDValue();

// TODO: Add non-uniform constant support.
ConstantSDNode *Divisor = isConstOrConstSplat(REMNode->getOperand(1));
ConstantSDNode *CompTarget = isConstOrConstSplat(CompNode);
if (!Divisor || !CompTarget || Divisor->isNullValue() ||
!CompTarget->isNullValue())
return SDValue();

const APInt &D = Divisor->getAPIntValue();

// Decompose D into D0 * 2^K
unsigned K = D.countTrailingZeros();
bool DivisorIsEven = (K != 0);
APInt D0 = D.lshr(K);

// The fold is invalid when D0 == 1.
// This is reachable because visitSetCC happens before visitREM.
if (D0.isOneValue())
return SDValue();

// P = inv(D0, 2^W)
// 2^W requires W + 1 bits, so we have to extend and then truncate.
unsigned W = D.getBitWidth();
APInt P = D0.zext(W + 1)
.multiplicativeInverse(APInt::getSignedMinValue(W + 1))
.trunc(W);
assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");

// Q = floor((2^W - 1) / D)
APInt Q = APInt::getAllOnesValue(W).udiv(D);

SelectionDAG &DAG = DCI.DAG;

SDValue PVal = DAG.getConstant(P, DL, REMVT);
SDValue QVal = DAG.getConstant(Q, DL, REMVT);
// (mul N, P)
SDValue Op1 = DAG.getNode(ISD::MUL, DL, REMVT, REMNode->getOperand(0), PVal);
Created.push_back(Op1.getNode());

// Rotate right only if D was even.
if (DivisorIsEven) {
// We need ROTR to do this.
if (!isOperationLegalOrCustom(ISD::ROTR, REMVT))
return SDValue();
SDValue ShAmt =
DAG.getConstant(K, DL, getShiftAmountTy(REMVT, DAG.getDataLayout()));
SDNodeFlags Flags;
Flags.setExact(true);
// UREM: (rotr (mul N, P), K)
Op1 = DAG.getNode(ISD::ROTR, DL, REMVT, Op1, ShAmt, Flags);
Created.push_back(Op1.getNode());
}

// UREM: (setule/setugt (rotr (mul N, P), K), Q)
return DAG.getSetCC(DL, VT, Op1, QVal,
((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT));
}

bool TargetLowering::
verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
if (!isa<ConstantSDNode>(Op.getOperand(0))) {
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Support/APInt.cpp
Expand Up @@ -1095,6 +1095,8 @@ APInt APInt::sqrt() const {
/// however we simplify it to speed up calculating only the inverse, and take
/// advantage of div+rem calculations. We also use some tricks to avoid copying
/// (potentially large) APInts around.
/// WARNING: a value of '0' may be returned,
/// signifying that no multiplicative inverse exists!
APInt APInt::multiplicativeInverse(const APInt& modulo) const {
assert(ult(modulo) && "This APInt must be smaller than the modulo");

Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll
Expand Up @@ -26,13 +26,13 @@ define i32 @test_optsize(i32 %X) optsize nounwind readnone {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #52429
; CHECK-NEXT: movk w8, #52428, lsl #16
; CHECK-NEXT: umull x8, w0, w8
; CHECK-NEXT: lsr x8, x8, #34
; CHECK-NEXT: add w8, w8, w8, lsl #2
; CHECK-NEXT: mov w9, #-10
; CHECK-NEXT: cmp w0, w8
; CHECK-NEXT: mov w9, #13108
; CHECK-NEXT: movk w9, #13107, lsl #16
; CHECK-NEXT: mul w8, w0, w8
; CHECK-NEXT: mov w10, #-10
; CHECK-NEXT: cmp w8, w9
; CHECK-NEXT: mov w8, #42
; CHECK-NEXT: csel w0, w8, w9, eq
; CHECK-NEXT: csel w0, w8, w10, lo
; CHECK-NEXT: ret
%rem = urem i32 %X, 5
%cmp = icmp eq i32 %rem, 0
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
Expand Up @@ -123,7 +123,6 @@ define <4 x i32> @test_urem_one(<4 x i32> %X) nounwind readnone {
ret <4 x i32> %ret
}

; Can't fold due to second line
define <4 x i32> @test_urem_nomulinv(<4 x i32> %X) nounwind readnone {
; CHECK-LABEL: test_urem_nomulinv:
; CHECK: // %bb.0:
Expand Down
20 changes: 6 additions & 14 deletions llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
Expand Up @@ -9,13 +9,9 @@ define <4 x i32> @test_urem_odd_vec_i32(<4 x i32> %X) nounwind readnone {
; CHECK-NEXT: mov w8, #52429
; CHECK-NEXT: movk w8, #52428, lsl #16
; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s
; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s
; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
; CHECK-NEXT: movi v1.4s, #5
; CHECK-NEXT: ushr v2.4s, v2.4s, #2
; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: movi v1.16b, #51
; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s
; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
Expand All @@ -31,13 +27,9 @@ define <8 x i16> @test_urem_odd_vec_i16(<8 x i16> %X) nounwind readnone {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #52429
; CHECK-NEXT: dup v2.8h, w8
; CHECK-NEXT: umull2 v3.4s, v0.8h, v2.8h
; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
; CHECK-NEXT: uzp2 v2.8h, v2.8h, v3.8h
; CHECK-NEXT: movi v1.8h, #5
; CHECK-NEXT: ushr v2.8h, v2.8h, #2
; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h
; CHECK-NEXT: cmeq v0.8h, v0.8h, #0
; CHECK-NEXT: movi v1.16b, #51
; CHECK-NEXT: mul v0.8h, v0.8h, v2.8h
; CHECK-NEXT: cmhs v0.8h, v1.8h, v0.8h
; CHECK-NEXT: movi v1.8h, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
Expand Down
100 changes: 43 additions & 57 deletions llvm/test/CodeGen/AArch64/urem-seteq.ll
Expand Up @@ -10,11 +10,11 @@ define i32 @test_urem_odd(i32 %X) nounwind readnone {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #52429
; CHECK-NEXT: movk w8, #52428, lsl #16
; CHECK-NEXT: umull x8, w0, w8
; CHECK-NEXT: lsr x8, x8, #34
; CHECK-NEXT: add w8, w8, w8, lsl #2
; CHECK-NEXT: cmp w0, w8
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: mov w9, #13108
; CHECK-NEXT: mul w8, w0, w8
; CHECK-NEXT: movk w9, #13107, lsl #16
; CHECK-NEXT: cmp w8, w9
; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 5
%cmp = icmp eq i32 %urem, 0
Expand All @@ -26,14 +26,11 @@ define i32 @test_urem_odd(i32 %X) nounwind readnone {
define i32 @test_urem_odd_bit30(i32 %X) nounwind readnone {
; CHECK-LABEL: test_urem_odd_bit30:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #-11
; CHECK-NEXT: umull x8, w0, w8
; CHECK-NEXT: mov w9, #3
; CHECK-NEXT: lsr x8, x8, #62
; CHECK-NEXT: movk w9, #16384, lsl #16
; CHECK-NEXT: msub w8, w8, w9, w0
; CHECK-NEXT: cmp w8, #0 // =0
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: movk w8, #27306, lsl #16
; CHECK-NEXT: mul w8, w0, w8
; CHECK-NEXT: cmp w8, #4 // =4
; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 1073741827
%cmp = icmp eq i32 %urem, 0
Expand All @@ -45,14 +42,11 @@ define i32 @test_urem_odd_bit30(i32 %X) nounwind readnone {
define i32 @test_urem_odd_bit31(i32 %X) nounwind readnone {
; CHECK-LABEL: test_urem_odd_bit31:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: lsl x9, x8, #30
; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: lsr x8, x8, #61
; CHECK-NEXT: mov w9, #-2147483645
; CHECK-NEXT: msub w8, w8, w9, w0
; CHECK-NEXT: cmp w8, #0 // =0
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: mov w8, #43691
; CHECK-NEXT: movk w8, #10922, lsl #16
; CHECK-NEXT: mul w8, w0, w8
; CHECK-NEXT: cmp w8, #2 // =2
; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 2147483651
%cmp = icmp eq i32 %urem, 0
Expand All @@ -69,16 +63,15 @@ define i32 @test_urem_odd_bit31(i32 %X) nounwind readnone {
define i16 @test_urem_even(i16 %X) nounwind readnone {
; CHECK-LABEL: test_urem_even:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w10, #9363
; CHECK-NEXT: ubfx w9, w0, #1, #15
; CHECK-NEXT: movk w10, #37449, lsl #16
; CHECK-NEXT: umull x9, w9, w10
; CHECK-NEXT: mov w9, #28087
; CHECK-NEXT: and w8, w0, #0xffff
; CHECK-NEXT: lsr x9, x9, #34
; CHECK-NEXT: mov w10, #14
; CHECK-NEXT: msub w8, w9, w10, w8
; CHECK-NEXT: cmp w8, #0 // =0
; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: movk w9, #46811, lsl #16
; CHECK-NEXT: mul w8, w8, w9
; CHECK-NEXT: mov w9, #9362
; CHECK-NEXT: ror w8, w8, #1
; CHECK-NEXT: movk w9, #4681, lsl #16
; CHECK-NEXT: cmp w8, w9
; CHECK-NEXT: cset w0, hi
; CHECK-NEXT: ret
%urem = urem i16 %X, 14
%cmp = icmp ne i16 %urem, 0
Expand All @@ -90,14 +83,12 @@ define i16 @test_urem_even(i16 %X) nounwind readnone {
define i32 @test_urem_even_bit30(i32 %X) nounwind readnone {
; CHECK-LABEL: test_urem_even_bit30:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #-415
; CHECK-NEXT: umull x8, w0, w8
; CHECK-NEXT: mov w9, #104
; CHECK-NEXT: lsr x8, x8, #62
; CHECK-NEXT: movk w9, #16384, lsl #16
; CHECK-NEXT: msub w8, w8, w9, w0
; CHECK-NEXT: cmp w8, #0 // =0
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: mov w8, #20165
; CHECK-NEXT: movk w8, #64748, lsl #16
; CHECK-NEXT: mul w8, w0, w8
; CHECK-NEXT: ror w8, w8, #3
; CHECK-NEXT: cmp w8, #4 // =4
; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 1073741928
%cmp = icmp eq i32 %urem, 0
Expand All @@ -109,15 +100,12 @@ define i32 @test_urem_even_bit30(i32 %X) nounwind readnone {
define i32 @test_urem_even_bit31(i32 %X) nounwind readnone {
; CHECK-LABEL: test_urem_even_bit31:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #65435
; CHECK-NEXT: movk w8, #32767, lsl #16
; CHECK-NEXT: umull x8, w0, w8
; CHECK-NEXT: mov w9, #102
; CHECK-NEXT: lsr x8, x8, #62
; CHECK-NEXT: movk w9, #32768, lsl #16
; CHECK-NEXT: msub w8, w8, w9, w0
; CHECK-NEXT: cmp w8, #0 // =0
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: mov w8, #64251
; CHECK-NEXT: movk w8, #47866, lsl #16
; CHECK-NEXT: mul w8, w0, w8
; CHECK-NEXT: ror w8, w8, #1
; CHECK-NEXT: cmp w8, #2 // =2
; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 2147483750
%cmp = icmp eq i32 %urem, 0
Expand All @@ -137,19 +125,17 @@ define i32 @test_urem_one(i32 %X) nounwind readnone {
ret i32 %ret
}

; We should not proceed with this fold if we can not compute
; multiplicative inverse
define i32 @test_urem_100(i32 %X) nounwind readnone {
; CHECK-LABEL: test_urem_100:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #34079
; CHECK-NEXT: movk w8, #20971, lsl #16
; CHECK-NEXT: umull x8, w0, w8
; CHECK-NEXT: lsr x8, x8, #37
; CHECK-NEXT: mov w9, #100
; CHECK-NEXT: msub w8, w8, w9, w0
; CHECK-NEXT: cmp w8, #0 // =0
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: mov w8, #23593
; CHECK-NEXT: movk w8, #49807, lsl #16
; CHECK-NEXT: mul w8, w0, w8
; CHECK-NEXT: mov w9, #23593
; CHECK-NEXT: ror w8, w8, #2
; CHECK-NEXT: movk w9, #655, lsl #16
; CHECK-NEXT: cmp w8, w9
; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%urem = urem i32 %X, 100
%cmp = icmp eq i32 %urem, 0
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/X86/jump_sign.ll
Expand Up @@ -236,13 +236,11 @@ define void @func_o() nounwind uwtable {
; CHECK-NEXT: jne .LBB12_8
; CHECK-NEXT: # %bb.4: # %if.end29
; CHECK-NEXT: movzwl (%eax), %eax
; CHECK-NEXT: imull $-13107, %eax, %eax # imm = 0xCCCD
; CHECK-NEXT: rorw %ax
; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: imull $52429, %eax, %ecx # imm = 0xCCCD
; CHECK-NEXT: shrl $18, %ecx
; CHECK-NEXT: andl $-2, %ecx
; CHECK-NEXT: leal (%ecx,%ecx,4), %ecx
; CHECK-NEXT: cmpw %cx, %ax
; CHECK-NEXT: jne .LBB12_5
; CHECK-NEXT: cmpl $6554, %eax # imm = 0x199A
; CHECK-NEXT: jae .LBB12_5
; CHECK-NEXT: .LBB12_8: # %if.then44
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
Expand Down

0 comments on commit 29d05c0

Please sign in to comment.