Skip to content

Commit

Permalink
[Codegen] (X & (C l>>/<< Y)) ==/!= 0 --> ((X <</l>> Y) & C) ==/!= 0 fold
Browse files Browse the repository at this point in the history
Summary:
This was originally reported in D62818.
https://rise4fun.com/Alive/oPH

InstCombine does the opposite fold, in hope that `C l>>/<< Y` expression
will be hoisted out of a loop if `Y` is invariant and `X` is not.
But as it is seen from the diffs here, if it didn't get hoisted,
the produced assembly is almost universally worse.

Much like with my recent "hoist add/sub by/from const" patches,
we should get almost universal win if we hoist constant,
there is almost always an "and/test by imm" instruction,
but "shift of imm" not so much, so we may avoid having to
materialize the immediate, and thus need one less register.
And since we now shift not by constant, but by something else,
the live-range of that something else may reduce.

Special care needs to be applied not to disturb x86 `BT` / hexagon `tstbit`
instruction pattern. And to not get into endless combine loop.

Reviewers: RKSimon, efriedma, t.p.northover, craig.topper, spatel, arsenm

Reviewed By: spatel

Subscribers: hiraditya, MaskRay, wuzish, xbolva00, nikic, nemanjai, jvesely, wdng, nhaehnle, javed.absar, tpr, kristof.beyls, jsji, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D62871

llvm-svn: 366955
  • Loading branch information
LebedevRI committed Jul 24, 2019
1 parent 6849911 commit 017e272
Show file tree
Hide file tree
Showing 16 changed files with 1,330 additions and 1,500 deletions.
43 changes: 43 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Expand Up @@ -539,6 +539,12 @@ class TargetLoweringBase {
return hasAndNotCompare(X);
}

/// Return true if the target has a bit-test instruction:
/// (X & (1 << Y)) ==/!= 0
/// This knowledge can be used to prevent breaking the pattern,
/// or creating it if it could be recognized.
virtual bool hasBitTest(SDValue X, SDValue Y) const { return false; }

/// There are two ways to clear extreme bits (either low or high):
/// Mask: x & (-1 << y) (the instcombine canonical form)
/// Shifts: x >> y << y
Expand Down Expand Up @@ -571,6 +577,38 @@ class TargetLoweringBase {
return false;
}

/// Given the pattern
/// (X & (C l>>/<< Y)) ==/!= 0
/// return true if it should be transformed into:
/// ((X <</l>> Y) & C) ==/!= 0
/// WARNING: if 'X' is a constant, the fold may deadlock!
/// FIXME: we could avoid passing XC, but we can't use isConstOrConstSplat()
/// here because it can end up being not linked in.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
SelectionDAG &DAG) const {
if (hasBitTest(X, Y)) {
// One interesting pattern that we'd want to form is 'bit test':
// ((1 << Y) & C) ==/!= 0
// But we also need to be careful not to try to reverse that fold.

// Is this '1 << Y' ?
if (OldShiftOpcode == ISD::SHL && CC->isOne())
return false; // Keep the 'bit test' pattern.

// Will it be '1 << Y' after the transform ?
if (XC && NewShiftOpcode == ISD::SHL && XC->isOne())
return true; // Do form the 'bit test' pattern.
}

// If 'X' is a constant, and we transform, then we will immediately
// try to undo the fold, thus causing endless combine loop.
// So by default, let's assume everyone prefers the fold
// iff 'X' is not a constant.
return !XC;
}

/// These two forms are equivalent:
/// sub %y, (xor %x, -1)
/// add (add %x, 1), %y
Expand Down Expand Up @@ -4108,6 +4146,11 @@ class TargetLowering : public TargetLoweringBase {
DAGCombinerInfo &DCI,
const SDLoc &DL) const;

// (X & (C l>>/<< Y)) ==/!= 0 --> ((X <</l>> Y) & C) ==/!= 0
SDValue optimizeSetCCByHoistingAndByConstFromLogicalShift(
EVT SCCVT, SDValue N0, SDValue N1C, ISD::CondCode Cond,
DAGCombinerInfo &DCI, const SDLoc &DL) const;

SDValue prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
SDValue CompTargetNode, ISD::CondCode Cond,
DAGCombinerInfo &DCI, const SDLoc &DL,
Expand Down
79 changes: 79 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Expand Up @@ -2760,6 +2760,77 @@ SDValue TargetLowering::optimizeSetCCOfSignedTruncationCheck(
return T2;
}

// (X & (C l>>/<< Y)) ==/!= 0 --> ((X <</l>> Y) & C) ==/!= 0
SDValue TargetLowering::optimizeSetCCByHoistingAndByConstFromLogicalShift(
EVT SCCVT, SDValue N0, SDValue N1C, ISD::CondCode Cond,
DAGCombinerInfo &DCI, const SDLoc &DL) const {
assert(isConstOrConstSplat(N1C) &&
isConstOrConstSplat(N1C)->getAPIntValue().isNullValue() &&
"Should be a comparison with 0.");
assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
"Valid only for [in]equality comparisons.");

unsigned NewShiftOpcode;
SDValue X, C, Y;

SelectionDAG &DAG = DCI.DAG;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();

// Look for '(C l>>/<< Y)'.
auto Match = [&NewShiftOpcode, &X, &C, &Y, &TLI, &DAG](SDValue V) {
// The shift should be one-use.
if (!V.hasOneUse())
return false;
unsigned OldShiftOpcode = V.getOpcode();
switch (OldShiftOpcode) {
case ISD::SHL:
NewShiftOpcode = ISD::SRL;
break;
case ISD::SRL:
NewShiftOpcode = ISD::SHL;
break;
default:
return false; // must be a logical shift.
}
// We should be shifting a constant.
// FIXME: best to use isConstantOrConstantVector().
C = V.getOperand(0);
ConstantSDNode *CC =
isConstOrConstSplat(C, /*AllowUndefs=*/true, /*AllowTruncation=*/true);
if (!CC)
return false;
Y = V.getOperand(1);

ConstantSDNode *XC =
isConstOrConstSplat(X, /*AllowUndefs=*/true, /*AllowTruncation=*/true);
return TLI.shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG);
};

// LHS of comparison should be an one-use 'and'.
if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
return SDValue();

X = N0.getOperand(0);
SDValue Mask = N0.getOperand(1);

// 'and' is commutative!
if (!Match(Mask)) {
std::swap(X, Mask);
if (!Match(Mask))
return SDValue();
}

EVT VT = X.getValueType();

// Produce:
// ((X 'OppositeShiftOpcode' Y) & C) Cond 0
SDValue T0 = DAG.getNode(NewShiftOpcode, DL, VT, X, Y);
SDValue T1 = DAG.getNode(ISD::AND, DL, VT, T0, C);
SDValue T2 = DAG.getSetCC(DL, SCCVT, T1, N1C, Cond);
return T2;
}

/// Try to fold an equality comparison with a {add/sub/xor} binary operation as
/// the 1st operand (N0). Callers are expected to swap the N0/N1 parameters to
/// handle the commuted versions of these patterns.
Expand Down Expand Up @@ -3328,6 +3399,14 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
}
}

if (Cond == ISD::SETEQ || Cond == ISD::SETNE) {
// (X & (C l>>/<< Y)) ==/!= 0 --> ((X <</l>> Y) & C) ==/!= 0
if (C1.isNullValue())
if (SDValue CC = optimizeSetCCByHoistingAndByConstFromLogicalShift(
VT, N0, N1, Cond, DCI, dl))
return CC;
}

// If we have "setcc X, C0", check to see if we can shrink the immediate
// by changing cc.
// TODO: Support this for vectors after legalize ops.
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -12042,6 +12042,19 @@ bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
return Mask->getValue().isPowerOf2();
}

bool AArch64TargetLowering::
shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
SelectionDAG &DAG) const {
// Does baseline recommend not to perform the fold by default?
if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
return false;
// Else, if this is a vector shift, prefer 'shl'.
return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
}

void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
// Update IsSplitCSR in AArch64unctionInfo.
AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Expand Up @@ -488,6 +488,11 @@ class AArch64TargetLowering : public TargetLowering {
return VT.getSizeInBits() >= 64; // vector 'bic'
}

bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
SelectionDAG &DAG) const override;

bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
if (DAG.getMachineFunction().getFunction().hasMinSize())
return false;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
Expand Up @@ -1817,6 +1817,10 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return false;
}

bool HexagonTargetLowering::hasBitTest(SDValue X, SDValue Y) const {
return X.getValueType().isScalarInteger(); // 'tstbit'
}

bool HexagonTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
return isTruncateFree(EVT::getEVT(Ty1), EVT::getEVT(Ty2));
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/Hexagon/HexagonISelLowering.h
Expand Up @@ -127,6 +127,8 @@ namespace HexagonISD {
bool isCheapToSpeculateCtlz() const override { return true; }
bool isCtlzFast() const override { return true; }

bool hasBitTest(SDValue X, SDValue Y) const override;

bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;

/// Return true if an FMA operation is faster than a pair of mul and add
Expand Down
27 changes: 27 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -5022,6 +5022,33 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const {
return Subtarget.hasSSE2();
}

bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
return X.getValueType().isScalarInteger(); // 'bt'
}

bool X86TargetLowering::
shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
SelectionDAG &DAG) const {
// Does baseline recommend not to perform the fold by default?
if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
return false;
// For scalars this transform is always beneficial.
if (X.getValueType().isScalarInteger())
return true;
// If all the shift amounts are identical, then transform is beneficial even
// with rudimentary SSE2 shifts.
if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
return true;
// If we have AVX2 with it's powerful shift operations, then it's also good.
if (Subtarget.hasAVX2())
return true;
// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
return NewShiftOpcode == ISD::SHL;
}

bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {
assert(((N->getOpcode() == ISD::SHL &&
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Expand Up @@ -840,6 +840,13 @@ namespace llvm {

bool hasAndNot(SDValue Y) const override;

bool hasBitTest(SDValue X, SDValue Y) const override;

bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
SelectionDAG &DAG) const override;

bool shouldFoldConstantShiftPairToMask(const SDNode *N,
CombineLevel Level) const override;

Expand Down

0 comments on commit 017e272

Please sign in to comment.