Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RISCV] Strength reduce mul by 2^N - 2^M #88983

Merged
merged 4 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 107 additions & 91 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13681,8 +13681,8 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
if (VT != Subtarget.getXLenVT())
return SDValue();

if (!Subtarget.hasStdExtZba() && !Subtarget.hasVendorXTHeadBa())
return SDValue();
const bool HasShlAdd =
Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa();

ConstantSDNode *CNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!CNode)
Expand All @@ -13695,107 +13695,123 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
// other target properly freezes X in these cases either.
SDValue X = N->getOperand(0);

for (uint64_t Divisor : {3, 5, 9}) {
if (MulAmt % Divisor != 0)
continue;
uint64_t MulAmt2 = MulAmt / Divisor;
// 3/5/9 * 2^N -> shl (shXadd X, X), N
if (isPowerOf2_64(MulAmt2)) {
SDLoc DL(N);
SDValue X = N->getOperand(0);
// Put the shift first if we can fold a zext into the
// shift forming a slli.uw.
if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) {
SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X,
DAG.getConstant(Log2_64(MulAmt2), DL, VT));
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
DAG.getConstant(Log2_64(Divisor - 1), DL, VT), Shl);
if (HasShlAdd) {
for (uint64_t Divisor : {3, 5, 9}) {
if (MulAmt % Divisor != 0)
continue;
uint64_t MulAmt2 = MulAmt / Divisor;
// 3/5/9 * 2^N -> shl (shXadd X, X), N
if (isPowerOf2_64(MulAmt2)) {
SDLoc DL(N);
SDValue X = N->getOperand(0);
// Put the shift first if we can fold a zext into the
// shift forming a slli.uw.
if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
X.getConstantOperandVal(1) == UINT64_C(0xffffffff)) {
SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X,
DAG.getConstant(Log2_64(MulAmt2), DL, VT));
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
DAG.getConstant(Log2_64(Divisor - 1), DL, VT),
Shl);
}
// Otherwise, put rhe shl second so that it can fold with following
// instructions (e.g. sext or add).
SDValue Mul359 =
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
return DAG.getNode(ISD::SHL, DL, VT, Mul359,
DAG.getConstant(Log2_64(MulAmt2), DL, VT));
}

// 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
if (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9) {
SDLoc DL(N);
SDValue Mul359 =
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
DAG.getConstant(Log2_64(MulAmt2 - 1), DL, VT),
Mul359);
}
// Otherwise, put rhe shl second so that it can fold with following
// instructions (e.g. sext or add).
SDValue Mul359 =
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
return DAG.getNode(ISD::SHL, DL, VT, Mul359,
DAG.getConstant(Log2_64(MulAmt2), DL, VT));
}

// 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
if (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9) {
SDLoc DL(N);
SDValue Mul359 =
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
DAG.getConstant(Log2_64(MulAmt2 - 1), DL, VT),
Mul359);
}
}

// If this is a power 2 + 2/4/8, we can use a shift followed by a single
// shXadd. First check if this a sum of two power of 2s because that's
// easy. Then count how many zeros are up to the first bit.
if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
unsigned ScaleShift = llvm::countr_zero(MulAmt);
if (ScaleShift >= 1 && ScaleShift < 4) {
unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
SDLoc DL(N);
SDValue Shift1 =
DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(ScaleShift, DL, VT), Shift1);
// If this is a power 2 + 2/4/8, we can use a shift followed by a single
// shXadd. First check if this a sum of two power of 2s because that's
// easy. Then count how many zeros are up to the first bit.
if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
unsigned ScaleShift = llvm::countr_zero(MulAmt);
if (ScaleShift >= 1 && ScaleShift < 4) {
unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
SDLoc DL(N);
SDValue Shift1 =
DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(ScaleShift, DL, VT), Shift1);
}
}
}

// 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x)
// This is the two instruction form, there are also three instruction
// variants we could implement. e.g.
// (2^(1,2,3) * 3,5,9 + 1) << C2
// 2^(C1>3) * 3,5,9 +/- 1
for (uint64_t Divisor : {3, 5, 9}) {
uint64_t C = MulAmt - 1;
if (C <= Divisor)
continue;
unsigned TZ = llvm::countr_zero(C);
if ((C >> TZ) == Divisor && (TZ == 1 || TZ == 2 || TZ == 3)) {
SDLoc DL(N);
SDValue Mul359 =
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
DAG.getConstant(TZ, DL, VT), X);
// 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x)
// This is the two instruction form, there are also three instruction
// variants we could implement. e.g.
// (2^(1,2,3) * 3,5,9 + 1) << C2
// 2^(C1>3) * 3,5,9 +/- 1
for (uint64_t Divisor : {3, 5, 9}) {
uint64_t C = MulAmt - 1;
if (C <= Divisor)
continue;
unsigned TZ = llvm::countr_zero(C);
if ((C >> TZ) == Divisor && (TZ == 1 || TZ == 2 || TZ == 3)) {
SDLoc DL(N);
SDValue Mul359 =
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
DAG.getConstant(TZ, DL, VT), X);
}
}
}

// 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X))
if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
unsigned ScaleShift = llvm::countr_zero(MulAmt - 1);
if (ScaleShift >= 1 && ScaleShift < 4) {
unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2)));
SDLoc DL(N);
SDValue Shift1 =
DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
return DAG.getNode(ISD::ADD, DL, VT, Shift1,
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(ScaleShift, DL, VT), X));
// 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X))
if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
unsigned ScaleShift = llvm::countr_zero(MulAmt - 1);
if (ScaleShift >= 1 && ScaleShift < 4) {
unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2)));
SDLoc DL(N);
SDValue Shift1 =
DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
return DAG.getNode(ISD::ADD, DL, VT, Shift1,
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(ScaleShift, DL, VT), X));
}
}
}

// 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x))
for (uint64_t Offset : {3, 5, 9}) {
if (isPowerOf2_64(MulAmt + Offset)) {
SDLoc DL(N);
SDValue Shift1 =
DAG.getNode(ISD::SHL, DL, VT, X,
DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT));
SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(Log2_64(Offset - 1), DL, VT),
X);
return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
// 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x))
for (uint64_t Offset : {3, 5, 9}) {
if (isPowerOf2_64(MulAmt + Offset)) {
SDLoc DL(N);
SDValue Shift1 =
DAG.getNode(ISD::SHL, DL, VT, X,
DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT));
SDValue Mul359 =
DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(Log2_64(Offset - 1), DL, VT), X);
return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
}
}
}

// 2^N - 2^M -> (sub (shl X, C1), (shl X, C2))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't it handled by DAGCombiner::visitMUL?
https://github.com/llvm/llvm-project/blob/e11b17a4ed90e74147594012207fc35a60515944/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp#L4325-L4371

I'd like to introduce ISD::SHXADD node and move all logic into DAGCombiner.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, I think it is time to switch to search-based methods. I believe it will address @wangpc-pp's concern about the cost model :)

References:

Bernstein, Robert. "Multiplication by Integer Constants." Software—Practice and Experience 16, 7 (July 1986), 641–652.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we missed the rule in RISCV's TLI.decomposeMulByConstant?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One stupid question: Why does it can't be signed int 12 in this line?

if (!Imm.isSignedIntN(12) && Imm.countr_zero() < 12 &&

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #88791 for adding ISD::SHL_ADD.. Once that lands, my hope is to common most of this code across at least x86 and RISCV.

Please see discussion on #87105 with regards to decomposeMulByConsant.

As a meta point, please don't let perfection be the enemy of the good here. :)

uint64_t MulAmtLowBit = MulAmt & (-MulAmt);
if (isPowerOf2_64(MulAmt + MulAmtLowBit)) {
uint64_t ShiftAmt1 = MulAmt + MulAmtLowBit;
SDLoc DL(N);
SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(ShiftAmt1), DL, VT));
SDValue Shift2 =
DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(MulAmtLowBit), DL, VT));
return DAG.getNode(ISD::SUB, DL, VT, Shift1, Shift2);
}

return SDValue();
}

Expand Down
Loading
Loading