-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[RISCV] Expand multiplication by (2/4/8 * 3/5/9 + 1) << N with SHL_ADD
#166372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-risc-v Author: Piotr Fusik (pfusik) ChangesFull diff: https://github.com/llvm/llvm-project/pull/166372.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c56ce3fd2a5a4..b597cbf8b2ba3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16496,29 +16496,47 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
}
static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
- unsigned ShY) {
+ unsigned ShY, bool AddX) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue X = N->getOperand(0);
SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
DAG.getConstant(ShY, DL, VT), X);
return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
- DAG.getConstant(ShX, DL, VT), Mul359);
+ DAG.getConstant(ShX, DL, VT), AddX ? X : Mul359);
}
static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG,
uint64_t MulAmt) {
switch (MulAmt) {
case 5 * 3:
- return getShlAddShlAdd(N, DAG, 2, 1);
+ return getShlAddShlAdd(N, DAG, 2, 1, false);
case 9 * 3:
- return getShlAddShlAdd(N, DAG, 3, 1);
+ return getShlAddShlAdd(N, DAG, 3, 1, false);
case 5 * 5:
- return getShlAddShlAdd(N, DAG, 2, 2);
+ return getShlAddShlAdd(N, DAG, 2, 2, false);
case 9 * 5:
- return getShlAddShlAdd(N, DAG, 3, 2);
+ return getShlAddShlAdd(N, DAG, 3, 2, false);
case 9 * 9:
- return getShlAddShlAdd(N, DAG, 3, 3);
+ return getShlAddShlAdd(N, DAG, 3, 3, false);
+ case 2 * 3 + 1:
+ return getShlAddShlAdd(N, DAG, 1, 1, true);
+ case 4 * 3 + 1:
+ return getShlAddShlAdd(N, DAG, 2, 1, true);
+ // case 8 * 3 + 1:
+ // Prefer 5 * 5 above because it doesn't require a register to hold X.
+ case 2 * 5 + 1:
+ return getShlAddShlAdd(N, DAG, 1, 2, true);
+ case 4 * 5 + 1:
+ return getShlAddShlAdd(N, DAG, 2, 2, true);
+ case 8 * 5 + 1:
+ return getShlAddShlAdd(N, DAG, 3, 2, true);
+ case 2 * 9 + 1:
+ return getShlAddShlAdd(N, DAG, 1, 3, true);
+ case 4 * 9 + 1:
+ return getShlAddShlAdd(N, DAG, 2, 3, true);
+ case 8 * 9 + 1:
+ return getShlAddShlAdd(N, DAG, 3, 3, true);
default:
return SDValue();
}
@@ -16581,7 +16599,8 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(Shift, DL, VT));
}
- // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
+ // 3/5/9 * 3/5/9 -> (shXadd (shYadd X, X), (shYadd X, X))
+ // 2/4/8 * 3/5/9 + 1 -> (shXadd (shYadd X, X), X)
if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt))
return V;
@@ -16600,21 +16619,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
}
}
- // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x)
- // This is the two instruction form, there are also three instruction
- // variants we could implement. e.g.
- // (2^(1,2,3) * 3,5,9 + 1) << C2
- // 2^(C1>3) * 3,5,9 +/- 1
- if (int ShXAmount = isShifted359(MulAmt - 1, Shift)) {
- assert(Shift != 0 && "MulAmt=4,6,10 handled before");
- if (Shift <= 3) {
- SDLoc DL(N);
- SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
- DAG.getConstant(ShXAmount, DL, VT), X);
- return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
- DAG.getConstant(Shift, DL, VT), X);
- }
- }
+ // TODO: 2^(C1>3) * 3,5,9 +/- 1
// 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X))
if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
@@ -16648,6 +16653,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
// 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
// of 25 which happen to be quite common.
+ // (2/4/8 * 3/5/9 + 1) * 2^N
Shift = llvm::countr_zero(MulAmt);
if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) {
SDLoc DL(N);
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
index 50bd22bf5fd69..f4964288e3541 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
@@ -205,12 +205,19 @@ define i64 @addmul20(i64 %a, i64 %b) {
}
define i64 @addmul22(i64 %a, i64 %b) {
-; CHECK-LABEL: addmul22:
-; CHECK: # %bb.0:
-; CHECK-NEXT: li a2, 22
-; CHECK-NEXT: mul a0, a0, a2
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: ret
+; RV64I-LABEL: addmul22:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a2, 22
+; RV64I-NEXT: mul a0, a0, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64XTHEADBA-LABEL: addmul22:
+; RV64XTHEADBA: # %bb.0:
+; RV64XTHEADBA-NEXT: th.addsl a2, a0, a0, 2
+; RV64XTHEADBA-NEXT: th.addsl a0, a0, a2, 1
+; RV64XTHEADBA-NEXT: th.addsl a0, a1, a0, 1
+; RV64XTHEADBA-NEXT: ret
%c = mul i64 %a, 22
%d = add i64 %c, %b
ret i64 %d
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 7fd76262d547a..d4b228828c04d 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -585,6 +585,33 @@ define i64 @addmul12(i64 %a, i64 %b) {
ret i64 %d
}
+define i64 @addmul14(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul14:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a2, a0, 1
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: sub a0, a0, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: addmul14:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: sh1add a2, a0, a0
+; RV64ZBA-NEXT: sh1add a0, a2, a0
+; RV64ZBA-NEXT: sh1add a0, a0, a1
+; RV64ZBA-NEXT: ret
+;
+; RV64XANDESPERF-LABEL: addmul14:
+; RV64XANDESPERF: # %bb.0:
+; RV64XANDESPERF-NEXT: nds.lea.h a2, a0, a0
+; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a2
+; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT: ret
+ %c = mul i64 %a, 14
+ %d = add i64 %c, %b
+ ret i64 %d
+}
+
define i64 @addmul18(i64 %a, i64 %b) {
; RV64I-LABEL: addmul18:
; RV64I: # %bb.0:
@@ -636,12 +663,26 @@ define i64 @addmul20(i64 %a, i64 %b) {
}
define i64 @addmul22(i64 %a, i64 %b) {
-; CHECK-LABEL: addmul22:
-; CHECK: # %bb.0:
-; CHECK-NEXT: li a2, 22
-; CHECK-NEXT: mul a0, a0, a2
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: ret
+; RV64I-LABEL: addmul22:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a2, 22
+; RV64I-NEXT: mul a0, a0, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: addmul22:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: sh2add a2, a0, a0
+; RV64ZBA-NEXT: sh1add a0, a2, a0
+; RV64ZBA-NEXT: sh1add a0, a0, a1
+; RV64ZBA-NEXT: ret
+;
+; RV64XANDESPERF-LABEL: addmul22:
+; RV64XANDESPERF: # %bb.0:
+; RV64XANDESPERF-NEXT: nds.lea.w a2, a0, a0
+; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a2
+; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT: ret
%c = mul i64 %a, 22
%d = add i64 %c, %b
ret i64 %d
@@ -672,6 +713,32 @@ define i64 @addmul24(i64 %a, i64 %b) {
ret i64 %d
}
+define i64 @addmul26(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul26:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a2, 26
+; RV64I-NEXT: mul a0, a0, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: addmul26:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: sh1add a2, a0, a0
+; RV64ZBA-NEXT: sh2add a0, a2, a0
+; RV64ZBA-NEXT: sh1add a0, a0, a1
+; RV64ZBA-NEXT: ret
+;
+; RV64XANDESPERF-LABEL: addmul26:
+; RV64XANDESPERF: # %bb.0:
+; RV64XANDESPERF-NEXT: nds.lea.h a2, a0, a0
+; RV64XANDESPERF-NEXT: nds.lea.w a0, a0, a2
+; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT: ret
+ %c = mul i64 %a, 26
+ %d = add i64 %c, %b
+ ret i64 %d
+}
+
define i64 @addmul36(i64 %a, i64 %b) {
; RV64I-LABEL: addmul36:
; RV64I: # %bb.0:
@@ -722,6 +789,58 @@ define i64 @addmul40(i64 %a, i64 %b) {
ret i64 %d
}
+define i64 @addmul38(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul38:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a2, 38
+; RV64I-NEXT: mul a0, a0, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: addmul38:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: sh3add a2, a0, a0
+; RV64ZBA-NEXT: sh1add a0, a2, a0
+; RV64ZBA-NEXT: sh1add a0, a0, a1
+; RV64ZBA-NEXT: ret
+;
+; RV64XANDESPERF-LABEL: addmul38:
+; RV64XANDESPERF: # %bb.0:
+; RV64XANDESPERF-NEXT: nds.lea.d a2, a0, a0
+; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a2
+; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT: ret
+ %c = mul i64 %a, 38
+ %d = add i64 %c, %b
+ ret i64 %d
+}
+
+define i64 @addmul42(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul42:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a2, 42
+; RV64I-NEXT: mul a0, a0, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: addmul42:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: sh2add a2, a0, a0
+; RV64ZBA-NEXT: sh2add a0, a2, a0
+; RV64ZBA-NEXT: sh1add a0, a0, a1
+; RV64ZBA-NEXT: ret
+;
+; RV64XANDESPERF-LABEL: addmul42:
+; RV64XANDESPERF: # %bb.0:
+; RV64XANDESPERF-NEXT: nds.lea.w a2, a0, a0
+; RV64XANDESPERF-NEXT: nds.lea.w a0, a0, a2
+; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT: ret
+ %c = mul i64 %a, 42
+ %d = add i64 %c, %b
+ ret i64 %d
+}
+
define i64 @addmul72(i64 %a, i64 %b) {
; RV64I-LABEL: addmul72:
; RV64I: # %bb.0:
@@ -747,6 +866,84 @@ define i64 @addmul72(i64 %a, i64 %b) {
ret i64 %d
}
+define i64 @addmul74(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul74:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a2, 74
+; RV64I-NEXT: mul a0, a0, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: addmul74:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: sh3add a2, a0, a0
+; RV64ZBA-NEXT: sh2add a0, a2, a0
+; RV64ZBA-NEXT: sh1add a0, a0, a1
+; RV64ZBA-NEXT: ret
+;
+; RV64XANDESPERF-LABEL: addmul74:
+; RV64XANDESPERF: # %bb.0:
+; RV64XANDESPERF-NEXT: nds.lea.d a2, a0, a0
+; RV64XANDESPERF-NEXT: nds.lea.w a0, a0, a2
+; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT: ret
+ %c = mul i64 %a, 74
+ %d = add i64 %c, %b
+ ret i64 %d
+}
+
+define i64 @addmul82(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul82:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a2, 82
+; RV64I-NEXT: mul a0, a0, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: addmul82:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: sh2add a2, a0, a0
+; RV64ZBA-NEXT: sh3add a0, a2, a0
+; RV64ZBA-NEXT: sh1add a0, a0, a1
+; RV64ZBA-NEXT: ret
+;
+; RV64XANDESPERF-LABEL: addmul82:
+; RV64XANDESPERF: # %bb.0:
+; RV64XANDESPERF-NEXT: nds.lea.w a2, a0, a0
+; RV64XANDESPERF-NEXT: nds.lea.d a0, a0, a2
+; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT: ret
+ %c = mul i64 %a, 82
+ %d = add i64 %c, %b
+ ret i64 %d
+}
+
+define i64 @addmul146(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul146:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a2, 146
+; RV64I-NEXT: mul a0, a0, a2
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: addmul146:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: sh3add a2, a0, a0
+; RV64ZBA-NEXT: sh3add a0, a2, a0
+; RV64ZBA-NEXT: sh1add a0, a0, a1
+; RV64ZBA-NEXT: ret
+;
+; RV64XANDESPERF-LABEL: addmul146:
+; RV64XANDESPERF: # %bb.0:
+; RV64XANDESPERF-NEXT: nds.lea.d a2, a0, a0
+; RV64XANDESPERF-NEXT: nds.lea.d a0, a0, a2
+; RV64XANDESPERF-NEXT: nds.lea.h a0, a1, a0
+; RV64XANDESPERF-NEXT: ret
+ %c = mul i64 %a, 146
+ %d = add i64 %c, %b
+ ret i64 %d
+}
+
define i64 @mul50(i64 %a) {
; RV64I-LABEL: mul50:
; RV64I: # %bb.0:
|
|
Rebased resolving conflicts and added two NFC commits. |
| ; | ||
| ; RV64ZBA-LABEL: addmul22: | ||
| ; RV64ZBA: # %bb.0: | ||
| ; RV64ZBA-NEXT: sh2add a2, a0, a0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This sequence is twice the size of the original code when Zca and Zcb are enabled. Do we have any opt for size limits on these transforms?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point! But the very same problem already affects other multipliers, e.g. addmul100. I suggest we deal with it in a separate PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now I see these transforms are disabled when optimizing for size:
| // LI + MUL is usually smaller than the alternative sequence. |
topperc
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
No description provided.