-
Notifications
You must be signed in to change notification settings - Fork 11k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Strength reduce mul by 2^M - 3/5/9 #88993
Conversation
We can expand these as the three instruction sequence: (sub (shl X, C1), (shXadd X, x)) Note: This is a companion to https://github.com/llvm/llvm-project/pull/88983/files which handles the 2^N - 2^M case, but neither is stacked on the other. Whichever lands second needs rebased to adjust the shadd check.
@llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesWe can expand these as the three instruction sequence: (sub (shl X, C1), (shXadd X, x)) Note: This is a companion to https://github.com/llvm/llvm-project/pull/88983/files which handles the 2^N - 2^M case, but neither is stacked on the other. Whichever lands second needs rebased to adjust the shadd check. Full diff: https://github.com/llvm/llvm-project/pull/88993.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7b4bec2f65b741..4443e08394211a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -13447,7 +13447,6 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
case 21:
case 25:
case 27:
- case 29:
case 37:
case 41:
case 45:
@@ -13474,6 +13473,23 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
}
}
+ // 2^N - 3/5/9 --> (sub (shl X, C1), (shXadd X, x))
+ for (uint64_t Offset : {3, 5, 9}) {
+ if ((VT == MVT::i64 || MulAmt + Offset <= UINT32_MAX) &&
+ isPowerOf2_64(MulAmt + Offset)) {
+ SDLoc DL(N);
+ SDValue Shift1 =
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt + Offset), DL, VT));
+ SDValue Shift2 =
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(Offset - 1), DL, VT));
+ return DAG.getNode(
+ ISD::SUB, DL, VT, Shift1,
+ DAG.getNode(ISD::ADD, DL, VT, Shift2, N->getOperand(0)));
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
index 48fa69e1045656..a35d0c1d7b9db1 100644
--- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
+++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
@@ -10,15 +10,17 @@
define i32 @add_mul_combine_accept_a1(i32 %x) {
; RV32IMB-LABEL: add_mul_combine_accept_a1:
; RV32IMB: # %bb.0:
-; RV32IMB-NEXT: li a1, 29
-; RV32IMB-NEXT: mul a0, a0, a1
+; RV32IMB-NEXT: sh1add a1, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a0, a0, a1
; RV32IMB-NEXT: addi a0, a0, 1073
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_accept_a1:
; RV64IMB: # %bb.0:
-; RV64IMB-NEXT: li a1, 29
-; RV64IMB-NEXT: mul a0, a0, a1
+; RV64IMB-NEXT: sh1add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: subw a0, a0, a1
; RV64IMB-NEXT: addiw a0, a0, 1073
; RV64IMB-NEXT: ret
%tmp0 = add i32 %x, 37
@@ -29,15 +31,17 @@ define i32 @add_mul_combine_accept_a1(i32 %x) {
define signext i32 @add_mul_combine_accept_a2(i32 signext %x) {
; RV32IMB-LABEL: add_mul_combine_accept_a2:
; RV32IMB: # %bb.0:
-; RV32IMB-NEXT: li a1, 29
-; RV32IMB-NEXT: mul a0, a0, a1
+; RV32IMB-NEXT: sh1add a1, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a0, a0, a1
; RV32IMB-NEXT: addi a0, a0, 1073
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_accept_a2:
; RV64IMB: # %bb.0:
-; RV64IMB-NEXT: li a1, 29
-; RV64IMB-NEXT: mul a0, a0, a1
+; RV64IMB-NEXT: sh1add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: subw a0, a0, a1
; RV64IMB-NEXT: addiw a0, a0, 1073
; RV64IMB-NEXT: ret
%tmp0 = add i32 %x, 37
@@ -49,10 +53,14 @@ define i64 @add_mul_combine_accept_a3(i64 %x) {
; RV32IMB-LABEL: add_mul_combine_accept_a3:
; RV32IMB: # %bb.0:
; RV32IMB-NEXT: li a2, 29
-; RV32IMB-NEXT: mul a1, a1, a2
-; RV32IMB-NEXT: mulhu a3, a0, a2
-; RV32IMB-NEXT: add a1, a3, a1
-; RV32IMB-NEXT: mul a2, a0, a2
+; RV32IMB-NEXT: mulhu a2, a0, a2
+; RV32IMB-NEXT: sh1add a3, a1, a1
+; RV32IMB-NEXT: slli a1, a1, 5
+; RV32IMB-NEXT: sub a1, a1, a3
+; RV32IMB-NEXT: add a1, a2, a1
+; RV32IMB-NEXT: sh1add a2, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a2, a0, a2
; RV32IMB-NEXT: addi a0, a2, 1073
; RV32IMB-NEXT: sltu a2, a0, a2
; RV32IMB-NEXT: add a1, a1, a2
@@ -60,8 +68,9 @@ define i64 @add_mul_combine_accept_a3(i64 %x) {
;
; RV64IMB-LABEL: add_mul_combine_accept_a3:
; RV64IMB: # %bb.0:
-; RV64IMB-NEXT: li a1, 29
-; RV64IMB-NEXT: mul a0, a0, a1
+; RV64IMB-NEXT: sh1add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: sub a0, a0, a1
; RV64IMB-NEXT: addi a0, a0, 1073
; RV64IMB-NEXT: ret
%tmp0 = add i64 %x, 37
@@ -72,8 +81,9 @@ define i64 @add_mul_combine_accept_a3(i64 %x) {
define i32 @add_mul_combine_accept_b1(i32 %x) {
; RV32IMB-LABEL: add_mul_combine_accept_b1:
; RV32IMB: # %bb.0:
-; RV32IMB-NEXT: li a1, 23
-; RV32IMB-NEXT: mul a0, a0, a1
+; RV32IMB-NEXT: sh3add a1, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a0, a0, a1
; RV32IMB-NEXT: lui a1, 50
; RV32IMB-NEXT: addi a1, a1, 1119
; RV32IMB-NEXT: add a0, a0, a1
@@ -81,8 +91,9 @@ define i32 @add_mul_combine_accept_b1(i32 %x) {
;
; RV64IMB-LABEL: add_mul_combine_accept_b1:
; RV64IMB: # %bb.0:
-; RV64IMB-NEXT: li a1, 23
-; RV64IMB-NEXT: mul a0, a0, a1
+; RV64IMB-NEXT: sh3add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: subw a0, a0, a1
; RV64IMB-NEXT: lui a1, 50
; RV64IMB-NEXT: addi a1, a1, 1119
; RV64IMB-NEXT: addw a0, a0, a1
@@ -95,8 +106,9 @@ define i32 @add_mul_combine_accept_b1(i32 %x) {
define signext i32 @add_mul_combine_accept_b2(i32 signext %x) {
; RV32IMB-LABEL: add_mul_combine_accept_b2:
; RV32IMB: # %bb.0:
-; RV32IMB-NEXT: li a1, 23
-; RV32IMB-NEXT: mul a0, a0, a1
+; RV32IMB-NEXT: sh3add a1, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a0, a0, a1
; RV32IMB-NEXT: lui a1, 50
; RV32IMB-NEXT: addi a1, a1, 1119
; RV32IMB-NEXT: add a0, a0, a1
@@ -104,8 +116,9 @@ define signext i32 @add_mul_combine_accept_b2(i32 signext %x) {
;
; RV64IMB-LABEL: add_mul_combine_accept_b2:
; RV64IMB: # %bb.0:
-; RV64IMB-NEXT: li a1, 23
-; RV64IMB-NEXT: mul a0, a0, a1
+; RV64IMB-NEXT: sh3add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: subw a0, a0, a1
; RV64IMB-NEXT: lui a1, 50
; RV64IMB-NEXT: addi a1, a1, 1119
; RV64IMB-NEXT: addw a0, a0, a1
@@ -119,10 +132,14 @@ define i64 @add_mul_combine_accept_b3(i64 %x) {
; RV32IMB-LABEL: add_mul_combine_accept_b3:
; RV32IMB: # %bb.0:
; RV32IMB-NEXT: li a2, 23
-; RV32IMB-NEXT: mul a1, a1, a2
-; RV32IMB-NEXT: mulhu a3, a0, a2
-; RV32IMB-NEXT: add a1, a3, a1
-; RV32IMB-NEXT: mul a2, a0, a2
+; RV32IMB-NEXT: mulhu a2, a0, a2
+; RV32IMB-NEXT: sh3add a3, a1, a1
+; RV32IMB-NEXT: slli a1, a1, 5
+; RV32IMB-NEXT: sub a1, a1, a3
+; RV32IMB-NEXT: add a1, a2, a1
+; RV32IMB-NEXT: sh3add a2, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a2, a0, a2
; RV32IMB-NEXT: lui a0, 50
; RV32IMB-NEXT: addi a0, a0, 1119
; RV32IMB-NEXT: add a0, a2, a0
@@ -132,8 +149,9 @@ define i64 @add_mul_combine_accept_b3(i64 %x) {
;
; RV64IMB-LABEL: add_mul_combine_accept_b3:
; RV64IMB: # %bb.0:
-; RV64IMB-NEXT: li a1, 23
-; RV64IMB-NEXT: mul a0, a0, a1
+; RV64IMB-NEXT: sh3add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: sub a0, a0, a1
; RV64IMB-NEXT: lui a1, 50
; RV64IMB-NEXT: addiw a1, a1, 1119
; RV64IMB-NEXT: add a0, a0, a1
@@ -147,15 +165,17 @@ define i32 @add_mul_combine_reject_a1(i32 %x) {
; RV32IMB-LABEL: add_mul_combine_reject_a1:
; RV32IMB: # %bb.0:
; RV32IMB-NEXT: addi a0, a0, 1971
-; RV32IMB-NEXT: li a1, 29
-; RV32IMB-NEXT: mul a0, a0, a1
+; RV32IMB-NEXT: sh1add a1, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a0, a0, a1
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_reject_a1:
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: addi a0, a0, 1971
-; RV64IMB-NEXT: li a1, 29
-; RV64IMB-NEXT: mulw a0, a0, a1
+; RV64IMB-NEXT: sh1add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: subw a0, a0, a1
; RV64IMB-NEXT: ret
%tmp0 = add i32 %x, 1971
%tmp1 = mul i32 %tmp0, 29
@@ -166,15 +186,17 @@ define signext i32 @add_mul_combine_reject_a2(i32 signext %x) {
; RV32IMB-LABEL: add_mul_combine_reject_a2:
; RV32IMB: # %bb.0:
; RV32IMB-NEXT: addi a0, a0, 1971
-; RV32IMB-NEXT: li a1, 29
-; RV32IMB-NEXT: mul a0, a0, a1
+; RV32IMB-NEXT: sh1add a1, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a0, a0, a1
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_reject_a2:
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: addi a0, a0, 1971
-; RV64IMB-NEXT: li a1, 29
-; RV64IMB-NEXT: mulw a0, a0, a1
+; RV64IMB-NEXT: sh1add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: subw a0, a0, a1
; RV64IMB-NEXT: ret
%tmp0 = add i32 %x, 1971
%tmp1 = mul i32 %tmp0, 29
@@ -185,10 +207,14 @@ define i64 @add_mul_combine_reject_a3(i64 %x) {
; RV32IMB-LABEL: add_mul_combine_reject_a3:
; RV32IMB: # %bb.0:
; RV32IMB-NEXT: li a2, 29
-; RV32IMB-NEXT: mul a1, a1, a2
-; RV32IMB-NEXT: mulhu a3, a0, a2
-; RV32IMB-NEXT: add a1, a3, a1
-; RV32IMB-NEXT: mul a2, a0, a2
+; RV32IMB-NEXT: mulhu a2, a0, a2
+; RV32IMB-NEXT: sh1add a3, a1, a1
+; RV32IMB-NEXT: slli a1, a1, 5
+; RV32IMB-NEXT: sub a1, a1, a3
+; RV32IMB-NEXT: add a1, a2, a1
+; RV32IMB-NEXT: sh1add a2, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a2, a0, a2
; RV32IMB-NEXT: lui a0, 14
; RV32IMB-NEXT: addi a0, a0, -185
; RV32IMB-NEXT: add a0, a2, a0
@@ -199,8 +225,9 @@ define i64 @add_mul_combine_reject_a3(i64 %x) {
; RV64IMB-LABEL: add_mul_combine_reject_a3:
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: addi a0, a0, 1971
-; RV64IMB-NEXT: li a1, 29
-; RV64IMB-NEXT: mul a0, a0, a1
+; RV64IMB-NEXT: sh1add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: sub a0, a0, a1
; RV64IMB-NEXT: ret
%tmp0 = add i64 %x, 1971
%tmp1 = mul i64 %tmp0, 29
@@ -343,15 +370,17 @@ define i32 @add_mul_combine_reject_e1(i32 %x) {
; RV32IMB-LABEL: add_mul_combine_reject_e1:
; RV32IMB: # %bb.0:
; RV32IMB-NEXT: addi a0, a0, 1971
-; RV32IMB-NEXT: li a1, 29
-; RV32IMB-NEXT: mul a0, a0, a1
+; RV32IMB-NEXT: sh1add a1, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a0, a0, a1
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_reject_e1:
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: addi a0, a0, 1971
-; RV64IMB-NEXT: li a1, 29
-; RV64IMB-NEXT: mulw a0, a0, a1
+; RV64IMB-NEXT: sh1add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: subw a0, a0, a1
; RV64IMB-NEXT: ret
%tmp0 = mul i32 %x, 29
%tmp1 = add i32 %tmp0, 57159
@@ -362,15 +391,17 @@ define signext i32 @add_mul_combine_reject_e2(i32 signext %x) {
; RV32IMB-LABEL: add_mul_combine_reject_e2:
; RV32IMB: # %bb.0:
; RV32IMB-NEXT: addi a0, a0, 1971
-; RV32IMB-NEXT: li a1, 29
-; RV32IMB-NEXT: mul a0, a0, a1
+; RV32IMB-NEXT: sh1add a1, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a0, a0, a1
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_reject_e2:
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: addi a0, a0, 1971
-; RV64IMB-NEXT: li a1, 29
-; RV64IMB-NEXT: mulw a0, a0, a1
+; RV64IMB-NEXT: sh1add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: subw a0, a0, a1
; RV64IMB-NEXT: ret
%tmp0 = mul i32 %x, 29
%tmp1 = add i32 %tmp0, 57159
@@ -381,10 +412,14 @@ define i64 @add_mul_combine_reject_e3(i64 %x) {
; RV32IMB-LABEL: add_mul_combine_reject_e3:
; RV32IMB: # %bb.0:
; RV32IMB-NEXT: li a2, 29
-; RV32IMB-NEXT: mul a1, a1, a2
-; RV32IMB-NEXT: mulhu a3, a0, a2
-; RV32IMB-NEXT: add a1, a3, a1
-; RV32IMB-NEXT: mul a2, a0, a2
+; RV32IMB-NEXT: mulhu a2, a0, a2
+; RV32IMB-NEXT: sh1add a3, a1, a1
+; RV32IMB-NEXT: slli a1, a1, 5
+; RV32IMB-NEXT: sub a1, a1, a3
+; RV32IMB-NEXT: add a1, a2, a1
+; RV32IMB-NEXT: sh1add a2, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a2, a0, a2
; RV32IMB-NEXT: lui a0, 14
; RV32IMB-NEXT: addi a0, a0, -185
; RV32IMB-NEXT: add a0, a2, a0
@@ -395,8 +430,9 @@ define i64 @add_mul_combine_reject_e3(i64 %x) {
; RV64IMB-LABEL: add_mul_combine_reject_e3:
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: addi a0, a0, 1971
-; RV64IMB-NEXT: li a1, 29
-; RV64IMB-NEXT: mul a0, a0, a1
+; RV64IMB-NEXT: sh1add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: sub a0, a0, a1
; RV64IMB-NEXT: ret
%tmp0 = mul i64 %x, 29
%tmp1 = add i64 %tmp0, 57159
@@ -407,16 +443,18 @@ define i32 @add_mul_combine_reject_f1(i32 %x) {
; RV32IMB-LABEL: add_mul_combine_reject_f1:
; RV32IMB: # %bb.0:
; RV32IMB-NEXT: addi a0, a0, 1972
-; RV32IMB-NEXT: li a1, 29
-; RV32IMB-NEXT: mul a0, a0, a1
+; RV32IMB-NEXT: sh1add a1, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a0, a0, a1
; RV32IMB-NEXT: addi a0, a0, 11
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_reject_f1:
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: addi a0, a0, 1972
-; RV64IMB-NEXT: li a1, 29
-; RV64IMB-NEXT: mul a0, a0, a1
+; RV64IMB-NEXT: sh1add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: subw a0, a0, a1
; RV64IMB-NEXT: addiw a0, a0, 11
; RV64IMB-NEXT: ret
%tmp0 = mul i32 %x, 29
@@ -428,16 +466,18 @@ define signext i32 @add_mul_combine_reject_f2(i32 signext %x) {
; RV32IMB-LABEL: add_mul_combine_reject_f2:
; RV32IMB: # %bb.0:
; RV32IMB-NEXT: addi a0, a0, 1972
-; RV32IMB-NEXT: li a1, 29
-; RV32IMB-NEXT: mul a0, a0, a1
+; RV32IMB-NEXT: sh1add a1, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a0, a0, a1
; RV32IMB-NEXT: addi a0, a0, 11
; RV32IMB-NEXT: ret
;
; RV64IMB-LABEL: add_mul_combine_reject_f2:
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: addi a0, a0, 1972
-; RV64IMB-NEXT: li a1, 29
-; RV64IMB-NEXT: mul a0, a0, a1
+; RV64IMB-NEXT: sh1add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: subw a0, a0, a1
; RV64IMB-NEXT: addiw a0, a0, 11
; RV64IMB-NEXT: ret
%tmp0 = mul i32 %x, 29
@@ -449,10 +489,14 @@ define i64 @add_mul_combine_reject_f3(i64 %x) {
; RV32IMB-LABEL: add_mul_combine_reject_f3:
; RV32IMB: # %bb.0:
; RV32IMB-NEXT: li a2, 29
-; RV32IMB-NEXT: mul a1, a1, a2
-; RV32IMB-NEXT: mulhu a3, a0, a2
-; RV32IMB-NEXT: add a1, a3, a1
-; RV32IMB-NEXT: mul a2, a0, a2
+; RV32IMB-NEXT: mulhu a2, a0, a2
+; RV32IMB-NEXT: sh1add a3, a1, a1
+; RV32IMB-NEXT: slli a1, a1, 5
+; RV32IMB-NEXT: sub a1, a1, a3
+; RV32IMB-NEXT: add a1, a2, a1
+; RV32IMB-NEXT: sh1add a2, a0, a0
+; RV32IMB-NEXT: slli a0, a0, 5
+; RV32IMB-NEXT: sub a2, a0, a2
; RV32IMB-NEXT: lui a0, 14
; RV32IMB-NEXT: addi a0, a0, -145
; RV32IMB-NEXT: add a0, a2, a0
@@ -463,8 +507,9 @@ define i64 @add_mul_combine_reject_f3(i64 %x) {
; RV64IMB-LABEL: add_mul_combine_reject_f3:
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: addi a0, a0, 1972
-; RV64IMB-NEXT: li a1, 29
-; RV64IMB-NEXT: mul a0, a0, a1
+; RV64IMB-NEXT: sh1add a1, a0, a0
+; RV64IMB-NEXT: slli a0, a0, 5
+; RV64IMB-NEXT: sub a0, a0, a1
; RV64IMB-NEXT: addi a0, a0, 11
; RV64IMB-NEXT: ret
%tmp0 = mul i64 %x, 29
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index b4c80b60e0bad5..3b32112d975273 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -568,31 +568,52 @@ define i64 @mul96(i64 %a) {
}
define i64 @mul119(i64 %a) {
-; CHECK-LABEL: mul119:
-; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 119
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: ret
+; RV64I-LABEL: mul119:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a1, 119
+; RV64I-NEXT: mul a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: mul119:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: sh3add a1, a0, a0
+; RV64ZBA-NEXT: slli a0, a0, 7
+; RV64ZBA-NEXT: sub a0, a0, a1
+; RV64ZBA-NEXT: ret
%c = mul i64 %a, 119
ret i64 %c
}
define i64 @mul123(i64 %a) {
-; CHECK-LABEL: mul123:
-; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 123
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: ret
+; RV64I-LABEL: mul123:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a1, 123
+; RV64I-NEXT: mul a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: mul123:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: sh2add a1, a0, a0
+; RV64ZBA-NEXT: slli a0, a0, 7
+; RV64ZBA-NEXT: sub a0, a0, a1
+; RV64ZBA-NEXT: ret
%c = mul i64 %a, 123
ret i64 %c
}
define i64 @mul125(i64 %a) {
-; CHECK-LABEL: mul125:
-; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 125
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: ret
+; RV64I-LABEL: mul125:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a1, 125
+; RV64I-NEXT: mul a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64ZBA-LABEL: mul125:
+; RV64ZBA: # %bb.0:
+; RV64ZBA-NEXT: sh1add a1, a0, a0
+; RV64ZBA-NEXT: slli a0, a0, 7
+; RV64ZBA-NEXT: sub a0, a0, a1
+; RV64ZBA-NEXT: ret
%c = mul i64 %a, 125
ret i64 %c
}
|
@@ -13447,7 +13447,6 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, | |||
case 21: | |||
case 25: | |||
case 27: | |||
case 29: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is it removed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was a mistake to start with. We don't actually have a TD pattern for 29.
; RV32IMB-NEXT: mulhu a3, a0, a2 | ||
; RV32IMB-NEXT: add a1, a3, a1 | ||
; RV32IMB-NEXT: mul a2, a0, a2 | ||
; RV32IMB-NEXT: mulhu a2, a0, a2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We may have latency=2 multiply instructions on a high performance RV64 core, so this may be an regression. But this is a RV32 test (multiply won't be so fast), so I just leave this comment for record.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FYI, Syntacore SCR1 has single-cycle integer multiplier :(
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A couple things here.
First, I agree that we are likely to want a per processor cost based threshold here. The only question for me is order of work and relative importance, not net result.
I wasn't aware SCR1 has a single cycle latency. I glanced at the schedule model for that just now, and given the only latencies set appear to be div/rem, I have to ask. Are you sure this is actually true?
Second, I'll argue that instruction count != latency. In this particular case, we have two independent instructions followed by a dependent subtract. On any core with multiple issue and single cycle shl/add, I'd expect that to have a latency of 2. Obviously, this argument can be taken to a ridiculous extreme, but it isn't obvious to me that a core with single cycle mul only wants single cycle expansions. The SCR1 might be a weird example here - in the schedule model it looks like there's only one ALU? Even then, that's not solely a function of latency.
For comparison, gcc appears to be defaulting to an expansion budget of at least four instructions for rv64gc. (example: https://godbolt.org/z/zzvqfha8f)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not opposed to this optimization.
For high performance core (the latency will be 3-4 normally, 2 is too extreme), the units for simple ALU should be more than multiplier units, so this optimization should be an uplift.
For RV32 core, they are normally single issue with fewer units and the latency of multiplier can be very small because of low frequency (if you agree), I think this optimization will be a decrease.
First, I agree that we are likely to want a per processor cost based threshold here.
For comparison, gcc appears to be defaulting to an expansion budget of at least four instructions for rv64gc. (example: https://godbolt.org/z/zzvqfha8f)
Yeah, I agree too. Actually this is what GCC is doing. The cost of mul is 2 for generic OoO, 1 for size, 4 for rocket and others (https://godbolt.org/z/8nvdbo5s1)
https://github.com/gcc-mirror/gcc/blob/58a0b190a256bd2a184554de0fae0031a614ec67/gcc/config/riscv/riscv.cc#L275-L292
We may add similar subtarget feature in LLVM too.
There's a bug here I need to fix, will post a revised version later this morning. |
At i64, this discards the carry bit, and thus any power of two detection only considers the bottom 64 in the result. I believe this is the correct result. For i32, the temporary is 64 bits, so an i32 is sign extended in the temporary, and this reduces to the i64 case.
Actually, it appears the original code was correct, and I actually had a redundant check, but skeptical review here is appreciated. Overflow logic makes my head hurt, so a skeptical second look at this is appreciated. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
JFYI, this needs a significant rebase. I'm going to wait until #89789 lands, and then revise this to use SHL_ADD directly. |
We can expand these as the three instruction sequence: (sub (shl X, C1), (shXadd X, x))
Note: This is a companion to https://github.com/llvm/llvm-project/pull/88983/files which handles the 2^N - 2^M case, but neither is stacked on the other. Whichever lands second needs rebased to adjust the shadd check.