-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[DAG] Fold mul 0 -> 0 when expanding mul into parts. #168780
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-backend-risc-v Author: David Green (davemgreen) ChangesIf the upper bits are zero, but we expand mulitply then immediately convert the multiple into a libcall, there is no opportunity to optimize away the mul. Do so manually to make sure extending multiplies optimise cleanly. Full diff: https://github.com/llvm/llvm-project/pull/168780.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index bb64f4ee70280..f1924a8900044 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11095,10 +11095,14 @@ void TargetLowering::forceExpandMultiply(SelectionDAG &DAG, const SDLoc &dl,
// If HiLHS and HiRHS are set, multiply them by the opposite low part and add
// the products to Hi.
if (HiLHS) {
- Hi = DAG.getNode(ISD::ADD, dl, VT, Hi,
- DAG.getNode(ISD::ADD, dl, VT,
- DAG.getNode(ISD::MUL, dl, VT, HiRHS, LHS),
- DAG.getNode(ISD::MUL, dl, VT, RHS, HiLHS)));
+ Hi = DAG.getNode(
+ ISD::ADD, dl, VT, Hi,
+ DAG.getNode(
+ ISD::ADD, dl, VT,
+ isNullConstant(HiRHS) ? HiRHS
+ : DAG.getNode(ISD::MUL, dl, VT, HiRHS, LHS),
+ isNullConstant(HiLHS) ? HiLHS
+ : DAG.getNode(ISD::MUL, dl, VT, RHS, HiLHS)));
}
}
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 4533e14c672e7..d691b1c278a48 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1829,67 +1829,53 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
; RV32I-NEXT: sw s5, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s2, a3
-; RV32I-NEXT: mv s3, a2
-; RV32I-NEXT: mv s0, a1
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s0, a3
+; RV32I-NEXT: mv s1, a2
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: mv s3, a0
; RV32I-NEXT: srai s4, a3, 31
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: li a3, 0
; RV32I-NEXT: call __muldi3
; RV32I-NEXT: mv s5, a1
-; RV32I-NEXT: mv a0, s0
+; RV32I-NEXT: mv a0, s2
; RV32I-NEXT: li a1, 0
-; RV32I-NEXT: mv a2, s3
+; RV32I-NEXT: mv a2, s1
; RV32I-NEXT: li a3, 0
; RV32I-NEXT: call __muldi3
; RV32I-NEXT: add s5, a0, s5
; RV32I-NEXT: sltu a0, s5, a0
-; RV32I-NEXT: add s7, a1, a0
-; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: add s6, a1, a0
+; RV32I-NEXT: mv a0, s3
; RV32I-NEXT: li a1, 0
-; RV32I-NEXT: mv a2, s2
+; RV32I-NEXT: mv a2, s0
; RV32I-NEXT: li a3, 0
; RV32I-NEXT: call __muldi3
; RV32I-NEXT: add s5, a0, s5
; RV32I-NEXT: sltu a0, s5, a0
; RV32I-NEXT: add a0, a1, a0
-; RV32I-NEXT: add s8, s7, a0
-; RV32I-NEXT: mv a0, s0
+; RV32I-NEXT: add s5, s6, a0
+; RV32I-NEXT: mv a0, s2
; RV32I-NEXT: li a1, 0
-; RV32I-NEXT: mv a2, s2
+; RV32I-NEXT: mv a2, s0
; RV32I-NEXT: li a3, 0
; RV32I-NEXT: call __muldi3
-; RV32I-NEXT: mv s5, a0
-; RV32I-NEXT: mv s6, a1
-; RV32I-NEXT: add s9, a0, s8
-; RV32I-NEXT: mv a0, s3
-; RV32I-NEXT: mv a1, s2
-; RV32I-NEXT: li a2, 0
-; RV32I-NEXT: li a3, 0
-; RV32I-NEXT: call __muldi3
-; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s1, a1
+; RV32I-NEXT: add s7, a0, s5
; RV32I-NEXT: mv a0, s4
; RV32I-NEXT: mv a1, s4
-; RV32I-NEXT: mv a2, s1
-; RV32I-NEXT: mv a3, s0
+; RV32I-NEXT: mv a2, s3
+; RV32I-NEXT: mv a3, s2
; RV32I-NEXT: call __muldi3
-; RV32I-NEXT: add s2, a0, s2
-; RV32I-NEXT: sltu a3, s9, s5
-; RV32I-NEXT: sltu a4, s8, s7
-; RV32I-NEXT: add a1, a1, s3
-; RV32I-NEXT: add a2, s9, s2
-; RV32I-NEXT: add a4, s6, a4
-; RV32I-NEXT: sltu a0, s2, a0
-; RV32I-NEXT: sltu a5, a2, s9
-; RV32I-NEXT: add a3, a4, a3
-; RV32I-NEXT: add a0, a1, a0
-; RV32I-NEXT: add a0, a3, a0
-; RV32I-NEXT: add a1, a0, a5
-; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: add a0, s7, a0
+; RV32I-NEXT: sltu a2, s7, s0
+; RV32I-NEXT: sltu a3, s5, s6
+; RV32I-NEXT: sltu a4, a0, s7
+; RV32I-NEXT: add a3, s1, a3
+; RV32I-NEXT: add a2, a3, a2
+; RV32I-NEXT: add a1, a2, a1
+; RV32I-NEXT: add a1, a1, a4
; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
@@ -1899,8 +1885,6 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
; RV32I-NEXT: lw s5, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s6, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s7, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 4(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 48
; RV32I-NEXT: ret
;
diff --git a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
index 9b5fa1c2bc811..94080c02ded80 100644
--- a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
@@ -10,11 +10,11 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; THUMBV6-NEXT: sub sp, #60
; THUMBV6-NEXT: mov r6, r3
; THUMBV6-NEXT: mov r1, r2
-; THUMBV6-NEXT: str r2, [sp, #52] @ 4-byte Spill
+; THUMBV6-NEXT: str r2, [sp, #36] @ 4-byte Spill
; THUMBV6-NEXT: mov r4, r0
-; THUMBV6-NEXT: str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT: str r0, [sp, #48] @ 4-byte Spill
; THUMBV6-NEXT: ldr r2, [sp, #88]
-; THUMBV6-NEXT: str r2, [sp, #48] @ 4-byte Spill
+; THUMBV6-NEXT: str r2, [sp, #56] @ 4-byte Spill
; THUMBV6-NEXT: movs r5, #0
; THUMBV6-NEXT: mov r0, r1
; THUMBV6-NEXT: mov r1, r5
@@ -23,21 +23,21 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; THUMBV6-NEXT: str r1, [sp, #28] @ 4-byte Spill
; THUMBV6-NEXT: str r0, [r4]
; THUMBV6-NEXT: ldr r2, [sp, #96]
-; THUMBV6-NEXT: str r2, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT: str r2, [sp, #40] @ 4-byte Spill
; THUMBV6-NEXT: mov r4, r6
-; THUMBV6-NEXT: str r6, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT: str r6, [sp, #44] @ 4-byte Spill
; THUMBV6-NEXT: mov r0, r6
; THUMBV6-NEXT: mov r1, r5
; THUMBV6-NEXT: mov r3, r5
; THUMBV6-NEXT: bl __aeabi_lmul
-; THUMBV6-NEXT: str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT: str r0, [sp, #52] @ 4-byte Spill
; THUMBV6-NEXT: mov r7, r1
; THUMBV6-NEXT: subs r0, r1, #1
; THUMBV6-NEXT: sbcs r7, r0
; THUMBV6-NEXT: ldr r0, [sp, #100]
; THUMBV6-NEXT: str r0, [sp, #32] @ 4-byte Spill
; THUMBV6-NEXT: mov r1, r5
-; THUMBV6-NEXT: ldr r6, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT: ldr r6, [sp, #36] @ 4-byte Reload
; THUMBV6-NEXT: mov r2, r6
; THUMBV6-NEXT: mov r3, r5
; THUMBV6-NEXT: bl __aeabi_lmul
@@ -53,10 +53,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; THUMBV6-NEXT: ands r4, r3
; THUMBV6-NEXT: orrs r4, r1
; THUMBV6-NEXT: orrs r4, r7
-; THUMBV6-NEXT: ldr r0, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT: ldr r0, [sp, #52] @ 4-byte Reload
; THUMBV6-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; THUMBV6-NEXT: adds r7, r1, r0
-; THUMBV6-NEXT: ldr r0, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT: ldr r0, [sp, #40] @ 4-byte Reload
; THUMBV6-NEXT: mov r1, r5
; THUMBV6-NEXT: mov r2, r6
; THUMBV6-NEXT: mov r3, r5
@@ -69,7 +69,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; THUMBV6-NEXT: orrs r0, r4
; THUMBV6-NEXT: str r0, [sp, #16] @ 4-byte Spill
; THUMBV6-NEXT: ldr r0, [sp, #92]
-; THUMBV6-NEXT: str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT: str r0, [sp, #52] @ 4-byte Spill
; THUMBV6-NEXT: ldr r7, [sp, #80]
; THUMBV6-NEXT: mov r1, r5
; THUMBV6-NEXT: mov r2, r7
@@ -82,13 +82,13 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; THUMBV6-NEXT: ldr r6, [sp, #84]
; THUMBV6-NEXT: mov r0, r6
; THUMBV6-NEXT: mov r1, r5
-; THUMBV6-NEXT: ldr r2, [sp, #48] @ 4-byte Reload
+; THUMBV6-NEXT: ldr r2, [sp, #56] @ 4-byte Reload
; THUMBV6-NEXT: mov r3, r5
; THUMBV6-NEXT: bl __aeabi_lmul
; THUMBV6-NEXT: str r0, [sp, #4] @ 4-byte Spill
; THUMBV6-NEXT: subs r2, r1, #1
; THUMBV6-NEXT: sbcs r1, r2
-; THUMBV6-NEXT: ldr r3, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT: ldr r3, [sp, #52] @ 4-byte Reload
; THUMBV6-NEXT: subs r2, r3, #1
; THUMBV6-NEXT: sbcs r3, r2
; THUMBV6-NEXT: str r6, [sp, #8] @ 4-byte Spill
@@ -99,21 +99,17 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; THUMBV6-NEXT: orrs r6, r4
; THUMBV6-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; THUMBV6-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
-; THUMBV6-NEXT: adds r0, r1, r0
-; THUMBV6-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; THUMBV6-NEXT: adds r4, r1, r0
; THUMBV6-NEXT: mov r0, r7
; THUMBV6-NEXT: mov r1, r5
-; THUMBV6-NEXT: ldr r4, [sp, #48] @ 4-byte Reload
-; THUMBV6-NEXT: mov r2, r4
+; THUMBV6-NEXT: ldr r2, [sp, #56] @ 4-byte Reload
; THUMBV6-NEXT: mov r3, r5
; THUMBV6-NEXT: bl __aeabi_lmul
-; THUMBV6-NEXT: str r0, [sp, #12] @ 4-byte Spill
-; THUMBV6-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
-; THUMBV6-NEXT: adds r0, r1, r0
+; THUMBV6-NEXT: adds r4, r1, r4
; THUMBV6-NEXT: mov r1, r5
; THUMBV6-NEXT: adcs r1, r5
; THUMBV6-NEXT: orrs r1, r6
-; THUMBV6-NEXT: ldr r3, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT: ldr r3, [sp, #40] @ 4-byte Reload
; THUMBV6-NEXT: ldr r2, [sp, #32] @ 4-byte Reload
; THUMBV6-NEXT: orrs r3, r2
; THUMBV6-NEXT: subs r2, r3, #1
@@ -127,68 +123,44 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; THUMBV6-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; THUMBV6-NEXT: orrs r7, r1
; THUMBV6-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
-; THUMBV6-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
-; THUMBV6-NEXT: adds r1, r2, r1
-; THUMBV6-NEXT: str r1, [sp, #32] @ 4-byte Spill
-; THUMBV6-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
-; THUMBV6-NEXT: adcs r0, r1
-; THUMBV6-NEXT: str r0, [sp, #36] @ 4-byte Spill
-; THUMBV6-NEXT: ldr r0, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT: adds r0, r0, r1
+; THUMBV6-NEXT: str r0, [sp, #32] @ 4-byte Spill
+; THUMBV6-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; THUMBV6-NEXT: adcs r4, r0
+; THUMBV6-NEXT: str r4, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT: ldr r0, [sp, #44] @ 4-byte Reload
; THUMBV6-NEXT: mov r1, r5
-; THUMBV6-NEXT: mov r2, r4
+; THUMBV6-NEXT: ldr r2, [sp, #56] @ 4-byte Reload
; THUMBV6-NEXT: mov r3, r5
; THUMBV6-NEXT: bl __aeabi_lmul
; THUMBV6-NEXT: mov r4, r1
; THUMBV6-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
; THUMBV6-NEXT: adds r6, r0, r1
; THUMBV6-NEXT: adcs r4, r5
-; THUMBV6-NEXT: ldr r0, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT: ldr r0, [sp, #36] @ 4-byte Reload
; THUMBV6-NEXT: mov r1, r5
-; THUMBV6-NEXT: ldr r2, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT: ldr r2, [sp, #52] @ 4-byte Reload
; THUMBV6-NEXT: mov r3, r5
; THUMBV6-NEXT: bl __aeabi_lmul
; THUMBV6-NEXT: adds r0, r0, r6
-; THUMBV6-NEXT: ldr r2, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT: ldr r2, [sp, #48] @ 4-byte Reload
; THUMBV6-NEXT: str r0, [r2, #4]
; THUMBV6-NEXT: adcs r1, r5
-; THUMBV6-NEXT: adds r0, r4, r1
-; THUMBV6-NEXT: str r0, [sp, #28] @ 4-byte Spill
+; THUMBV6-NEXT: adds r4, r4, r1
; THUMBV6-NEXT: mov r6, r5
; THUMBV6-NEXT: adcs r6, r5
-; THUMBV6-NEXT: ldr r0, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT: ldr r0, [sp, #44] @ 4-byte Reload
; THUMBV6-NEXT: mov r1, r5
-; THUMBV6-NEXT: ldr r4, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT: mov r2, r4
+; THUMBV6-NEXT: ldr r2, [sp, #52] @ 4-byte Reload
; THUMBV6-NEXT: mov r3, r5
; THUMBV6-NEXT: bl __aeabi_lmul
-; THUMBV6-NEXT: ldr r2, [sp, #28] @ 4-byte Reload
-; THUMBV6-NEXT: adds r0, r0, r2
-; THUMBV6-NEXT: str r0, [sp, #28] @ 4-byte Spill
+; THUMBV6-NEXT: adds r0, r0, r4
; THUMBV6-NEXT: adcs r1, r6
-; THUMBV6-NEXT: str r1, [sp, #24] @ 4-byte Spill
-; THUMBV6-NEXT: ldr r0, [sp, #48] @ 4-byte Reload
-; THUMBV6-NEXT: mov r1, r4
-; THUMBV6-NEXT: mov r2, r5
-; THUMBV6-NEXT: mov r3, r5
-; THUMBV6-NEXT: bl __aeabi_lmul
-; THUMBV6-NEXT: mov r6, r0
-; THUMBV6-NEXT: mov r4, r1
-; THUMBV6-NEXT: ldr r0, [sp, #52] @ 4-byte Reload
-; THUMBV6-NEXT: ldr r1, [sp, #56] @ 4-byte Reload
-; THUMBV6-NEXT: mov r2, r5
-; THUMBV6-NEXT: mov r3, r5
-; THUMBV6-NEXT: bl __aeabi_lmul
-; THUMBV6-NEXT: adds r0, r0, r6
-; THUMBV6-NEXT: adcs r1, r4
-; THUMBV6-NEXT: ldr r2, [sp, #28] @ 4-byte Reload
-; THUMBV6-NEXT: adds r0, r2, r0
-; THUMBV6-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
-; THUMBV6-NEXT: adcs r1, r2
; THUMBV6-NEXT: ldr r2, [sp, #32] @ 4-byte Reload
; THUMBV6-NEXT: adds r0, r0, r2
-; THUMBV6-NEXT: ldr r2, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT: ldr r2, [sp, #48] @ 4-byte Reload
; THUMBV6-NEXT: str r0, [r2, #8]
-; THUMBV6-NEXT: ldr r0, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT: ldr r0, [sp, #40] @ 4-byte Reload
; THUMBV6-NEXT: adcs r1, r0
; THUMBV6-NEXT: str r1, [r2, #12]
; THUMBV6-NEXT: adcs r5, r5
|
| ISD::ADD, dl, VT, Hi, | ||
| DAG.getNode( | ||
| ISD::ADD, dl, VT, | ||
| isNullConstant(HiRHS) ? HiRHS |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't there a simplify function you can call instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the impact if we had a zero check to getNode like we do for ISD::AND/OR/XOR/ADD/SUB?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 I'd much prefer we just added similar N2CV->isZero() handling to what we do for ISD::AND
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, see the first patch in the series. I will see about fixing the issues that went wrong doing it that way.
🐧 Linux x64 Test Results
|
| ; RV32I-NEXT: mv a2, s1 | ||
| ; RV32I-NEXT: mv a3, s0 | ||
| ; RV32I-NEXT: mv a2, s3 | ||
| ; RV32I-NEXT: mv a3, s2 | ||
| ; RV32I-NEXT: call __muldi3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that this multiplies {s2, s3} by s4, which is either 0 or -1. This is another opportunity for optimization: the multiplication is equivalent to {-s2 & s4, -s3 & s4}.
https://alive2.llvm.org/ce/z/C7SJX9
This didn't work well on some tests, but they can be fixed with some better tablegen patterns.
2f655fe to
2e3cc1e
Compare
If the upper bits are zero, but we expand multiply then immediately convert the multiple into a libcall, there is no opportunity to optimize away the mul. Do so in getNode to make sure extending multiplies optimise cleanly.
This didn't work well on some tests, but they can (mostly) be fixed with some better tablegen patterns.