Skip to content

Commit

Permalink
[DAGCombiner] Optimize SMULO/UMULO if we can prove that overflow is i…
Browse files Browse the repository at this point in the history
…mpossible.

Using ComputeNumSignBits or computeKnownBits we might be able
to determine that overflow is impossible.

This especially helps after type legalization if the type was
promoted from a type with half the bits or more. Type legalization
conservatively creates a promoted smulo/umulo and an overflow
check for the promoted bits. The overflow from the promoted
smulo/umulo is ORed with the result of the promoted bits
overflow check. Proving that the promoted smulo/umulo can never
overflow will leave us with just the promoted bits overflow check.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D97160
  • Loading branch information
topperc committed Feb 26, 2021
1 parent 07de084 commit eea53b1
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 371 deletions.
22 changes: 22 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4660,6 +4660,28 @@ SDValue DAGCombiner::visitMULO(SDNode *N) {
return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL,
N->getVTList(), N0, N0);

if (IsSigned) {
// Multiplying n * m significant bits yields a result of n + m significant
// bits. If the total number of significant bits does not exceed the
// result bit width (minus 1), there is no overflow.
unsigned SignBits = DAG.ComputeNumSignBits(N0);
if (SignBits > 1)
SignBits += DAG.ComputeNumSignBits(N1);
if (SignBits > VT.getScalarSizeInBits() + 1)
return CombineTo(N, DAG.getNode(ISD::MUL, DL, VT, N0, N1),
DAG.getConstant(0, DL, CarryVT));
} else {
KnownBits N1Known = DAG.computeKnownBits(N1);
if (N1Known.Zero.getBoolValue()) {
KnownBits N0Known = DAG.computeKnownBits(N0);
bool Overflow;
(void)N0Known.getMaxValue().umul_ov(N1Known.getMaxValue(), Overflow);
if (!Overflow)
return CombineTo(N, DAG.getNode(ISD::MUL, DL, VT, N0, N1),
DAG.getConstant(0, DL, CarryVT));
}
}

return SDValue();
}

Expand Down
18 changes: 7 additions & 11 deletions llvm/test/CodeGen/AArch64/vec_umulo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -294,21 +294,17 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; CHECK-NEXT: movi v2.4h, #1
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: umov w9, v2.h[1]
; CHECK-NEXT: umov w8, v2.h[0]
; CHECK-NEXT: mul v1.4h, v0.4h, v1.4h
; CHECK-NEXT: umov w9, v1.h[1]
; CHECK-NEXT: umov w8, v1.h[0]
; CHECK-NEXT: and w9, w9, #0x1
; CHECK-NEXT: shrn v1.4h, v0.4s, #16
; CHECK-NEXT: bfi w8, w9, #1, #1
; CHECK-NEXT: umov w9, v2.h[2]
; CHECK-NEXT: cmeq v0.4h, v1.4h, #0
; CHECK-NEXT: ushr v1.4h, v2.4h, #1
; CHECK-NEXT: umov w9, v1.h[2]
; CHECK-NEXT: and w9, w9, #0x1
; CHECK-NEXT: cmtst v1.4h, v1.4h, v1.4h
; CHECK-NEXT: ushr v0.4h, v1.4h, #1
; CHECK-NEXT: bfi w8, w9, #2, #1
; CHECK-NEXT: umov w9, v2.h[3]
; CHECK-NEXT: orn v0.8b, v1.8b, v0.8b
; CHECK-NEXT: umov w9, v1.h[3]
; CHECK-NEXT: cmtst v0.4h, v0.4h, v0.4h
; CHECK-NEXT: bfi w8, w9, #3, #29
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: and w8, w8, #0xf
Expand Down
54 changes: 10 additions & 44 deletions llvm/test/CodeGen/RISCV/xaluo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -425,12 +425,8 @@ define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
; RV64-NEXT: sext.w a1, a1
; RV64-NEXT: sext.w a0, a0
; RV64-NEXT: mul a3, a0, a1
; RV64-NEXT: mulw a4, a0, a1
; RV64-NEXT: xor a4, a4, a3
; RV64-NEXT: mulh a0, a0, a1
; RV64-NEXT: srai a1, a3, 63
; RV64-NEXT: xor a0, a0, a1
; RV64-NEXT: or a0, a4, a0
; RV64-NEXT: mulw a0, a0, a1
; RV64-NEXT: xor a0, a0, a3
; RV64-NEXT: snez a0, a0
; RV64-NEXT: sw a3, 0(a2)
; RV64-NEXT: ret
Expand Down Expand Up @@ -459,12 +455,8 @@ define zeroext i1 @smulo2.i32(i32 %v1, i32* %res) {
; RV64-NEXT: sext.w a0, a0
; RV64-NEXT: addi a2, zero, 13
; RV64-NEXT: mul a3, a0, a2
; RV64-NEXT: mulw a4, a0, a2
; RV64-NEXT: xor a4, a4, a3
; RV64-NEXT: mulh a0, a0, a2
; RV64-NEXT: srai a2, a3, 63
; RV64-NEXT: xor a0, a0, a2
; RV64-NEXT: or a0, a4, a0
; RV64-NEXT: mulw a0, a0, a2
; RV64-NEXT: xor a0, a0, a3
; RV64-NEXT: snez a0, a0
; RV64-NEXT: sw a3, 0(a1)
; RV64-NEXT: ret
Expand Down Expand Up @@ -575,10 +567,8 @@ define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
; RV64-NEXT: srli a1, a1, 32
; RV64-NEXT: slli a0, a0, 32
; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: mulhu a3, a0, a1
; RV64-NEXT: mul a1, a0, a1
; RV64-NEXT: srli a0, a1, 32
; RV64-NEXT: or a0, a0, a3
; RV64-NEXT: snez a0, a0
; RV64-NEXT: sw a1, 0(a2)
; RV64-NEXT: ret
Expand Down Expand Up @@ -606,10 +596,8 @@ define zeroext i1 @umulo2.i32(i32 %v1, i32* %res) {
; RV64-NEXT: slli a0, a0, 32
; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: addi a2, zero, 13
; RV64-NEXT: mulhu a3, a0, a2
; RV64-NEXT: mul a2, a0, a2
; RV64-NEXT: srli a0, a2, 32
; RV64-NEXT: or a0, a0, a3
; RV64-NEXT: snez a0, a0
; RV64-NEXT: sw a2, 0(a1)
; RV64-NEXT: ret
Expand Down Expand Up @@ -1209,14 +1197,8 @@ define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
; RV64-NEXT: sext.w a2, a1
; RV64-NEXT: sext.w a3, a0
; RV64-NEXT: mul a4, a3, a2
; RV64-NEXT: mulw a5, a3, a2
; RV64-NEXT: xor a5, a5, a4
; RV64-NEXT: mulh a2, a3, a2
; RV64-NEXT: srai a3, a4, 63
; RV64-NEXT: xor a2, a2, a3
; RV64-NEXT: or a2, a5, a2
; RV64-NEXT: snez a2, a2
; RV64-NEXT: bnez a2, .LBB38_2
; RV64-NEXT: mulw a2, a3, a2
; RV64-NEXT: bne a2, a4, .LBB38_2
; RV64-NEXT: # %bb.1: # %entry
; RV64-NEXT: mv a0, a1
; RV64-NEXT: .LBB38_2: # %entry
Expand All @@ -1243,12 +1225,8 @@ define i1 @smulo.not.i32(i32 %v1, i32 %v2) {
; RV64-NEXT: sext.w a1, a1
; RV64-NEXT: sext.w a0, a0
; RV64-NEXT: mul a2, a0, a1
; RV64-NEXT: mulw a3, a0, a1
; RV64-NEXT: xor a3, a3, a2
; RV64-NEXT: mulh a0, a0, a1
; RV64-NEXT: srai a1, a2, 63
; RV64-NEXT: xor a0, a0, a1
; RV64-NEXT: or a0, a3, a0
; RV64-NEXT: mulw a0, a0, a1
; RV64-NEXT: xor a0, a0, a2
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: ret
entry:
Expand Down Expand Up @@ -1363,11 +1341,8 @@ define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
; RV64-NEXT: srli a2, a2, 32
; RV64-NEXT: slli a3, a0, 32
; RV64-NEXT: srli a3, a3, 32
; RV64-NEXT: mulhu a4, a3, a2
; RV64-NEXT: mul a2, a3, a2
; RV64-NEXT: srli a2, a2, 32
; RV64-NEXT: or a2, a2, a4
; RV64-NEXT: snez a2, a2
; RV64-NEXT: bnez a2, .LBB42_2
; RV64-NEXT: # %bb.1: # %entry
; RV64-NEXT: mv a0, a1
Expand All @@ -1393,10 +1368,8 @@ define i1 @umulo.not.i32(i32 %v1, i32 %v2) {
; RV64-NEXT: srli a1, a1, 32
; RV64-NEXT: slli a0, a0, 32
; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: mulhu a2, a0, a1
; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: or a0, a0, a2
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: ret
entry:
Expand Down Expand Up @@ -1843,13 +1816,8 @@ define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) {
; RV64-NEXT: sext.w a1, a1
; RV64-NEXT: sext.w a0, a0
; RV64-NEXT: mul a2, a0, a1
; RV64-NEXT: mulw a3, a0, a1
; RV64-NEXT: xor a3, a3, a2
; RV64-NEXT: mulh a0, a0, a1
; RV64-NEXT: srai a1, a2, 63
; RV64-NEXT: xor a0, a0, a1
; RV64-NEXT: or a0, a3, a0
; RV64-NEXT: beqz a0, .LBB54_2
; RV64-NEXT: mulw a0, a0, a1
; RV64-NEXT: beq a0, a2, .LBB54_2
; RV64-NEXT: # %bb.1: # %overflow
; RV64-NEXT: mv a0, zero
; RV64-NEXT: ret
Expand Down Expand Up @@ -1984,10 +1952,8 @@ define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) {
; RV64-NEXT: srli a1, a1, 32
; RV64-NEXT: slli a0, a0, 32
; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: mulhu a2, a0, a1
; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: or a0, a0, a2
; RV64-NEXT: beqz a0, .LBB57_2
; RV64-NEXT: # %bb.1: # %overflow
; RV64-NEXT: mv a0, zero
Expand Down
Loading

0 comments on commit eea53b1

Please sign in to comment.