[DAGCombiner] Optimize SMULO/UMULO if we can prove that overflow is i…

…mpossible. Using ComputeNumSignBits or computeKnownBits we might be able to determine that overflow is impossible. This especially helps after type legalization if the type was promoted from a type with half the bits or more. Type legalization conservatively creates a promoted smulo/umulo and an overflow check for the promoted bits. The overflow from the promoted smulo/umulo is ORed with the result of the promoted bits overflow check. Proving that the promoted smulo/umulo can never overflow will leave us with just the promoted bits overflow check. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D97160
llvm · Feb 26, 2021 · eea53b1 · eea53b1
1 parent 07de084
commit eea53b1
Show file tree

Hide file tree

Showing 5 changed files with 178 additions and 371 deletions.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4660,6 +4660,28 @@ SDValue DAGCombiner::visitMULO(SDNode *N) {
     return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL,
                        N->getVTList(), N0, N0);
 
+  if (IsSigned) {
+    // Multiplying n * m significant bits yields a result of n + m significant
+    // bits. If the total number of significant bits does not exceed the
+    // result bit width (minus 1), there is no overflow.
+    unsigned SignBits = DAG.ComputeNumSignBits(N0);
+    if (SignBits > 1)
+      SignBits += DAG.ComputeNumSignBits(N1);
+    if (SignBits > VT.getScalarSizeInBits() + 1)
+      return CombineTo(N, DAG.getNode(ISD::MUL, DL, VT, N0, N1),
+                       DAG.getConstant(0, DL, CarryVT));
+  } else {
+    KnownBits N1Known = DAG.computeKnownBits(N1);
+    if (N1Known.Zero.getBoolValue()) {
+      KnownBits N0Known = DAG.computeKnownBits(N0);
+      bool Overflow;
+      (void)N0Known.getMaxValue().umul_ov(N1Known.getMaxValue(), Overflow);
+      if (!Overflow)
+        return CombineTo(N, DAG.getNode(ISD::MUL, DL, VT, N0, N1),
+                         DAG.getConstant(0, DL, CarryVT));
+    }
+  }
+
   return SDValue();
 }
 

diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll
@@ -294,21 +294,17 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; CHECK-NEXT:    movi v2.4h, #1
 ; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    xtn v2.4h, v0.4s
-; CHECK-NEXT:    umov w9, v2.h[1]
-; CHECK-NEXT:    umov w8, v2.h[0]
+; CHECK-NEXT:    mul v1.4h, v0.4h, v1.4h
+; CHECK-NEXT:    umov w9, v1.h[1]
+; CHECK-NEXT:    umov w8, v1.h[0]
 ; CHECK-NEXT:    and w9, w9, #0x1
-; CHECK-NEXT:    shrn v1.4h, v0.4s, #16
 ; CHECK-NEXT:    bfi w8, w9, #1, #1
-; CHECK-NEXT:    umov w9, v2.h[2]
-; CHECK-NEXT:    cmeq v0.4h, v1.4h, #0
-; CHECK-NEXT:    ushr v1.4h, v2.4h, #1
+; CHECK-NEXT:    umov w9, v1.h[2]
 ; CHECK-NEXT:    and w9, w9, #0x1
-; CHECK-NEXT:    cmtst v1.4h, v1.4h, v1.4h
+; CHECK-NEXT:    ushr v0.4h, v1.4h, #1
 ; CHECK-NEXT:    bfi w8, w9, #2, #1
-; CHECK-NEXT:    umov w9, v2.h[3]
-; CHECK-NEXT:    orn v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    umov w9, v1.h[3]
+; CHECK-NEXT:    cmtst v0.4h, v0.4h, v0.4h
 ; CHECK-NEXT:    bfi w8, w9, #3, #29
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    and w8, w8, #0xf

diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -425,12 +425,8 @@ define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
 ; RV64-NEXT:    sext.w a1, a1
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    mul a3, a0, a1
-; RV64-NEXT:    mulw a4, a0, a1
-; RV64-NEXT:    xor a4, a4, a3
-; RV64-NEXT:    mulh a0, a0, a1
-; RV64-NEXT:    srai a1, a3, 63
-; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    or a0, a4, a0
+; RV64-NEXT:    mulw a0, a0, a1
+; RV64-NEXT:    xor a0, a0, a3
 ; RV64-NEXT:    snez a0, a0
 ; RV64-NEXT:    sw a3, 0(a2)
 ; RV64-NEXT:    ret
@@ -459,12 +455,8 @@ define zeroext i1 @smulo2.i32(i32 %v1, i32* %res) {
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    addi a2, zero, 13
 ; RV64-NEXT:    mul a3, a0, a2
-; RV64-NEXT:    mulw a4, a0, a2
-; RV64-NEXT:    xor a4, a4, a3
-; RV64-NEXT:    mulh a0, a0, a2
-; RV64-NEXT:    srai a2, a3, 63
-; RV64-NEXT:    xor a0, a0, a2
-; RV64-NEXT:    or a0, a4, a0
+; RV64-NEXT:    mulw a0, a0, a2
+; RV64-NEXT:    xor a0, a0, a3
 ; RV64-NEXT:    snez a0, a0
 ; RV64-NEXT:    sw a3, 0(a1)
 ; RV64-NEXT:    ret
@@ -575,10 +567,8 @@ define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    slli a0, a0, 32
 ; RV64-NEXT:    srli a0, a0, 32
-; RV64-NEXT:    mulhu a3, a0, a1
 ; RV64-NEXT:    mul a1, a0, a1
 ; RV64-NEXT:    srli a0, a1, 32
-; RV64-NEXT:    or a0, a0, a3
 ; RV64-NEXT:    snez a0, a0
 ; RV64-NEXT:    sw a1, 0(a2)
 ; RV64-NEXT:    ret
@@ -606,10 +596,8 @@ define zeroext i1 @umulo2.i32(i32 %v1, i32* %res) {
 ; RV64-NEXT:    slli a0, a0, 32
 ; RV64-NEXT:    srli a0, a0, 32
 ; RV64-NEXT:    addi a2, zero, 13
-; RV64-NEXT:    mulhu a3, a0, a2
 ; RV64-NEXT:    mul a2, a0, a2
 ; RV64-NEXT:    srli a0, a2, 32
-; RV64-NEXT:    or a0, a0, a3
 ; RV64-NEXT:    snez a0, a0
 ; RV64-NEXT:    sw a2, 0(a1)
 ; RV64-NEXT:    ret
@@ -1209,14 +1197,8 @@ define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
 ; RV64-NEXT:    sext.w a2, a1
 ; RV64-NEXT:    sext.w a3, a0
 ; RV64-NEXT:    mul a4, a3, a2
-; RV64-NEXT:    mulw a5, a3, a2
-; RV64-NEXT:    xor a5, a5, a4
-; RV64-NEXT:    mulh a2, a3, a2
-; RV64-NEXT:    srai a3, a4, 63
-; RV64-NEXT:    xor a2, a2, a3
-; RV64-NEXT:    or a2, a5, a2
-; RV64-NEXT:    snez a2, a2
-; RV64-NEXT:    bnez a2, .LBB38_2
+; RV64-NEXT:    mulw a2, a3, a2
+; RV64-NEXT:    bne a2, a4, .LBB38_2
 ; RV64-NEXT:  # %bb.1: # %entry
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:  .LBB38_2: # %entry
@@ -1243,12 +1225,8 @@ define i1 @smulo.not.i32(i32 %v1, i32 %v2) {
 ; RV64-NEXT:    sext.w a1, a1
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    mul a2, a0, a1
-; RV64-NEXT:    mulw a3, a0, a1
-; RV64-NEXT:    xor a3, a3, a2
-; RV64-NEXT:    mulh a0, a0, a1
-; RV64-NEXT:    srai a1, a2, 63
-; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    or a0, a3, a0
+; RV64-NEXT:    mulw a0, a0, a1
+; RV64-NEXT:    xor a0, a0, a2
 ; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    ret
 entry:
@@ -1363,11 +1341,8 @@ define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
 ; RV64-NEXT:    srli a2, a2, 32
 ; RV64-NEXT:    slli a3, a0, 32
 ; RV64-NEXT:    srli a3, a3, 32
-; RV64-NEXT:    mulhu a4, a3, a2
 ; RV64-NEXT:    mul a2, a3, a2
 ; RV64-NEXT:    srli a2, a2, 32
-; RV64-NEXT:    or a2, a2, a4
-; RV64-NEXT:    snez a2, a2
 ; RV64-NEXT:    bnez a2, .LBB42_2
 ; RV64-NEXT:  # %bb.1: # %entry
 ; RV64-NEXT:    mv a0, a1
@@ -1393,10 +1368,8 @@ define i1 @umulo.not.i32(i32 %v1, i32 %v2) {
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    slli a0, a0, 32
 ; RV64-NEXT:    srli a0, a0, 32
-; RV64-NEXT:    mulhu a2, a0, a1
 ; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    srli a0, a0, 32
-; RV64-NEXT:    or a0, a0, a2
 ; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    ret
 entry:
@@ -1843,13 +1816,8 @@ define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) {
 ; RV64-NEXT:    sext.w a1, a1
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    mul a2, a0, a1
-; RV64-NEXT:    mulw a3, a0, a1
-; RV64-NEXT:    xor a3, a3, a2
-; RV64-NEXT:    mulh a0, a0, a1
-; RV64-NEXT:    srai a1, a2, 63
-; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    or a0, a3, a0
-; RV64-NEXT:    beqz a0, .LBB54_2
+; RV64-NEXT:    mulw a0, a0, a1
+; RV64-NEXT:    beq a0, a2, .LBB54_2
 ; RV64-NEXT:  # %bb.1: # %overflow
 ; RV64-NEXT:    mv a0, zero
 ; RV64-NEXT:    ret
@@ -1984,10 +1952,8 @@ define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) {
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    slli a0, a0, 32
 ; RV64-NEXT:    srli a0, a0, 32
-; RV64-NEXT:    mulhu a2, a0, a1
 ; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    srli a0, a0, 32
-; RV64-NEXT:    or a0, a0, a2
 ; RV64-NEXT:    beqz a0, .LBB57_2
 ; RV64-NEXT:  # %bb.1: # %overflow
 ; RV64-NEXT:    mv a0, zero