[DAGCombiner][AArch64][VE] Teach BuildUDIV/SDIV to use 2x mul when mu…

…lh/mul_lohi are not available. Correct the legality of i32 mul_lohi on AArch64. Previously, AArch64 incorrectly reported i32 mul_lohi as Legal. This allowed BuildUDIV/SDIV to use them. A later DAGCombiner would replace them with MULHS/MULHU because only the high half was used. This conversion does not check the legality of MULHS/MULHU under the assumption that LegalizeDAG can turn it back into MUL_LOHI later. After they are converted to MULHS/MULHU, DAGCombine ran and saw that these operations aren't supported but an i64 MUL is. So they get converted to that plus a shift. Without this, LegalizeDAG would convert back MUL_LOHI and isel would fail to find a pattern. This patch teaches BuildUDIV/SDIV to create the wide mul and shift so that we can report the correct operation legality on AArch64. It also enables div by constant folding for more cases on VE. I don't know if VE wants this div by constant optimization or not. If they don't want it, they can use the isIntDivCheap hook to disable it. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D150333
llvm · May 12, 2023 · a983ef2 · a983ef2
1 parent 2da2995
commit a983ef2
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 19 deletions.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5990,6 +5990,19 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
           DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y);
       return SDValue(LoHi.getNode(), 1);
     }
+    // If type twice as wide legal, widen and use a mul plus a shift.
+    if (!VT.isVector()) {
+      unsigned Size = VT.getSizeInBits();
+      EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), Size * 2);
+      if (isOperationLegal(ISD::MUL, WideVT)) {
+        X = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, X);
+        Y = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, Y);
+        Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y);
+        Y = DAG.getNode(ISD::SRL, dl, WideVT, Y,
+                        DAG.getShiftAmountConstant(EltBits, WideVT, dl));
+        return DAG.getNode(ISD::TRUNCATE, dl, VT, Y);
+      }
+    }
     return SDValue();
   };
 
@@ -6163,6 +6176,19 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
           DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y);
       return SDValue(LoHi.getNode(), 1);
     }
+    // If type twice as wide legal, widen and use a mul plus a shift.
+    if (!VT.isVector()) {
+      unsigned Size = VT.getSizeInBits();
+      EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), Size * 2);
+      if (isOperationLegal(ISD::MUL, WideVT)) {
+        X = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, X);
+        Y = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, Y);
+        Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y);
+        Y = DAG.getNode(ISD::SRL, dl, WideVT, Y,
+                        DAG.getShiftAmountConstant(EltBits, WideVT, dl));
+        return DAG.getNode(ISD::TRUNCATE, dl, VT, Y);
+      }
+    }
     return SDValue(); // No mulhu or equivalent
   };
 

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -580,6 +580,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::MULHS, MVT::i32, Expand);
 
   // AArch64 doesn't have {U|S}MUL_LOHI.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 

diff --git a/llvm/test/CodeGen/VE/Scalar/div.ll b/llvm/test/CodeGen/VE/Scalar/div.ll
@@ -149,7 +149,11 @@ define i64 @divi64ri(i64 %a, i64 %b) {
 define signext i32 @divi32ri(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: divi32ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    divs.w.sx %s0, %s0, (62)0
+; CHECK-NEXT:    lea %s1, 1431655766
+; CHECK-NEXT:    muls.l %s0, %s0, %s1
+; CHECK-NEXT:    srl %s1, %s0, 63
+; CHECK-NEXT:    srl %s0, %s0, 32
+; CHECK-NEXT:    adds.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %r = sdiv i32 %a, 3
@@ -185,8 +189,10 @@ define i64 @divu64ri(i64 %a, i64 %b) {
 define zeroext i32 @divu32ri(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-LABEL: divu32ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    divu.w %s0, %s0, (62)0
-; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    lea %s1, -1431655765
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    muls.l %s0, %s0, %s1
+; CHECK-NEXT:    srl %s0, %s0, 33
 ; CHECK-NEXT:    b.l.t (, %s10)
   %r = udiv i32 %a, 3
   ret i32 %r

diff --git a/llvm/test/CodeGen/VE/Scalar/rem.ll b/llvm/test/CodeGen/VE/Scalar/rem.ll
@@ -165,7 +165,11 @@ define i64 @remi64ri(i64 %a) {
 define signext i32 @remi32ri(i32 signext %a) {
 ; CHECK-LABEL: remi32ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    divs.w.sx %s1, %s0, (62)0
+; CHECK-NEXT:    lea %s1, 1431655766
+; CHECK-NEXT:    muls.l %s1, %s0, %s1
+; CHECK-NEXT:    srl %s2, %s1, 63
+; CHECK-NEXT:    srl %s1, %s1, 32
+; CHECK-NEXT:    adds.w.sx %s1, %s1, %s2
 ; CHECK-NEXT:    muls.w.sx %s1, 3, %s1
 ; CHECK-NEXT:    subs.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
@@ -205,7 +209,10 @@ define i64 @remu64ri(i64 %a) {
 define zeroext i32 @remu32ri(i32 zeroext %a) {
 ; CHECK-LABEL: remu32ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    divu.w %s1, %s0, (62)0
+; CHECK-NEXT:    lea %s1, -1431655765
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    muls.l %s1, %s0, %s1
+; CHECK-NEXT:    srl %s1, %s1, 33
 ; CHECK-NEXT:    muls.w.sx %s1, 3, %s1
 ; CHECK-NEXT:    subs.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1

diff --git a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
@@ -8,13 +8,18 @@ define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) {
 ; CHECK-LABEL: udiv_by_minus_one:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    and %s0, %s0, (56)0
+; CHECK-NEXT:    lea %s4, 16843010
+; CHECK-NEXT:    muls.l %s0, %s0, %s4
+; CHECK-NEXT:    srl %s0, %s0, 32
 ; CHECK-NEXT:    and %s1, %s1, (56)0
+; CHECK-NEXT:    muls.l %s1, %s1, %s4
+; CHECK-NEXT:    srl %s1, %s1, 32
 ; CHECK-NEXT:    and %s2, %s2, (56)0
+; CHECK-NEXT:    muls.l %s2, %s2, %s4
+; CHECK-NEXT:    srl %s2, %s2, 32
 ; CHECK-NEXT:    and %s3, %s3, (56)0
-; CHECK-NEXT:    divu.w %s3, %s3, (56)0
-; CHECK-NEXT:    divu.w %s2, %s2, (56)0
-; CHECK-NEXT:    divu.w %s1, %s1, (56)0
-; CHECK-NEXT:    divu.w %s0, %s0, (56)0
+; CHECK-NEXT:    muls.l %s3, %s3, %s4
+; CHECK-NEXT:    srl %s3, %s3, 32
 ; CHECK-NEXT:    b.l.t (, %s10)
   %r = udiv <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
   ret <4 x i8> %r
@@ -27,16 +32,21 @@ define <4 x i8> @urem_by_minus_one(<4 x i8> %x) {
 ; CHECK-NEXT:    and %s1, %s1, (56)0
 ; CHECK-NEXT:    and %s2, %s2, (56)0
 ; CHECK-NEXT:    and %s3, %s3, (56)0
-; CHECK-NEXT:    divu.w %s4, %s3, (56)0
-; CHECK-NEXT:    muls.w.sx %s4, %s4, (56)0
-; CHECK-NEXT:    subs.w.sx %s3, %s3, %s4
-; CHECK-NEXT:    divu.w %s4, %s2, (56)0
-; CHECK-NEXT:    muls.w.sx %s4, %s4, (56)0
-; CHECK-NEXT:    subs.w.sx %s2, %s2, %s4
-; CHECK-NEXT:    divu.w %s4, %s1, (56)0
-; CHECK-NEXT:    muls.w.sx %s4, %s4, (56)0
-; CHECK-NEXT:    subs.w.sx %s1, %s1, %s4
-; CHECK-NEXT:    divu.w %s4, %s0, (56)0
+; CHECK-NEXT:    lea %s4, 16843010
+; CHECK-NEXT:    muls.l %s5, %s3, %s4
+; CHECK-NEXT:    srl %s5, %s5, 32
+; CHECK-NEXT:    muls.w.sx %s5, %s5, (56)0
+; CHECK-NEXT:    subs.w.sx %s3, %s3, %s5
+; CHECK-NEXT:    muls.l %s5, %s2, %s4
+; CHECK-NEXT:    srl %s5, %s5, 32
+; CHECK-NEXT:    muls.w.sx %s5, %s5, (56)0
+; CHECK-NEXT:    subs.w.sx %s2, %s2, %s5
+; CHECK-NEXT:    muls.l %s5, %s1, %s4
+; CHECK-NEXT:    srl %s5, %s5, 32
+; CHECK-NEXT:    muls.w.sx %s5, %s5, (56)0
+; CHECK-NEXT:    subs.w.sx %s1, %s1, %s5
+; CHECK-NEXT:    muls.l %s4, %s0, %s4
+; CHECK-NEXT:    srl %s4, %s4, 32
 ; CHECK-NEXT:    muls.w.sx %s4, %s4, (56)0
 ; CHECK-NEXT:    subs.w.sx %s0, %s0, %s4
 ; CHECK-NEXT:    b.l.t (, %s10)