Merging r338658:

------------------------------------------------------------------------ r338658 | nemanjai | 2018-08-02 02:03:22 +0200 (Thu, 02 Aug 2018) | 13 lines [PowerPC] Do not round values prior to converting to integer Adding the FP_ROUND nodes when combining FP_TO_[SU]INT of elements feeding a BUILD_VECTOR into an FP_TO_[SU]INT of the built vector loses precision. This patch removes the code that adds these nodes to true f64 operands. It also adds patterns required to ensure the code is still vectorized rather than converting individual elements and inserting into a vector. Fixes https://bugs.llvm.org/show_bug.cgi?id=38342 Differential Revision: https://reviews.llvm.org/D50121 ------------------------------------------------------------------------ llvm-svn: 338678
llvm · Aug 2, 2018 · 63740db · 63740db
1 parent 41c19c9
commit 63740db
Show file tree

Hide file tree

Showing 3 changed files with 258 additions and 207 deletions.
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -11761,6 +11761,14 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
       ShiftCst);
 }
 
+// Is this an extending load from an f32 to an f64?
+static bool isFPExtLoad(SDValue Op) {
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()))
+    return LD->getExtensionType() == ISD::EXTLOAD &&
+      Op.getValueType() == MVT::f64;
+  return false;
+}
+
 /// Reduces the number of fp-to-int conversion when building a vector.
 ///
 /// If this vector is built out of floating to integer conversions,
@@ -11795,11 +11803,18 @@ combineElementTruncationToVectorTruncation(SDNode *N,
     SmallVector<SDValue, 4> Ops;
     EVT TargetVT = N->getValueType(0);
     for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
-      if (N->getOperand(i).getOpcode() != PPCISD::MFVSR)
+      SDValue NextOp = N->getOperand(i);
+      if (NextOp.getOpcode() != PPCISD::MFVSR)
         return SDValue();
-      unsigned NextConversion = N->getOperand(i).getOperand(0).getOpcode();
+      unsigned NextConversion = NextOp.getOperand(0).getOpcode();
       if (NextConversion != FirstConversion)
         return SDValue();
+      // If we are converting to 32-bit integers, we need to add an FP_ROUND.
+      // This is not valid if the input was originally double precision. It is
+      // also not profitable to do unless this is an extending load in which
+      // case doing this combine will allow us to combine consecutive loads.
+      if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0)))
+        return SDValue();
       if (N->getOperand(i) != FirstInput)
         IsSplat = false;
     }
@@ -11813,8 +11828,9 @@ combineElementTruncationToVectorTruncation(SDNode *N,
     // Now that we know we have the right type of node, get its operands
     for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
       SDValue In = N->getOperand(i).getOperand(0);
-      // For 32-bit values, we need to add an FP_ROUND node.
       if (Is32Bit) {
+        // For 32-bit values, we need to add an FP_ROUND node (if we made it
+        // here, we know that all inputs are extending loads so this is safe).
         if (In.isUndef())
           Ops.push_back(DAG.getUNDEF(SrcVT));
         else {

diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -3494,6 +3494,17 @@ def DblToFlt {
   dag B1 = (f32 (fpround (f64 (extractelt v2f64:$B, 1))));
 }
 
+def ExtDbl {
+  dag A0S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$A, 0))))));
+  dag A1S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$A, 1))))));
+  dag B0S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$B, 0))))));
+  dag B1S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$B, 1))))));
+  dag A0U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$A, 0))))));
+  dag A1U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$A, 1))))));
+  dag B0U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$B, 0))))));
+  dag B1U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$B, 1))))));
+}
+
 def ByteToWord {
   dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8));
   dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8));
@@ -3571,9 +3582,15 @@ def FltToULong {
 }
 def DblToInt {
   dag A = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$A))));
+  dag B = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$B))));
+  dag C = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$C))));
+  dag D = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$D))));
 }
 def DblToUInt {
   dag A = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$A))));
+  dag B = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$B))));
+  dag C = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$C))));
+  dag D = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$D))));
 }
 def DblToLong {
   dag A = (i64 (PPCmfvsr (f64 (PPCfctidz f64:$A))));
@@ -3612,6 +3629,47 @@ def MrgFP {
   dag BAlToFlt = (XVCVDPSP (XXPERMDI $B, $A, 3));
 }
 
+// Word-element merge dags - conversions from f64 to i32 merged into vectors.
+def MrgWords {
+  // For big endian, we merge low and hi doublewords (A, B).
+  dag A0B0 = (v2f64 (XXPERMDI v2f64:$A, v2f64:$B, 0));
+  dag A1B1 = (v2f64 (XXPERMDI v2f64:$A, v2f64:$B, 3));
+  dag CVA1B1S = (v4i32 (XVCVDPSXWS A1B1));
+  dag CVA0B0S = (v4i32 (XVCVDPSXWS A0B0));
+  dag CVA1B1U = (v4i32 (XVCVDPUXWS A1B1));
+  dag CVA0B0U = (v4i32 (XVCVDPUXWS A0B0));
+
+  // For little endian, we merge low and hi doublewords (B, A).
+  dag B1A1 = (v2f64 (XXPERMDI v2f64:$B, v2f64:$A, 0));
+  dag B0A0 = (v2f64 (XXPERMDI v2f64:$B, v2f64:$A, 3));
+  dag CVB1A1S = (v4i32 (XVCVDPSXWS B1A1));
+  dag CVB0A0S = (v4i32 (XVCVDPSXWS B0A0));
+  dag CVB1A1U = (v4i32 (XVCVDPUXWS B1A1));
+  dag CVB0A0U = (v4i32 (XVCVDPUXWS B0A0));
+
+  // For big endian, we merge hi doublewords of (A, C) and (B, D), convert
+  // then merge.
+  dag AC = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$A, VSRC),
+                            (COPY_TO_REGCLASS f64:$C, VSRC), 0));
+  dag BD = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$B, VSRC),
+                            (COPY_TO_REGCLASS f64:$D, VSRC), 0));
+  dag CVACS = (v4i32 (XVCVDPSXWS AC));
+  dag CVBDS = (v4i32 (XVCVDPSXWS BD));
+  dag CVACU = (v4i32 (XVCVDPUXWS AC));
+  dag CVBDU = (v4i32 (XVCVDPUXWS BD));
+
+  // For little endian, we merge hi doublewords of (D, B) and (C, A), convert
+  // then merge.
+  dag DB = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$D, VSRC),
+                            (COPY_TO_REGCLASS f64:$B, VSRC), 0));
+  dag CA = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$C, VSRC),
+                            (COPY_TO_REGCLASS f64:$A, VSRC), 0));
+  dag CVDBS = (v4i32 (XVCVDPSXWS DB));
+  dag CVCAS = (v4i32 (XVCVDPSXWS CA));
+  dag CVDBU = (v4i32 (XVCVDPUXWS DB));
+  dag CVCAU = (v4i32 (XVCVDPUXWS CA));
+}
+
 // Patterns for BUILD_VECTOR nodes.
 let AddedComplexity = 400 in {
 
@@ -3679,6 +3737,20 @@ let AddedComplexity = 400 in {
     def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
                                    DblToFlt.B0, DblToFlt.B1)),
               (v4f32 (VMRGEW MrgFP.ABhToFlt, MrgFP.ABlToFlt))>;
+
+    // Convert 4 doubles to a vector of ints.
+    def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B,
+                                   DblToInt.C, DblToInt.D)),
+              (v4i32 (VMRGEW MrgWords.CVACS, MrgWords.CVBDS))>;
+    def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B,
+                                   DblToUInt.C, DblToUInt.D)),
+              (v4i32 (VMRGEW MrgWords.CVACU, MrgWords.CVBDU))>;
+    def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S,
+                                   ExtDbl.B0S, ExtDbl.B1S)),
+              (v4i32 (VMRGEW MrgWords.CVA0B0S, MrgWords.CVA1B1S))>;
+    def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U,
+                                   ExtDbl.B0U, ExtDbl.B1U)),
+              (v4i32 (VMRGEW MrgWords.CVA0B0U, MrgWords.CVA1B1U))>;
   }
 
   let Predicates = [IsLittleEndian, HasVSX] in {
@@ -3693,6 +3765,20 @@ let AddedComplexity = 400 in {
     def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
                                    DblToFlt.B0, DblToFlt.B1)),
               (v4f32 (VMRGEW MrgFP.BAhToFlt, MrgFP.BAlToFlt))>;
+
+    // Convert 4 doubles to a vector of ints.
+    def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B,
+                                   DblToInt.C, DblToInt.D)),
+              (v4i32 (VMRGEW MrgWords.CVDBS, MrgWords.CVCAS))>;
+    def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B,
+                                   DblToUInt.C, DblToUInt.D)),
+              (v4i32 (VMRGEW MrgWords.CVDBU, MrgWords.CVCAU))>;
+    def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S,
+                                   ExtDbl.B0S, ExtDbl.B1S)),
+              (v4i32 (VMRGEW MrgWords.CVB1A1S, MrgWords.CVB0A0S))>;
+    def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U,
+                                   ExtDbl.B0U, ExtDbl.B1U)),
+              (v4i32 (VMRGEW MrgWords.CVB1A1U, MrgWords.CVB0A0U))>;
   }
 
   let Predicates = [HasDirectMove] in {