[RISCV] Optimize INSERT_VECTOR_ELT sequences

This patch optimizes the codegen for INSERT_VECTOR_ELT in various ways. Primarily, it removes the use of vslidedown during lowering, and the vector element is inserted entirely using vslideup with a custom VL and slide index. Additionally, lowering of i64-element vectors on RV32 has been optimized in several ways. When the 64-bit value to insert is the same as the sign-extension of the lower 32-bits, the codegen can follow the regular path. When this is not possible, a new sequence of two i32 vslide1up instructions is used to get the vector element into a vector. This sequence was suggested by @craig.topper. From there, the value is slid into the final position for more consistent lowering across RV32 and RV64. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D98250
llvm · Mar 12, 2021 · 641f570 · 641f570
1 parent 4d2d585
commit 641f570
Show file tree

Hide file tree

Showing 8 changed files with 539 additions and 523 deletions.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2218,6 +2218,12 @@ SDValue RISCVTargetLowering::lowerVectorMaskTrunc(SDValue Op,
   return convertFromScalableVector(MaskVT, Trunc, DAG, Subtarget);
 }
 
+// Custom-legalize INSERT_VECTOR_ELT so that the value is inserted into the
+// first position of a vector, and that vector is slid up to the insert index.
+// By limiting the active vector length to index+1 and merging with the
+// original vector (with an undisturbed tail policy for elements >= VL), we
+// achieve the desired result of leaving all elements untouched except the one
+// at VL-1, which is replaced with the desired value.
 SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
                                                     SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -2233,51 +2239,67 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
     Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
   }
 
+  MVT XLenVT = Subtarget.getXLenVT();
+
+  SDValue Zero = DAG.getConstant(0, DL, XLenVT);
+  bool IsLegalInsert = Subtarget.is64Bit() || Val.getValueType() != MVT::i64;
+  // Even i64-element vectors on RV32 can be lowered without scalar
+  // legalization if the most-significant 32 bits of the value are not affected
+  // by the sign-extension of the lower 32 bits.
+  // TODO: We could also catch sign extensions of a 32-bit value.
+  if (!IsLegalInsert && isa<ConstantSDNode>(Val)) {
+    const auto *CVal = cast<ConstantSDNode>(Val);
+    if (isInt<32>(CVal->getSExtValue())) {
+      IsLegalInsert = true;
+      Val = DAG.getConstant(CVal->getSExtValue(), DL, MVT::i32);
+    }
+  }
+
   SDValue Mask, VL;
   std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
 
-  // Custom-legalize INSERT_VECTOR_ELT where XLEN>=SEW, so that the vector is
-  // first slid down into position, the value is inserted into the first
-  // position, and the vector is slid back up. We do this to simplify patterns.
-  //   (slideup vec, (insertelt (slidedown impdef, vec, idx), val, 0), idx),
-  if (Subtarget.is64Bit() || Val.getValueType() != MVT::i64) {
+  SDValue ValInVec;
+
+  if (IsLegalInsert) {
     if (isNullConstant(Idx))
       return DAG.getNode(RISCVISD::VMV_S_XF_VL, DL, ContainerVT, Vec, Val, VL);
-    SDValue Slidedown =
-        DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT,
-                    DAG.getUNDEF(ContainerVT), Vec, Idx, Mask, VL);
-    SDValue InsertElt0 =
-        DAG.getNode(RISCVISD::VMV_S_XF_VL, DL, ContainerVT, Slidedown, Val, VL);
-    return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, ContainerVT, Vec, InsertElt0,
-                       Idx, Mask, VL);
-  }
-
-  // Custom-legalize INSERT_VECTOR_ELT where XLEN<SEW, as the SEW element type
-  // is illegal (currently only vXi64 RV32).
-  // Since there is no easy way of getting a single element into a vector when
-  // XLEN<SEW, we lower the operation to the following sequence:
-  //   splat      vVal, rVal
-  //   vid.v      vVid
-  //   vmseq.vx   mMask, vVid, rIdx
-  //   vmerge.vvm vDest, vSrc, vVal, mMask
-  // This essentially merges the original vector with the inserted element by
-  // using a mask whose only set bit is that corresponding to the insert
-  // index.
-  SDValue SplattedVal = DAG.getSplatVector(ContainerVT, DL, Val);
-  SDValue SplattedIdx =
-      DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, Idx, VL);
-
-  SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, ContainerVT, Mask, VL);
-  auto SetCCVT =
-      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ContainerVT);
-  SDValue SelectCond =
-      DAG.getNode(RISCVISD::SETCC_VL, DL, SetCCVT, VID, SplattedIdx,
-                  DAG.getCondCode(ISD::SETEQ), Mask, VL);
-  SDValue Select = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT,
-                               SelectCond, SplattedVal, Vec, VL);
+    ValInVec = DAG.getNode(RISCVISD::VMV_S_XF_VL, DL, ContainerVT,
+                           DAG.getUNDEF(ContainerVT), Val, VL);
+  } else {
+    // On RV32, i64-element vectors must be specially handled to place the
+    // value at element 0, by using two vslide1up instructions in sequence on
+    // the i32 split lo/hi value. Use an equivalently-sized i32 vector for
+    // this.
+    SDValue One = DAG.getConstant(1, DL, XLenVT);
+    SDValue ValLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Val, Zero);
+    SDValue ValHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Val, One);
+    MVT I32ContainerVT =
+        MVT::getVectorVT(MVT::i32, ContainerVT.getVectorElementCount() * 2);
+    SDValue I32Mask =
+        getDefaultScalableVLOps(I32ContainerVT, DL, DAG, Subtarget).first;
+    // Limit the active VL to two.
+    SDValue InsertI64VL = DAG.getConstant(2, DL, XLenVT);
+    // Note: We can't pass a UNDEF to the first VSLIDE1UP_VL since an untied
+    // undef doesn't obey the earlyclobber constraint. Just splat a zero value.
+    ValInVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, I32ContainerVT, Zero,
+                           InsertI64VL);
+    // First slide in the hi value, then the lo in underneath it.
+    ValInVec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32ContainerVT, ValInVec,
+                           ValHi, I32Mask, InsertI64VL);
+    ValInVec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32ContainerVT, ValInVec,
+                           ValLo, I32Mask, InsertI64VL);
+    // Bitcast back to the right container type.
+    ValInVec = DAG.getBitcast(ContainerVT, ValInVec);
+  }
+
+  // Now that the value is in a vector, slide it into position.
+  SDValue InsertVL =
+      DAG.getNode(ISD::ADD, DL, XLenVT, Idx, DAG.getConstant(1, DL, XLenVT));
+  SDValue Slideup = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, ContainerVT, Vec,
+                                ValInVec, Idx, Mask, InsertVL);
   if (!VecVT.isFixedLengthVector())
-    return Select;
-  return convertFromScalableVector(VecVT, Select, DAG, Subtarget);
+    return Slideup;
+  return convertFromScalableVector(VecVT, Slideup, DAG, Subtarget);
 }
 
 // Custom-lower EXTRACT_VECTOR_ELT operations to slide the vector down, then
@@ -6100,6 +6122,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VLEFF)
   NODE_NAME_CASE(VLEFF_MASK)
   NODE_NAME_CASE(VSLIDEUP_VL)
+  NODE_NAME_CASE(VSLIDE1UP_VL)
   NODE_NAME_CASE(VSLIDEDOWN_VL)
   NODE_NAME_CASE(VID_VL)
   NODE_NAME_CASE(VFNCVT_ROD_VL)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -120,6 +120,10 @@ enum NodeType : unsigned {
   // and the fifth the VL.
   VSLIDEUP_VL,
   VSLIDEDOWN_VL,
+  // Matches the semantics of vslide1up. The first operand is the source
+  // vector, the second is the XLenVT scalar value. The third and fourth
+  // operands are the mask and VL operands.
+  VSLIDE1UP_VL,
   // Matches the semantics of the vid.v instruction, with a mask and VL
   // operand.
   VID_VL,

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -1129,8 +1129,13 @@ def SDTRVVSlide : SDTypeProfile<1, 5, [
   SDTCisVec<0>, SDTCisSameAs<1, 0>, SDTCisSameAs<2, 0>, SDTCisVT<3, XLenVT>,
   SDTCVecEltisVT<4, i1>, SDTCisSameNumEltsAs<0, 4>, SDTCisVT<5, XLenVT>
 ]>;
+def SDTRVVSlide1 : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisSameAs<1, 0>, SDTCisVT<2, XLenVT>,
+  SDTCVecEltisVT<3, i1>, SDTCisSameNumEltsAs<0, 3>, SDTCisVT<4, XLenVT>
+]>;
 
 def riscv_slideup_vl   : SDNode<"RISCVISD::VSLIDEUP_VL", SDTRVVSlide, []>;
+def riscv_slide1up_vl  : SDNode<"RISCVISD::VSLIDE1UP_VL", SDTRVVSlide1, []>;
 def riscv_slidedown_vl : SDNode<"RISCVISD::VSLIDEDOWN_VL", SDTRVVSlide, []>;
 
 let Predicates = [HasStdExtV] in {
@@ -1157,6 +1162,12 @@ foreach vti = !listconcat(AllIntegerVectors, AllFloatVectors) in {
                 vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2,
                 GPR:$vl, vti.SEW)>;
 
+  def : Pat<(vti.Vector (riscv_slide1up_vl (vti.Vector vti.RegClass:$rs1),
+                                           GPR:$rs2, (vti.Mask true_mask),
+                                           (XLenVT (VLOp GPR:$vl)))),
+            (!cast<Instruction>("PseudoVSLIDE1UP_VX_"#vti.LMul.MX)
+                vti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.SEW)>;
+
   def : Pat<(vti.Vector (riscv_slidedown_vl (vti.Vector vti.RegClass:$rs3),
                                             (vti.Vector vti.RegClass:$rs1),
                                             uimm5:$rs2, (vti.Mask true_mask),