[RISCV] Avoid generate large LMUL vmv.s.x or fvmv.s.f

This is a follow up to patch discussion on D139656. As noted there, M2/M4/M8 versions of these instructions don't actually exist, and using them results in overly constrained register allocation. In that review, we'd talked about moving towards a variant of the instructions which ignored LMUL. I decided to see what happened if we just stopped generating the high LMUL variants, and the results are surprisingly neutral. I only see one minor thing which looks like a real regression among all the churn. I think this is worth doing now to loosen register allocation constraints, and avoid digging our hole around these instructions deeper while thinking about the right model change. Differential Revision: https://reviews.llvm.org/D140027
llvm · Dec 14, 2022 · d860119 · d860119
1 parent 7b68411
commit d860119
Show file tree

Hide file tree

Showing 8 changed files with 278 additions and 255 deletions.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2816,14 +2816,26 @@ static MVT getLMUL1VT(MVT VT) {
 static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL,
                                  MVT VT, SDLoc DL, SelectionDAG &DAG,
                                  const RISCVSubtarget &Subtarget) {
+  const MVT XLenVT = Subtarget.getXLenVT();
+
   SDValue Passthru = DAG.getUNDEF(VT);
-  if (VT.isFloatingPoint())
+  if (VT.isFloatingPoint()) {
     // TODO: Use vmv.v.i for appropriate constants
-    return DAG.getNode(RISCVISD::VFMV_S_F_VL, DL, VT, Passthru, Scalar, VL);
+    // Use M1 or smaller to avoid over constraining register allocation
+    const MVT M1VT = getLMUL1VT(VT);
+    auto InnerVT = VT.bitsLE(M1VT) ? VT : M1VT;
+    SDValue Result = DAG.getNode(RISCVISD::VFMV_S_F_VL, DL, InnerVT,
+                                 DAG.getUNDEF(InnerVT), Scalar, VL);
+    if (VT != InnerVT)
+      Result = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                           DAG.getUNDEF(VT),
+                           Result, DAG.getConstant(0, DL, XLenVT));
+    return Result;
+  }
+
 
   // Avoid the tricky legalization cases by falling back to using the
   // splat code which already handles it gracefully.
-  const MVT XLenVT = Subtarget.getXLenVT();
   if (!Scalar.getValueType().bitsLE(XLenVT))
     return lowerScalarSplat(DAG.getUNDEF(VT), Scalar,
                             DAG.getConstant(1, DL, XLenVT),
@@ -2844,7 +2856,17 @@ static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL,
         VT.bitsLE(getLMUL1VT(VT)))
       return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Passthru, Scalar, VL);
   }
-  return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, Passthru, Scalar, VL);
+  // Use M1 or smaller to avoid over constraining register allocation
+  const MVT M1VT = getLMUL1VT(VT);
+  auto InnerVT = VT.bitsLE(M1VT) ? VT : M1VT;
+  SDValue Result = DAG.getNode(RISCVISD::VMV_S_X_VL, DL, InnerVT,
+                               DAG.getUNDEF(InnerVT), Scalar, VL);
+  if (VT != InnerVT)
+    Result = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                         DAG.getUNDEF(VT),
+                         Result, DAG.getConstant(0, DL, XLenVT));
+  return Result;
+
 }
 
 static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources,

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
@@ -109,13 +109,13 @@ define <64 x i1> @insertelt_v64i1(<64 x i1> %x, i1 %elt) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmv.s.x v12, a0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v12, v8, 1
+; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v12, 1
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %y = insertelement <64 x i1> %x, i1 %elt, i64 1
@@ -127,14 +127,14 @@ define <64 x i1> @insertelt_idx_v64i1(<64 x i1> %x, i1 %elt, i32 zeroext %idx) n
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 64
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vmv.s.x v12, a0
 ; CHECK-NEXT:    addi a0, a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
-; CHECK-NEXT:    vslideup.vx v12, v8, a1
+; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v12, 1
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %y = insertelement <64 x i1> %x, i1 %elt, i32 %idx

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -202,8 +202,9 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) {
 ; RV64-LABEL: vrgather_shuffle_vv_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a0, 5
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.s.x v16, a0
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vmv.v.i v20, 2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
 ; RV64-NEXT:    vslideup.vi v20, v16, 7