diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e9ae4e685b298..1fbbf0b3699fe 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1971,6 +1971,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1); bool InvertMask = IsSelect == SwapOps; + // Keep a track of which non-undef indices are used by each LHS/RHS shuffle + // half. + DenseMap LHSIndexCounts, RHSIndexCounts; + // Now construct the mask that will be used by the vselect or blended // vrgather operation. For vrgathers, construct the appropriate indices into // each vector. @@ -1985,6 +1989,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, GatherIndicesRHS.push_back( IsLHSOrUndefIndex ? DAG.getUNDEF(XLenVT) : DAG.getConstant(MaskIndex - NumElts, DL, XLenVT)); + if (IsLHSOrUndefIndex && MaskIndex >= 0) + ++LHSIndexCounts[MaskIndex]; + if (!IsLHSOrUndefIndex) + ++RHSIndexCounts[MaskIndex - NumElts]; } } @@ -2008,13 +2016,14 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, return SDValue(); } - unsigned GatherOpc = RISCVISD::VRGATHER_VV_VL; + unsigned GatherVXOpc = RISCVISD::VRGATHER_VX_VL; + unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL; MVT IndexVT = VT.changeTypeToInteger(); // Since we can't introduce illegal index types at this stage, use i16 and // vrgatherei16 if the corresponding index type for plain vrgather is greater // than XLenVT. if (IndexVT.getScalarType().bitsGT(XLenVT)) { - GatherOpc = RISCVISD::VRGATHEREI16_VV_VL; + GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL; IndexVT = IndexVT.changeVectorElementType(MVT::i16); } @@ -2027,28 +2036,48 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) { Gather = lowerScalarSplat(SplatValue, VL, ContainerVT, DL, DAG, Subtarget); } else { - SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS); - LHSIndices = - convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget); - V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget); - Gather = - DAG.getNode(GatherOpc, DL, ContainerVT, V1, LHSIndices, TrueMask, VL); + // If only one index is used, we can use a "splat" vrgather. + // TODO: We can splat the most-common index and fix-up any stragglers, if + // that's beneficial. + if (LHSIndexCounts.size() == 1) { + int SplatIndex = LHSIndexCounts.begin()->getFirst(); + Gather = + DAG.getNode(GatherVXOpc, DL, ContainerVT, V1, + DAG.getConstant(SplatIndex, DL, XLenVT), TrueMask, VL); + } else { + SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS); + LHSIndices = + convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget); + + Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices, + TrueMask, VL); + } } // If a second vector operand is used by this shuffle, blend it in with an // additional vrgather. if (!V2.isUndef()) { + V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget); + // If only one index is used, we can use a "splat" vrgather. + // TODO: We can splat the most-common index and fix-up any stragglers, if + // that's beneficial. + if (RHSIndexCounts.size() == 1) { + int SplatIndex = RHSIndexCounts.begin()->getFirst(); + V2 = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2, + DAG.getConstant(SplatIndex, DL, XLenVT), TrueMask, VL); + } else { + SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS); + RHSIndices = + convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget); + V2 = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, TrueMask, + VL); + } + MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1); SelectMask = convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget); - SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS); - RHSIndices = - convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget); - - V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget); - V2 = DAG.getNode(GatherOpc, DL, ContainerVT, V2, RHSIndices, TrueMask, VL); Gather = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, SelectMask, V2, Gather, VL); } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index d8eee8ddbd3ec..7a4f133989f27 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -1314,6 +1314,18 @@ foreach vti = AllIntegerVectors in { vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1, vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>; + def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm), + (riscv_vrgather_vx_vl + vti.RegClass:$rs2, + uimm5:$imm, + (vti.Mask true_mask), + VLOpFrag), + vti.RegClass:$merge, + VLOpFrag)), + (!cast("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK") + vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm, + vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>; + // emul = lmul * 16 / sew defvar vlmul = vti.LMul; defvar octuple_lmul = vlmul.octuple; @@ -1385,6 +1397,18 @@ foreach vti = AllFloatVectors in { vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1, vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>; + def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm), + (riscv_vrgather_vx_vl + vti.RegClass:$rs2, + uimm5:$imm, + (vti.Mask true_mask), + VLOpFrag), + vti.RegClass:$merge, + VLOpFrag)), + (!cast("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK") + vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm, + vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>; + defvar vlmul = vti.LMul; defvar octuple_lmul = vlmul.octuple; defvar octuple_emul = !srl(!mul(octuple_lmul, 16), vti.Log2SEW); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index 12823121f6ff5..b4d28a3579b78 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -57,28 +57,25 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, ; ; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization: ; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-NEXT: vmv.v.i v25, 0 -; LMULMAX2-NEXT: vrgather.vv v26, v8, v25 ; LMULMAX2-NEXT: addi a0, zero, 2 ; LMULMAX2-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; LMULMAX2-NEXT: vmv.s.x v0, a0 ; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-NEXT: vmv.v.i v27, 3 +; LMULMAX2-NEXT: vrgather.vi v25, v8, 0 ; LMULMAX2-NEXT: vsetvli zero, zero, e32, m1, tu, mu -; LMULMAX2-NEXT: vrgather.vv v26, v9, v27, v0.t -; LMULMAX2-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; LMULMAX2-NEXT: vrgather.vv v28, v10, v25 +; LMULMAX2-NEXT: vrgather.vi v25, v9, 3, v0.t ; LMULMAX2-NEXT: addi a0, zero, 8 ; LMULMAX2-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; LMULMAX2-NEXT: vmv.s.x v0, a0 -; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, tu, mu -; LMULMAX2-NEXT: vrgather.vv v28, v11, v27, v0.t +; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-NEXT: vrgather.vi v26, v10, 0 +; LMULMAX2-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; LMULMAX2-NEXT: vrgather.vi v26, v11, 3, v0.t ; LMULMAX2-NEXT: addi a0, zero, 3 ; LMULMAX2-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; LMULMAX2-NEXT: vmv.s.x v0, a0 ; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-NEXT: vmerge.vvm v8, v28, v26, v0 +; LMULMAX2-NEXT: vmerge.vvm v8, v26, v25, v0 ; LMULMAX2-NEXT: ret %z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> ret <4 x float> %z diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll index 5393c7adf64be..ced3d6d87a1e7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll @@ -142,10 +142,8 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y) ; RV32-NEXT: addi a0, zero, 8 ; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; RV32-NEXT: vmv.v.i v25, 1 -; RV32-NEXT: vsetvli zero, zero, e64, m2, tu, mu -; RV32-NEXT: vrgatherei16.vv v26, v10, v25, v0.t +; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV32-NEXT: vrgather.vi v26, v10, 1, v0.t ; RV32-NEXT: vmv2r.v v8, v26 ; RV32-NEXT: ret ; @@ -159,10 +157,8 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y) ; RV64-NEXT: addi a0, zero, 8 ; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; RV64-NEXT: vmv.s.x v0, a0 -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-NEXT: vmv.v.i v28, 1 -; RV64-NEXT: vsetvli zero, zero, e64, m2, tu, mu -; RV64-NEXT: vrgather.vv v26, v10, v28, v0.t +; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; RV64-NEXT: vrgather.vi v26, v10, 1, v0.t ; RV64-NEXT: vmv2r.v v8, v26 ; RV64-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index ca76aea58a853..c953e81decea8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -93,10 +93,8 @@ define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) { ; CHECK-NEXT: addi a0, zero, 8 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vmv.v.i v26, 1 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, mu -; CHECK-NEXT: vrgather.vv v25, v9, v26, v0.t +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, mu +; CHECK-NEXT: vrgather.vi v25, v9, 1, v0.t ; CHECK-NEXT: vmv1r.v v8, v25 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> %x, <4 x i16> %y, <4 x i32> @@ -388,16 +386,13 @@ define <8 x i8> @splat_ve4_ins_i1ve3(<8 x i8> %v) { define <8 x i8> @splat_ve2_we0(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: splat_ve2_we0: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.v.i v26, 2 -; CHECK-NEXT: vrgather.vv v25, v8, v26 ; CHECK-NEXT: addi a0, zero, 66 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vrgather.vi v25, v8, 2 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu -; CHECK-NEXT: vrgather.vv v25, v9, v26, v0.t +; CHECK-NEXT: vrgather.vi v25, v9, 0, v0.t ; CHECK-NEXT: vmv1r.v v8, v25 ; CHECK-NEXT: ret %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -417,10 +412,8 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) { ; CHECK-NEXT: addi a0, zero, 66 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.v.i v26, 0 -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu -; CHECK-NEXT: vrgather.vv v25, v9, v26, v0.t +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; CHECK-NEXT: vrgather.vi v25, v9, 0, v0.t ; CHECK-NEXT: vmv1r.v v8, v25 ; CHECK-NEXT: ret %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -430,12 +423,11 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) { define <8 x i8> @splat_ve2_we0_ins_i0we4(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: splat_ve2_we0_ins_i0we4: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.v.i v26, 2 -; CHECK-NEXT: vrgather.vv v25, v8, v26 ; CHECK-NEXT: addi a0, zero, 67 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v25, v8, 2 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v26, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu @@ -458,10 +450,8 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) { ; RV32-NEXT: addi a0, zero, 66 ; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-NEXT: vmv.v.i v26, 0 -; RV32-NEXT: vsetvli zero, zero, e8, mf2, tu, mu -; RV32-NEXT: vrgather.vv v25, v9, v26, v0.t +; RV32-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; RV32-NEXT: vrgather.vi v25, v9, 0, v0.t ; RV32-NEXT: vmv1r.v v8, v25 ; RV32-NEXT: ret ; @@ -476,10 +466,8 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) { ; RV64-NEXT: addi a0, zero, 66 ; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; RV64-NEXT: vmv.s.x v0, a0 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64-NEXT: vmv.v.i v26, 0 -; RV64-NEXT: vsetvli zero, zero, e8, mf2, tu, mu -; RV64-NEXT: vrgather.vv v25, v9, v26, v0.t +; RV64-NEXT: vsetivli zero, 8, e8, mf2, tu, mu +; RV64-NEXT: vrgather.vi v25, v9, 0, v0.t ; RV64-NEXT: vmv1r.v v8, v25 ; RV64-NEXT: ret %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -489,19 +477,19 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) { define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: splat_ve2_we0_ins_i2we4: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.v.i v26, 2 -; CHECK-NEXT: vrgather.vv v25, v8, v26 ; CHECK-NEXT: addi a0, zero, 4 -; CHECK-NEXT: vmv.s.x v26, a0 -; CHECK-NEXT: vmv.v.i v27, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vmv.s.x v25, a0 +; CHECK-NEXT: vmv.v.i v26, 0 ; CHECK-NEXT: vsetivli zero, 3, e8, mf2, tu, mu -; CHECK-NEXT: vslideup.vi v27, v26, 2 +; CHECK-NEXT: vslideup.vi v26, v25, 2 ; CHECK-NEXT: addi a0, zero, 70 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; CHECK-NEXT: vrgather.vv v25, v9, v27, v0.t +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v25, v8, 2 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu +; CHECK-NEXT: vrgather.vv v25, v9, v26, v0.t ; CHECK-NEXT: vmv1r.v v8, v25 ; CHECK-NEXT: ret %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32>