diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 165c71d8e03f1..4f0904ed0df5d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5350,6 +5350,12 @@ static bool isLocalRepeatingShuffle(ArrayRef Mask, int Span) { return true; } +/// Is this mask only using elements from the first span of the input? +static bool isLowSourceShuffle(ArrayRef Mask, int Span) { + return all_of(Mask, + [&](const auto &Idx) { return Idx == -1 || Idx < Span; }); +} + /// Try to widen element type to get a new mask value for a better permutation /// sequence. This doesn't try to inspect the widened mask for profitability; /// we speculate the widened form is equal or better. This has the effect of @@ -5766,6 +5772,39 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather, SubVec, SubIdx); } + } else if (ContainerVT.bitsGT(M1VT) && isLowSourceShuffle(Mask, VLMAX)) { + // If we have a shuffle which only uses the first register in our + // source register group, we can do a linear number of m1 vrgathers + // reusing the same source register (but with different indices) + // TODO: This can be generalized for m2 or m4, or for any shuffle + // for which we can do a vslidedown followed by this expansion. + EVT SubIndexVT = M1VT.changeVectorElementType(IndexVT.getScalarType()); + auto [InnerTrueMask, InnerVL] = + getDefaultScalableVLOps(M1VT, DL, DAG, Subtarget); + int N = ContainerVT.getVectorMinNumElements() / + M1VT.getVectorMinNumElements(); + assert(isPowerOf2_32(N) && N <= 8); + Gather = DAG.getUNDEF(ContainerVT); + SDValue SlideAmt = + DAG.getElementCount(DL, XLenVT, M1VT.getVectorElementCount()); + for (int i = 0; i < N; i++) { + if (i != 0) + LHSIndices = getVSlidedown(DAG, Subtarget, DL, IndexContainerVT, + DAG.getUNDEF(IndexContainerVT), LHSIndices, + SlideAmt, TrueMask, VL); + SDValue SubIdx = + DAG.getVectorIdxConstant(M1VT.getVectorMinNumElements() * i, DL); + SDValue SubV1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, V1, SubIdx); + SDValue SubIndex = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubIndexVT, LHSIndices, + DAG.getVectorIdxConstant(0, DL)); + SDValue SubVec = + DAG.getNode(GatherVVOpc, DL, M1VT, SubV1, SubIndex, + DAG.getUNDEF(M1VT), InnerTrueMask, InnerVL); + Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather, + SubVec, SubIdx); + } } else { Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices, DAG.getUNDEF(ContainerVT), TrueMask, VL); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll index 4b09b571b9406..30b2181ece1eb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -38,15 +38,20 @@ define <4 x float> @interleave_v2f32(<2 x float> %x, <2 x float> %y) { define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) { ; V128-LABEL: interleave_v2f64: ; V128: # %bb.0: +; V128-NEXT: csrr a0, vlenb ; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; V128-NEXT: vmv1r.v v12, v9 -; V128-NEXT: vid.v v9 +; V128-NEXT: vid.v v10 +; V128-NEXT: srli a0, a0, 3 +; V128-NEXT: vsrl.vi v10, v10, 1 +; V128-NEXT: vslidedown.vx v12, v10, a0 +; V128-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; V128-NEXT: vrgatherei16.vv v13, v11, v12 +; V128-NEXT: vrgatherei16.vv v12, v9, v10 ; V128-NEXT: vmv.v.i v0, 10 -; V128-NEXT: vsrl.vi v14, v9, 1 -; V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; V128-NEXT: vrgatherei16.vv v10, v8, v14 -; V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t -; V128-NEXT: vmv.v.v v8, v10 +; V128-NEXT: vrgatherei16.vv v14, v8, v10 +; V128-NEXT: vmv.v.v v15, v13 +; V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; V128-NEXT: vmerge.vvm v8, v14, v12, v0 ; V128-NEXT: ret ; ; RV32-V512-LABEL: interleave_v2f64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll index da7cdf3ba8ec0..ac70e5a3081c4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -51,15 +51,20 @@ define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) { define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) { ; V128-LABEL: interleave_v2i64: ; V128: # %bb.0: +; V128-NEXT: csrr a0, vlenb ; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; V128-NEXT: vmv1r.v v12, v9 -; V128-NEXT: vid.v v9 +; V128-NEXT: vid.v v10 +; V128-NEXT: srli a0, a0, 3 +; V128-NEXT: vsrl.vi v10, v10, 1 +; V128-NEXT: vslidedown.vx v12, v10, a0 +; V128-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; V128-NEXT: vrgatherei16.vv v13, v11, v12 +; V128-NEXT: vrgatherei16.vv v12, v9, v10 ; V128-NEXT: vmv.v.i v0, 10 -; V128-NEXT: vsrl.vi v14, v9, 1 -; V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; V128-NEXT: vrgatherei16.vv v10, v8, v14 -; V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t -; V128-NEXT: vmv.v.v v8, v10 +; V128-NEXT: vrgatherei16.vv v14, v8, v10 +; V128-NEXT: vmv.v.v v15, v13 +; V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; V128-NEXT: vmerge.vvm v8, v14, v12, v0 ; V128-NEXT: ret ; ; RV32-V512-LABEL: interleave_v2i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index a5039c58fccb1..a6a79f57a3da6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -817,13 +817,17 @@ define <8 x i32> @shuffle_spread2_singlesrc_e32_index1(<8 x i32> %v) { define <8 x i32> @shuffle_spread2_singlesrc_e32_index2(<8 x i32> %v) { ; CHECK-LABEL: shuffle_spread2_singlesrc_e32_index2: ; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsrl.vi v10, v10, 1 ; CHECK-NEXT: vadd.vi v12, v10, -1 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v10, v12, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v9, v10 ; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -836,9 +840,13 @@ define <8 x i32> @shuffle_spread3_singlesrc_e32(<8 x i32> %v) { ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: li a0, 1 ; CHECK-NEXT: vslide1down.vx v12, v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vslidedown.vx v10, v12, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v9, v10 ; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -848,12 +856,16 @@ define <8 x i32> @shuffle_spread3_singlesrc_e32(<8 x i32> %v) { define <8 x i32> @shuffle_spread4_singlesrc_e32(<8 x i32> %v) { ; CHECK-LABEL: shuffle_spread4_singlesrc_e32: ; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsrl.vi v12, v10, 2 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v10, v12, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v9, v10 ; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -980,12 +992,16 @@ define <8 x i32> @shuffle_repeat3_singlesrc_e32(<8 x i32> %v) { ; CHECK-NEXT: vmv.v.i v11, 1 ; CHECK-NEXT: li a0, 192 ; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vmerge.vim v11, v11, 0, v0 ; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: vmerge.vim v12, v11, 2, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vslidedown.vx v10, v12, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v9, v10 ; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -994,12 +1010,16 @@ define <8 x i32> @shuffle_repeat3_singlesrc_e32(<8 x i32> %v) { define <8 x i32> @shuffle_repeat4_singlesrc_e32(<8 x i32> %v) { ; CHECK-LABEL: shuffle_repeat4_singlesrc_e32: ; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsrl.vi v12, v10, 2 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v10, v12, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v9, v10 ; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -1291,11 +1311,23 @@ define void @shuffle_i128_splat(ptr %p) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: lui a2, 16 +; CHECK-NEXT: srli a1, a1, 3 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.x v12, a1 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v13, v12, a1 +; CHECK-NEXT: vslidedown.vx v14, v13, a1 +; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v17, v9, v13 ; CHECK-NEXT: vrgatherei16.vv v16, v8, v12 +; CHECK-NEXT: vrgatherei16.vv v18, v10, v14 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v14, a1 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v19, v11, v8 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vse64.v v16, (a0) ; CHECK-NEXT: ret %a = load <4 x i128>, ptr %p diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll index abbbfe8f252fb..b7b5ca870bd90 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll @@ -237,10 +237,15 @@ define <8 x i32> @v8i32_v4i32(<4 x i32>) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vslidedown.vx v10, v9, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v12, v10 +; CHECK-NEXT: vrgatherei16.vv v10, v8, v9 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %2 = shufflevector <4 x i32> %0, <4 x i32> poison, <8 x i32> ret <8 x i32> %2 @@ -249,30 +254,38 @@ define <8 x i32> @v8i32_v4i32(<4 x i32>) { define <16 x i32> @v16i32_v4i32(<4 x i32>) { ; CHECK-LABEL: v16i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 2 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v9, 3 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: lui a0, 2 +; CHECK-NEXT: vmv.v.i v11, 3 ; CHECK-NEXT: addi a1, a0, 265 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: lui a1, 4 ; CHECK-NEXT: addi a1, a1, 548 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vim v9, v9, 2, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: addi a0, a0, -1856 +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vim v9, v9, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 +; CHECK-NEXT: vmerge.vim v11, v11, 2, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v8, v11, 0, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vsext.vf2 v16, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsext.vf2 v14, v8 +; CHECK-NEXT: vslidedown.vx v16, v14, a1 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v9, v12, v16 +; CHECK-NEXT: vrgatherei16.vv v8, v10, v14 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v12, v16, a1 +; CHECK-NEXT: vslidedown.vx v14, v12, a1 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v10, v11, v12 +; CHECK-NEXT: vrgatherei16.vv v11, v12, v14 ; CHECK-NEXT: ret %2 = shufflevector <4 x i32> %0, <4 x i32> poison, <16 x i32> ret <16 x i32> %2 @@ -281,31 +294,55 @@ define <16 x i32> @v16i32_v4i32(<4 x i32>) { define <32 x i32> @v32i32_v4i32(<4 x i32>) { ; CHECK-LABEL: v32i32_v4i32: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: lui a1, 135432 ; CHECK-NEXT: addi a1, a1, 1161 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: lui a1, 270865 ; CHECK-NEXT: addi a1, a1, 548 -; CHECK-NEXT: vmv.s.x v9, a1 +; CHECK-NEXT: vmv.s.x v8, a1 ; CHECK-NEXT: lui a1, 100550 +; CHECK-NEXT: addi a1, a1, 64 +; CHECK-NEXT: vmv.s.x v9, a1 +; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v10, 3 -; CHECK-NEXT: addi a0, a1, 64 -; CHECK-NEXT: vmerge.vim v18, v10, 2, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmv.s.x v16, a0 +; CHECK-NEXT: vmv.v.i v12, 3 +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: vmerge.vim v12, v12, 2, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v12, v12, 0, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vmerge.vim v18, v18, 0, v0 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vmerge.vim v16, v18, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v12, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vsext.vf2 v24, v16 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v16, v8, v24 -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 +; CHECK-NEXT: vslidedown.vx v12, v16, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v9, v11, v12 +; CHECK-NEXT: vrgatherei16.vv v8, v10, v16 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v12, v12, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v10, v11, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v12, v12, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v16, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v20, v12, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v17, v20 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v20, v20, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v13, v18, v20 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v20, v20, a1 +; CHECK-NEXT: vslidedown.vx v24, v20, a1 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v14, v19, v20 +; CHECK-NEXT: vrgatherei16.vv v15, v16, v24 ; CHECK-NEXT: ret %2 = shufflevector <4 x i32> %0, <4 x i32> poison, <32 x i32> ret <32 x i32> %2