diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4c3dc63afd878..750d70c03eabd 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -15283,13 +15283,62 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); } +// Recursively split up concat_vectors with more than 2 operands: +// +// concat_vector op1, op2, op3, op4 +// -> +// concat_vector (concat_vector op1, op2), (concat_vector op3, op4) +// +// This reduces the length of the chain of vslideups and allows us to perform +// the vslideups at a smaller LMUL, limited to MF2. +// +// We do this as a DAG combine rather than during lowering so that any undef +// operands can get combined away. +static SDValue +performCONCAT_VECTORSSplitCombine(SDNode *N, SelectionDAG &DAG, + const RISCVTargetLowering &TLI) { + SDLoc DL(N); + + if (N->getNumOperands() <= 2) + return SDValue(); + + if (!TLI.isTypeLegal(N->getValueType(0))) + return SDValue(); + MVT VT = N->getSimpleValueType(0); + + // Don't split any further than MF2. + MVT ContainerVT = VT; + if (VT.isFixedLengthVector()) + ContainerVT = getContainerForFixedLengthVector(DAG, VT, TLI.getSubtarget()); + if (ContainerVT.bitsLT(getLMUL1VT(ContainerVT))) + return SDValue(); + + MVT HalfVT = VT.getHalfNumVectorElementsVT(); + assert(isPowerOf2_32(N->getNumOperands())); + size_t HalfNumOps = N->getNumOperands() / 2; + SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, + N->ops().take_front(HalfNumOps)); + SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, + N->ops().drop_front(HalfNumOps)); + + // Lower to an insert_subvector directly so the concat_vectors don't get + // recombined. + SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Lo, + DAG.getVectorIdxConstant(0, DL)); + Vec = DAG.getNode( + ISD::INSERT_SUBVECTOR, DL, VT, Vec, Hi, + DAG.getVectorIdxConstant(HalfVT.getVectorMinNumElements(), DL)); + return Vec; +} + // If we're concatenating a series of vector loads like // concat_vectors (load v4i8, p+0), (load v4i8, p+n), (load v4i8, p+n*2) ... // Then we can turn this into a strided load by widening the vector elements // vlse32 p, stride=n -static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget, - const RISCVTargetLowering &TLI) { +static SDValue +performCONCAT_VECTORSStridedLoadCombine(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget, + const RISCVTargetLowering &TLI) { SDLoc DL(N); EVT VT = N->getValueType(0); @@ -16394,7 +16443,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return V; break; case ISD::CONCAT_VECTORS: - if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this)) + if (SDValue V = + performCONCAT_VECTORSStridedLoadCombine(N, DAG, Subtarget, *this)) + return V; + if (SDValue V = performCONCAT_VECTORSSplitCombine(N, DAG, *this)) return V; break; case ISD::INSERT_VECTOR_ELT: diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll index 87d95d7596d4f..139579b3d2a36 100644 --- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll @@ -161,72 +161,71 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) { define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) { ; CHECK-LABEL: fv128: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: lui a0, %hi(.LCPI10_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0) +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v0, v16, a2 -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_1) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1) -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 4 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v10, v16, a2 +; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v8, v16, a2 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v10, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_2) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 6 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v9, v16, a2 +; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 ; CHECK-NEXT: lui a0, %hi(.LCPI10_3) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 8 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v9, v16, a2 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 6 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: lui a0, %hi(.LCPI10_4) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4) -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 10 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v0, v16, a2 +; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v9, v16, a2 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v0, v9, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_5) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 12 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v9, v16, a2 +; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v0, v9, 4 ; CHECK-NEXT: lui a0, %hi(.LCPI10_6) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vi v0, v16, 14 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v9, v16, a2 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v0, v9, 6 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vi v0, v8, 8 ; CHECK-NEXT: ret %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc) ret <128 x i1> %mask diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll index c64216180c2af..ed434deea1a83 100644 --- a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll @@ -19,7 +19,7 @@ define void @test(ptr %ref_array, ptr %sad_array) { ; RV32-NEXT: th.swia a0, (a1), 4, 0 ; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; RV32-NEXT: vle8.v v10, (a3) -; RV32-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vslideup.vi v10, v9, 4 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vzext.vf4 v12, v10 @@ -42,7 +42,7 @@ define void @test(ptr %ref_array, ptr %sad_array) { ; RV64-NEXT: th.swia a0, (a1), 4, 0 ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; RV64-NEXT: vle8.v v10, (a3) -; RV64-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vslideup.vi v10, v9, 4 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vzext.vf4 v12, v10 diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll index 76aa2b913c652..e15e6452163b1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll @@ -469,9 +469,8 @@ define @extract_nxv6f16_nxv12f16_6( %in) ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v13, v10, a0 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v13, v10, a0 ; CHECK-NEXT: vslidedown.vx v12, v9, a0 ; CHECK-NEXT: add a1, a0, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll index e5bef20fd9e24..8474f95edd813 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll @@ -5,6 +5,59 @@ ; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s ; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s +define <8 x i16> @concat_2xv4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: concat_2xv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: ret + %ab = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + ret <8 x i16> %ab +} + +define <8 x i16> @concat_4xv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { +; CHECK-LABEL: concat_4xv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v10, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: ret + %ab = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> + %cd = shufflevector <2 x i16> %c, <2 x i16> %d, <4 x i32> + %abcd = shufflevector <4 x i16> %ab, <4 x i16> %cd, <8 x i32> + ret <8 x i16> %abcd +} + +define <8 x i16> @concat_8xv1i16(<1 x i16> %a, <1 x i16> %b, <1 x i16> %c, <1 x i16> %d, <1 x i16> %e, <1 x i16> %f, <1 x i16> %g, <1 x i16> %h) { +; CHECK-LABEL: concat_8xv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v13, 1 +; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v14, 2 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v15, 3 +; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v11, 3 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 4 +; CHECK-NEXT: ret + %ab = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> + %cd = shufflevector <1 x i16> %c, <1 x i16> %d, <2 x i32> + %abcd = shufflevector <2 x i16> %ab, <2 x i16> %cd, <4 x i32> + %ef = shufflevector <1 x i16> %e, <1 x i16> %f, <2 x i32> + %gh = shufflevector <1 x i16> %g, <1 x i16> %h, <2 x i32> + %efgh = shufflevector <2 x i16> %ef, <2 x i16> %gh, <4 x i32> + %abcdefgh = shufflevector <4 x i16> %abcd, <4 x i16> %efgh, <8 x i32> + ret <8 x i16> %abcdefgh +} + define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: concat_2xv4i32: ; CHECK: # %bb.0: @@ -19,14 +72,11 @@ define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) { define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; CHECK-LABEL: concat_4xv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v12, v11 -; CHECK-NEXT: vmv1r.v v14, v9 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v14, 2 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v10, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 6 +; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: ret %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> @@ -37,24 +87,18 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) { ; CHECK-LABEL: concat_8xv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v16, v15 -; CHECK-NEXT: vmv1r.v v18, v13 -; CHECK-NEXT: vmv1r.v v20, v11 -; CHECK-NEXT: vmv1r.v v22, v9 -; CHECK-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v22, 1 -; CHECK-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v14, v15, 1 +; CHECK-NEXT: vslideup.vi v12, v13, 1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v12, v14, 2 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v10, v11, 1 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vslideup.vi v8, v10, 2 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v20, 3 -; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 4 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v18, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v14, 6 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v16, 7 +; CHECK-NEXT: vslideup.vi v8, v12, 4 ; CHECK-NEXT: ret %ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> %cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> @@ -80,15 +124,14 @@ define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) { define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; CHECK-LABEL: concat_4xv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v12, v11 -; CHECK-NEXT: vmv1r.v v16, v10 -; CHECK-NEXT: vmv1r.v v20, v9 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v20, 4 -; CHECK-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v16, 8 +; CHECK-NEXT: vmv1r.v v14, v11 +; CHECK-NEXT: vmv1r.v v12, v10 +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v14, 4 +; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 12 +; CHECK-NEXT: vslideup.vi v8, v12, 8 ; CHECK-NEXT: ret %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> @@ -99,26 +142,18 @@ define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) { ; CHECK-LABEL: concat_8xv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v16, v15 -; CHECK-NEXT: vmv1r.v v20, v14 -; CHECK-NEXT: vmv1r.v v24, v13 -; CHECK-NEXT: vmv1r.v v28, v11 -; CHECK-NEXT: vmv1r.v v4, v10 -; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v0, 2 -; CHECK-NEXT: vsetivli zero, 6, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v4, 4 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v28, 6 -; CHECK-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v24, 10 -; CHECK-NEXT: vsetivli zero, 14, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v20, 12 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v14, v15, 2 +; CHECK-NEXT: vslideup.vi v12, v13, 2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v14, 4 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v10, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v16, 14 +; CHECK-NEXT: vslideup.vi v8, v12, 8 ; CHECK-NEXT: ret %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> @@ -152,29 +187,27 @@ define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) { define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; VLA-LABEL: concat_4xv8i32: ; VLA: # %bb.0: -; VLA-NEXT: vmv2r.v v16, v14 -; VLA-NEXT: vmv2r.v v24, v12 -; VLA-NEXT: vmv2r.v v0, v10 -; VLA-NEXT: vsetivli zero, 16, e32, m8, tu, ma -; VLA-NEXT: vslideup.vi v8, v0, 8 -; VLA-NEXT: vsetivli zero, 24, e32, m8, tu, ma -; VLA-NEXT: vslideup.vi v8, v24, 16 +; VLA-NEXT: vmv2r.v v20, v14 +; VLA-NEXT: vmv2r.v v16, v12 +; VLA-NEXT: vmv2r.v v12, v10 +; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v16, v20, 8 +; VLA-NEXT: vslideup.vi v8, v12, 8 ; VLA-NEXT: li a0, 32 ; VLA-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; VLA-NEXT: vslideup.vi v8, v16, 24 +; VLA-NEXT: vslideup.vi v8, v16, 16 ; VLA-NEXT: ret ; ; VLS-LABEL: concat_4xv8i32: ; VLS: # %bb.0: -; VLS-NEXT: vmv2r.v v16, v14 -; VLS-NEXT: vmv2r.v v24, v12 -; VLS-NEXT: vmv2r.v v0, v10 -; VLS-NEXT: vsetivli zero, 16, e32, m8, tu, ma -; VLS-NEXT: vslideup.vi v8, v0, 8 -; VLS-NEXT: vsetivli zero, 24, e32, m8, tu, ma -; VLS-NEXT: vslideup.vi v8, v24, 16 +; VLS-NEXT: vmv2r.v v20, v14 +; VLS-NEXT: vmv2r.v v16, v12 +; VLS-NEXT: vmv2r.v v12, v10 +; VLS-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLS-NEXT: vslideup.vi v16, v20, 8 +; VLS-NEXT: vslideup.vi v8, v12, 8 ; VLS-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; VLS-NEXT: vslideup.vi v8, v16, 24 +; VLS-NEXT: vslideup.vi v8, v16, 16 ; VLS-NEXT: ret %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> @@ -185,123 +218,49 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) { ; VLA-LABEL: concat_8xv4i32: ; VLA: # %bb.0: -; VLA-NEXT: addi sp, sp, -16 -; VLA-NEXT: .cfi_def_cfa_offset 16 -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 5 -; VLA-NEXT: sub sp, sp, a0 -; VLA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; VLA-NEXT: vmv1r.v v16, v15 -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 3 -; VLA-NEXT: mv a1, a0 -; VLA-NEXT: slli a0, a0, 1 -; VLA-NEXT: add a0, a0, a1 -; VLA-NEXT: add a0, sp, a0 -; VLA-NEXT: addi a0, a0, 16 -; VLA-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; VLA-NEXT: vmv1r.v v16, v14 -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 4 -; VLA-NEXT: add a0, sp, a0 -; VLA-NEXT: addi a0, a0, 16 -; VLA-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; VLA-NEXT: vmv1r.v v16, v13 -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 3 -; VLA-NEXT: add a0, sp, a0 -; VLA-NEXT: addi a0, a0, 16 -; VLA-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; VLA-NEXT: vmv1r.v v18, v15 +; VLA-NEXT: vmv1r.v v20, v14 +; VLA-NEXT: vmv1r.v v22, v13 ; VLA-NEXT: vmv1r.v v16, v12 -; VLA-NEXT: addi a0, sp, 16 -; VLA-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; VLA-NEXT: vmv1r.v v0, v11 -; VLA-NEXT: vmv1r.v v24, v10 -; VLA-NEXT: vmv1r.v v16, v9 -; VLA-NEXT: vsetivli zero, 8, e32, m8, tu, ma -; VLA-NEXT: vslideup.vi v8, v16, 4 -; VLA-NEXT: vsetivli zero, 12, e32, m8, tu, ma -; VLA-NEXT: vslideup.vi v8, v24, 8 -; VLA-NEXT: vsetivli zero, 16, e32, m8, tu, ma -; VLA-NEXT: vslideup.vi v8, v0, 12 -; VLA-NEXT: vsetivli zero, 20, e32, m8, tu, ma -; VLA-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLA-NEXT: vslideup.vi v8, v16, 16 -; VLA-NEXT: vsetivli zero, 24, e32, m8, tu, ma -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 3 -; VLA-NEXT: add a0, sp, a0 -; VLA-NEXT: addi a0, a0, 16 -; VLA-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLA-NEXT: vslideup.vi v8, v16, 20 -; VLA-NEXT: vsetivli zero, 28, e32, m8, tu, ma -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 4 -; VLA-NEXT: add a0, sp, a0 -; VLA-NEXT: addi a0, a0, 16 -; VLA-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLA-NEXT: vslideup.vi v8, v16, 24 +; VLA-NEXT: vmv1r.v v14, v11 +; VLA-NEXT: vmv1r.v v12, v10 +; VLA-NEXT: vmv1r.v v10, v9 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v20, v18, 4 +; VLA-NEXT: vslideup.vi v16, v22, 4 +; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v16, v20, 8 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v12, v14, 4 +; VLA-NEXT: vslideup.vi v8, v10, 4 +; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v8, v12, 8 ; VLA-NEXT: li a0, 32 ; VLA-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 3 -; VLA-NEXT: mv a1, a0 -; VLA-NEXT: slli a0, a0, 1 -; VLA-NEXT: add a0, a0, a1 -; VLA-NEXT: add a0, sp, a0 -; VLA-NEXT: addi a0, a0, 16 -; VLA-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLA-NEXT: vslideup.vi v8, v16, 28 -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 5 -; VLA-NEXT: add sp, sp, a0 -; VLA-NEXT: addi sp, sp, 16 +; VLA-NEXT: vslideup.vi v8, v16, 16 ; VLA-NEXT: ret ; ; VLS-LABEL: concat_8xv4i32: ; VLS: # %bb.0: -; VLS-NEXT: addi sp, sp, -16 -; VLS-NEXT: .cfi_def_cfa_offset 16 -; VLS-NEXT: addi sp, sp, -512 -; VLS-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; VLS-NEXT: vmv1r.v v16, v15 -; VLS-NEXT: addi a0, sp, 400 -; VLS-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; VLS-NEXT: vmv1r.v v16, v14 -; VLS-NEXT: addi a0, sp, 272 -; VLS-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; VLS-NEXT: vmv1r.v v16, v13 -; VLS-NEXT: addi a0, sp, 144 -; VLS-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; VLS-NEXT: vmv1r.v v18, v15 +; VLS-NEXT: vmv1r.v v20, v14 +; VLS-NEXT: vmv1r.v v22, v13 ; VLS-NEXT: vmv1r.v v16, v12 -; VLS-NEXT: addi a0, sp, 16 -; VLS-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; VLS-NEXT: vmv1r.v v0, v11 -; VLS-NEXT: vmv1r.v v24, v10 -; VLS-NEXT: vmv1r.v v16, v9 -; VLS-NEXT: vsetivli zero, 8, e32, m8, tu, ma -; VLS-NEXT: vslideup.vi v8, v16, 4 -; VLS-NEXT: vsetivli zero, 12, e32, m8, tu, ma -; VLS-NEXT: vslideup.vi v8, v24, 8 -; VLS-NEXT: vsetivli zero, 16, e32, m8, tu, ma -; VLS-NEXT: vslideup.vi v8, v0, 12 -; VLS-NEXT: vsetivli zero, 20, e32, m8, tu, ma -; VLS-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLS-NEXT: vslideup.vi v8, v16, 16 -; VLS-NEXT: vsetivli zero, 24, e32, m8, tu, ma -; VLS-NEXT: addi a0, sp, 144 -; VLS-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLS-NEXT: vslideup.vi v8, v16, 20 -; VLS-NEXT: vsetivli zero, 28, e32, m8, tu, ma -; VLS-NEXT: addi a0, sp, 272 -; VLS-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLS-NEXT: vslideup.vi v8, v16, 24 +; VLS-NEXT: vmv1r.v v14, v11 +; VLS-NEXT: vmv1r.v v12, v10 +; VLS-NEXT: vmv1r.v v10, v9 +; VLS-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLS-NEXT: vslideup.vi v20, v18, 4 +; VLS-NEXT: vslideup.vi v16, v22, 4 +; VLS-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLS-NEXT: vslideup.vi v16, v20, 8 +; VLS-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLS-NEXT: vslideup.vi v12, v14, 4 +; VLS-NEXT: vslideup.vi v8, v10, 4 +; VLS-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLS-NEXT: vslideup.vi v8, v12, 8 ; VLS-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; VLS-NEXT: addi a0, sp, 400 -; VLS-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLS-NEXT: vslideup.vi v8, v16, 28 -; VLS-NEXT: addi sp, sp, 512 -; VLS-NEXT: addi sp, sp, 16 +; VLS-NEXT: vslideup.vi v8, v16, 16 ; VLS-NEXT: ret %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll index ba5db552b8544..37902aa187321 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll @@ -24,17 +24,15 @@ define void @widen_2xv4i16(ptr %x, ptr %z) { define void @widen_3xv4i16(ptr %x, ptr %z) { ; CHECK-LABEL: widen_3xv4i16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, a0, 16 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a2, a0, 8 -; CHECK-NEXT: vle16.v v10, (a2) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v10, v8, 8 +; CHECK-NEXT: vsetivli zero, 12, e16, m2, ta, ma +; CHECK-NEXT: vse16.v v10, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 8 @@ -72,20 +70,18 @@ define void @widen_4xv4i16(ptr %x, ptr %z) { define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) { ; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned: ; CHECK-NO-MISALIGN: # %bb.0: -; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0) -; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8 -; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2) ; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16 -; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a2) +; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2) +; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8 ; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24 -; CHECK-NO-MISALIGN-NEXT: vle8.v v14, (a0) -; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0) +; CHECK-NO-MISALIGN-NEXT: vle8.v v11, (a2) +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v11, 4 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v14, 12 +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 8 ; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1) ; CHECK-NO-MISALIGN-NEXT: ret ; @@ -185,21 +181,14 @@ define void @strided_constant_0(ptr %x, ptr %z) { define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) { ; CHECK-LABEL: strided_constant_mismatch_4xv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a2, a0, 2 -; CHECK-NEXT: vle16.v v10, (a2) ; CHECK-NEXT: addi a2, a0, 6 -; CHECK-NEXT: vle16.v v12, (a2) -; CHECK-NEXT: addi a0, a0, 8 -; CHECK-NEXT: vle16.v v14, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v14, 12 -; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: li a3, 2 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a3 +; CHECK-NEXT: vlse64.v v10, (a2), a3 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 2 @@ -255,59 +244,38 @@ define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) { define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { ; RV32-LABEL: strided_runtime_mismatch_4xv4i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV32-NEXT: vle16.v v8, (a0) -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: vle16.v v10, (a0) -; RV32-NEXT: add a0, a0, a4 -; RV32-NEXT: vle16.v v12, (a0) -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: vle16.v v14, (a0) -; RV32-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v10, 4 -; RV32-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v12, 8 -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vslideup.vi v8, v14, 12 -; RV32-NEXT: vse16.v v8, (a1) +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), a2 +; RV32-NEXT: vlse64.v v10, (a3), a2 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vslideup.vi v8, v10, 2 +; RV32-NEXT: vse64.v v8, (a1) ; RV32-NEXT: ret ; ; RV64-LABEL: strided_runtime_mismatch_4xv4i16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64-NEXT: vle16.v v8, (a0) -; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: vle16.v v10, (a0) -; RV64-NEXT: add a0, a0, a3 -; RV64-NEXT: vle16.v v12, (a0) -; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: vle16.v v14, (a0) -; RV64-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; RV64-NEXT: vslideup.vi v8, v10, 4 -; RV64-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; RV64-NEXT: vslideup.vi v8, v12, 8 -; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vslideup.vi v8, v14, 12 -; RV64-NEXT: vse16.v v8, (a1) +; RV64-NEXT: add a4, a0, a2 +; RV64-NEXT: add a3, a4, a3 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vlse64.v v8, (a0), a2 +; RV64-NEXT: vlse64.v v10, (a3), a2 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vslideup.vi v8, v10, 2 +; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: ret ; ; ZVE64F-LABEL: strided_runtime_mismatch_4xv4i16: ; ZVE64F: # %bb.0: -; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVE64F-NEXT: vle16.v v8, (a0) -; ZVE64F-NEXT: add a0, a0, a2 -; ZVE64F-NEXT: vle16.v v10, (a0) -; ZVE64F-NEXT: add a0, a0, a3 -; ZVE64F-NEXT: vle16.v v12, (a0) -; ZVE64F-NEXT: add a0, a0, a2 -; ZVE64F-NEXT: vle16.v v14, (a0) -; ZVE64F-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; ZVE64F-NEXT: vslideup.vi v8, v10, 4 -; ZVE64F-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; ZVE64F-NEXT: vslideup.vi v8, v12, 8 -; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVE64F-NEXT: vslideup.vi v8, v14, 12 -; ZVE64F-NEXT: vse16.v v8, (a1) +; ZVE64F-NEXT: add a4, a0, a2 +; ZVE64F-NEXT: add a3, a4, a3 +; ZVE64F-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVE64F-NEXT: vlse64.v v8, (a0), a2 +; ZVE64F-NEXT: vlse64.v v10, (a3), a2 +; ZVE64F-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZVE64F-NEXT: vslideup.vi v8, v10, 2 +; ZVE64F-NEXT: vse64.v v8, (a1) ; ZVE64F-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 %s diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index 48ce7d623475c..cbdabab65cc67 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -441,57 +441,50 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu a0, 24(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -609,57 +602,50 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu a0, 24(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -787,60 +773,53 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu a0, 24(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 +; CHECK-V-NEXT: vslideup.vi v8, v10, 2 ; CHECK-V-NEXT: li a0, -1 ; CHECK-V-NEXT: srli a0, a0, 32 -; CHECK-V-NEXT: vmin.vx v8, v10, a0 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 @@ -1404,90 +1383,125 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s5, -56 ; CHECK-V-NEXT: .cfi_offset s6, -64 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) -; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu s3, 24(a0) +; CHECK-V-NEXT: lhu s4, 32(a0) +; CHECK-V-NEXT: lhu s5, 40(a0) +; CHECK-V-NEXT: lhu s6, 48(a0) +; CHECK-V-NEXT: lhu a0, 56(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 4 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 5 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 6 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 7 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 +; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 4 ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -1682,90 +1696,125 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s5, -56 ; CHECK-V-NEXT: .cfi_offset s6, -64 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) -; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu s3, 24(a0) +; CHECK-V-NEXT: lhu s4, 32(a0) +; CHECK-V-NEXT: lhu s5, 40(a0) +; CHECK-V-NEXT: lhu s6, 48(a0) +; CHECK-V-NEXT: lhu a0, 56(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 4 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 5 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 6 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 7 -; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 +; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 4 +; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -1982,94 +2031,129 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s5, -56 ; CHECK-V-NEXT: .cfi_offset s6, -64 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) -; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu s3, 24(a0) +; CHECK-V-NEXT: lhu s4, 32(a0) +; CHECK-V-NEXT: lhu s5, 40(a0) +; CHECK-V-NEXT: lhu s6, 48(a0) +; CHECK-V-NEXT: lhu a0, 56(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 4 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 5 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 6 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 7 +; CHECK-V-NEXT: vslideup.vi v8, v10, 4 ; CHECK-V-NEXT: lui a0, 16 ; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v10, a0 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -3728,57 +3812,50 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu a0, 24(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -3894,57 +3971,50 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu a0, 24(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -4071,60 +4141,53 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu a0, 24(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 +; CHECK-V-NEXT: vslideup.vi v8, v10, 2 ; CHECK-V-NEXT: li a0, -1 ; CHECK-V-NEXT: srli a0, a0, 32 -; CHECK-V-NEXT: vmin.vx v8, v10, a0 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 @@ -4676,90 +4739,125 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s5, -56 ; CHECK-V-NEXT: .cfi_offset s6, -64 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) -; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu s3, 24(a0) +; CHECK-V-NEXT: lhu s4, 32(a0) +; CHECK-V-NEXT: lhu s5, 40(a0) +; CHECK-V-NEXT: lhu s6, 48(a0) +; CHECK-V-NEXT: lhu a0, 56(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 4 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 5 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 6 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 7 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 +; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 4 ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -4952,90 +5050,125 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s5, -56 ; CHECK-V-NEXT: .cfi_offset s6, -64 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) -; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu s3, 24(a0) +; CHECK-V-NEXT: lhu s4, 32(a0) +; CHECK-V-NEXT: lhu s5, 40(a0) +; CHECK-V-NEXT: lhu s6, 48(a0) +; CHECK-V-NEXT: lhu a0, 56(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 4 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 5 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 6 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 7 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 +; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 4 ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -5251,94 +5384,129 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s5, -56 ; CHECK-V-NEXT: .cfi_offset s6, -64 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) -; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu s3, 24(a0) +; CHECK-V-NEXT: lhu s4, 32(a0) +; CHECK-V-NEXT: lhu s5, 40(a0) +; CHECK-V-NEXT: lhu s6, 48(a0) +; CHECK-V-NEXT: lhu a0, 56(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 4 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 5 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 6 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 7 +; CHECK-V-NEXT: vslideup.vi v8, v10, 4 ; CHECK-V-NEXT: lui a0, 16 ; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v10, a0 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll index f3ae03af7c786..0b236f6d3ff38 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -2136,17 +2136,18 @@ define @mgather_baseidx_nxv32i8(ptr %base, ; RV64-NEXT: vluxei64.v v13, (a0), v24, v0.t ; RV64-NEXT: srli a1, a1, 2 ; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vx v0, v16, a1 -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v16, v10 -; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64-NEXT: vluxei64.v v14, (a0), v16, v0.t +; RV64-NEXT: vslidedown.vx v8, v16, a1 ; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vx v0, v0, a2 +; RV64-NEXT: vslidedown.vx v0, v8, a2 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf8 v16, v11 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v15, (a0), v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vluxei64.v v14, (a0), v16, v0.t ; RV64-NEXT: vmv4r.v v8, v12 ; RV64-NEXT: ret %ptrs = getelementptr inbounds i8, ptr %base, %idxs diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll index c27488b18a017..d13d67fd0a882 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll @@ -9,39 +9,38 @@ define <4 x float> @foo(ptr %0) nounwind { ; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: lhu s0, 6(a0) -; CHECK-NEXT: lhu s1, 4(a0) -; CHECK-NEXT: lhu s2, 0(a0) -; CHECK-NEXT: lhu a0, 2(a0) +; CHECK-NEXT: lhu s0, 0(a0) +; CHECK-NEXT: lhu s1, 2(a0) +; CHECK-NEXT: lhu s2, 4(a0) +; CHECK-NEXT: lhu a0, 6(a0) ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fsw fa0, 8(sp) +; CHECK-NEXT: fsw fa0, 4(sp) ; CHECK-NEXT: fmv.w.x fa0, s2 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fsw fa0, 0(sp) +; CHECK-NEXT: fsw fa0, 12(sp) ; CHECK-NEXT: fmv.w.x fa0, s1 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fsw fa0, 12(sp) +; CHECK-NEXT: fsw fa0, 8(sp) ; CHECK-NEXT: fmv.w.x fa0, s0 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fsw fa0, 4(sp) -; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: fsw fa0, 0(sp) +; CHECK-NEXT: addi a0, sp, 4 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: addi a0, sp, 12 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: addi a0, sp, 4 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: addi a0, sp, 8 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: vslideup.vi v8, v9, 2 ; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload