diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f1cea6c6756f4..d176fcaf54c2d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -8629,6 +8629,18 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, ContainerVT = getContainerForFixedLengthVector(VecVT); Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); } + + // Shrink down Vec so we're performing the slideup on a smaller LMUL. + unsigned LastIdx = OrigIdx + SubVecVT.getVectorNumElements() - 1; + MVT OrigContainerVT = ContainerVT; + SDValue OrigVec = Vec; + if (auto ShrunkVT = + getSmallestVTForIndex(ContainerVT, LastIdx, DL, DAG, Subtarget)) { + ContainerVT = *ShrunkVT; + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec, + DAG.getVectorIdxConstant(0, DL)); + } + SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, DAG.getUNDEF(ContainerVT), SubVec, DAG.getConstant(0, DL, XLenVT)); @@ -8659,6 +8671,12 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, SlideupAmt, Mask, VL, Policy); } + // If we performed the slideup on a smaller LMUL, insert the result back + // into the rest of the vector. + if (ContainerVT != OrigContainerVT) + SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigContainerVT, OrigVec, + SubVec, DAG.getVectorIdxConstant(0, DL)); + if (VecVT.isFixedLengthVector()) SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget); return DAG.getBitcast(Op.getValueType(), SubVec); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll index 1d6a45ed36f33..6a9212ed309a8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -14,7 +14,7 @@ define @insert_nxv8i32_v2i32_0( %vec, ptr % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m4, tu, ma +; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %sv = load <2 x i32>, ptr %svp @@ -27,7 +27,7 @@ define @insert_nxv8i32_v2i32_2( %vec, ptr % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma ; CHECK-NEXT: vslideup.vi v8, v12, 2 ; CHECK-NEXT: ret %sv = load <2 x i32>, ptr %svp @@ -40,7 +40,7 @@ define @insert_nxv8i32_v2i32_6( %vec, ptr % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v12, 6 ; CHECK-NEXT: ret %sv = load <2 x i32>, ptr %svp @@ -51,22 +51,19 @@ define @insert_nxv8i32_v2i32_6( %vec, ptr % define @insert_nxv8i32_v8i32_0( %vec, ptr %svp) { ; LMULMAX2-LABEL: insert_nxv8i32_v8i32_0: ; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-NEXT: vle32.v v12, (a0) -; LMULMAX2-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; LMULMAX2-NEXT: vmv.v.v v8, v12 +; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; LMULMAX2-NEXT: vle32.v v8, (a0) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: insert_nxv8i32_v8i32_0: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v12, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v16, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m4, tu, ma -; LMULMAX1-NEXT: vmv.v.v v8, v12 -; LMULMAX1-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; LMULMAX1-NEXT: vslideup.vi v8, v16, 4 +; LMULMAX1-NEXT: vle32.v v12, (a1) +; LMULMAX1-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; LMULMAX1-NEXT: vslideup.vi v8, v12, 4 ; LMULMAX1-NEXT: ret %sv = load <8 x i32>, ptr %svp %v = call @llvm.vector.insert.v8i32.nxv8i32( %vec, <8 x i32> %sv, i64 0) @@ -84,14 +81,14 @@ define @insert_nxv8i32_v8i32_8( %vec, ptr % ; ; LMULMAX1-LABEL: insert_nxv8i32_v8i32_8: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v12, (a1) +; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 ; LMULMAX1-NEXT: vle32.v v16, (a0) ; LMULMAX1-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; LMULMAX1-NEXT: vslideup.vi v8, v16, 8 +; LMULMAX1-NEXT: vslideup.vi v8, v12, 8 ; LMULMAX1-NEXT: vsetivli zero, 16, e32, m4, tu, ma -; LMULMAX1-NEXT: vslideup.vi v8, v12, 12 +; LMULMAX1-NEXT: vslideup.vi v8, v16, 12 ; LMULMAX1-NEXT: ret %sv = load <8 x i32>, ptr %svp %v = call @llvm.vector.insert.v8i32.nxv8i32( %vec, <8 x i32> %sv, i64 8) @@ -166,7 +163,7 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) { ; LMULMAX2-NEXT: vle32.v v8, (a1) ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-NEXT: vle32.v v10, (a0) -; LMULMAX2-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; LMULMAX2-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; LMULMAX2-NEXT: vmv.v.v v10, v8 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-NEXT: vse32.v v10, (a0) @@ -197,7 +194,7 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) { ; LMULMAX2-NEXT: vle32.v v8, (a1) ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-NEXT: vle32.v v10, (a0) -; LMULMAX2-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, tu, ma ; LMULMAX2-NEXT: vslideup.vi v10, v8, 2 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-NEXT: vse32.v v10, (a0) @@ -508,9 +505,9 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, * %o ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: vsetivli zero, 6, e64, m8, tu, ma -; CHECK-NEXT: vslideup.vi v8, v16, 4 +; CHECK-NEXT: vle64.v v12, (a1) +; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma +; CHECK-NEXT: vslideup.vi v8, v12, 4 ; CHECK-NEXT: vs8r.v v8, (a2) ; CHECK-NEXT: ret %sv0 = load <2 x i64>, ptr %psv0 @@ -539,7 +536,7 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, * %out) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e64, m8, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vslideup.vi v16, v8, 2 ; CHECK-NEXT: vs8r.v v16, (a1) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll index f52ba6f51d5c8..805557905117a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll @@ -27,13 +27,13 @@ define void @widen_3xv4i16(ptr %x, ptr %z) { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a2, a0, 8 -; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vle16.v v9, (a2) ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 ; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vslideup.vi v8, v10, 8 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -75,17 +75,17 @@ define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) { ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0) ; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8 -; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2) +; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a2) ; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16 -; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a2) +; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2) ; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24 -; CHECK-NO-MISALIGN-NEXT: vle8.v v14, (a0) -; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a0) +; CHECK-NO-MISALIGN-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v9, 4 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 8 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v14, 12 +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 12 ; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1) ; CHECK-NO-MISALIGN-NEXT: ret ; @@ -188,17 +188,17 @@ define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a2, a0, 2 -; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vle16.v v9, (a2) ; CHECK-NEXT: addi a2, a0, 6 -; CHECK-NEXT: vle16.v v12, (a2) +; CHECK-NEXT: vle16.v v10, (a2) ; CHECK-NEXT: addi a0, a0, 8 -; CHECK-NEXT: vle16.v v14, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 ; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vslideup.vi v8, v10, 8 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v14, 12 +; CHECK-NEXT: vslideup.vi v8, v12, 12 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -258,17 +258,17 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: vle16.v v10, (a0) +; RV32-NEXT: vle16.v v9, (a0) ; RV32-NEXT: add a0, a0, a4 -; RV32-NEXT: vle16.v v12, (a0) +; RV32-NEXT: vle16.v v10, (a0) ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: vle16.v v14, (a0) -; RV32-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v10, 4 +; RV32-NEXT: vle16.v v12, (a0) +; RV32-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; RV32-NEXT: vslideup.vi v8, v9, 4 ; RV32-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v12, 8 +; RV32-NEXT: vslideup.vi v8, v10, 8 ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vslideup.vi v8, v14, 12 +; RV32-NEXT: vslideup.vi v8, v12, 12 ; RV32-NEXT: vse16.v v8, (a1) ; RV32-NEXT: ret ; @@ -277,17 +277,17 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { ; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV64-NEXT: vle16.v v8, (a0) ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: vle16.v v10, (a0) +; RV64-NEXT: vle16.v v9, (a0) ; RV64-NEXT: add a0, a0, a3 -; RV64-NEXT: vle16.v v12, (a0) +; RV64-NEXT: vle16.v v10, (a0) ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: vle16.v v14, (a0) -; RV64-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; RV64-NEXT: vslideup.vi v8, v10, 4 +; RV64-NEXT: vle16.v v12, (a0) +; RV64-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; RV64-NEXT: vslideup.vi v8, v9, 4 ; RV64-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; RV64-NEXT: vslideup.vi v8, v12, 8 +; RV64-NEXT: vslideup.vi v8, v10, 8 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vslideup.vi v8, v14, 12 +; RV64-NEXT: vslideup.vi v8, v12, 12 ; RV64-NEXT: vse16.v v8, (a1) ; RV64-NEXT: ret ; @@ -296,17 +296,17 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { ; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVE64F-NEXT: vle16.v v8, (a0) ; ZVE64F-NEXT: add a0, a0, a2 -; ZVE64F-NEXT: vle16.v v10, (a0) +; ZVE64F-NEXT: vle16.v v9, (a0) ; ZVE64F-NEXT: add a0, a0, a3 -; ZVE64F-NEXT: vle16.v v12, (a0) +; ZVE64F-NEXT: vle16.v v10, (a0) ; ZVE64F-NEXT: add a0, a0, a2 -; ZVE64F-NEXT: vle16.v v14, (a0) -; ZVE64F-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; ZVE64F-NEXT: vslideup.vi v8, v10, 4 +; ZVE64F-NEXT: vle16.v v12, (a0) +; ZVE64F-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; ZVE64F-NEXT: vslideup.vi v8, v9, 4 ; ZVE64F-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; ZVE64F-NEXT: vslideup.vi v8, v12, 8 +; ZVE64F-NEXT: vslideup.vi v8, v10, 8 ; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVE64F-NEXT: vslideup.vi v8, v14, 12 +; ZVE64F-NEXT: vslideup.vi v8, v12, 12 ; ZVE64F-NEXT: vse16.v v8, (a1) ; ZVE64F-NEXT: ret %a = load <4 x i16>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index 8f3ad6bf6e65e..8e47dd72ae218 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -460,54 +460,49 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb ; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s1, 0(a0) +; CHECK-V-NEXT: lhu s2, 8(a0) +; CHECK-V-NEXT: lhu a0, 16(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 -; CHECK-V-NEXT: call __extendhfsf2@plt -; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: call __extendhfsf2@plt +; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s1 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 3 ; CHECK-V-NEXT: lui a0, 524288 @@ -632,54 +627,49 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb ; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s1, 0(a0) +; CHECK-V-NEXT: lhu s2, 8(a0) +; CHECK-V-NEXT: lhu a0, 16(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 -; CHECK-V-NEXT: call __extendhfsf2@plt -; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: call __extendhfsf2@plt +; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s1 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 3 ; CHECK-V-NEXT: li a0, -1 @@ -813,54 +803,49 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb ; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s1, 0(a0) +; CHECK-V-NEXT: lhu s2, 8(a0) +; CHECK-V-NEXT: lhu a0, 16(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 -; CHECK-V-NEXT: call __extendhfsf2@plt -; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: call __extendhfsf2@plt +; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s1 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 3 ; CHECK-V-NEXT: li a0, -1 @@ -1454,8 +1439,8 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: lhu s3, 32(a0) ; CHECK-V-NEXT: lhu s4, 24(a0) ; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s6, 8(a0) +; CHECK-V-NEXT: lhu a0, 0(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz @@ -1466,16 +1451,16 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload @@ -1484,7 +1469,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload @@ -1737,8 +1722,8 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: lhu s3, 32(a0) ; CHECK-V-NEXT: lhu s4, 24(a0) ; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s6, 8(a0) +; CHECK-V-NEXT: lhu a0, 0(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz @@ -1749,16 +1734,16 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload @@ -1767,7 +1752,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload @@ -2040,8 +2025,8 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: lhu s3, 32(a0) ; CHECK-V-NEXT: lhu s4, 24(a0) ; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s6, 8(a0) +; CHECK-V-NEXT: lhu a0, 0(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz @@ -2052,16 +2037,16 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload @@ -2070,7 +2055,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload @@ -3796,54 +3781,49 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb ; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s1, 0(a0) +; CHECK-V-NEXT: lhu s2, 8(a0) +; CHECK-V-NEXT: lhu a0, 16(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 -; CHECK-V-NEXT: call __extendhfsf2@plt -; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: call __extendhfsf2@plt +; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s1 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 3 ; CHECK-V-NEXT: lui a0, 524288 @@ -3966,54 +3946,49 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb ; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s1, 0(a0) +; CHECK-V-NEXT: lhu s2, 8(a0) +; CHECK-V-NEXT: lhu a0, 16(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 -; CHECK-V-NEXT: call __extendhfsf2@plt -; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: call __extendhfsf2@plt +; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s1 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 3 ; CHECK-V-NEXT: li a0, -1 @@ -4146,54 +4121,49 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb ; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s1, 0(a0) +; CHECK-V-NEXT: lhu s2, 8(a0) +; CHECK-V-NEXT: lhu a0, 16(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 -; CHECK-V-NEXT: call __extendhfsf2@plt -; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: call __extendhfsf2@plt +; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s1 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 3 ; CHECK-V-NEXT: li a0, -1 @@ -4775,8 +4745,8 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: lhu s3, 32(a0) ; CHECK-V-NEXT: lhu s4, 24(a0) ; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s6, 8(a0) +; CHECK-V-NEXT: lhu a0, 0(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz @@ -4787,16 +4757,16 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload @@ -4805,7 +4775,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload @@ -5054,8 +5024,8 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: lhu s3, 32(a0) ; CHECK-V-NEXT: lhu s4, 24(a0) ; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s6, 8(a0) +; CHECK-V-NEXT: lhu a0, 0(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz @@ -5066,16 +5036,16 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload @@ -5084,7 +5054,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload @@ -5356,8 +5326,8 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: lhu s3, 32(a0) ; CHECK-V-NEXT: lhu s4, 24(a0) ; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s6, 8(a0) +; CHECK-V-NEXT: lhu a0, 0(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz @@ -5368,16 +5338,16 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 3, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload @@ -5386,7 +5356,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2@plt ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, tu, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload