From fdb976a96abb380986ba2e46459bf310c46114d2 Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Wed, 27 Nov 2024 09:31:56 -0300 Subject: [PATCH 01/10] [RISCV] Use vrgather in llvm.experimental.vector.match This patch changes matchSplatAsGather to use vrgather no only when we see an EXTRACT_VECTOR_ELT but also for RISCVISD::VMV_X_S. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 13 +- .../RISCV/rvv/intrinsic-vector-match.ll | 1477 +++++------------ 2 files changed, 397 insertions(+), 1093 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 78dc3cb27a698..65aaa595a0a4e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3493,13 +3493,12 @@ static std::optional isSimpleVIDSequence(SDValue Op, static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { - if (SplatVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + if (SplatVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT && (SplatVal.getOpcode() != RISCVISD::VMV_X_S)) return SDValue(); SDValue Vec = SplatVal.getOperand(0); - // Only perform this optimization on vectors of the same size for simplicity. // Don't perform this optimization for i1 vectors. // FIXME: Support i1 vectors, maybe by promoting to i8? - if (Vec.getValueType() != VT || VT.getVectorElementType() == MVT::i1) + if (VT.getVectorElementType() == MVT::i1) return SDValue(); SDValue Idx = SplatVal.getOperand(1); // The index must be a legal type. @@ -3507,10 +3506,12 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL, return SDValue(); MVT ContainerVT = VT; - if (VT.isFixedLengthVector()) { + if (VT.isFixedLengthVector()) ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); - Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); - } + + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), Vec, + DAG.getVectorIdxConstant(0, DL)); auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll index e70dcd16d02cd..295562f7f1beb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll +++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll @@ -6,8 +6,8 @@ define @match_nxv16i8_v1i8( %op1, <1 x i8> ; CHECK-LABEL: match_nxv16i8_v1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vmseq.vx v10, v8, a0 +; CHECK-NEXT: vrgather.vi v12, v10, 0 +; CHECK-NEXT: vmseq.vv v10, v8, v12 ; CHECK-NEXT: vmand.mm v0, v10, v0 ; CHECK-NEXT: ret %r = tail call @llvm.experimental.vector.match( %op1, <1 x i8> %op2, %mask) @@ -17,14 +17,12 @@ define @match_nxv16i8_v1i8( %op1, <1 x i8> define @match_nxv16i8_v2i8( %op1, <2 x i8> %op2, %mask) { ; CHECK-LABEL: match_nxv16i8_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v10, 1 -; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; CHECK-NEXT: vmseq.vx v11, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vmseq.vx v10, v8, a0 -; CHECK-NEXT: vmor.mm v8, v11, v10 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vrgather.vi v12, v10, 1 +; CHECK-NEXT: vmseq.vv v14, v8, v12 +; CHECK-NEXT: vrgather.vi v12, v10, 0 +; CHECK-NEXT: vmseq.vv v10, v8, v12 +; CHECK-NEXT: vmor.mm v8, v10, v14 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret %r = tail call @llvm.experimental.vector.match( %op1, <2 x i8> %op2, %mask) @@ -34,21 +32,17 @@ define @match_nxv16i8_v2i8( %op1, <2 x i8> define @match_nxv16i8_v4i8( %op1, <4 x i8> %op2, %mask) { ; CHECK-LABEL: match_nxv16i8_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v11, v10, 1 -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vslidedown.vi v10, v10, 3 -; CHECK-NEXT: vmv.x.s a1, v11 -; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma -; CHECK-NEXT: vmseq.vx v11, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v12 -; CHECK-NEXT: vmseq.vx v12, v8, a1 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: vmseq.vx v10, v8, a0 -; CHECK-NEXT: vmor.mm v11, v11, v12 -; CHECK-NEXT: vmor.mm v10, v11, v10 -; CHECK-NEXT: vmseq.vx v11, v8, a1 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vrgather.vi v12, v10, 1 +; CHECK-NEXT: vmseq.vv v14, v8, v12 +; CHECK-NEXT: vrgather.vi v12, v10, 0 +; CHECK-NEXT: vmseq.vv v15, v8, v12 +; CHECK-NEXT: vmor.mm v12, v15, v14 +; CHECK-NEXT: vrgather.vi v14, v10, 2 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vrgather.vi v14, v10, 3 +; CHECK-NEXT: vmor.mm v10, v12, v13 +; CHECK-NEXT: vmseq.vv v11, v8, v14 ; CHECK-NEXT: vmor.mm v8, v10, v11 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret @@ -59,37 +53,29 @@ define @match_nxv16i8_v4i8( %op1, <4 x i8> define @match_nxv16i8_v8i8( %op1, <8 x i8> %op2, %mask) { ; CHECK-LABEL: match_nxv16i8_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v11, v10, 1 -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vmv.x.s a1, v11 -; CHECK-NEXT: vslidedown.vi v11, v10, 3 -; CHECK-NEXT: vmv.x.s a2, v12 -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vmv.x.s a3, v11 -; CHECK-NEXT: vslidedown.vi v11, v10, 5 -; CHECK-NEXT: vmv.x.s a4, v12 -; CHECK-NEXT: vslidedown.vi v12, v10, 6 -; CHECK-NEXT: vslidedown.vi v10, v10, 7 -; CHECK-NEXT: vmv.x.s a5, v11 -; CHECK-NEXT: vsetvli a6, zero, e8, m2, ta, ma -; CHECK-NEXT: vmseq.vx v11, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v12 -; CHECK-NEXT: vmseq.vx v12, v8, a1 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: vmseq.vx v10, v8, a2 -; CHECK-NEXT: vmor.mm v11, v11, v12 -; CHECK-NEXT: vmseq.vx v12, v8, a3 -; CHECK-NEXT: vmor.mm v10, v11, v10 -; CHECK-NEXT: vmseq.vx v11, v8, a4 -; CHECK-NEXT: vmor.mm v10, v10, v12 -; CHECK-NEXT: vmseq.vx v12, v8, a5 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a0 -; CHECK-NEXT: vmor.mm v10, v10, v12 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a1 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vrgather.vi v12, v10, 1 +; CHECK-NEXT: vmseq.vv v14, v8, v12 +; CHECK-NEXT: vrgather.vi v12, v10, 0 +; CHECK-NEXT: vmseq.vv v15, v8, v12 +; CHECK-NEXT: vmor.mm v12, v15, v14 +; CHECK-NEXT: vrgather.vi v14, v10, 2 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 3 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 4 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 5 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 6 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vrgather.vi v14, v10, 7 +; CHECK-NEXT: vmor.mm v10, v12, v13 +; CHECK-NEXT: vmseq.vv v11, v8, v14 ; CHECK-NEXT: vmor.mm v8, v10, v11 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret @@ -100,69 +86,53 @@ define @match_nxv16i8_v8i8( %op1, <8 x i8> define @match_nxv16i8_v16i8( %op1, <16 x i8> %op2, %mask) { ; CHECK-LABEL: match_nxv16i8_v16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v11, v10, 1 -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vmv.x.s a1, v11 -; CHECK-NEXT: vslidedown.vi v11, v10, 3 -; CHECK-NEXT: vmv.x.s a2, v12 -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vmv.x.s a3, v11 -; CHECK-NEXT: vslidedown.vi v11, v10, 5 -; CHECK-NEXT: vmv.x.s a4, v12 -; CHECK-NEXT: vslidedown.vi v12, v10, 6 -; CHECK-NEXT: vmv.x.s a5, v11 -; CHECK-NEXT: vslidedown.vi v11, v10, 7 -; CHECK-NEXT: vmv.x.s a6, v12 -; CHECK-NEXT: vslidedown.vi v12, v10, 8 -; CHECK-NEXT: vmv.x.s a7, v11 -; CHECK-NEXT: vslidedown.vi v11, v10, 9 -; CHECK-NEXT: vmv.x.s t0, v12 -; CHECK-NEXT: vslidedown.vi v12, v10, 10 -; CHECK-NEXT: vmv.x.s t1, v11 -; CHECK-NEXT: vslidedown.vi v11, v10, 11 -; CHECK-NEXT: vmv.x.s t2, v12 -; CHECK-NEXT: vslidedown.vi v12, v10, 12 -; CHECK-NEXT: vmv.x.s t3, v11 -; CHECK-NEXT: vslidedown.vi v11, v10, 13 -; CHECK-NEXT: vmv.x.s t4, v12 -; CHECK-NEXT: vslidedown.vi v12, v10, 14 -; CHECK-NEXT: vslidedown.vi v10, v10, 15 -; CHECK-NEXT: vmv.x.s t5, v11 -; CHECK-NEXT: vsetvli t6, zero, e8, m2, ta, ma -; CHECK-NEXT: vmseq.vx v11, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v12 -; CHECK-NEXT: vmseq.vx v12, v8, a1 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: vmseq.vx v10, v8, a2 -; CHECK-NEXT: vmor.mm v11, v11, v12 -; CHECK-NEXT: vmseq.vx v12, v8, a3 -; CHECK-NEXT: vmor.mm v10, v11, v10 -; CHECK-NEXT: vmseq.vx v11, v8, a4 -; CHECK-NEXT: vmor.mm v10, v10, v12 -; CHECK-NEXT: vmseq.vx v12, v8, a5 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a6 -; CHECK-NEXT: vmor.mm v10, v10, v12 -; CHECK-NEXT: vmseq.vx v12, v8, a7 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vmseq.vx v11, v8, t0 -; CHECK-NEXT: vmor.mm v10, v10, v12 -; CHECK-NEXT: vmseq.vx v12, v8, t1 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vmseq.vx v11, v8, t2 -; CHECK-NEXT: vmor.mm v10, v10, v12 -; CHECK-NEXT: vmseq.vx v12, v8, t3 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vmseq.vx v11, v8, t4 -; CHECK-NEXT: vmor.mm v10, v10, v12 -; CHECK-NEXT: vmseq.vx v12, v8, t5 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a0 -; CHECK-NEXT: vmor.mm v10, v10, v12 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a1 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vrgather.vi v12, v10, 1 +; CHECK-NEXT: vmseq.vv v14, v8, v12 +; CHECK-NEXT: vrgather.vi v12, v10, 0 +; CHECK-NEXT: vmseq.vv v15, v8, v12 +; CHECK-NEXT: vmor.mm v12, v15, v14 +; CHECK-NEXT: vrgather.vi v14, v10, 2 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 3 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 4 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 5 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 6 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 7 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 8 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 9 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 10 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 11 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 12 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 13 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 14 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vrgather.vi v14, v10, 15 +; CHECK-NEXT: vmor.mm v10, v12, v13 +; CHECK-NEXT: vmseq.vv v11, v8, v14 ; CHECK-NEXT: vmor.mm v8, v10, v11 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret @@ -174,8 +144,8 @@ define <16 x i1> @match_v16i8_v1i8(<16 x i8> %op1, <1 x i8> %op2, <16 x i1> %mas ; CHECK-LABEL: match_v16i8_v1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vmseq.vx v8, v8, a0 +; CHECK-NEXT: vrgather.vi v10, v9, 0 +; CHECK-NEXT: vmseq.vv v8, v8, v10 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <1 x i8> %op2, <16 x i1> %mask) @@ -185,14 +155,12 @@ define <16 x i1> @match_v16i8_v1i8(<16 x i8> %op1, <1 x i8> %op2, <16 x i1> %mas define <16 x i1> @match_v16i8_v2i8(<16 x i8> %op1, <2 x i8> %op2, <16 x i1> %mask) { ; CHECK-LABEL: match_v16i8_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vslidedown.vi v9, v9, 1 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmseq.vx v10, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vmseq.vx v8, v8, a0 -; CHECK-NEXT: vmor.mm v8, v10, v8 +; CHECK-NEXT: vrgather.vi v10, v9, 1 +; CHECK-NEXT: vrgather.vi v11, v9, 0 +; CHECK-NEXT: vmseq.vv v9, v8, v10 +; CHECK-NEXT: vmseq.vv v8, v8, v11 +; CHECK-NEXT: vmor.mm v8, v8, v9 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <2 x i8> %op2, <16 x i1> %mask) @@ -202,21 +170,17 @@ define <16 x i1> @match_v16i8_v2i8(<16 x i8> %op1, <2 x i8> %op2, <16 x i1> %mas define <16 x i1> @match_v16i8_v4i8(<16 x i8> %op1, <4 x i8> %op2, <16 x i1> %mask) { ; CHECK-LABEL: match_v16i8_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vslidedown.vi v11, v9, 2 -; CHECK-NEXT: vslidedown.vi v9, v9, 3 -; CHECK-NEXT: vmv.x.s a1, v10 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmseq.vx v10, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a1 -; CHECK-NEXT: vmv.x.s a1, v9 -; CHECK-NEXT: vmseq.vx v9, v8, a0 -; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v10, v9, 1 +; CHECK-NEXT: vrgather.vi v11, v9, 0 +; CHECK-NEXT: vmseq.vv v10, v8, v10 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v11, v10 +; CHECK-NEXT: vrgather.vi v11, v9, 2 +; CHECK-NEXT: vrgather.vi v12, v9, 3 +; CHECK-NEXT: vmseq.vv v9, v8, v11 ; CHECK-NEXT: vmor.mm v9, v10, v9 -; CHECK-NEXT: vmseq.vx v8, v8, a1 +; CHECK-NEXT: vmseq.vv v8, v8, v12 ; CHECK-NEXT: vmor.mm v8, v9, v8 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret @@ -227,37 +191,29 @@ define <16 x i1> @match_v16i8_v4i8(<16 x i8> %op1, <4 x i8> %op2, <16 x i1> %mas define <16 x i1> @match_v16i8_v8i8(<16 x i8> %op1, <8 x i8> %op2, <16 x i1> %mask) { ; CHECK-LABEL: match_v16i8_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vslidedown.vi v11, v9, 2 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: vslidedown.vi v10, v9, 3 -; CHECK-NEXT: vmv.x.s a2, v11 -; CHECK-NEXT: vslidedown.vi v11, v9, 4 -; CHECK-NEXT: vmv.x.s a3, v10 -; CHECK-NEXT: vslidedown.vi v10, v9, 5 -; CHECK-NEXT: vmv.x.s a4, v11 -; CHECK-NEXT: vslidedown.vi v11, v9, 6 -; CHECK-NEXT: vslidedown.vi v9, v9, 7 -; CHECK-NEXT: vmv.x.s a5, v10 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmseq.vx v10, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a1 -; CHECK-NEXT: vmv.x.s a1, v9 -; CHECK-NEXT: vmseq.vx v9, v8, a2 +; CHECK-NEXT: vrgather.vi v10, v9, 1 +; CHECK-NEXT: vrgather.vi v11, v9, 0 +; CHECK-NEXT: vmseq.vv v10, v8, v10 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v11, v10 +; CHECK-NEXT: vrgather.vi v11, v9, 2 +; CHECK-NEXT: vmseq.vv v11, v8, v11 ; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a3 +; CHECK-NEXT: vrgather.vi v11, v9, 3 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 4 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 5 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 6 +; CHECK-NEXT: vrgather.vi v12, v9, 7 +; CHECK-NEXT: vmseq.vv v9, v8, v11 ; CHECK-NEXT: vmor.mm v9, v10, v9 -; CHECK-NEXT: vmseq.vx v10, v8, a4 -; CHECK-NEXT: vmor.mm v9, v9, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a5 -; CHECK-NEXT: vmor.mm v9, v9, v10 -; CHECK-NEXT: vmseq.vx v10, v8, a0 -; CHECK-NEXT: vmor.mm v9, v9, v11 -; CHECK-NEXT: vmor.mm v9, v9, v10 -; CHECK-NEXT: vmseq.vx v8, v8, a1 +; CHECK-NEXT: vmseq.vv v8, v8, v12 ; CHECK-NEXT: vmor.mm v8, v9, v8 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret @@ -358,37 +314,29 @@ define <8 x i1> @match_v8i8_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) { define @match_nxv8i16_v8i16( %op1, <8 x i16> %op2, %mask) { ; CHECK-LABEL: match_nxv8i16_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v11, v10, 1 -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vmv.x.s a1, v11 -; CHECK-NEXT: vslidedown.vi v11, v10, 3 -; CHECK-NEXT: vmv.x.s a2, v12 -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vmv.x.s a3, v11 -; CHECK-NEXT: vslidedown.vi v11, v10, 5 -; CHECK-NEXT: vmv.x.s a4, v12 -; CHECK-NEXT: vslidedown.vi v12, v10, 6 -; CHECK-NEXT: vslidedown.vi v10, v10, 7 -; CHECK-NEXT: vmv.x.s a5, v11 -; CHECK-NEXT: vsetvli a6, zero, e16, m2, ta, ma -; CHECK-NEXT: vmseq.vx v11, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v12 -; CHECK-NEXT: vmseq.vx v12, v8, a1 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: vmseq.vx v10, v8, a2 -; CHECK-NEXT: vmor.mm v11, v11, v12 -; CHECK-NEXT: vmseq.vx v12, v8, a3 -; CHECK-NEXT: vmor.mm v10, v11, v10 -; CHECK-NEXT: vmseq.vx v11, v8, a4 -; CHECK-NEXT: vmor.mm v10, v10, v12 -; CHECK-NEXT: vmseq.vx v12, v8, a5 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a0 -; CHECK-NEXT: vmor.mm v10, v10, v12 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a1 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vrgather.vi v12, v10, 1 +; CHECK-NEXT: vmseq.vv v14, v8, v12 +; CHECK-NEXT: vrgather.vi v12, v10, 0 +; CHECK-NEXT: vmseq.vv v15, v8, v12 +; CHECK-NEXT: vmor.mm v12, v15, v14 +; CHECK-NEXT: vrgather.vi v14, v10, 2 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 3 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 4 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 5 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vrgather.vi v14, v10, 6 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vrgather.vi v14, v10, 7 +; CHECK-NEXT: vmor.mm v10, v12, v13 +; CHECK-NEXT: vmseq.vv v11, v8, v14 ; CHECK-NEXT: vmor.mm v8, v10, v11 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret @@ -429,74 +377,57 @@ define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) { ret <8 x i1> %r } -; Cases where op2 has more elements than op1. define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask) { ; CHECK-LABEL: match_v8i8_v16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vslidedown.vi v11, v9, 2 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: vslidedown.vi v10, v9, 3 -; CHECK-NEXT: vmv.x.s a2, v11 -; CHECK-NEXT: vslidedown.vi v11, v9, 4 -; CHECK-NEXT: vmv.x.s a3, v10 -; CHECK-NEXT: vslidedown.vi v10, v9, 5 -; CHECK-NEXT: vmv.x.s a4, v11 -; CHECK-NEXT: vslidedown.vi v11, v9, 6 -; CHECK-NEXT: vmv.x.s a5, v10 -; CHECK-NEXT: vslidedown.vi v10, v9, 7 -; CHECK-NEXT: vmv.x.s a6, v11 -; CHECK-NEXT: vslidedown.vi v11, v9, 8 -; CHECK-NEXT: vmv.x.s a7, v10 -; CHECK-NEXT: vslidedown.vi v10, v9, 9 -; CHECK-NEXT: vmv.x.s t0, v11 -; CHECK-NEXT: vslidedown.vi v11, v9, 10 -; CHECK-NEXT: vmv.x.s t1, v10 -; CHECK-NEXT: vslidedown.vi v10, v9, 11 -; CHECK-NEXT: vmv.x.s t2, v11 -; CHECK-NEXT: vslidedown.vi v11, v9, 12 -; CHECK-NEXT: vmv.x.s t3, v10 -; CHECK-NEXT: vslidedown.vi v10, v9, 13 -; CHECK-NEXT: vmv.x.s t4, v11 -; CHECK-NEXT: vslidedown.vi v11, v9, 14 -; CHECK-NEXT: vslidedown.vi v9, v9, 15 -; CHECK-NEXT: vmv.x.s t5, v10 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v10, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a1 -; CHECK-NEXT: vmv.x.s a1, v9 -; CHECK-NEXT: vmseq.vx v9, v8, a2 +; CHECK-NEXT: vrgather.vi v10, v9, 1 +; CHECK-NEXT: vrgather.vi v11, v9, 0 +; CHECK-NEXT: vmseq.vv v10, v8, v10 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v11, v10 +; CHECK-NEXT: vrgather.vi v11, v9, 2 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 3 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 4 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 5 +; CHECK-NEXT: vmseq.vv v11, v8, v11 ; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a3 +; CHECK-NEXT: vrgather.vi v11, v9, 6 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 7 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 8 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 9 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 10 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 11 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 12 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 13 +; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vrgather.vi v11, v9, 14 +; CHECK-NEXT: vrgather.vi v12, v9, 15 +; CHECK-NEXT: vmseq.vv v9, v8, v11 ; CHECK-NEXT: vmor.mm v9, v10, v9 -; CHECK-NEXT: vmseq.vx v10, v8, a4 -; CHECK-NEXT: vmor.mm v9, v9, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a5 -; CHECK-NEXT: vmor.mm v9, v9, v10 -; CHECK-NEXT: vmseq.vx v10, v8, a6 -; CHECK-NEXT: vmor.mm v9, v9, v11 -; CHECK-NEXT: vmseq.vx v11, v8, a7 -; CHECK-NEXT: vmor.mm v9, v9, v10 -; CHECK-NEXT: vmseq.vx v10, v8, t0 -; CHECK-NEXT: vmor.mm v9, v9, v11 -; CHECK-NEXT: vmseq.vx v11, v8, t1 -; CHECK-NEXT: vmor.mm v9, v9, v10 -; CHECK-NEXT: vmseq.vx v10, v8, t2 -; CHECK-NEXT: vmor.mm v9, v9, v11 -; CHECK-NEXT: vmseq.vx v11, v8, t3 -; CHECK-NEXT: vmor.mm v9, v9, v10 -; CHECK-NEXT: vmseq.vx v10, v8, t4 -; CHECK-NEXT: vmor.mm v9, v9, v11 -; CHECK-NEXT: vmseq.vx v11, v8, t5 -; CHECK-NEXT: vmor.mm v9, v9, v10 -; CHECK-NEXT: vmseq.vx v10, v8, a0 -; CHECK-NEXT: vmor.mm v9, v9, v11 -; CHECK-NEXT: vmor.mm v9, v9, v10 -; CHECK-NEXT: vmseq.vx v8, v8, a1 +; CHECK-NEXT: vmseq.vv v8, v8, v12 ; CHECK-NEXT: vmor.mm v8, v9, v8 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret @@ -505,797 +436,198 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask) } define @match_nxv16i8_v32i8( %op1, <32 x i8> %op2, %mask) { -; RV32-LABEL: match_nxv16i8_v32i8: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: .cfi_offset s1, -12 -; RV32-NEXT: .cfi_offset s2, -16 -; RV32-NEXT: .cfi_offset s3, -20 -; RV32-NEXT: .cfi_offset s4, -24 -; RV32-NEXT: .cfi_offset s5, -28 -; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: .cfi_offset s7, -36 -; RV32-NEXT: .cfi_offset s8, -40 -; RV32-NEXT: .cfi_offset s9, -44 -; RV32-NEXT: .cfi_offset s10, -48 -; RV32-NEXT: .cfi_offset s11, -52 -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32-NEXT: vslidedown.vi v12, v10, 1 -; RV32-NEXT: vslidedown.vi v13, v10, 2 -; RV32-NEXT: vslidedown.vi v14, v10, 3 -; RV32-NEXT: vslidedown.vi v15, v10, 4 -; RV32-NEXT: vslidedown.vi v16, v10, 5 -; RV32-NEXT: vslidedown.vi v17, v10, 6 -; RV32-NEXT: vslidedown.vi v18, v10, 7 -; RV32-NEXT: vslidedown.vi v19, v10, 8 -; RV32-NEXT: vslidedown.vi v20, v10, 9 -; RV32-NEXT: vslidedown.vi v21, v10, 10 -; RV32-NEXT: vslidedown.vi v22, v10, 11 -; RV32-NEXT: vslidedown.vi v23, v10, 12 -; RV32-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV32-NEXT: vslidedown.vi v24, v10, 16 -; RV32-NEXT: vmv.x.s a1, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 17 -; RV32-NEXT: vmv.x.s a2, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 18 -; RV32-NEXT: vmv.x.s a3, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 19 -; RV32-NEXT: vmv.x.s a4, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 20 -; RV32-NEXT: vmv.x.s a5, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 21 -; RV32-NEXT: vmv.x.s a6, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 22 -; RV32-NEXT: vmv.x.s a7, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 23 -; RV32-NEXT: vmv.x.s t0, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 24 -; RV32-NEXT: vmv.x.s t1, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 25 -; RV32-NEXT: vmv.x.s t2, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 26 -; RV32-NEXT: vmv.x.s t3, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 27 -; RV32-NEXT: vmv.x.s t4, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 28 -; RV32-NEXT: vmv.x.s t5, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 29 -; RV32-NEXT: vmv.x.s t6, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 30 -; RV32-NEXT: vmv.x.s s0, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 31 -; RV32-NEXT: vmv.x.s s1, v24 -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vslidedown.vi v11, v10, 13 -; RV32-NEXT: vslidedown.vi v24, v10, 14 -; RV32-NEXT: vslidedown.vi v10, v10, 15 -; RV32-NEXT: vmv.x.s s2, v12 -; RV32-NEXT: vmv.x.s s3, v13 -; RV32-NEXT: vmv.x.s s4, v14 -; RV32-NEXT: vmv.x.s s5, v15 -; RV32-NEXT: vmv.x.s s6, v16 -; RV32-NEXT: vmv.x.s s7, v17 -; RV32-NEXT: vmv.x.s s8, v18 -; RV32-NEXT: vmv.x.s s9, v19 -; RV32-NEXT: vmv.x.s s10, v20 -; RV32-NEXT: vmv.x.s s11, v21 -; RV32-NEXT: vmv.x.s ra, v22 -; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: vmseq.vx v12, v8, a0 -; RV32-NEXT: vmv.x.s a0, v23 -; RV32-NEXT: vmseq.vx v13, v8, s2 -; RV32-NEXT: vmv.x.s s2, v11 -; RV32-NEXT: vmseq.vx v11, v8, s3 -; RV32-NEXT: vmv.x.s s3, v24 -; RV32-NEXT: vmseq.vx v14, v8, s4 -; RV32-NEXT: vmv.x.s s4, v10 -; RV32-NEXT: vmseq.vx v10, v8, s5 -; RV32-NEXT: vmor.mm v12, v12, v13 -; RV32-NEXT: vmseq.vx v13, v8, s6 -; RV32-NEXT: vmor.mm v11, v12, v11 -; RV32-NEXT: vmseq.vx v12, v8, s7 -; RV32-NEXT: vmor.mm v11, v11, v14 -; RV32-NEXT: vmseq.vx v14, v8, s8 -; RV32-NEXT: vmor.mm v10, v11, v10 -; RV32-NEXT: vmseq.vx v11, v8, s9 -; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, s10 -; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, s11 -; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmseq.vx v14, v8, ra -; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, a0 -; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, s2 -; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, s3 -; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmseq.vx v14, v8, s4 -; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, a1 -; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, a2 -; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, a3 -; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmseq.vx v14, v8, a4 -; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, a5 -; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, a6 -; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, a7 -; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmseq.vx v14, v8, t0 -; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, t1 -; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, t2 -; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, t3 -; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmseq.vx v14, v8, t4 -; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmseq.vx v11, v8, t5 -; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmseq.vx v13, v8, t6 -; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v12, v8, s0 -; RV32-NEXT: vmor.mm v10, v10, v14 -; RV32-NEXT: vmor.mm v10, v10, v11 -; RV32-NEXT: vmor.mm v10, v10, v13 -; RV32-NEXT: vmor.mm v10, v10, v12 -; RV32-NEXT: vmseq.vx v11, v8, s1 -; RV32-NEXT: vmor.mm v8, v10, v11 -; RV32-NEXT: vmand.mm v0, v8, v0 -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: .cfi_restore s1 -; RV32-NEXT: .cfi_restore s2 -; RV32-NEXT: .cfi_restore s3 -; RV32-NEXT: .cfi_restore s4 -; RV32-NEXT: .cfi_restore s5 -; RV32-NEXT: .cfi_restore s6 -; RV32-NEXT: .cfi_restore s7 -; RV32-NEXT: .cfi_restore s8 -; RV32-NEXT: .cfi_restore s9 -; RV32-NEXT: .cfi_restore s10 -; RV32-NEXT: .cfi_restore s11 -; RV32-NEXT: addi sp, sp, 64 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: match_nxv16i8_v32i8: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -112 -; RV64-NEXT: .cfi_def_cfa_offset 112 -; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: .cfi_offset s1, -24 -; RV64-NEXT: .cfi_offset s2, -32 -; RV64-NEXT: .cfi_offset s3, -40 -; RV64-NEXT: .cfi_offset s4, -48 -; RV64-NEXT: .cfi_offset s5, -56 -; RV64-NEXT: .cfi_offset s6, -64 -; RV64-NEXT: .cfi_offset s7, -72 -; RV64-NEXT: .cfi_offset s8, -80 -; RV64-NEXT: .cfi_offset s9, -88 -; RV64-NEXT: .cfi_offset s10, -96 -; RV64-NEXT: .cfi_offset s11, -104 -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: sd a0, 0(sp) # 8-byte Folded Spill -; RV64-NEXT: vslidedown.vi v12, v10, 1 -; RV64-NEXT: vslidedown.vi v13, v10, 2 -; RV64-NEXT: vslidedown.vi v14, v10, 3 -; RV64-NEXT: vslidedown.vi v15, v10, 4 -; RV64-NEXT: vslidedown.vi v16, v10, 5 -; RV64-NEXT: vslidedown.vi v17, v10, 6 -; RV64-NEXT: vslidedown.vi v18, v10, 7 -; RV64-NEXT: vslidedown.vi v19, v10, 8 -; RV64-NEXT: vslidedown.vi v20, v10, 9 -; RV64-NEXT: vslidedown.vi v21, v10, 10 -; RV64-NEXT: vslidedown.vi v22, v10, 11 -; RV64-NEXT: vslidedown.vi v23, v10, 12 -; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64-NEXT: vslidedown.vi v24, v10, 16 -; RV64-NEXT: vmv.x.s a1, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 17 -; RV64-NEXT: vmv.x.s a2, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 18 -; RV64-NEXT: vmv.x.s a3, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 19 -; RV64-NEXT: vmv.x.s a4, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 20 -; RV64-NEXT: vmv.x.s a5, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 21 -; RV64-NEXT: vmv.x.s a6, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 22 -; RV64-NEXT: vmv.x.s a7, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 23 -; RV64-NEXT: vmv.x.s t0, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 24 -; RV64-NEXT: vmv.x.s t1, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 25 -; RV64-NEXT: vmv.x.s t2, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 26 -; RV64-NEXT: vmv.x.s t3, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 27 -; RV64-NEXT: vmv.x.s t4, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 28 -; RV64-NEXT: vmv.x.s t5, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 29 -; RV64-NEXT: vmv.x.s t6, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 30 -; RV64-NEXT: vmv.x.s s0, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 31 -; RV64-NEXT: vmv.x.s s1, v24 -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vslidedown.vi v11, v10, 13 -; RV64-NEXT: vslidedown.vi v24, v10, 14 -; RV64-NEXT: vslidedown.vi v10, v10, 15 -; RV64-NEXT: vmv.x.s s2, v12 -; RV64-NEXT: vmv.x.s s3, v13 -; RV64-NEXT: vmv.x.s s4, v14 -; RV64-NEXT: vmv.x.s s5, v15 -; RV64-NEXT: vmv.x.s s6, v16 -; RV64-NEXT: vmv.x.s s7, v17 -; RV64-NEXT: vmv.x.s s8, v18 -; RV64-NEXT: vmv.x.s s9, v19 -; RV64-NEXT: vmv.x.s s10, v20 -; RV64-NEXT: vmv.x.s s11, v21 -; RV64-NEXT: vmv.x.s ra, v22 -; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-NEXT: ld a0, 0(sp) # 8-byte Folded Reload -; RV64-NEXT: vmseq.vx v12, v8, a0 -; RV64-NEXT: vmv.x.s a0, v23 -; RV64-NEXT: vmseq.vx v13, v8, s2 -; RV64-NEXT: vmv.x.s s2, v11 -; RV64-NEXT: vmseq.vx v11, v8, s3 -; RV64-NEXT: vmv.x.s s3, v24 -; RV64-NEXT: vmseq.vx v14, v8, s4 -; RV64-NEXT: vmv.x.s s4, v10 -; RV64-NEXT: vmseq.vx v10, v8, s5 -; RV64-NEXT: vmor.mm v12, v12, v13 -; RV64-NEXT: vmseq.vx v13, v8, s6 -; RV64-NEXT: vmor.mm v11, v12, v11 -; RV64-NEXT: vmseq.vx v12, v8, s7 -; RV64-NEXT: vmor.mm v11, v11, v14 -; RV64-NEXT: vmseq.vx v14, v8, s8 -; RV64-NEXT: vmor.mm v10, v11, v10 -; RV64-NEXT: vmseq.vx v11, v8, s9 -; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, s10 -; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, s11 -; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmseq.vx v14, v8, ra -; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, a0 -; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, s2 -; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, s3 -; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmseq.vx v14, v8, s4 -; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, a1 -; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, a2 -; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, a3 -; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmseq.vx v14, v8, a4 -; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, a5 -; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, a6 -; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, a7 -; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmseq.vx v14, v8, t0 -; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, t1 -; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, t2 -; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, t3 -; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmseq.vx v14, v8, t4 -; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmseq.vx v11, v8, t5 -; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmseq.vx v13, v8, t6 -; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v12, v8, s0 -; RV64-NEXT: vmor.mm v10, v10, v14 -; RV64-NEXT: vmor.mm v10, v10, v11 -; RV64-NEXT: vmor.mm v10, v10, v13 -; RV64-NEXT: vmor.mm v10, v10, v12 -; RV64-NEXT: vmseq.vx v11, v8, s1 -; RV64-NEXT: vmor.mm v8, v10, v11 -; RV64-NEXT: vmand.mm v0, v8, v0 -; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra -; RV64-NEXT: .cfi_restore s0 -; RV64-NEXT: .cfi_restore s1 -; RV64-NEXT: .cfi_restore s2 -; RV64-NEXT: .cfi_restore s3 -; RV64-NEXT: .cfi_restore s4 -; RV64-NEXT: .cfi_restore s5 -; RV64-NEXT: .cfi_restore s6 -; RV64-NEXT: .cfi_restore s7 -; RV64-NEXT: .cfi_restore s8 -; RV64-NEXT: .cfi_restore s9 -; RV64-NEXT: .cfi_restore s10 -; RV64-NEXT: .cfi_restore s11 -; RV64-NEXT: addi sp, sp, 112 -; RV64-NEXT: .cfi_def_cfa_offset 0 -; RV64-NEXT: ret +; CHECK-LABEL: match_nxv16i8_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x06, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 6 * vlenb +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs1r.v v0, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vrgather.vi v18, v10, 1 +; CHECK-NEXT: vrgather.vi v14, v10, 0 +; CHECK-NEXT: vrgather.vi v20, v10, 2 +; CHECK-NEXT: vrgather.vi v16, v10, 3 +; CHECK-NEXT: vrgather.vi v24, v10, 4 +; CHECK-NEXT: vrgather.vi v22, v10, 5 +; CHECK-NEXT: vrgather.vi v28, v10, 6 +; CHECK-NEXT: vrgather.vi v26, v10, 7 +; CHECK-NEXT: vrgather.vi v6, v10, 8 +; CHECK-NEXT: vrgather.vi v30, v10, 9 +; CHECK-NEXT: vrgather.vi v2, v10, 10 +; CHECK-NEXT: vrgather.vi v4, v10, 11 +; CHECK-NEXT: vrgather.vi v12, v10, 12 +; CHECK-NEXT: vmseq.vv v1, v8, v18 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs1r.v v1, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vrgather.vi v18, v10, 13 +; CHECK-NEXT: vmseq.vv v1, v8, v14 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs1r.v v1, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmseq.vv v14, v8, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs1r.v v14, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vrgather.vi v20, v10, 14 +; CHECK-NEXT: vmseq.vv v14, v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs1r.v v14, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmseq.vv v14, v8, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs1r.v v14, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vrgather.vi v24, v10, 15 +; CHECK-NEXT: vmseq.vv v1, v8, v22 +; CHECK-NEXT: vmseq.vv v0, v8, v28 +; CHECK-NEXT: vrgather.vi v28, v10, 16 +; CHECK-NEXT: vmseq.vv v23, v8, v26 +; CHECK-NEXT: vmseq.vv v26, v8, v6 +; CHECK-NEXT: vrgather.vi v6, v10, 17 +; CHECK-NEXT: vmseq.vv v27, v8, v30 +; CHECK-NEXT: vmseq.vv v30, v8, v2 +; CHECK-NEXT: vrgather.vi v2, v10, 18 +; CHECK-NEXT: vmseq.vv v31, v8, v4 +; CHECK-NEXT: vmseq.vv v5, v8, v12 +; CHECK-NEXT: vrgather.vi v12, v10, 19 +; CHECK-NEXT: vmseq.vv v4, v8, v18 +; CHECK-NEXT: vmseq.vv v22, v8, v20 +; CHECK-NEXT: vrgather.vi v20, v10, 20 +; CHECK-NEXT: vmseq.vv v19, v8, v24 +; CHECK-NEXT: vmseq.vv v24, v8, v28 +; CHECK-NEXT: vrgather.vi v28, v10, 21 +; CHECK-NEXT: vmseq.vv v25, v8, v6 +; CHECK-NEXT: vmseq.vv v7, v8, v2 +; CHECK-NEXT: vrgather.vi v2, v10, 22 +; CHECK-NEXT: vmseq.vv v6, v8, v12 +; CHECK-NEXT: vmseq.vv v18, v8, v20 +; CHECK-NEXT: vrgather.vi v20, v10, 23 +; CHECK-NEXT: vmseq.vv v17, v8, v28 +; CHECK-NEXT: vmseq.vv v28, v8, v2 +; CHECK-NEXT: vrgather.vi v2, v10, 24 +; CHECK-NEXT: vmseq.vv v29, v8, v20 +; CHECK-NEXT: vmseq.vv v20, v8, v2 +; CHECK-NEXT: vrgather.vi v2, v10, 25 +; CHECK-NEXT: vmseq.vv v21, v8, v2 +; CHECK-NEXT: vrgather.vi v2, v10, 26 +; CHECK-NEXT: vmseq.vv v16, v8, v2 +; CHECK-NEXT: vrgather.vi v2, v10, 27 +; CHECK-NEXT: vmseq.vv v15, v8, v2 +; CHECK-NEXT: vrgather.vi v2, v10, 28 +; CHECK-NEXT: vmseq.vv v14, v8, v2 +; CHECK-NEXT: vrgather.vi v2, v10, 29 +; CHECK-NEXT: vmseq.vv v13, v8, v2 +; CHECK-NEXT: vrgather.vi v2, v10, 30 +; CHECK-NEXT: vmseq.vv v12, v8, v2 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v3, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v2, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmor.mm v3, v2, v3 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v2, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmor.mm v3, v3, v2 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v2, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmor.mm v3, v3, v2 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl1r.v v2, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmor.mm v3, v3, v2 +; CHECK-NEXT: vmor.mm v3, v3, v1 +; CHECK-NEXT: vmor.mm v3, v3, v0 +; CHECK-NEXT: vmor.mm v23, v3, v23 +; CHECK-NEXT: vmor.mm v23, v23, v26 +; CHECK-NEXT: vmor.mm v23, v23, v27 +; CHECK-NEXT: vmor.mm v23, v23, v30 +; CHECK-NEXT: vmor.mm v23, v23, v31 +; CHECK-NEXT: vmor.mm v23, v23, v5 +; CHECK-NEXT: vmor.mm v23, v23, v4 +; CHECK-NEXT: vmor.mm v22, v23, v22 +; CHECK-NEXT: vmor.mm v19, v22, v19 +; CHECK-NEXT: vmor.mm v19, v19, v24 +; CHECK-NEXT: vmor.mm v19, v19, v25 +; CHECK-NEXT: vmor.mm v19, v19, v7 +; CHECK-NEXT: vmor.mm v19, v19, v6 +; CHECK-NEXT: vmor.mm v18, v19, v18 +; CHECK-NEXT: vmor.mm v17, v18, v17 +; CHECK-NEXT: vmor.mm v17, v17, v28 +; CHECK-NEXT: vmor.mm v17, v17, v29 +; CHECK-NEXT: vmor.mm v17, v17, v20 +; CHECK-NEXT: vmor.mm v17, v17, v21 +; CHECK-NEXT: vmor.mm v16, v17, v16 +; CHECK-NEXT: vmor.mm v15, v16, v15 +; CHECK-NEXT: vmor.mm v14, v15, v14 +; CHECK-NEXT: vrgather.vi v16, v10, 31 +; CHECK-NEXT: vmor.mm v10, v14, v13 +; CHECK-NEXT: vmor.mm v10, v10, v12 +; CHECK-NEXT: vmseq.vv v11, v8, v16 +; CHECK-NEXT: vmor.mm v8, v10, v11 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret %r = tail call @llvm.experimental.vector.match( %op1, <32 x i8> %op2, %mask) ret %r } -define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) { -; RV32-LABEL: match_v16i8_v32i8: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -64 -; RV32-NEXT: .cfi_def_cfa_offset 64 -; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: .cfi_offset s1, -12 -; RV32-NEXT: .cfi_offset s2, -16 -; RV32-NEXT: .cfi_offset s3, -20 -; RV32-NEXT: .cfi_offset s4, -24 -; RV32-NEXT: .cfi_offset s5, -28 -; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: .cfi_offset s7, -36 -; RV32-NEXT: .cfi_offset s8, -40 -; RV32-NEXT: .cfi_offset s9, -44 -; RV32-NEXT: .cfi_offset s10, -48 -; RV32-NEXT: .cfi_offset s11, -52 -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: vslidedown.vi v9, v10, 1 -; RV32-NEXT: vslidedown.vi v12, v10, 2 -; RV32-NEXT: vslidedown.vi v13, v10, 3 -; RV32-NEXT: vslidedown.vi v14, v10, 4 -; RV32-NEXT: vslidedown.vi v15, v10, 5 -; RV32-NEXT: vslidedown.vi v16, v10, 6 -; RV32-NEXT: vslidedown.vi v17, v10, 7 -; RV32-NEXT: vslidedown.vi v18, v10, 8 -; RV32-NEXT: vslidedown.vi v19, v10, 9 -; RV32-NEXT: vslidedown.vi v20, v10, 10 -; RV32-NEXT: vslidedown.vi v21, v10, 11 -; RV32-NEXT: vslidedown.vi v22, v10, 12 -; RV32-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV32-NEXT: vslidedown.vi v24, v10, 16 -; RV32-NEXT: vmv.x.s a1, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 17 -; RV32-NEXT: vmv.x.s a2, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 18 -; RV32-NEXT: vmv.x.s a3, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 19 -; RV32-NEXT: vmv.x.s a4, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 20 -; RV32-NEXT: vmv.x.s a5, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 21 -; RV32-NEXT: vmv.x.s a6, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 22 -; RV32-NEXT: vmv.x.s a7, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 23 -; RV32-NEXT: vmv.x.s t0, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 24 -; RV32-NEXT: vmv.x.s t1, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 25 -; RV32-NEXT: vmv.x.s t2, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 26 -; RV32-NEXT: vmv.x.s t3, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 27 -; RV32-NEXT: vmv.x.s t4, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 28 -; RV32-NEXT: vmv.x.s t5, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 29 -; RV32-NEXT: vmv.x.s t6, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 30 -; RV32-NEXT: vmv.x.s s0, v24 -; RV32-NEXT: vslidedown.vi v24, v10, 31 -; RV32-NEXT: vmv.x.s s1, v24 -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vslidedown.vi v11, v10, 13 -; RV32-NEXT: vslidedown.vi v23, v10, 14 -; RV32-NEXT: vslidedown.vi v10, v10, 15 -; RV32-NEXT: vmv.x.s s2, v9 -; RV32-NEXT: vmv.x.s s3, v12 -; RV32-NEXT: vmv.x.s s4, v13 -; RV32-NEXT: vmv.x.s s5, v14 -; RV32-NEXT: vmv.x.s s6, v15 -; RV32-NEXT: vmv.x.s s7, v16 -; RV32-NEXT: vmv.x.s s8, v17 -; RV32-NEXT: vmv.x.s s9, v18 -; RV32-NEXT: vmv.x.s s10, v19 -; RV32-NEXT: vmv.x.s s11, v20 -; RV32-NEXT: vmv.x.s ra, v21 -; RV32-NEXT: vmseq.vx v9, v8, a0 -; RV32-NEXT: vmv.x.s a0, v22 -; RV32-NEXT: vmseq.vx v12, v8, s2 -; RV32-NEXT: vmv.x.s s2, v11 -; RV32-NEXT: vmseq.vx v11, v8, s3 -; RV32-NEXT: vmv.x.s s3, v23 -; RV32-NEXT: vmseq.vx v13, v8, s4 -; RV32-NEXT: vmv.x.s s4, v10 -; RV32-NEXT: vmseq.vx v10, v8, s5 -; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, s6 -; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, s7 -; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, s8 -; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, s9 -; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, s10 -; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, s11 -; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, ra -; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, a0 -; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, s2 -; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, s3 -; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, s4 -; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, a1 -; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, a2 -; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, a3 -; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, a4 -; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, a5 -; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, a6 -; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, a7 -; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, t0 -; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, t1 -; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, t2 -; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, t3 -; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmseq.vx v13, v8, t4 -; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmseq.vx v10, v8, t5 -; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmseq.vx v12, v8, t6 -; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v11, v8, s0 -; RV32-NEXT: vmor.mm v9, v9, v13 -; RV32-NEXT: vmor.mm v9, v9, v10 -; RV32-NEXT: vmor.mm v9, v9, v12 -; RV32-NEXT: vmor.mm v9, v9, v11 -; RV32-NEXT: vmseq.vx v8, v8, s1 -; RV32-NEXT: vmor.mm v8, v9, v8 -; RV32-NEXT: vmand.mm v0, v8, v0 -; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: .cfi_restore s1 -; RV32-NEXT: .cfi_restore s2 -; RV32-NEXT: .cfi_restore s3 -; RV32-NEXT: .cfi_restore s4 -; RV32-NEXT: .cfi_restore s5 -; RV32-NEXT: .cfi_restore s6 -; RV32-NEXT: .cfi_restore s7 -; RV32-NEXT: .cfi_restore s8 -; RV32-NEXT: .cfi_restore s9 -; RV32-NEXT: .cfi_restore s10 -; RV32-NEXT: .cfi_restore s11 -; RV32-NEXT: addi sp, sp, 64 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: match_v16i8_v32i8: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -112 -; RV64-NEXT: .cfi_def_cfa_offset 112 -; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: .cfi_offset s1, -24 -; RV64-NEXT: .cfi_offset s2, -32 -; RV64-NEXT: .cfi_offset s3, -40 -; RV64-NEXT: .cfi_offset s4, -48 -; RV64-NEXT: .cfi_offset s5, -56 -; RV64-NEXT: .cfi_offset s6, -64 -; RV64-NEXT: .cfi_offset s7, -72 -; RV64-NEXT: .cfi_offset s8, -80 -; RV64-NEXT: .cfi_offset s9, -88 -; RV64-NEXT: .cfi_offset s10, -96 -; RV64-NEXT: .cfi_offset s11, -104 -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: vslidedown.vi v9, v10, 1 -; RV64-NEXT: vslidedown.vi v12, v10, 2 -; RV64-NEXT: vslidedown.vi v13, v10, 3 -; RV64-NEXT: vslidedown.vi v14, v10, 4 -; RV64-NEXT: vslidedown.vi v15, v10, 5 -; RV64-NEXT: vslidedown.vi v16, v10, 6 -; RV64-NEXT: vslidedown.vi v17, v10, 7 -; RV64-NEXT: vslidedown.vi v18, v10, 8 -; RV64-NEXT: vslidedown.vi v19, v10, 9 -; RV64-NEXT: vslidedown.vi v20, v10, 10 -; RV64-NEXT: vslidedown.vi v21, v10, 11 -; RV64-NEXT: vslidedown.vi v22, v10, 12 -; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64-NEXT: vslidedown.vi v24, v10, 16 -; RV64-NEXT: vmv.x.s a1, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 17 -; RV64-NEXT: vmv.x.s a2, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 18 -; RV64-NEXT: vmv.x.s a3, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 19 -; RV64-NEXT: vmv.x.s a4, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 20 -; RV64-NEXT: vmv.x.s a5, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 21 -; RV64-NEXT: vmv.x.s a6, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 22 -; RV64-NEXT: vmv.x.s a7, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 23 -; RV64-NEXT: vmv.x.s t0, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 24 -; RV64-NEXT: vmv.x.s t1, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 25 -; RV64-NEXT: vmv.x.s t2, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 26 -; RV64-NEXT: vmv.x.s t3, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 27 -; RV64-NEXT: vmv.x.s t4, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 28 -; RV64-NEXT: vmv.x.s t5, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 29 -; RV64-NEXT: vmv.x.s t6, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 30 -; RV64-NEXT: vmv.x.s s0, v24 -; RV64-NEXT: vslidedown.vi v24, v10, 31 -; RV64-NEXT: vmv.x.s s1, v24 -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vslidedown.vi v11, v10, 13 -; RV64-NEXT: vslidedown.vi v23, v10, 14 -; RV64-NEXT: vslidedown.vi v10, v10, 15 -; RV64-NEXT: vmv.x.s s2, v9 -; RV64-NEXT: vmv.x.s s3, v12 -; RV64-NEXT: vmv.x.s s4, v13 -; RV64-NEXT: vmv.x.s s5, v14 -; RV64-NEXT: vmv.x.s s6, v15 -; RV64-NEXT: vmv.x.s s7, v16 -; RV64-NEXT: vmv.x.s s8, v17 -; RV64-NEXT: vmv.x.s s9, v18 -; RV64-NEXT: vmv.x.s s10, v19 -; RV64-NEXT: vmv.x.s s11, v20 -; RV64-NEXT: vmv.x.s ra, v21 -; RV64-NEXT: vmseq.vx v9, v8, a0 -; RV64-NEXT: vmv.x.s a0, v22 -; RV64-NEXT: vmseq.vx v12, v8, s2 -; RV64-NEXT: vmv.x.s s2, v11 -; RV64-NEXT: vmseq.vx v11, v8, s3 -; RV64-NEXT: vmv.x.s s3, v23 -; RV64-NEXT: vmseq.vx v13, v8, s4 -; RV64-NEXT: vmv.x.s s4, v10 -; RV64-NEXT: vmseq.vx v10, v8, s5 -; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, s6 -; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, s7 -; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, s8 -; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, s9 -; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, s10 -; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, s11 -; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, ra -; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, a0 -; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, s2 -; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, s3 -; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, s4 -; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, a1 -; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, a2 -; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, a3 -; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, a4 -; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, a5 -; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, a6 -; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, a7 -; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, t0 -; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, t1 -; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, t2 -; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, t3 -; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmseq.vx v13, v8, t4 -; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmseq.vx v10, v8, t5 -; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmseq.vx v12, v8, t6 -; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v11, v8, s0 -; RV64-NEXT: vmor.mm v9, v9, v13 -; RV64-NEXT: vmor.mm v9, v9, v10 -; RV64-NEXT: vmor.mm v9, v9, v12 -; RV64-NEXT: vmor.mm v9, v9, v11 -; RV64-NEXT: vmseq.vx v8, v8, s1 -; RV64-NEXT: vmor.mm v8, v9, v8 -; RV64-NEXT: vmand.mm v0, v8, v0 -; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra -; RV64-NEXT: .cfi_restore s0 -; RV64-NEXT: .cfi_restore s1 -; RV64-NEXT: .cfi_restore s2 -; RV64-NEXT: .cfi_restore s3 -; RV64-NEXT: .cfi_restore s4 -; RV64-NEXT: .cfi_restore s5 -; RV64-NEXT: .cfi_restore s6 -; RV64-NEXT: .cfi_restore s7 -; RV64-NEXT: .cfi_restore s8 -; RV64-NEXT: .cfi_restore s9 -; RV64-NEXT: .cfi_restore s10 -; RV64-NEXT: .cfi_restore s11 -; RV64-NEXT: addi sp, sp, 112 -; RV64-NEXT: .cfi_def_cfa_offset 0 -; RV64-NEXT: ret - %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) - ret <16 x i1> %r -} - define @match_nxv4xi32_v4i32( %op1, <4 x i32> %op2, %mask) { ; CHECK-LABEL: match_nxv4xi32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v11, v10, 1 -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vslidedown.vi v10, v10, 3 -; CHECK-NEXT: vmv.x.s a1, v11 -; CHECK-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; CHECK-NEXT: vmseq.vx v11, v8, a0 -; CHECK-NEXT: vmv.x.s a0, v12 -; CHECK-NEXT: vmseq.vx v12, v8, a1 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: vmseq.vx v10, v8, a0 -; CHECK-NEXT: vmor.mm v11, v11, v12 -; CHECK-NEXT: vmor.mm v10, v11, v10 -; CHECK-NEXT: vmseq.vx v11, v8, a1 +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vrgather.vi v12, v10, 1 +; CHECK-NEXT: vmseq.vv v14, v8, v12 +; CHECK-NEXT: vrgather.vi v12, v10, 0 +; CHECK-NEXT: vmseq.vv v15, v8, v12 +; CHECK-NEXT: vmor.mm v12, v15, v14 +; CHECK-NEXT: vrgather.vi v14, v10, 2 +; CHECK-NEXT: vmseq.vv v13, v8, v14 +; CHECK-NEXT: vrgather.vi v14, v10, 3 +; CHECK-NEXT: vmor.mm v10, v12, v13 +; CHECK-NEXT: vmseq.vv v11, v8, v14 ; CHECK-NEXT: vmor.mm v8, v10, v11 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret @@ -1304,48 +636,16 @@ define @match_nxv4xi32_v4i32( %op1, <4 x i32 } define @match_nxv2xi64_v2i64( %op1, <2 x i64> %op2, %mask) { -; RV32-LABEL: match_nxv2xi64_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vmv.x.s a0, v10 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vslidedown.vi v11, v10, 1 -; RV32-NEXT: addi a2, sp, 8 -; RV32-NEXT: vsrl.vx v10, v10, a1 -; RV32-NEXT: vmv.x.s a3, v11 -; RV32-NEXT: vsrl.vx v11, v11, a1 -; RV32-NEXT: vmv.x.s a1, v10 -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: vmv.x.s a0, v11 -; RV32-NEXT: sw a3, 0(sp) -; RV32-NEXT: sw a0, 4(sp) -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV32-NEXT: vlse64.v v10, (a2), zero -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmseq.vv v14, v8, v10 -; RV32-NEXT: vmseq.vv v10, v8, v12 -; RV32-NEXT: vmor.mm v8, v14, v10 -; RV32-NEXT: vmand.mm v0, v8, v0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: match_nxv2xi64_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: vslidedown.vi v10, v10, 1 -; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV64-NEXT: vmseq.vx v11, v8, a0 -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: vmseq.vx v10, v8, a0 -; RV64-NEXT: vmor.mm v8, v11, v10 -; RV64-NEXT: vmand.mm v0, v8, v0 -; RV64-NEXT: ret +; CHECK-LABEL: match_nxv2xi64_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vrgather.vi v12, v10, 1 +; CHECK-NEXT: vmseq.vv v14, v8, v12 +; CHECK-NEXT: vrgather.vi v12, v10, 0 +; CHECK-NEXT: vmseq.vv v10, v8, v12 +; CHECK-NEXT: vmor.mm v8, v10, v14 +; CHECK-NEXT: vmand.mm v0, v8, v0 +; CHECK-NEXT: ret %r = tail call @llvm.experimental.vector.match( %op1, <2 x i64> %op2, %mask) ret %r } @@ -1385,3 +685,6 @@ define <2 x i1> @match_v2xi64_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %ma %r = tail call <2 x i1> @llvm.experimental.vector.match(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) ret <2 x i1> %r } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} From 678655fea87849310bc49e830acbc2c52dbff936 Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Wed, 27 Nov 2024 10:26:25 -0300 Subject: [PATCH 02/10] We don't handle vmv_x_s correctly yet Signed-off-by: Mikhail R. Gadelha --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 65aaa595a0a4e..90bf5b8f4e47e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3493,7 +3493,7 @@ static std::optional isSimpleVIDSequence(SDValue Op, static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { - if (SplatVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT && (SplatVal.getOpcode() != RISCVISD::VMV_X_S)) + if (SplatVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); SDValue Vec = SplatVal.getOperand(0); // Don't perform this optimization for i1 vectors. From 61ed7db2bfe6f14f79123fbf6c647fda3165f5ab Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Wed, 27 Nov 2024 10:27:53 -0300 Subject: [PATCH 03/10] code style Signed-off-by: Mikhail R. Gadelha --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 90bf5b8f4e47e..f0d8b8d94b6bf 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3524,7 +3524,6 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL, return convertFromScalableVector(VT, Gather, DAG, Subtarget); } - /// Try and optimize BUILD_VECTORs with "dominant values" - these are values /// which constitute a large proportion of the elements. In such cases we can /// splat a vector with the dominant element and make up the shortfall with From 7afbd3a3413cea5c11c970b99aa524f054153f41 Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Wed, 27 Nov 2024 10:53:38 -0300 Subject: [PATCH 04/10] Comments Signed-off-by: Mikhail R. Gadelha --- llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll index 295562f7f1beb..6737e2611f9cc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll +++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll @@ -377,6 +377,7 @@ define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) { ret <8 x i1> %r } +; Cases where op2 has more elements than op1. define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask) { ; CHECK-LABEL: match_v8i8_v16i8: @@ -685,6 +686,3 @@ define <2 x i1> @match_v2xi64_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %ma %r = tail call <2 x i1> @llvm.experimental.vector.match(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) ret <2 x i1> %r } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; RV32: {{.*}} -; RV64: {{.*}} From 1cfa8b4c3c2fbf154087da78393760fc03bae8e5 Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Thu, 28 Nov 2024 18:59:04 -0300 Subject: [PATCH 05/10] Address comment Signed-off-by: Mikhail R. Gadelha --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 + .../RISCV/rvv/intrinsic-vector-match.ll | 663 ++++++++++++------ 2 files changed, 449 insertions(+), 218 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f0d8b8d94b6bf..f3fc7536a9c82 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3505,6 +3505,10 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL, if (Idx.getValueType() != Subtarget.getXLenVT()) return SDValue(); + if (Vec.getValueSizeInBits().getKnownMinValue() > + VT.getSizeInBits().getKnownMinValue()) + return SDValue(); + MVT ContainerVT = VT; if (VT.isFixedLengthVector()) ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll index 6737e2611f9cc..5f01434c1fb5a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll +++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll @@ -382,53 +382,69 @@ define <8 x i1> @match_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) { define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask) { ; CHECK-LABEL: match_v8i8_v16i8: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv.x.s a0, v9 +; CHECK-NEXT: vslidedown.vi v10, v9, 1 +; CHECK-NEXT: vslidedown.vi v11, v9, 2 +; CHECK-NEXT: vmv.x.s a1, v10 +; CHECK-NEXT: vslidedown.vi v10, v9, 3 +; CHECK-NEXT: vmv.x.s a2, v11 +; CHECK-NEXT: vslidedown.vi v11, v9, 4 +; CHECK-NEXT: vmv.x.s a3, v10 +; CHECK-NEXT: vslidedown.vi v10, v9, 5 +; CHECK-NEXT: vmv.x.s a4, v11 +; CHECK-NEXT: vslidedown.vi v11, v9, 6 +; CHECK-NEXT: vmv.x.s a5, v10 +; CHECK-NEXT: vslidedown.vi v10, v9, 7 +; CHECK-NEXT: vmv.x.s a6, v11 +; CHECK-NEXT: vslidedown.vi v11, v9, 8 +; CHECK-NEXT: vmv.x.s a7, v10 +; CHECK-NEXT: vslidedown.vi v10, v9, 9 +; CHECK-NEXT: vmv.x.s t0, v11 +; CHECK-NEXT: vslidedown.vi v11, v9, 10 +; CHECK-NEXT: vmv.x.s t1, v10 +; CHECK-NEXT: vslidedown.vi v10, v9, 11 +; CHECK-NEXT: vmv.x.s t2, v11 +; CHECK-NEXT: vslidedown.vi v11, v9, 12 +; CHECK-NEXT: vmv.x.s t3, v10 +; CHECK-NEXT: vslidedown.vi v10, v9, 13 +; CHECK-NEXT: vmv.x.s t4, v11 +; CHECK-NEXT: vslidedown.vi v11, v9, 14 +; CHECK-NEXT: vslidedown.vi v9, v9, 15 +; CHECK-NEXT: vmv.x.s t5, v10 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vrgather.vi v10, v9, 1 -; CHECK-NEXT: vrgather.vi v11, v9, 0 -; CHECK-NEXT: vmseq.vv v10, v8, v10 -; CHECK-NEXT: vmseq.vv v11, v8, v11 -; CHECK-NEXT: vmor.mm v10, v11, v10 -; CHECK-NEXT: vrgather.vi v11, v9, 2 -; CHECK-NEXT: vmseq.vv v11, v8, v11 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vrgather.vi v11, v9, 3 -; CHECK-NEXT: vmseq.vv v11, v8, v11 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vrgather.vi v11, v9, 4 -; CHECK-NEXT: vmseq.vv v11, v8, v11 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vrgather.vi v11, v9, 5 -; CHECK-NEXT: vmseq.vv v11, v8, v11 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vrgather.vi v11, v9, 6 -; CHECK-NEXT: vmseq.vv v11, v8, v11 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vrgather.vi v11, v9, 7 -; CHECK-NEXT: vmseq.vv v11, v8, v11 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vrgather.vi v11, v9, 8 -; CHECK-NEXT: vmseq.vv v11, v8, v11 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vrgather.vi v11, v9, 9 -; CHECK-NEXT: vmseq.vv v11, v8, v11 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vrgather.vi v11, v9, 10 -; CHECK-NEXT: vmseq.vv v11, v8, v11 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vrgather.vi v11, v9, 11 -; CHECK-NEXT: vmseq.vv v11, v8, v11 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vrgather.vi v11, v9, 12 -; CHECK-NEXT: vmseq.vv v11, v8, v11 -; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vrgather.vi v11, v9, 13 -; CHECK-NEXT: vmseq.vv v11, v8, v11 +; CHECK-NEXT: vmseq.vx v10, v8, a0 +; CHECK-NEXT: vmv.x.s a0, v11 +; CHECK-NEXT: vmseq.vx v11, v8, a1 +; CHECK-NEXT: vmv.x.s a1, v9 +; CHECK-NEXT: vmseq.vx v9, v8, a2 ; CHECK-NEXT: vmor.mm v10, v10, v11 -; CHECK-NEXT: vrgather.vi v11, v9, 14 -; CHECK-NEXT: vrgather.vi v12, v9, 15 -; CHECK-NEXT: vmseq.vv v9, v8, v11 +; CHECK-NEXT: vmseq.vx v11, v8, a3 ; CHECK-NEXT: vmor.mm v9, v10, v9 -; CHECK-NEXT: vmseq.vv v8, v8, v12 +; CHECK-NEXT: vmseq.vx v10, v8, a4 +; CHECK-NEXT: vmor.mm v9, v9, v11 +; CHECK-NEXT: vmseq.vx v11, v8, a5 +; CHECK-NEXT: vmor.mm v9, v9, v10 +; CHECK-NEXT: vmseq.vx v10, v8, a6 +; CHECK-NEXT: vmor.mm v9, v9, v11 +; CHECK-NEXT: vmseq.vx v11, v8, a7 +; CHECK-NEXT: vmor.mm v9, v9, v10 +; CHECK-NEXT: vmseq.vx v10, v8, t0 +; CHECK-NEXT: vmor.mm v9, v9, v11 +; CHECK-NEXT: vmseq.vx v11, v8, t1 +; CHECK-NEXT: vmor.mm v9, v9, v10 +; CHECK-NEXT: vmseq.vx v10, v8, t2 +; CHECK-NEXT: vmor.mm v9, v9, v11 +; CHECK-NEXT: vmseq.vx v11, v8, t3 +; CHECK-NEXT: vmor.mm v9, v9, v10 +; CHECK-NEXT: vmseq.vx v10, v8, t4 +; CHECK-NEXT: vmor.mm v9, v9, v11 +; CHECK-NEXT: vmseq.vx v11, v8, t5 +; CHECK-NEXT: vmor.mm v9, v9, v10 +; CHECK-NEXT: vmseq.vx v10, v8, a0 +; CHECK-NEXT: vmor.mm v9, v9, v11 +; CHECK-NEXT: vmor.mm v9, v9, v10 +; CHECK-NEXT: vmseq.vx v8, v8, a1 ; CHECK-NEXT: vmor.mm v8, v9, v8 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret @@ -437,180 +453,391 @@ define <8 x i1> @match_v8i8_v16i8(<8 x i8> %op1, <16 x i8> %op2, <8 x i1> %mask) } define @match_nxv16i8_v32i8( %op1, <32 x i8> %op2, %mask) { -; CHECK-LABEL: match_nxv16i8_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x06, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 6 * vlenb -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs1r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vrgather.vi v18, v10, 1 -; CHECK-NEXT: vrgather.vi v14, v10, 0 -; CHECK-NEXT: vrgather.vi v20, v10, 2 -; CHECK-NEXT: vrgather.vi v16, v10, 3 -; CHECK-NEXT: vrgather.vi v24, v10, 4 -; CHECK-NEXT: vrgather.vi v22, v10, 5 -; CHECK-NEXT: vrgather.vi v28, v10, 6 -; CHECK-NEXT: vrgather.vi v26, v10, 7 -; CHECK-NEXT: vrgather.vi v6, v10, 8 -; CHECK-NEXT: vrgather.vi v30, v10, 9 -; CHECK-NEXT: vrgather.vi v2, v10, 10 -; CHECK-NEXT: vrgather.vi v4, v10, 11 -; CHECK-NEXT: vrgather.vi v12, v10, 12 -; CHECK-NEXT: vmseq.vv v1, v8, v18 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs1r.v v1, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vrgather.vi v18, v10, 13 -; CHECK-NEXT: vmseq.vv v1, v8, v14 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs1r.v v1, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmseq.vv v14, v8, v20 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs1r.v v14, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vrgather.vi v20, v10, 14 -; CHECK-NEXT: vmseq.vv v14, v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs1r.v v14, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmseq.vv v14, v8, v24 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs1r.v v14, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vrgather.vi v24, v10, 15 -; CHECK-NEXT: vmseq.vv v1, v8, v22 -; CHECK-NEXT: vmseq.vv v0, v8, v28 -; CHECK-NEXT: vrgather.vi v28, v10, 16 -; CHECK-NEXT: vmseq.vv v23, v8, v26 -; CHECK-NEXT: vmseq.vv v26, v8, v6 -; CHECK-NEXT: vrgather.vi v6, v10, 17 -; CHECK-NEXT: vmseq.vv v27, v8, v30 -; CHECK-NEXT: vmseq.vv v30, v8, v2 -; CHECK-NEXT: vrgather.vi v2, v10, 18 -; CHECK-NEXT: vmseq.vv v31, v8, v4 -; CHECK-NEXT: vmseq.vv v5, v8, v12 -; CHECK-NEXT: vrgather.vi v12, v10, 19 -; CHECK-NEXT: vmseq.vv v4, v8, v18 -; CHECK-NEXT: vmseq.vv v22, v8, v20 -; CHECK-NEXT: vrgather.vi v20, v10, 20 -; CHECK-NEXT: vmseq.vv v19, v8, v24 -; CHECK-NEXT: vmseq.vv v24, v8, v28 -; CHECK-NEXT: vrgather.vi v28, v10, 21 -; CHECK-NEXT: vmseq.vv v25, v8, v6 -; CHECK-NEXT: vmseq.vv v7, v8, v2 -; CHECK-NEXT: vrgather.vi v2, v10, 22 -; CHECK-NEXT: vmseq.vv v6, v8, v12 -; CHECK-NEXT: vmseq.vv v18, v8, v20 -; CHECK-NEXT: vrgather.vi v20, v10, 23 -; CHECK-NEXT: vmseq.vv v17, v8, v28 -; CHECK-NEXT: vmseq.vv v28, v8, v2 -; CHECK-NEXT: vrgather.vi v2, v10, 24 -; CHECK-NEXT: vmseq.vv v29, v8, v20 -; CHECK-NEXT: vmseq.vv v20, v8, v2 -; CHECK-NEXT: vrgather.vi v2, v10, 25 -; CHECK-NEXT: vmseq.vv v21, v8, v2 -; CHECK-NEXT: vrgather.vi v2, v10, 26 -; CHECK-NEXT: vmseq.vv v16, v8, v2 -; CHECK-NEXT: vrgather.vi v2, v10, 27 -; CHECK-NEXT: vmseq.vv v15, v8, v2 -; CHECK-NEXT: vrgather.vi v2, v10, 28 -; CHECK-NEXT: vmseq.vv v14, v8, v2 -; CHECK-NEXT: vrgather.vi v2, v10, 29 -; CHECK-NEXT: vmseq.vv v13, v8, v2 -; CHECK-NEXT: vrgather.vi v2, v10, 30 -; CHECK-NEXT: vmseq.vv v12, v8, v2 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl1r.v v3, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl1r.v v2, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmor.mm v3, v2, v3 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl1r.v v2, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmor.mm v3, v3, v2 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl1r.v v2, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmor.mm v3, v3, v2 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl1r.v v2, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmor.mm v3, v3, v2 -; CHECK-NEXT: vmor.mm v3, v3, v1 -; CHECK-NEXT: vmor.mm v3, v3, v0 -; CHECK-NEXT: vmor.mm v23, v3, v23 -; CHECK-NEXT: vmor.mm v23, v23, v26 -; CHECK-NEXT: vmor.mm v23, v23, v27 -; CHECK-NEXT: vmor.mm v23, v23, v30 -; CHECK-NEXT: vmor.mm v23, v23, v31 -; CHECK-NEXT: vmor.mm v23, v23, v5 -; CHECK-NEXT: vmor.mm v23, v23, v4 -; CHECK-NEXT: vmor.mm v22, v23, v22 -; CHECK-NEXT: vmor.mm v19, v22, v19 -; CHECK-NEXT: vmor.mm v19, v19, v24 -; CHECK-NEXT: vmor.mm v19, v19, v25 -; CHECK-NEXT: vmor.mm v19, v19, v7 -; CHECK-NEXT: vmor.mm v19, v19, v6 -; CHECK-NEXT: vmor.mm v18, v19, v18 -; CHECK-NEXT: vmor.mm v17, v18, v17 -; CHECK-NEXT: vmor.mm v17, v17, v28 -; CHECK-NEXT: vmor.mm v17, v17, v29 -; CHECK-NEXT: vmor.mm v17, v17, v20 -; CHECK-NEXT: vmor.mm v17, v17, v21 -; CHECK-NEXT: vmor.mm v16, v17, v16 -; CHECK-NEXT: vmor.mm v15, v16, v15 -; CHECK-NEXT: vmor.mm v14, v15, v14 -; CHECK-NEXT: vrgather.vi v16, v10, 31 -; CHECK-NEXT: vmor.mm v10, v14, v13 -; CHECK-NEXT: vmor.mm v10, v10, v12 -; CHECK-NEXT: vmseq.vv v11, v8, v16 -; CHECK-NEXT: vmor.mm v8, v10, v11 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmand.mm v0, v8, v9 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: ret +; RV32-LABEL: match_nxv16i8_v32i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -64 +; RV32-NEXT: .cfi_def_cfa_offset 64 +; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: .cfi_offset s1, -12 +; RV32-NEXT: .cfi_offset s2, -16 +; RV32-NEXT: .cfi_offset s3, -20 +; RV32-NEXT: .cfi_offset s4, -24 +; RV32-NEXT: .cfi_offset s5, -28 +; RV32-NEXT: .cfi_offset s6, -32 +; RV32-NEXT: .cfi_offset s7, -36 +; RV32-NEXT: .cfi_offset s8, -40 +; RV32-NEXT: .cfi_offset s9, -44 +; RV32-NEXT: .cfi_offset s10, -48 +; RV32-NEXT: .cfi_offset s11, -52 +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: sw a0, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: vslidedown.vi v12, v10, 1 +; RV32-NEXT: vslidedown.vi v13, v10, 2 +; RV32-NEXT: vslidedown.vi v14, v10, 3 +; RV32-NEXT: vslidedown.vi v15, v10, 4 +; RV32-NEXT: vslidedown.vi v16, v10, 5 +; RV32-NEXT: vslidedown.vi v17, v10, 6 +; RV32-NEXT: vslidedown.vi v18, v10, 7 +; RV32-NEXT: vslidedown.vi v19, v10, 8 +; RV32-NEXT: vslidedown.vi v20, v10, 9 +; RV32-NEXT: vslidedown.vi v21, v10, 10 +; RV32-NEXT: vslidedown.vi v22, v10, 11 +; RV32-NEXT: vslidedown.vi v23, v10, 12 +; RV32-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV32-NEXT: vslidedown.vi v24, v10, 16 +; RV32-NEXT: vmv.x.s a1, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 17 +; RV32-NEXT: vmv.x.s a2, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 18 +; RV32-NEXT: vmv.x.s a3, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 19 +; RV32-NEXT: vmv.x.s a4, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 20 +; RV32-NEXT: vmv.x.s a5, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 21 +; RV32-NEXT: vmv.x.s a6, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 22 +; RV32-NEXT: vmv.x.s a7, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 23 +; RV32-NEXT: vmv.x.s t0, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 24 +; RV32-NEXT: vmv.x.s t1, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 25 +; RV32-NEXT: vmv.x.s t2, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 26 +; RV32-NEXT: vmv.x.s t3, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 27 +; RV32-NEXT: vmv.x.s t4, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 28 +; RV32-NEXT: vmv.x.s t5, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 29 +; RV32-NEXT: vmv.x.s t6, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 30 +; RV32-NEXT: vmv.x.s s0, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 31 +; RV32-NEXT: vmv.x.s s1, v24 +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vslidedown.vi v11, v10, 13 +; RV32-NEXT: vslidedown.vi v24, v10, 14 +; RV32-NEXT: vslidedown.vi v10, v10, 15 +; RV32-NEXT: vmv.x.s s2, v12 +; RV32-NEXT: vmv.x.s s3, v13 +; RV32-NEXT: vmv.x.s s4, v14 +; RV32-NEXT: vmv.x.s s5, v15 +; RV32-NEXT: vmv.x.s s6, v16 +; RV32-NEXT: vmv.x.s s7, v17 +; RV32-NEXT: vmv.x.s s8, v18 +; RV32-NEXT: vmv.x.s s9, v19 +; RV32-NEXT: vmv.x.s s10, v20 +; RV32-NEXT: vmv.x.s s11, v21 +; RV32-NEXT: vmv.x.s ra, v22 +; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV32-NEXT: lw a0, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: vmseq.vx v12, v8, a0 +; RV32-NEXT: vmv.x.s a0, v23 +; RV32-NEXT: vmseq.vx v13, v8, s2 +; RV32-NEXT: vmv.x.s s2, v11 +; RV32-NEXT: vmseq.vx v11, v8, s3 +; RV32-NEXT: vmv.x.s s3, v24 +; RV32-NEXT: vmseq.vx v14, v8, s4 +; RV32-NEXT: vmv.x.s s4, v10 +; RV32-NEXT: vmseq.vx v10, v8, s5 +; RV32-NEXT: vmor.mm v12, v12, v13 +; RV32-NEXT: vmseq.vx v13, v8, s6 +; RV32-NEXT: vmor.mm v11, v12, v11 +; RV32-NEXT: vmseq.vx v12, v8, s7 +; RV32-NEXT: vmor.mm v11, v11, v14 +; RV32-NEXT: vmseq.vx v14, v8, s8 +; RV32-NEXT: vmor.mm v10, v11, v10 +; RV32-NEXT: vmseq.vx v11, v8, s9 +; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmseq.vx v13, v8, s10 +; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmseq.vx v12, v8, s11 +; RV32-NEXT: vmor.mm v10, v10, v14 +; RV32-NEXT: vmseq.vx v14, v8, ra +; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmseq.vx v11, v8, a0 +; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmseq.vx v13, v8, s2 +; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmseq.vx v12, v8, s3 +; RV32-NEXT: vmor.mm v10, v10, v14 +; RV32-NEXT: vmseq.vx v14, v8, s4 +; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmseq.vx v11, v8, a1 +; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmseq.vx v13, v8, a2 +; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmseq.vx v12, v8, a3 +; RV32-NEXT: vmor.mm v10, v10, v14 +; RV32-NEXT: vmseq.vx v14, v8, a4 +; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmseq.vx v11, v8, a5 +; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmseq.vx v13, v8, a6 +; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmseq.vx v12, v8, a7 +; RV32-NEXT: vmor.mm v10, v10, v14 +; RV32-NEXT: vmseq.vx v14, v8, t0 +; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmseq.vx v11, v8, t1 +; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmseq.vx v13, v8, t2 +; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmseq.vx v12, v8, t3 +; RV32-NEXT: vmor.mm v10, v10, v14 +; RV32-NEXT: vmseq.vx v14, v8, t4 +; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmseq.vx v11, v8, t5 +; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmseq.vx v13, v8, t6 +; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmseq.vx v12, v8, s0 +; RV32-NEXT: vmor.mm v10, v10, v14 +; RV32-NEXT: vmor.mm v10, v10, v11 +; RV32-NEXT: vmor.mm v10, v10, v13 +; RV32-NEXT: vmor.mm v10, v10, v12 +; RV32-NEXT: vmseq.vx v11, v8, s1 +; RV32-NEXT: vmor.mm v8, v10, v11 +; RV32-NEXT: vmand.mm v0, v8, v0 +; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: .cfi_restore s1 +; RV32-NEXT: .cfi_restore s2 +; RV32-NEXT: .cfi_restore s3 +; RV32-NEXT: .cfi_restore s4 +; RV32-NEXT: .cfi_restore s5 +; RV32-NEXT: .cfi_restore s6 +; RV32-NEXT: .cfi_restore s7 +; RV32-NEXT: .cfi_restore s8 +; RV32-NEXT: .cfi_restore s9 +; RV32-NEXT: .cfi_restore s10 +; RV32-NEXT: .cfi_restore s11 +; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: match_nxv16i8_v32i8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -112 +; RV64-NEXT: .cfi_def_cfa_offset 112 +; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: .cfi_offset s1, -24 +; RV64-NEXT: .cfi_offset s2, -32 +; RV64-NEXT: .cfi_offset s3, -40 +; RV64-NEXT: .cfi_offset s4, -48 +; RV64-NEXT: .cfi_offset s5, -56 +; RV64-NEXT: .cfi_offset s6, -64 +; RV64-NEXT: .cfi_offset s7, -72 +; RV64-NEXT: .cfi_offset s8, -80 +; RV64-NEXT: .cfi_offset s9, -88 +; RV64-NEXT: .cfi_offset s10, -96 +; RV64-NEXT: .cfi_offset s11, -104 +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: sd a0, 0(sp) # 8-byte Folded Spill +; RV64-NEXT: vslidedown.vi v12, v10, 1 +; RV64-NEXT: vslidedown.vi v13, v10, 2 +; RV64-NEXT: vslidedown.vi v14, v10, 3 +; RV64-NEXT: vslidedown.vi v15, v10, 4 +; RV64-NEXT: vslidedown.vi v16, v10, 5 +; RV64-NEXT: vslidedown.vi v17, v10, 6 +; RV64-NEXT: vslidedown.vi v18, v10, 7 +; RV64-NEXT: vslidedown.vi v19, v10, 8 +; RV64-NEXT: vslidedown.vi v20, v10, 9 +; RV64-NEXT: vslidedown.vi v21, v10, 10 +; RV64-NEXT: vslidedown.vi v22, v10, 11 +; RV64-NEXT: vslidedown.vi v23, v10, 12 +; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64-NEXT: vslidedown.vi v24, v10, 16 +; RV64-NEXT: vmv.x.s a1, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 17 +; RV64-NEXT: vmv.x.s a2, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 18 +; RV64-NEXT: vmv.x.s a3, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 19 +; RV64-NEXT: vmv.x.s a4, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 20 +; RV64-NEXT: vmv.x.s a5, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 21 +; RV64-NEXT: vmv.x.s a6, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 22 +; RV64-NEXT: vmv.x.s a7, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 23 +; RV64-NEXT: vmv.x.s t0, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 24 +; RV64-NEXT: vmv.x.s t1, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 25 +; RV64-NEXT: vmv.x.s t2, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 26 +; RV64-NEXT: vmv.x.s t3, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 27 +; RV64-NEXT: vmv.x.s t4, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 28 +; RV64-NEXT: vmv.x.s t5, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 29 +; RV64-NEXT: vmv.x.s t6, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 30 +; RV64-NEXT: vmv.x.s s0, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 31 +; RV64-NEXT: vmv.x.s s1, v24 +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vslidedown.vi v11, v10, 13 +; RV64-NEXT: vslidedown.vi v24, v10, 14 +; RV64-NEXT: vslidedown.vi v10, v10, 15 +; RV64-NEXT: vmv.x.s s2, v12 +; RV64-NEXT: vmv.x.s s3, v13 +; RV64-NEXT: vmv.x.s s4, v14 +; RV64-NEXT: vmv.x.s s5, v15 +; RV64-NEXT: vmv.x.s s6, v16 +; RV64-NEXT: vmv.x.s s7, v17 +; RV64-NEXT: vmv.x.s s8, v18 +; RV64-NEXT: vmv.x.s s9, v19 +; RV64-NEXT: vmv.x.s s10, v20 +; RV64-NEXT: vmv.x.s s11, v21 +; RV64-NEXT: vmv.x.s ra, v22 +; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV64-NEXT: ld a0, 0(sp) # 8-byte Folded Reload +; RV64-NEXT: vmseq.vx v12, v8, a0 +; RV64-NEXT: vmv.x.s a0, v23 +; RV64-NEXT: vmseq.vx v13, v8, s2 +; RV64-NEXT: vmv.x.s s2, v11 +; RV64-NEXT: vmseq.vx v11, v8, s3 +; RV64-NEXT: vmv.x.s s3, v24 +; RV64-NEXT: vmseq.vx v14, v8, s4 +; RV64-NEXT: vmv.x.s s4, v10 +; RV64-NEXT: vmseq.vx v10, v8, s5 +; RV64-NEXT: vmor.mm v12, v12, v13 +; RV64-NEXT: vmseq.vx v13, v8, s6 +; RV64-NEXT: vmor.mm v11, v12, v11 +; RV64-NEXT: vmseq.vx v12, v8, s7 +; RV64-NEXT: vmor.mm v11, v11, v14 +; RV64-NEXT: vmseq.vx v14, v8, s8 +; RV64-NEXT: vmor.mm v10, v11, v10 +; RV64-NEXT: vmseq.vx v11, v8, s9 +; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmseq.vx v13, v8, s10 +; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmseq.vx v12, v8, s11 +; RV64-NEXT: vmor.mm v10, v10, v14 +; RV64-NEXT: vmseq.vx v14, v8, ra +; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmseq.vx v11, v8, a0 +; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmseq.vx v13, v8, s2 +; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmseq.vx v12, v8, s3 +; RV64-NEXT: vmor.mm v10, v10, v14 +; RV64-NEXT: vmseq.vx v14, v8, s4 +; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmseq.vx v11, v8, a1 +; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmseq.vx v13, v8, a2 +; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmseq.vx v12, v8, a3 +; RV64-NEXT: vmor.mm v10, v10, v14 +; RV64-NEXT: vmseq.vx v14, v8, a4 +; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmseq.vx v11, v8, a5 +; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmseq.vx v13, v8, a6 +; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmseq.vx v12, v8, a7 +; RV64-NEXT: vmor.mm v10, v10, v14 +; RV64-NEXT: vmseq.vx v14, v8, t0 +; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmseq.vx v11, v8, t1 +; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmseq.vx v13, v8, t2 +; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmseq.vx v12, v8, t3 +; RV64-NEXT: vmor.mm v10, v10, v14 +; RV64-NEXT: vmseq.vx v14, v8, t4 +; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmseq.vx v11, v8, t5 +; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmseq.vx v13, v8, t6 +; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmseq.vx v12, v8, s0 +; RV64-NEXT: vmor.mm v10, v10, v14 +; RV64-NEXT: vmor.mm v10, v10, v11 +; RV64-NEXT: vmor.mm v10, v10, v13 +; RV64-NEXT: vmor.mm v10, v10, v12 +; RV64-NEXT: vmseq.vx v11, v8, s1 +; RV64-NEXT: vmor.mm v8, v10, v11 +; RV64-NEXT: vmand.mm v0, v8, v0 +; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: .cfi_restore s1 +; RV64-NEXT: .cfi_restore s2 +; RV64-NEXT: .cfi_restore s3 +; RV64-NEXT: .cfi_restore s4 +; RV64-NEXT: .cfi_restore s5 +; RV64-NEXT: .cfi_restore s6 +; RV64-NEXT: .cfi_restore s7 +; RV64-NEXT: .cfi_restore s8 +; RV64-NEXT: .cfi_restore s9 +; RV64-NEXT: .cfi_restore s10 +; RV64-NEXT: .cfi_restore s11 +; RV64-NEXT: addi sp, sp, 112 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret %r = tail call @llvm.experimental.vector.match( %op1, <32 x i8> %op2, %mask) ret %r } From 8eb53a955bf77cbcdd5af5db9fd1e53a91ae210a Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Thu, 28 Nov 2024 19:01:29 -0300 Subject: [PATCH 06/10] Readded test Signed-off-by: Mikhail R. Gadelha --- .../RISCV/rvv/intrinsic-vector-match.ll | 384 ++++++++++++++++++ 1 file changed, 384 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll index 5f01434c1fb5a..c71a6a85755b7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll +++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll @@ -842,6 +842,390 @@ define @match_nxv16i8_v32i8( %op1, <32 x i8 ret %r } +define <16 x i1> @match_v16i8_v32i8(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) { +; RV32-LABEL: match_v16i8_v32i8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -64 +; RV32-NEXT: .cfi_def_cfa_offset 64 +; RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 52(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 48(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 40(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 36(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s6, 32(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s8, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s9, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s10, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s11, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: .cfi_offset s1, -12 +; RV32-NEXT: .cfi_offset s2, -16 +; RV32-NEXT: .cfi_offset s3, -20 +; RV32-NEXT: .cfi_offset s4, -24 +; RV32-NEXT: .cfi_offset s5, -28 +; RV32-NEXT: .cfi_offset s6, -32 +; RV32-NEXT: .cfi_offset s7, -36 +; RV32-NEXT: .cfi_offset s8, -40 +; RV32-NEXT: .cfi_offset s9, -44 +; RV32-NEXT: .cfi_offset s10, -48 +; RV32-NEXT: .cfi_offset s11, -52 +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: vslidedown.vi v9, v10, 1 +; RV32-NEXT: vslidedown.vi v12, v10, 2 +; RV32-NEXT: vslidedown.vi v13, v10, 3 +; RV32-NEXT: vslidedown.vi v14, v10, 4 +; RV32-NEXT: vslidedown.vi v15, v10, 5 +; RV32-NEXT: vslidedown.vi v16, v10, 6 +; RV32-NEXT: vslidedown.vi v17, v10, 7 +; RV32-NEXT: vslidedown.vi v18, v10, 8 +; RV32-NEXT: vslidedown.vi v19, v10, 9 +; RV32-NEXT: vslidedown.vi v20, v10, 10 +; RV32-NEXT: vslidedown.vi v21, v10, 11 +; RV32-NEXT: vslidedown.vi v22, v10, 12 +; RV32-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV32-NEXT: vslidedown.vi v24, v10, 16 +; RV32-NEXT: vmv.x.s a1, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 17 +; RV32-NEXT: vmv.x.s a2, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 18 +; RV32-NEXT: vmv.x.s a3, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 19 +; RV32-NEXT: vmv.x.s a4, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 20 +; RV32-NEXT: vmv.x.s a5, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 21 +; RV32-NEXT: vmv.x.s a6, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 22 +; RV32-NEXT: vmv.x.s a7, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 23 +; RV32-NEXT: vmv.x.s t0, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 24 +; RV32-NEXT: vmv.x.s t1, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 25 +; RV32-NEXT: vmv.x.s t2, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 26 +; RV32-NEXT: vmv.x.s t3, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 27 +; RV32-NEXT: vmv.x.s t4, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 28 +; RV32-NEXT: vmv.x.s t5, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 29 +; RV32-NEXT: vmv.x.s t6, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 30 +; RV32-NEXT: vmv.x.s s0, v24 +; RV32-NEXT: vslidedown.vi v24, v10, 31 +; RV32-NEXT: vmv.x.s s1, v24 +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vslidedown.vi v11, v10, 13 +; RV32-NEXT: vslidedown.vi v23, v10, 14 +; RV32-NEXT: vslidedown.vi v10, v10, 15 +; RV32-NEXT: vmv.x.s s2, v9 +; RV32-NEXT: vmv.x.s s3, v12 +; RV32-NEXT: vmv.x.s s4, v13 +; RV32-NEXT: vmv.x.s s5, v14 +; RV32-NEXT: vmv.x.s s6, v15 +; RV32-NEXT: vmv.x.s s7, v16 +; RV32-NEXT: vmv.x.s s8, v17 +; RV32-NEXT: vmv.x.s s9, v18 +; RV32-NEXT: vmv.x.s s10, v19 +; RV32-NEXT: vmv.x.s s11, v20 +; RV32-NEXT: vmv.x.s ra, v21 +; RV32-NEXT: vmseq.vx v9, v8, a0 +; RV32-NEXT: vmv.x.s a0, v22 +; RV32-NEXT: vmseq.vx v12, v8, s2 +; RV32-NEXT: vmv.x.s s2, v11 +; RV32-NEXT: vmseq.vx v11, v8, s3 +; RV32-NEXT: vmv.x.s s3, v23 +; RV32-NEXT: vmseq.vx v13, v8, s4 +; RV32-NEXT: vmv.x.s s4, v10 +; RV32-NEXT: vmseq.vx v10, v8, s5 +; RV32-NEXT: vmor.mm v9, v9, v12 +; RV32-NEXT: vmseq.vx v12, v8, s6 +; RV32-NEXT: vmor.mm v9, v9, v11 +; RV32-NEXT: vmseq.vx v11, v8, s7 +; RV32-NEXT: vmor.mm v9, v9, v13 +; RV32-NEXT: vmseq.vx v13, v8, s8 +; RV32-NEXT: vmor.mm v9, v9, v10 +; RV32-NEXT: vmseq.vx v10, v8, s9 +; RV32-NEXT: vmor.mm v9, v9, v12 +; RV32-NEXT: vmseq.vx v12, v8, s10 +; RV32-NEXT: vmor.mm v9, v9, v11 +; RV32-NEXT: vmseq.vx v11, v8, s11 +; RV32-NEXT: vmor.mm v9, v9, v13 +; RV32-NEXT: vmseq.vx v13, v8, ra +; RV32-NEXT: vmor.mm v9, v9, v10 +; RV32-NEXT: vmseq.vx v10, v8, a0 +; RV32-NEXT: vmor.mm v9, v9, v12 +; RV32-NEXT: vmseq.vx v12, v8, s2 +; RV32-NEXT: vmor.mm v9, v9, v11 +; RV32-NEXT: vmseq.vx v11, v8, s3 +; RV32-NEXT: vmor.mm v9, v9, v13 +; RV32-NEXT: vmseq.vx v13, v8, s4 +; RV32-NEXT: vmor.mm v9, v9, v10 +; RV32-NEXT: vmseq.vx v10, v8, a1 +; RV32-NEXT: vmor.mm v9, v9, v12 +; RV32-NEXT: vmseq.vx v12, v8, a2 +; RV32-NEXT: vmor.mm v9, v9, v11 +; RV32-NEXT: vmseq.vx v11, v8, a3 +; RV32-NEXT: vmor.mm v9, v9, v13 +; RV32-NEXT: vmseq.vx v13, v8, a4 +; RV32-NEXT: vmor.mm v9, v9, v10 +; RV32-NEXT: vmseq.vx v10, v8, a5 +; RV32-NEXT: vmor.mm v9, v9, v12 +; RV32-NEXT: vmseq.vx v12, v8, a6 +; RV32-NEXT: vmor.mm v9, v9, v11 +; RV32-NEXT: vmseq.vx v11, v8, a7 +; RV32-NEXT: vmor.mm v9, v9, v13 +; RV32-NEXT: vmseq.vx v13, v8, t0 +; RV32-NEXT: vmor.mm v9, v9, v10 +; RV32-NEXT: vmseq.vx v10, v8, t1 +; RV32-NEXT: vmor.mm v9, v9, v12 +; RV32-NEXT: vmseq.vx v12, v8, t2 +; RV32-NEXT: vmor.mm v9, v9, v11 +; RV32-NEXT: vmseq.vx v11, v8, t3 +; RV32-NEXT: vmor.mm v9, v9, v13 +; RV32-NEXT: vmseq.vx v13, v8, t4 +; RV32-NEXT: vmor.mm v9, v9, v10 +; RV32-NEXT: vmseq.vx v10, v8, t5 +; RV32-NEXT: vmor.mm v9, v9, v12 +; RV32-NEXT: vmseq.vx v12, v8, t6 +; RV32-NEXT: vmor.mm v9, v9, v11 +; RV32-NEXT: vmseq.vx v11, v8, s0 +; RV32-NEXT: vmor.mm v9, v9, v13 +; RV32-NEXT: vmor.mm v9, v9, v10 +; RV32-NEXT: vmor.mm v9, v9, v12 +; RV32-NEXT: vmor.mm v9, v9, v11 +; RV32-NEXT: vmseq.vx v8, v8, s1 +; RV32-NEXT: vmor.mm v8, v9, v8 +; RV32-NEXT: vmand.mm v0, v8, v0 +; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 52(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 48(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 40(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 36(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s6, 32(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s7, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s8, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s9, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s10, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s11, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: .cfi_restore s1 +; RV32-NEXT: .cfi_restore s2 +; RV32-NEXT: .cfi_restore s3 +; RV32-NEXT: .cfi_restore s4 +; RV32-NEXT: .cfi_restore s5 +; RV32-NEXT: .cfi_restore s6 +; RV32-NEXT: .cfi_restore s7 +; RV32-NEXT: .cfi_restore s8 +; RV32-NEXT: .cfi_restore s9 +; RV32-NEXT: .cfi_restore s10 +; RV32-NEXT: .cfi_restore s11 +; RV32-NEXT: addi sp, sp, 64 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: match_v16i8_v32i8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -112 +; RV64-NEXT: .cfi_def_cfa_offset 112 +; RV64-NEXT: sd ra, 104(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 96(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 88(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 80(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s5, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s6, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s7, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s8, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s9, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s10, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s11, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: .cfi_offset s1, -24 +; RV64-NEXT: .cfi_offset s2, -32 +; RV64-NEXT: .cfi_offset s3, -40 +; RV64-NEXT: .cfi_offset s4, -48 +; RV64-NEXT: .cfi_offset s5, -56 +; RV64-NEXT: .cfi_offset s6, -64 +; RV64-NEXT: .cfi_offset s7, -72 +; RV64-NEXT: .cfi_offset s8, -80 +; RV64-NEXT: .cfi_offset s9, -88 +; RV64-NEXT: .cfi_offset s10, -96 +; RV64-NEXT: .cfi_offset s11, -104 +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: vslidedown.vi v9, v10, 1 +; RV64-NEXT: vslidedown.vi v12, v10, 2 +; RV64-NEXT: vslidedown.vi v13, v10, 3 +; RV64-NEXT: vslidedown.vi v14, v10, 4 +; RV64-NEXT: vslidedown.vi v15, v10, 5 +; RV64-NEXT: vslidedown.vi v16, v10, 6 +; RV64-NEXT: vslidedown.vi v17, v10, 7 +; RV64-NEXT: vslidedown.vi v18, v10, 8 +; RV64-NEXT: vslidedown.vi v19, v10, 9 +; RV64-NEXT: vslidedown.vi v20, v10, 10 +; RV64-NEXT: vslidedown.vi v21, v10, 11 +; RV64-NEXT: vslidedown.vi v22, v10, 12 +; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64-NEXT: vslidedown.vi v24, v10, 16 +; RV64-NEXT: vmv.x.s a1, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 17 +; RV64-NEXT: vmv.x.s a2, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 18 +; RV64-NEXT: vmv.x.s a3, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 19 +; RV64-NEXT: vmv.x.s a4, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 20 +; RV64-NEXT: vmv.x.s a5, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 21 +; RV64-NEXT: vmv.x.s a6, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 22 +; RV64-NEXT: vmv.x.s a7, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 23 +; RV64-NEXT: vmv.x.s t0, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 24 +; RV64-NEXT: vmv.x.s t1, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 25 +; RV64-NEXT: vmv.x.s t2, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 26 +; RV64-NEXT: vmv.x.s t3, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 27 +; RV64-NEXT: vmv.x.s t4, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 28 +; RV64-NEXT: vmv.x.s t5, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 29 +; RV64-NEXT: vmv.x.s t6, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 30 +; RV64-NEXT: vmv.x.s s0, v24 +; RV64-NEXT: vslidedown.vi v24, v10, 31 +; RV64-NEXT: vmv.x.s s1, v24 +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vslidedown.vi v11, v10, 13 +; RV64-NEXT: vslidedown.vi v23, v10, 14 +; RV64-NEXT: vslidedown.vi v10, v10, 15 +; RV64-NEXT: vmv.x.s s2, v9 +; RV64-NEXT: vmv.x.s s3, v12 +; RV64-NEXT: vmv.x.s s4, v13 +; RV64-NEXT: vmv.x.s s5, v14 +; RV64-NEXT: vmv.x.s s6, v15 +; RV64-NEXT: vmv.x.s s7, v16 +; RV64-NEXT: vmv.x.s s8, v17 +; RV64-NEXT: vmv.x.s s9, v18 +; RV64-NEXT: vmv.x.s s10, v19 +; RV64-NEXT: vmv.x.s s11, v20 +; RV64-NEXT: vmv.x.s ra, v21 +; RV64-NEXT: vmseq.vx v9, v8, a0 +; RV64-NEXT: vmv.x.s a0, v22 +; RV64-NEXT: vmseq.vx v12, v8, s2 +; RV64-NEXT: vmv.x.s s2, v11 +; RV64-NEXT: vmseq.vx v11, v8, s3 +; RV64-NEXT: vmv.x.s s3, v23 +; RV64-NEXT: vmseq.vx v13, v8, s4 +; RV64-NEXT: vmv.x.s s4, v10 +; RV64-NEXT: vmseq.vx v10, v8, s5 +; RV64-NEXT: vmor.mm v9, v9, v12 +; RV64-NEXT: vmseq.vx v12, v8, s6 +; RV64-NEXT: vmor.mm v9, v9, v11 +; RV64-NEXT: vmseq.vx v11, v8, s7 +; RV64-NEXT: vmor.mm v9, v9, v13 +; RV64-NEXT: vmseq.vx v13, v8, s8 +; RV64-NEXT: vmor.mm v9, v9, v10 +; RV64-NEXT: vmseq.vx v10, v8, s9 +; RV64-NEXT: vmor.mm v9, v9, v12 +; RV64-NEXT: vmseq.vx v12, v8, s10 +; RV64-NEXT: vmor.mm v9, v9, v11 +; RV64-NEXT: vmseq.vx v11, v8, s11 +; RV64-NEXT: vmor.mm v9, v9, v13 +; RV64-NEXT: vmseq.vx v13, v8, ra +; RV64-NEXT: vmor.mm v9, v9, v10 +; RV64-NEXT: vmseq.vx v10, v8, a0 +; RV64-NEXT: vmor.mm v9, v9, v12 +; RV64-NEXT: vmseq.vx v12, v8, s2 +; RV64-NEXT: vmor.mm v9, v9, v11 +; RV64-NEXT: vmseq.vx v11, v8, s3 +; RV64-NEXT: vmor.mm v9, v9, v13 +; RV64-NEXT: vmseq.vx v13, v8, s4 +; RV64-NEXT: vmor.mm v9, v9, v10 +; RV64-NEXT: vmseq.vx v10, v8, a1 +; RV64-NEXT: vmor.mm v9, v9, v12 +; RV64-NEXT: vmseq.vx v12, v8, a2 +; RV64-NEXT: vmor.mm v9, v9, v11 +; RV64-NEXT: vmseq.vx v11, v8, a3 +; RV64-NEXT: vmor.mm v9, v9, v13 +; RV64-NEXT: vmseq.vx v13, v8, a4 +; RV64-NEXT: vmor.mm v9, v9, v10 +; RV64-NEXT: vmseq.vx v10, v8, a5 +; RV64-NEXT: vmor.mm v9, v9, v12 +; RV64-NEXT: vmseq.vx v12, v8, a6 +; RV64-NEXT: vmor.mm v9, v9, v11 +; RV64-NEXT: vmseq.vx v11, v8, a7 +; RV64-NEXT: vmor.mm v9, v9, v13 +; RV64-NEXT: vmseq.vx v13, v8, t0 +; RV64-NEXT: vmor.mm v9, v9, v10 +; RV64-NEXT: vmseq.vx v10, v8, t1 +; RV64-NEXT: vmor.mm v9, v9, v12 +; RV64-NEXT: vmseq.vx v12, v8, t2 +; RV64-NEXT: vmor.mm v9, v9, v11 +; RV64-NEXT: vmseq.vx v11, v8, t3 +; RV64-NEXT: vmor.mm v9, v9, v13 +; RV64-NEXT: vmseq.vx v13, v8, t4 +; RV64-NEXT: vmor.mm v9, v9, v10 +; RV64-NEXT: vmseq.vx v10, v8, t5 +; RV64-NEXT: vmor.mm v9, v9, v12 +; RV64-NEXT: vmseq.vx v12, v8, t6 +; RV64-NEXT: vmor.mm v9, v9, v11 +; RV64-NEXT: vmseq.vx v11, v8, s0 +; RV64-NEXT: vmor.mm v9, v9, v13 +; RV64-NEXT: vmor.mm v9, v9, v10 +; RV64-NEXT: vmor.mm v9, v9, v12 +; RV64-NEXT: vmor.mm v9, v9, v11 +; RV64-NEXT: vmseq.vx v8, v8, s1 +; RV64-NEXT: vmor.mm v8, v9, v8 +; RV64-NEXT: vmand.mm v0, v8, v0 +; RV64-NEXT: ld ra, 104(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 96(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 88(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 80(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s5, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s6, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s7, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s8, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s9, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s10, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s11, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: .cfi_restore s1 +; RV64-NEXT: .cfi_restore s2 +; RV64-NEXT: .cfi_restore s3 +; RV64-NEXT: .cfi_restore s4 +; RV64-NEXT: .cfi_restore s5 +; RV64-NEXT: .cfi_restore s6 +; RV64-NEXT: .cfi_restore s7 +; RV64-NEXT: .cfi_restore s8 +; RV64-NEXT: .cfi_restore s9 +; RV64-NEXT: .cfi_restore s10 +; RV64-NEXT: .cfi_restore s11 +; RV64-NEXT: addi sp, sp, 112 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %r = tail call <16 x i1> @llvm.experimental.vector.match(<16 x i8> %op1, <32 x i8> %op2, <16 x i1> %mask) + ret <16 x i1> %r +} + define @match_nxv4xi32_v4i32( %op1, <4 x i32> %op2, %mask) { ; CHECK-LABEL: match_nxv4xi32_v4i32: ; CHECK: # %bb.0: From 984c3ccf77d1aa852c044a02ac33abf13c236c37 Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Mon, 2 Dec 2024 22:04:12 -0300 Subject: [PATCH 07/10] Bail if the vector element's types don't match Signed-off-by: Mikhail R. Gadelha --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f3fc7536a9c82..c2cdd73ec869a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3500,6 +3500,10 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL, // FIXME: Support i1 vectors, maybe by promoting to i8? if (VT.getVectorElementType() == MVT::i1) return SDValue(); + // Additionally the element types should match + if (Vec.getSimpleValueType().getVectorElementType() != + VT.getVectorElementType()) + return SDValue(); SDValue Idx = SplatVal.getOperand(1); // The index must be a legal type. if (Idx.getValueType() != Subtarget.getXLenVT()) From 09e4d3ee6debde6ae76d9e9bad41269fee527e9e Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Mon, 2 Dec 2024 22:16:10 -0300 Subject: [PATCH 08/10] Swapped condition to handle cases where one vector is scalable and the other is fixed Signed-off-by: Mikhail R. Gadelha --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 ++-- llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index c2cdd73ec869a..fe21b3a16ee56 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3509,8 +3509,8 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL, if (Idx.getValueType() != Subtarget.getXLenVT()) return SDValue(); - if (Vec.getValueSizeInBits().getKnownMinValue() > - VT.getSizeInBits().getKnownMinValue()) + // FIXME: Can we use the indexes that are in-bound here instead? + if (!TypeSize::isKnownLE(Vec.getValueSizeInBits(), VT.getSizeInBits())) return SDValue(); MVT ContainerVT = VT; diff --git a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll index c71a6a85755b7..5d730da09ef83 100644 --- a/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll +++ b/llvm/test/CodeGen/RISCV/rvv/intrinsic-vector-match.ll @@ -143,8 +143,9 @@ define @match_nxv16i8_v16i8( %op1, <16 x i8 define <16 x i1> @match_v16i8_v1i8(<16 x i8> %op1, <1 x i8> %op2, <16 x i1> %mask) { ; CHECK-LABEL: match_v16i8_v1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; CHECK-NEXT: vrgather.vi v10, v9, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vmseq.vv v8, v8, v10 ; CHECK-NEXT: vmand.mm v0, v8, v0 ; CHECK-NEXT: ret From 8aa38bdb5aefabcc56e049166b9ee59ddf46facd Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Mon, 2 Dec 2024 22:39:24 -0300 Subject: [PATCH 09/10] Unify VT element type checks Signed-off-by: Mikhail R. Gadelha --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index fe21b3a16ee56..e2f4fcdeddd9a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3496,13 +3496,12 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL, if (SplatVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); SDValue Vec = SplatVal.getOperand(0); - // Don't perform this optimization for i1 vectors. + // Don't perform this optimization for i1 vectors, or if the element types are + // different // FIXME: Support i1 vectors, maybe by promoting to i8? - if (VT.getVectorElementType() == MVT::i1) - return SDValue(); - // Additionally the element types should match - if (Vec.getSimpleValueType().getVectorElementType() != - VT.getVectorElementType()) + MVT EltTy = VT.getVectorElementType(); + if (EltTy == MVT::i1 || + EltTy != Vec.getSimpleValueType().getVectorElementType()) return SDValue(); SDValue Idx = SplatVal.getOperand(1); // The index must be a legal type. From a8c9fb03ce5c712767c7ef3f5130701faa8ed47a Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Mon, 2 Dec 2024 22:39:40 -0300 Subject: [PATCH 10/10] FIXME -> TODO Signed-off-by: Mikhail R. Gadelha --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e2f4fcdeddd9a..b170ce4e13051 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3508,7 +3508,8 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL, if (Idx.getValueType() != Subtarget.getXLenVT()) return SDValue(); - // FIXME: Can we use the indexes that are in-bound here instead? + // Check that Index lies within VT + // TODO: Can we check if the Index is constant and known in-bounds? if (!TypeSize::isKnownLE(Vec.getValueSizeInBits(), VT.getSizeInBits())) return SDValue();