diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e6814c5f71a09..031030990d440 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -391,7 +391,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); } } else if (!Subtarget.hasVendorXCVbitmanip()) { - setOperationAction({ISD::CTTZ, ISD::CTPOP}, XLenVT, Expand); + setOperationAction(ISD::CTTZ, XLenVT, Expand); + setOperationAction(ISD::CTPOP, XLenVT, + Subtarget.is64Bit() ? Custom : Expand); if (RV64LegalI32 && Subtarget.is64Bit()) setOperationAction({ISD::CTTZ, ISD::CTPOP}, MVT::i32, Expand); } @@ -901,11 +903,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, VT, Custom); } else { setOperationAction({ISD::BITREVERSE, ISD::VP_BITREVERSE}, VT, Expand); - setOperationAction({ISD::CTLZ, ISD::CTTZ, ISD::CTPOP}, VT, Expand); + setOperationAction({ISD::CTLZ, ISD::CTTZ}, VT, Expand); setOperationAction({ISD::VP_CTLZ, ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ, - ISD::VP_CTTZ_ZERO_UNDEF, ISD::VP_CTPOP}, + ISD::VP_CTTZ_ZERO_UNDEF}, VT, Expand); + setOperationAction({ISD::CTPOP, ISD::VP_CTPOP}, VT, Custom); + // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the // range of f32. EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); @@ -1238,6 +1242,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTPOP}, VT, Custom); } else { + setOperationAction({ISD::CTPOP, ISD::VP_CTPOP}, VT, Custom); // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the // range of f32. EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount()); @@ -6746,8 +6751,18 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::UDIV: case ISD::UREM: case ISD::BSWAP: - case ISD::CTPOP: return lowerToScalableOp(Op, DAG); + case ISD::CTPOP: { + if (Op.getValueType().isScalarInteger()) + return lowerScalarCTPOP(Op, DAG); + if (Subtarget.hasStdExtZvbb()) + return lowerToScalableOp(Op, DAG); + return lowerVectorCTPOP(Op, DAG); + } + case ISD::VP_CTPOP: + if (Subtarget.hasStdExtZvbb()) + return lowerVPOp(Op, DAG); + return lowerVectorCTPOP(Op, DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: @@ -6972,8 +6987,6 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, if (Subtarget.hasStdExtZvbb()) return lowerVPOp(Op, DAG); return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG); - case ISD::VP_CTPOP: - return lowerVPOp(Op, DAG); case ISD::EXPERIMENTAL_VP_STRIDED_LOAD: return lowerVPStridedLoad(Op, DAG); case ISD::EXPERIMENTAL_VP_STRIDED_STORE: @@ -10755,6 +10768,182 @@ SDValue RISCVTargetLowering::lowerABS(SDValue Op, SelectionDAG &DAG) const { return Max; } +SDValue RISCVTargetLowering::lowerScalarCTPOP(SDValue Op, + SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + MVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()).getSimpleVT(); + unsigned Len = VT.getScalarSizeInBits(); + assert(VT.isInteger() && "lowerScalarCTPOP not implemented for this type."); + + SDValue V = Op.getOperand(0); + + // This is same algorithm of TargetLowering::expandCTPOP from + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + // 0x0F0F0F0F... + const APInt &Constant0F = APInt::getSplat(Len, APInt(8, 0x0F)); + SDValue Mask0F = DAG.getConstant(Constant0F, DL, VT, false, true); + // 0x33333333... = (0x0F0F0F0F... ^ (0x0F0F0F0F... << 2)) + const APInt &Constant33 = APInt::getSplat(Len, APInt(8, 0x33)); + SDValue Mask33 = + RISCVMatInt::getIntMatCost(Constant33, VT.getScalarSizeInBits(), + Subtarget) > 2 + ? DAG.getNode(ISD::XOR, DL, VT, Mask0F, + DAG.getNode(ISD::SHL, DL, VT, Mask0F, + DAG.getShiftAmountConstant(2, VT, DL))) + : DAG.getConstant(Constant33, DL, VT); + // 0x55555555... = (0x33333333... ^ (0x33333333... << 1)) + const APInt &Constant55 = APInt::getSplat(Len, APInt(8, 0x55)); + SDValue Mask55 = + RISCVMatInt::getIntMatCost(Constant55, VT.getScalarSizeInBits(), + Subtarget) > 2 + ? DAG.getNode(ISD::XOR, DL, VT, Mask33, + DAG.getNode(ISD::SHL, DL, VT, Mask33, + DAG.getShiftAmountConstant(1, VT, DL))) + : DAG.getConstant(Constant55, DL, VT); + + // v = v - ((v >> 1) & 0x55555555...) + V = DAG.getNode(ISD::SUB, DL, VT, V, + DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(ISD::SRL, DL, VT, V, + DAG.getConstant(1, DL, ShVT)), + Mask55)); + // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) + V = DAG.getNode(ISD::ADD, DL, VT, DAG.getNode(ISD::AND, DL, VT, V, Mask33), + DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(ISD::SRL, DL, VT, V, + DAG.getConstant(2, DL, ShVT)), + Mask33)); + // v = (v + (v >> 4)) & 0x0F0F0F0F... + V = DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(ISD::ADD, DL, VT, V, + DAG.getNode(ISD::SRL, DL, VT, V, + DAG.getConstant(4, DL, ShVT))), + Mask0F); + + // v = (v * 0x01010101...) >> (Len - 8) + // 0x01010101... == (0x0F0F0F0F... & (0x0F0F0F0F... >> 3)) + const APInt &Constant01 = APInt::getSplat(Len, APInt(8, 0x01)); + SDValue Mask01 = + RISCVMatInt::getIntMatCost(Constant01, VT.getScalarSizeInBits(), + Subtarget) > 2 + ? DAG.getNode(ISD::AND, DL, VT, Mask0F, + DAG.getNode(ISD::SRL, DL, VT, Mask0F, + DAG.getShiftAmountConstant(3, VT, DL))) + : DAG.getConstant(Constant01, DL, VT); + return DAG.getNode(ISD::SRL, DL, VT, DAG.getNode(ISD::MUL, DL, VT, V, Mask01), + DAG.getConstant(Len - 8, DL, ShVT)); +} + +SDValue RISCVTargetLowering::lowerVectorCTPOP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + unsigned Len = VT.getScalarSizeInBits(); + assert(VT.isInteger() && "lowerVectorCTPOP not implemented for this type."); + + SDValue V = Op.getOperand(0); + MVT ContainerVT = VT; + if (VT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(VT); + V = convertToScalableVector(ContainerVT, V, DAG, Subtarget); + } + + SDValue Mask, VL; + if (Op->getOpcode() == ISD::VP_CTPOP) { + Mask = Op->getOperand(1); + if (VT.isFixedLengthVector()) + Mask = convertToScalableVector(getMaskTypeFor(ContainerVT), Mask, DAG, + Subtarget); + VL = Op->getOperand(2); + } else + std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); + + // This is same algorithm of TargetLowering::expandVPCTPOP from + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + + // 0x0F0F0F0F... + const APInt &Constant0F = APInt::getSplat(Len, APInt(8, 0x0F)); + SDValue Mask0F = DAG.getConstant(Constant0F, DL, ContainerVT); + // 0x33333333... = (0x0F0F0F0F... ^ (0x0F0F0F0F... << 2)) + const APInt &Constant33 = APInt::getSplat(Len, APInt(8, 0x33)); + SDValue Mask33 = + RISCVMatInt::getIntMatCost(Constant33, ContainerVT.getScalarSizeInBits(), + Subtarget) > 2 + ? DAG.getNode(RISCVISD::XOR_VL, DL, ContainerVT, Mask0F, + DAG.getNode(RISCVISD::SHL_VL, DL, ContainerVT, Mask0F, + DAG.getConstant(2, DL, ContainerVT), + DAG.getUNDEF(ContainerVT), Mask, VL), + DAG.getUNDEF(ContainerVT), Mask, VL) + : DAG.getConstant(Constant33, DL, ContainerVT); + // 0x55555555... = (0x33333333... ^ (0x33333333... << 1)) + const APInt &Constant55 = APInt::getSplat(Len, APInt(8, 0x55)); + SDValue Mask55 = + RISCVMatInt::getIntMatCost(Constant55, ContainerVT.getScalarSizeInBits(), + Subtarget) > 2 + ? DAG.getNode(RISCVISD::XOR_VL, DL, ContainerVT, Mask33, + DAG.getNode(RISCVISD::SHL_VL, DL, ContainerVT, Mask33, + DAG.getConstant(1, DL, ContainerVT), + DAG.getUNDEF(ContainerVT), Mask, VL), + DAG.getUNDEF(ContainerVT), Mask, VL) + : DAG.getConstant(Constant55, DL, ContainerVT); + + SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5; + + // v = v - ((v >> 1) & 0x55555555...) + Tmp1 = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, + DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, V, + DAG.getConstant(1, DL, ContainerVT), + DAG.getUNDEF(ContainerVT), Mask, VL), + Mask55, DAG.getUNDEF(ContainerVT), Mask, VL); + V = DAG.getNode(RISCVISD::SUB_VL, DL, ContainerVT, V, Tmp1, + DAG.getUNDEF(ContainerVT), Mask, VL); + + // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) + Tmp2 = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, V, Mask33, + DAG.getUNDEF(ContainerVT), Mask, VL); + Tmp3 = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, + DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, V, + DAG.getConstant(2, DL, ContainerVT), + DAG.getUNDEF(ContainerVT), Mask, VL), + Mask33, DAG.getUNDEF(ContainerVT), Mask, VL); + V = DAG.getNode(RISCVISD::ADD_VL, DL, ContainerVT, Tmp2, Tmp3, + DAG.getUNDEF(ContainerVT), Mask, VL); + + // v = (v + (v >> 4)) & 0x0F0F0F0F... + Tmp4 = DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, V, + DAG.getConstant(4, DL, ContainerVT), + DAG.getUNDEF(ContainerVT), Mask, VL), + Tmp5 = DAG.getNode(RISCVISD::ADD_VL, DL, ContainerVT, V, Tmp4, + DAG.getUNDEF(ContainerVT), Mask, VL); + V = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, Tmp5, Mask0F, + DAG.getUNDEF(ContainerVT), Mask, VL); + + if (Len > 8) { + // v = (v * 0x01010101...) >> (Len - 8) + // 0x01010101... == (0x0F0F0F0F... & (0x0F0F0F0F... >> 3)) + const APInt &Constant01 = APInt::getSplat(Len, APInt(8, 0x01)); + SDValue Mask01 = + RISCVMatInt::getIntMatCost( + Constant01, ContainerVT.getScalarSizeInBits(), Subtarget) > 2 + ? DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, Mask0F, + DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, Mask0F, + DAG.getConstant(3, DL, ContainerVT), + DAG.getUNDEF(ContainerVT), Mask, VL), + DAG.getUNDEF(ContainerVT), Mask, VL) + : DAG.getConstant(Constant01, DL, ContainerVT); + V = DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, + DAG.getNode(RISCVISD::MUL_VL, DL, ContainerVT, V, Mask01, + DAG.getUNDEF(ContainerVT), Mask, VL), + DAG.getConstant(Len - 8, DL, ContainerVT), + DAG.getUNDEF(ContainerVT), Mask, VL); + } + + if (VT.isFixedLengthVector()) + V = convertFromScalableVector(VT, V, DAG, Subtarget); + return V; +} + SDValue RISCVTargetLowering::lowerFixedLengthVectorFCOPYSIGNToRVV( SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index c11b1464757c7..cc8a18d908810 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -959,6 +959,9 @@ class RISCVTargetLowering : public TargetLowering { SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerScalarCTPOP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVectorCTPOP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const; SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll index 455e6e54c9b39..1eaf91096336f 100644 --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -53,28 +53,77 @@ define i8 @test_cttz_i8(i8 %a) nounwind { ; RV32_NOZBB-NEXT: li a0, 8 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_cttz_i8: -; RV64NOZBB: # %bb.0: -; RV64NOZBB-NEXT: andi a1, a0, 255 -; RV64NOZBB-NEXT: beqz a1, .LBB0_2 -; RV64NOZBB-NEXT: # %bb.1: # %cond.false -; RV64NOZBB-NEXT: addi a1, a0, -1 -; RV64NOZBB-NEXT: not a0, a0 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: srli a1, a0, 1 -; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 51 -; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: andi a0, a0, 51 -; RV64NOZBB-NEXT: add a0, a1, a0 -; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a0, a0, 15 -; RV64NOZBB-NEXT: ret -; RV64NOZBB-NEXT: .LBB0_2: -; RV64NOZBB-NEXT: li a0, 8 -; RV64NOZBB-NEXT: ret +; RV64I-LABEL: test_cttz_i8: +; RV64I: # %bb.0: +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: beqz a1, .LBB0_2 +; RV64I-NEXT: # %bb.1: # %cond.false +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: addi a3, a0, -1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: andi a3, a0, 255 +; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: andi a0, a0, 85 +; RV64I-NEXT: sub a3, a3, a0 +; RV64I-NEXT: and a0, a3, a2 +; RV64I-NEXT: srli a3, a3, 2 +; RV64I-NEXT: and a2, a3, a2 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB0_2: +; RV64I-NEXT: li a0, 8 +; RV64I-NEXT: ret +; +; RV64M-LABEL: test_cttz_i8: +; RV64M: # %bb.0: +; RV64M-NEXT: andi a1, a0, 255 +; RV64M-NEXT: beqz a1, .LBB0_2 +; RV64M-NEXT: # %bb.1: # %cond.false +; RV64M-NEXT: lui a1, 61681 +; RV64M-NEXT: addiw a1, a1, -241 +; RV64M-NEXT: slli a2, a1, 32 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: addi a3, a0, -1 +; RV64M-NEXT: not a0, a0 +; RV64M-NEXT: and a0, a0, a3 +; RV64M-NEXT: andi a3, a0, 255 +; RV64M-NEXT: srli a0, a0, 1 +; RV64M-NEXT: andi a0, a0, 85 +; RV64M-NEXT: sub a3, a3, a0 +; RV64M-NEXT: and a0, a3, a2 +; RV64M-NEXT: srli a3, a3, 2 +; RV64M-NEXT: and a2, a3, a2 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: and a0, a0, a1 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 +; RV64M-NEXT: mul a0, a0, a1 +; RV64M-NEXT: srli a0, a0, 56 +; RV64M-NEXT: ret +; RV64M-NEXT: .LBB0_2: +; RV64M-NEXT: li a0, 8 +; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_cttz_i8: ; RV32ZBB: # %bb.0: @@ -154,35 +203,83 @@ define i16 @test_cttz_i16(i16 %a) nounwind { ; RV32_NOZBB-NEXT: li a0, 16 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_cttz_i16: -; RV64NOZBB: # %bb.0: -; RV64NOZBB-NEXT: slli a1, a0, 48 -; RV64NOZBB-NEXT: beqz a1, .LBB1_2 -; RV64NOZBB-NEXT: # %bb.1: # %cond.false -; RV64NOZBB-NEXT: addi a1, a0, -1 -; RV64NOZBB-NEXT: not a0, a0 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: srli a1, a0, 1 -; RV64NOZBB-NEXT: lui a2, 5 -; RV64NOZBB-NEXT: addiw a2, a2, 1365 -; RV64NOZBB-NEXT: and a1, a1, a2 -; RV64NOZBB-NEXT: sub a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 3 -; RV64NOZBB-NEXT: addiw a1, a1, 819 -; RV64NOZBB-NEXT: and a2, a0, a1 -; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: add a0, a2, a0 -; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 15 -; RV64NOZBB-NEXT: slli a0, a0, 52 -; RV64NOZBB-NEXT: srli a0, a0, 60 -; RV64NOZBB-NEXT: add a0, a1, a0 -; RV64NOZBB-NEXT: ret -; RV64NOZBB-NEXT: .LBB1_2: -; RV64NOZBB-NEXT: li a0, 16 -; RV64NOZBB-NEXT: ret +; RV64I-LABEL: test_cttz_i16: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: beqz a1, .LBB1_2 +; RV64I-NEXT: # %bb.1: # %cond.false +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: addi a3, a0, -1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 5 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srli a0, a0, 48 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB1_2: +; RV64I-NEXT: li a0, 16 +; RV64I-NEXT: ret +; +; RV64M-LABEL: test_cttz_i16: +; RV64M: # %bb.0: +; RV64M-NEXT: slli a1, a0, 48 +; RV64M-NEXT: beqz a1, .LBB1_2 +; RV64M-NEXT: # %bb.1: # %cond.false +; RV64M-NEXT: lui a1, 61681 +; RV64M-NEXT: addiw a1, a1, -241 +; RV64M-NEXT: slli a2, a1, 32 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: addi a3, a0, -1 +; RV64M-NEXT: not a0, a0 +; RV64M-NEXT: and a0, a0, a3 +; RV64M-NEXT: srli a3, a0, 1 +; RV64M-NEXT: lui a4, 5 +; RV64M-NEXT: addiw a4, a4, 1365 +; RV64M-NEXT: and a3, a3, a4 +; RV64M-NEXT: slli a0, a0, 48 +; RV64M-NEXT: srli a0, a0, 48 +; RV64M-NEXT: sub a0, a0, a3 +; RV64M-NEXT: and a3, a0, a2 +; RV64M-NEXT: srli a0, a0, 2 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: add a0, a3, a0 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: and a0, a0, a1 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 +; RV64M-NEXT: mul a0, a0, a1 +; RV64M-NEXT: srli a0, a0, 56 +; RV64M-NEXT: ret +; RV64M-NEXT: .LBB1_2: +; RV64M-NEXT: li a0, 16 +; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_cttz_i16: ; RV32ZBB: # %bb.0: @@ -422,16 +519,33 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: addi a3, a0, -1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, %hi(.LCPI3_0) -; RV64I-NEXT: ld a1, %lo(.LCPI3_0)(a1) +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srli a0, a0, 58 -; RV64I-NEXT: lui a1, %hi(.LCPI3_1) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI3_1) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -475,16 +589,33 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV64M: # %bb.0: ; RV64M-NEXT: beqz a0, .LBB3_2 ; RV64M-NEXT: # %bb.1: # %cond.false -; RV64M-NEXT: lui a1, %hi(.LCPI3_0) -; RV64M-NEXT: ld a1, %lo(.LCPI3_0)(a1) -; RV64M-NEXT: neg a2, a0 +; RV64M-NEXT: lui a1, 61681 +; RV64M-NEXT: addiw a1, a1, -241 +; RV64M-NEXT: slli a2, a1, 32 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: addi a3, a0, -1 +; RV64M-NEXT: not a0, a0 +; RV64M-NEXT: and a0, a0, a3 +; RV64M-NEXT: srli a3, a0, 1 +; RV64M-NEXT: lui a4, 349525 +; RV64M-NEXT: addiw a4, a4, 1365 +; RV64M-NEXT: slli a5, a4, 32 +; RV64M-NEXT: add a4, a4, a5 +; RV64M-NEXT: and a3, a3, a4 +; RV64M-NEXT: sub a0, a0, a3 +; RV64M-NEXT: and a3, a0, a2 +; RV64M-NEXT: srli a0, a0, 2 ; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: add a0, a3, a0 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: and a0, a0, a1 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 ; RV64M-NEXT: mul a0, a0, a1 -; RV64M-NEXT: srli a0, a0, 58 -; RV64M-NEXT: lui a1, %hi(.LCPI3_1) -; RV64M-NEXT: addi a1, a1, %lo(.LCPI3_1) -; RV64M-NEXT: add a0, a1, a0 -; RV64M-NEXT: lbu a0, 0(a0) +; RV64M-NEXT: srli a0, a0, 56 ; RV64M-NEXT: ret ; RV64M-NEXT: .LBB3_2: ; RV64M-NEXT: li a0, 64 @@ -565,22 +696,65 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind { ; RV32_NOZBB-NEXT: andi a0, a0, 15 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_cttz_i8_zero_undef: -; RV64NOZBB: # %bb.0: -; RV64NOZBB-NEXT: addi a1, a0, -1 -; RV64NOZBB-NEXT: not a0, a0 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: srli a1, a0, 1 -; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 51 -; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: andi a0, a0, 51 -; RV64NOZBB-NEXT: add a0, a1, a0 -; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a0, a0, 15 -; RV64NOZBB-NEXT: ret +; RV64I-LABEL: test_cttz_i8_zero_undef: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: addi a3, a0, -1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: andi a3, a0, 255 +; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: andi a0, a0, 85 +; RV64I-NEXT: sub a3, a3, a0 +; RV64I-NEXT: and a0, a3, a2 +; RV64I-NEXT: srli a3, a3, 2 +; RV64I-NEXT: and a2, a3, a2 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64M-LABEL: test_cttz_i8_zero_undef: +; RV64M: # %bb.0: +; RV64M-NEXT: lui a1, 61681 +; RV64M-NEXT: addiw a1, a1, -241 +; RV64M-NEXT: slli a2, a1, 32 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: addi a3, a0, -1 +; RV64M-NEXT: not a0, a0 +; RV64M-NEXT: and a0, a0, a3 +; RV64M-NEXT: andi a3, a0, 255 +; RV64M-NEXT: srli a0, a0, 1 +; RV64M-NEXT: andi a0, a0, 85 +; RV64M-NEXT: sub a3, a3, a0 +; RV64M-NEXT: and a0, a3, a2 +; RV64M-NEXT: srli a3, a3, 2 +; RV64M-NEXT: and a2, a3, a2 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: and a0, a0, a1 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 +; RV64M-NEXT: mul a0, a0, a1 +; RV64M-NEXT: srli a0, a0, 56 +; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_cttz_i8_zero_undef: ; RV32ZBB: # %bb.0: @@ -640,29 +814,71 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind { ; RV32_NOZBB-NEXT: add a0, a1, a0 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_cttz_i16_zero_undef: -; RV64NOZBB: # %bb.0: -; RV64NOZBB-NEXT: addi a1, a0, -1 -; RV64NOZBB-NEXT: not a0, a0 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: srli a1, a0, 1 -; RV64NOZBB-NEXT: lui a2, 5 -; RV64NOZBB-NEXT: addiw a2, a2, 1365 -; RV64NOZBB-NEXT: and a1, a1, a2 -; RV64NOZBB-NEXT: sub a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 3 -; RV64NOZBB-NEXT: addiw a1, a1, 819 -; RV64NOZBB-NEXT: and a2, a0, a1 -; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: add a0, a2, a0 -; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 15 -; RV64NOZBB-NEXT: slli a0, a0, 52 -; RV64NOZBB-NEXT: srli a0, a0, 60 -; RV64NOZBB-NEXT: add a0, a1, a0 -; RV64NOZBB-NEXT: ret +; RV64I-LABEL: test_cttz_i16_zero_undef: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: addi a3, a0, -1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 5 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srli a0, a0, 48 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64M-LABEL: test_cttz_i16_zero_undef: +; RV64M: # %bb.0: +; RV64M-NEXT: lui a1, 61681 +; RV64M-NEXT: addiw a1, a1, -241 +; RV64M-NEXT: slli a2, a1, 32 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: addi a3, a0, -1 +; RV64M-NEXT: not a0, a0 +; RV64M-NEXT: and a0, a0, a3 +; RV64M-NEXT: srli a3, a0, 1 +; RV64M-NEXT: lui a4, 5 +; RV64M-NEXT: addiw a4, a4, 1365 +; RV64M-NEXT: and a3, a3, a4 +; RV64M-NEXT: slli a0, a0, 48 +; RV64M-NEXT: srli a0, a0, 48 +; RV64M-NEXT: sub a0, a0, a3 +; RV64M-NEXT: and a3, a0, a2 +; RV64M-NEXT: srli a0, a0, 2 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: add a0, a3, a0 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: and a0, a0, a1 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 +; RV64M-NEXT: mul a0, a0, a1 +; RV64M-NEXT: srli a0, a0, 56 +; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_cttz_i16_zero_undef: ; RV32ZBB: # %bb.0: @@ -846,16 +1062,33 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: addi a3, a0, -1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, %hi(.LCPI7_0) -; RV64I-NEXT: ld a1, %lo(.LCPI7_0)(a1) +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srli a0, a0, 58 -; RV64I-NEXT: lui a1, %hi(.LCPI7_1) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI7_1) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -889,16 +1122,33 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; ; RV64M-LABEL: test_cttz_i64_zero_undef: ; RV64M: # %bb.0: -; RV64M-NEXT: lui a1, %hi(.LCPI7_0) -; RV64M-NEXT: ld a1, %lo(.LCPI7_0)(a1) -; RV64M-NEXT: neg a2, a0 +; RV64M-NEXT: lui a1, 61681 +; RV64M-NEXT: addiw a1, a1, -241 +; RV64M-NEXT: slli a2, a1, 32 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: addi a3, a0, -1 +; RV64M-NEXT: not a0, a0 +; RV64M-NEXT: and a0, a0, a3 +; RV64M-NEXT: srli a3, a0, 1 +; RV64M-NEXT: lui a4, 349525 +; RV64M-NEXT: addiw a4, a4, 1365 +; RV64M-NEXT: slli a5, a4, 32 +; RV64M-NEXT: add a4, a4, a5 +; RV64M-NEXT: and a3, a3, a4 +; RV64M-NEXT: sub a0, a0, a3 +; RV64M-NEXT: and a3, a0, a2 +; RV64M-NEXT: srli a0, a0, 2 ; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: add a0, a3, a0 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: and a0, a0, a1 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 ; RV64M-NEXT: mul a0, a0, a1 -; RV64M-NEXT: srli a0, a0, 58 -; RV64M-NEXT: lui a1, %hi(.LCPI7_1) -; RV64M-NEXT: addi a1, a1, %lo(.LCPI7_1) -; RV64M-NEXT: add a0, a1, a0 -; RV64M-NEXT: lbu a0, 0(a0) +; RV64M-NEXT: srli a0, a0, 56 ; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_cttz_i64_zero_undef: @@ -984,35 +1234,91 @@ define i8 @test_ctlz_i8(i8 %a) nounwind { ; RV32_NOZBB-NEXT: li a0, 8 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_ctlz_i8: -; RV64NOZBB: # %bb.0: -; RV64NOZBB-NEXT: andi a1, a0, 255 -; RV64NOZBB-NEXT: beqz a1, .LBB8_2 -; RV64NOZBB-NEXT: # %bb.1: # %cond.false -; RV64NOZBB-NEXT: slli a1, a0, 56 -; RV64NOZBB-NEXT: srli a1, a1, 57 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 56 -; RV64NOZBB-NEXT: srli a1, a1, 58 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 56 -; RV64NOZBB-NEXT: srli a1, a1, 60 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: not a0, a0 -; RV64NOZBB-NEXT: srli a1, a0, 1 -; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 51 -; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: andi a0, a0, 51 -; RV64NOZBB-NEXT: add a0, a1, a0 -; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a0, a0, 15 -; RV64NOZBB-NEXT: ret -; RV64NOZBB-NEXT: .LBB8_2: -; RV64NOZBB-NEXT: li a0, 8 -; RV64NOZBB-NEXT: ret +; RV64I-LABEL: test_ctlz_i8: +; RV64I: # %bb.0: +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: beqz a1, .LBB8_2 +; RV64I-NEXT: # %bb.1: # %cond.false +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srli a1, a1, 57 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srli a1, a1, 58 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srli a1, a1, 60 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: andi a0, a0, 85 +; RV64I-NEXT: sub a1, a1, a0 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: slli a2, a0, 32 +; RV64I-NEXT: add a2, a0, a2 +; RV64I-NEXT: slli a0, a2, 2 +; RV64I-NEXT: xor a0, a0, a2 +; RV64I-NEXT: and a3, a1, a0 +; RV64I-NEXT: srli a1, a1, 2 +; RV64I-NEXT: and a0, a1, a0 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: srli a1, a2, 3 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB8_2: +; RV64I-NEXT: li a0, 8 +; RV64I-NEXT: ret +; +; RV64M-LABEL: test_ctlz_i8: +; RV64M: # %bb.0: +; RV64M-NEXT: andi a1, a0, 255 +; RV64M-NEXT: beqz a1, .LBB8_2 +; RV64M-NEXT: # %bb.1: # %cond.false +; RV64M-NEXT: slli a1, a0, 56 +; RV64M-NEXT: srli a1, a1, 57 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: slli a1, a0, 56 +; RV64M-NEXT: srli a1, a1, 58 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: slli a1, a0, 56 +; RV64M-NEXT: srli a1, a1, 60 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: not a0, a0 +; RV64M-NEXT: andi a1, a0, 255 +; RV64M-NEXT: srli a0, a0, 1 +; RV64M-NEXT: andi a0, a0, 85 +; RV64M-NEXT: sub a1, a1, a0 +; RV64M-NEXT: lui a0, 61681 +; RV64M-NEXT: addiw a0, a0, -241 +; RV64M-NEXT: slli a2, a0, 32 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: slli a2, a0, 2 +; RV64M-NEXT: xor a2, a2, a0 +; RV64M-NEXT: and a3, a1, a2 +; RV64M-NEXT: srli a1, a1, 2 +; RV64M-NEXT: and a1, a1, a2 +; RV64M-NEXT: add a1, a3, a1 +; RV64M-NEXT: srli a2, a1, 4 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: and a1, a1, a0 +; RV64M-NEXT: srli a2, a0, 3 +; RV64M-NEXT: and a0, a2, a0 +; RV64M-NEXT: mul a0, a1, a0 +; RV64M-NEXT: srli a0, a0, 56 +; RV64M-NEXT: ret +; RV64M-NEXT: .LBB8_2: +; RV64M-NEXT: li a0, 8 +; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_ctlz_i8: ; RV32ZBB: # %bb.0: @@ -1085,44 +1391,103 @@ define i16 @test_ctlz_i16(i16 %a) nounwind { ; RV32_NOZBB-NEXT: li a0, 16 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_ctlz_i16: -; RV64NOZBB: # %bb.0: -; RV64NOZBB-NEXT: slli a1, a0, 48 -; RV64NOZBB-NEXT: beqz a1, .LBB9_2 -; RV64NOZBB-NEXT: # %bb.1: # %cond.false -; RV64NOZBB-NEXT: srli a1, a1, 49 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 48 -; RV64NOZBB-NEXT: srli a1, a1, 50 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 48 -; RV64NOZBB-NEXT: srli a1, a1, 52 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 48 -; RV64NOZBB-NEXT: srli a1, a1, 56 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: not a0, a0 -; RV64NOZBB-NEXT: srli a1, a0, 1 -; RV64NOZBB-NEXT: lui a2, 5 -; RV64NOZBB-NEXT: addiw a2, a2, 1365 -; RV64NOZBB-NEXT: and a1, a1, a2 -; RV64NOZBB-NEXT: sub a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 3 -; RV64NOZBB-NEXT: addiw a1, a1, 819 -; RV64NOZBB-NEXT: and a2, a0, a1 -; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: add a0, a2, a0 -; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 15 -; RV64NOZBB-NEXT: slli a0, a0, 52 -; RV64NOZBB-NEXT: srli a0, a0, 60 -; RV64NOZBB-NEXT: add a0, a1, a0 -; RV64NOZBB-NEXT: ret -; RV64NOZBB-NEXT: .LBB9_2: -; RV64NOZBB-NEXT: li a0, 16 -; RV64NOZBB-NEXT: ret +; RV64I-LABEL: test_ctlz_i16: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: beqz a1, .LBB9_2 +; RV64I-NEXT: # %bb.1: # %cond.false +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: srli a1, a1, 49 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srli a1, a1, 50 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srli a1, a1, 52 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srli a1, a1, 56 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: lui a1, 16 +; RV64I-NEXT: addiw a1, a1, -1 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: lui a2, 5 +; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: sub a1, a1, a0 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: slli a2, a0, 32 +; RV64I-NEXT: add a2, a0, a2 +; RV64I-NEXT: slli a0, a2, 2 +; RV64I-NEXT: xor a0, a0, a2 +; RV64I-NEXT: and a3, a1, a0 +; RV64I-NEXT: srli a1, a1, 2 +; RV64I-NEXT: and a0, a1, a0 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: srli a1, a2, 3 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB9_2: +; RV64I-NEXT: li a0, 16 +; RV64I-NEXT: ret +; +; RV64M-LABEL: test_ctlz_i16: +; RV64M: # %bb.0: +; RV64M-NEXT: slli a1, a0, 48 +; RV64M-NEXT: beqz a1, .LBB9_2 +; RV64M-NEXT: # %bb.1: # %cond.false +; RV64M-NEXT: srli a1, a1, 49 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: slli a1, a0, 48 +; RV64M-NEXT: srli a1, a1, 50 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: slli a1, a0, 48 +; RV64M-NEXT: srli a1, a1, 52 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: slli a1, a0, 48 +; RV64M-NEXT: srli a1, a1, 56 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: not a0, a0 +; RV64M-NEXT: lui a1, 16 +; RV64M-NEXT: addiw a1, a1, -1 +; RV64M-NEXT: and a1, a0, a1 +; RV64M-NEXT: srli a0, a0, 1 +; RV64M-NEXT: lui a2, 5 +; RV64M-NEXT: addiw a2, a2, 1365 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: sub a1, a1, a0 +; RV64M-NEXT: lui a0, 61681 +; RV64M-NEXT: addiw a0, a0, -241 +; RV64M-NEXT: slli a2, a0, 32 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: slli a2, a0, 2 +; RV64M-NEXT: xor a2, a2, a0 +; RV64M-NEXT: and a3, a1, a2 +; RV64M-NEXT: srli a1, a1, 2 +; RV64M-NEXT: and a1, a1, a2 +; RV64M-NEXT: add a1, a3, a1 +; RV64M-NEXT: srli a2, a1, 4 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: and a1, a1, a0 +; RV64M-NEXT: srli a2, a0, 3 +; RV64M-NEXT: and a0, a2, a0 +; RV64M-NEXT: mul a0, a1, a0 +; RV64M-NEXT: srli a0, a0, 56 +; RV64M-NEXT: ret +; RV64M-NEXT: .LBB9_2: +; RV64M-NEXT: li a0, 16 +; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_ctlz_i16: ; RV32ZBB: # %bb.0: @@ -1222,22 +1587,26 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -1305,22 +1674,26 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ; RV64M-NEXT: lui a2, 349525 ; RV64M-NEXT: addiw a2, a2, 1365 ; RV64M-NEXT: and a1, a1, a2 +; RV64M-NEXT: slli a0, a0, 32 +; RV64M-NEXT: srli a0, a0, 32 ; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: lui a1, 209715 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: and a2, a0, a1 -; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: add a0, a2, a0 -; RV64M-NEXT: srli a1, a0, 4 -; RV64M-NEXT: add a0, a0, a1 ; RV64M-NEXT: lui a1, 61681 -; RV64M-NEXT: addi a1, a1, -241 +; RV64M-NEXT: addiw a1, a1, -241 +; RV64M-NEXT: slli a2, a1, 32 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: and a3, a0, a2 +; RV64M-NEXT: srli a0, a0, 2 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: add a0, a3, a0 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 ; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: lui a1, 4112 -; RV64M-NEXT: addi a1, a1, 257 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 ; RV64M-NEXT: mul a0, a0, a1 -; RV64M-NEXT: srliw a0, a0, 24 +; RV64M-NEXT: srli a0, a0, 56 ; RV64M-NEXT: ret ; RV64M-NEXT: .LBB10_2: ; RV64M-NEXT: li a0, 32 @@ -1466,25 +1839,21 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -1583,25 +1952,21 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; RV64M-NEXT: add a2, a2, a3 ; RV64M-NEXT: and a1, a1, a2 ; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: lui a1, 209715 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 -; RV64M-NEXT: and a2, a0, a1 -; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: add a0, a2, a0 -; RV64M-NEXT: srli a1, a0, 4 -; RV64M-NEXT: add a0, a0, a1 ; RV64M-NEXT: lui a1, 61681 ; RV64M-NEXT: addiw a1, a1, -241 ; RV64M-NEXT: slli a2, a1, 32 ; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: and a3, a0, a2 +; RV64M-NEXT: srli a0, a0, 2 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: add a0, a3, a0 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 ; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: lui a1, 4112 -; RV64M-NEXT: addiw a1, a1, 257 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 ; RV64M-NEXT: mul a0, a0, a1 ; RV64M-NEXT: srli a0, a0, 56 ; RV64M-NEXT: ret @@ -1673,29 +2038,79 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind { ; RV32_NOZBB-NEXT: andi a0, a0, 15 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_ctlz_i8_zero_undef: -; RV64NOZBB: # %bb.0: -; RV64NOZBB-NEXT: slli a1, a0, 56 -; RV64NOZBB-NEXT: srli a1, a1, 57 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 56 -; RV64NOZBB-NEXT: srli a1, a1, 58 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 56 -; RV64NOZBB-NEXT: srli a1, a1, 60 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: not a0, a0 -; RV64NOZBB-NEXT: srli a1, a0, 1 -; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 51 -; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: andi a0, a0, 51 -; RV64NOZBB-NEXT: add a0, a1, a0 -; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a0, a0, 15 -; RV64NOZBB-NEXT: ret +; RV64I-LABEL: test_ctlz_i8_zero_undef: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srli a1, a1, 57 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srli a1, a1, 58 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srli a1, a1, 60 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: andi a0, a0, 85 +; RV64I-NEXT: sub a1, a1, a0 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: slli a2, a0, 32 +; RV64I-NEXT: add a2, a0, a2 +; RV64I-NEXT: slli a0, a2, 2 +; RV64I-NEXT: xor a0, a0, a2 +; RV64I-NEXT: and a3, a1, a0 +; RV64I-NEXT: srli a1, a1, 2 +; RV64I-NEXT: and a0, a1, a0 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: srli a1, a2, 3 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64M-LABEL: test_ctlz_i8_zero_undef: +; RV64M: # %bb.0: +; RV64M-NEXT: slli a1, a0, 56 +; RV64M-NEXT: srli a1, a1, 57 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: slli a1, a0, 56 +; RV64M-NEXT: srli a1, a1, 58 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: slli a1, a0, 56 +; RV64M-NEXT: srli a1, a1, 60 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: not a0, a0 +; RV64M-NEXT: andi a1, a0, 255 +; RV64M-NEXT: srli a0, a0, 1 +; RV64M-NEXT: andi a0, a0, 85 +; RV64M-NEXT: sub a1, a1, a0 +; RV64M-NEXT: lui a0, 61681 +; RV64M-NEXT: addiw a0, a0, -241 +; RV64M-NEXT: slli a2, a0, 32 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: slli a2, a0, 2 +; RV64M-NEXT: xor a2, a2, a0 +; RV64M-NEXT: and a3, a1, a2 +; RV64M-NEXT: srli a1, a1, 2 +; RV64M-NEXT: and a1, a1, a2 +; RV64M-NEXT: add a1, a3, a1 +; RV64M-NEXT: srli a2, a1, 4 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: and a1, a1, a0 +; RV64M-NEXT: srli a2, a0, 3 +; RV64M-NEXT: and a0, a2, a0 +; RV64M-NEXT: mul a0, a1, a0 +; RV64M-NEXT: srli a0, a0, 56 +; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_ctlz_i8_zero_undef: ; RV32ZBB: # %bb.0: @@ -1763,39 +2178,93 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { ; RV32_NOZBB-NEXT: add a0, a1, a0 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef: -; RV64NOZBB: # %bb.0: -; RV64NOZBB-NEXT: slli a1, a0, 48 -; RV64NOZBB-NEXT: srli a1, a1, 49 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 48 -; RV64NOZBB-NEXT: srli a1, a1, 50 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 48 -; RV64NOZBB-NEXT: srli a1, a1, 52 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: slli a1, a0, 48 -; RV64NOZBB-NEXT: srli a1, a1, 56 -; RV64NOZBB-NEXT: or a0, a0, a1 -; RV64NOZBB-NEXT: not a0, a0 -; RV64NOZBB-NEXT: srli a1, a0, 1 -; RV64NOZBB-NEXT: lui a2, 5 -; RV64NOZBB-NEXT: addiw a2, a2, 1365 -; RV64NOZBB-NEXT: and a1, a1, a2 -; RV64NOZBB-NEXT: sub a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 3 -; RV64NOZBB-NEXT: addiw a1, a1, 819 -; RV64NOZBB-NEXT: and a2, a0, a1 -; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: add a0, a2, a0 -; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 15 -; RV64NOZBB-NEXT: slli a0, a0, 52 -; RV64NOZBB-NEXT: srli a0, a0, 60 -; RV64NOZBB-NEXT: add a0, a1, a0 -; RV64NOZBB-NEXT: ret +; RV64I-LABEL: test_ctlz_i16_zero_undef: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srli a1, a1, 49 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srli a1, a1, 50 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srli a1, a1, 52 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srli a1, a1, 56 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: lui a1, 16 +; RV64I-NEXT: addiw a1, a1, -1 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: lui a2, 5 +; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: sub a1, a1, a0 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: slli a2, a0, 32 +; RV64I-NEXT: add a2, a0, a2 +; RV64I-NEXT: slli a0, a2, 2 +; RV64I-NEXT: xor a0, a0, a2 +; RV64I-NEXT: and a3, a1, a0 +; RV64I-NEXT: srli a1, a1, 2 +; RV64I-NEXT: and a0, a1, a0 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: srli a1, a2, 3 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64M-LABEL: test_ctlz_i16_zero_undef: +; RV64M: # %bb.0: +; RV64M-NEXT: slli a1, a0, 48 +; RV64M-NEXT: srli a1, a1, 49 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: slli a1, a0, 48 +; RV64M-NEXT: srli a1, a1, 50 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: slli a1, a0, 48 +; RV64M-NEXT: srli a1, a1, 52 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: slli a1, a0, 48 +; RV64M-NEXT: srli a1, a1, 56 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: not a0, a0 +; RV64M-NEXT: lui a1, 16 +; RV64M-NEXT: addiw a1, a1, -1 +; RV64M-NEXT: and a1, a0, a1 +; RV64M-NEXT: srli a0, a0, 1 +; RV64M-NEXT: lui a2, 5 +; RV64M-NEXT: addiw a2, a2, 1365 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: sub a1, a1, a0 +; RV64M-NEXT: lui a0, 61681 +; RV64M-NEXT: addiw a0, a0, -241 +; RV64M-NEXT: slli a2, a0, 32 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: slli a2, a0, 2 +; RV64M-NEXT: xor a2, a2, a0 +; RV64M-NEXT: and a3, a1, a2 +; RV64M-NEXT: srli a1, a1, 2 +; RV64M-NEXT: and a1, a1, a2 +; RV64M-NEXT: add a1, a3, a1 +; RV64M-NEXT: srli a2, a1, 4 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: and a1, a1, a0 +; RV64M-NEXT: srli a2, a0, 3 +; RV64M-NEXT: and a0, a2, a0 +; RV64M-NEXT: mul a0, a1, a0 +; RV64M-NEXT: srli a0, a0, 56 +; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_ctlz_i16_zero_undef: ; RV32ZBB: # %bb.0: @@ -1887,22 +2356,26 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -1959,22 +2432,26 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV64M-NEXT: lui a2, 349525 ; RV64M-NEXT: addiw a2, a2, 1365 ; RV64M-NEXT: and a1, a1, a2 +; RV64M-NEXT: slli a0, a0, 32 +; RV64M-NEXT: srli a0, a0, 32 ; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: lui a1, 209715 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: and a2, a0, a1 -; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: add a0, a2, a0 -; RV64M-NEXT: srli a1, a0, 4 -; RV64M-NEXT: add a0, a0, a1 ; RV64M-NEXT: lui a1, 61681 -; RV64M-NEXT: addi a1, a1, -241 +; RV64M-NEXT: addiw a1, a1, -241 +; RV64M-NEXT: slli a2, a1, 32 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: and a3, a0, a2 +; RV64M-NEXT: srli a0, a0, 2 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: add a0, a3, a0 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 ; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: lui a1, 4112 -; RV64M-NEXT: addi a1, a1, 257 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 ; RV64M-NEXT: mul a0, a0, a1 -; RV64M-NEXT: srliw a0, a0, 24 +; RV64M-NEXT: srli a0, a0, 56 ; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_ctlz_i32_zero_undef: @@ -2115,25 +2592,21 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -2227,25 +2700,21 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV64M-NEXT: add a2, a2, a3 ; RV64M-NEXT: and a1, a1, a2 ; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: lui a1, 209715 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 -; RV64M-NEXT: and a2, a0, a1 -; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: add a0, a2, a0 -; RV64M-NEXT: srli a1, a0, 4 -; RV64M-NEXT: add a0, a0, a1 ; RV64M-NEXT: lui a1, 61681 ; RV64M-NEXT: addiw a1, a1, -241 ; RV64M-NEXT: slli a2, a1, 32 ; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: and a3, a0, a2 +; RV64M-NEXT: srli a0, a0, 2 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: add a0, a3, a0 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 ; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: lui a1, 4112 -; RV64M-NEXT: addiw a1, a1, 257 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 ; RV64M-NEXT: mul a0, a0, a1 ; RV64M-NEXT: srli a0, a0, 56 ; RV64M-NEXT: ret @@ -2304,19 +2773,59 @@ define i8 @test_ctpop_i8(i8 %a) nounwind { ; RV32_NOZBB-NEXT: andi a0, a0, 15 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_ctpop_i8: -; RV64NOZBB: # %bb.0: -; RV64NOZBB-NEXT: srli a1, a0, 1 -; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 51 -; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: andi a0, a0, 51 -; RV64NOZBB-NEXT: add a0, a1, a0 -; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a0, a0, 15 -; RV64NOZBB-NEXT: ret +; RV64I-LABEL: test_ctpop_i8: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: andi a3, a0, 255 +; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: andi a0, a0, 85 +; RV64I-NEXT: sub a3, a3, a0 +; RV64I-NEXT: srli a0, a3, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: and a2, a3, a2 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64M-LABEL: test_ctpop_i8: +; RV64M: # %bb.0: +; RV64M-NEXT: lui a1, 61681 +; RV64M-NEXT: addiw a1, a1, -241 +; RV64M-NEXT: slli a2, a1, 32 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: andi a3, a0, 255 +; RV64M-NEXT: srli a0, a0, 1 +; RV64M-NEXT: andi a0, a0, 85 +; RV64M-NEXT: sub a3, a3, a0 +; RV64M-NEXT: srli a0, a3, 2 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: and a2, a3, a2 +; RV64M-NEXT: add a0, a2, a0 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: and a0, a0, a1 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 +; RV64M-NEXT: mul a0, a0, a1 +; RV64M-NEXT: srli a0, a0, 56 +; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_ctpop_i8: ; RV32ZBB: # %bb.0: @@ -2346,16 +2855,31 @@ define i8 @test_ctpop_i8(i8 %a) nounwind { ; ; RV64XTHEADBB-LABEL: test_ctpop_i8: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: srli a1, a0, 1 -; RV64XTHEADBB-NEXT: andi a1, a1, 85 -; RV64XTHEADBB-NEXT: subw a0, a0, a1 -; RV64XTHEADBB-NEXT: andi a1, a0, 51 -; RV64XTHEADBB-NEXT: srli a0, a0, 2 -; RV64XTHEADBB-NEXT: andi a0, a0, 51 -; RV64XTHEADBB-NEXT: add a0, a1, a0 -; RV64XTHEADBB-NEXT: srli a1, a0, 4 -; RV64XTHEADBB-NEXT: add a0, a0, a1 -; RV64XTHEADBB-NEXT: andi a0, a0, 15 +; RV64XTHEADBB-NEXT: addi sp, sp, -16 +; RV64XTHEADBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64XTHEADBB-NEXT: lui a1, 61681 +; RV64XTHEADBB-NEXT: addiw a1, a1, -241 +; RV64XTHEADBB-NEXT: slli a2, a1, 32 +; RV64XTHEADBB-NEXT: add a1, a1, a2 +; RV64XTHEADBB-NEXT: slli a2, a1, 2 +; RV64XTHEADBB-NEXT: xor a2, a2, a1 +; RV64XTHEADBB-NEXT: andi a3, a0, 255 +; RV64XTHEADBB-NEXT: srli a0, a0, 1 +; RV64XTHEADBB-NEXT: andi a0, a0, 85 +; RV64XTHEADBB-NEXT: sub a3, a3, a0 +; RV64XTHEADBB-NEXT: srli a0, a3, 2 +; RV64XTHEADBB-NEXT: and a0, a0, a2 +; RV64XTHEADBB-NEXT: and a2, a3, a2 +; RV64XTHEADBB-NEXT: add a0, a2, a0 +; RV64XTHEADBB-NEXT: srli a2, a0, 4 +; RV64XTHEADBB-NEXT: add a0, a0, a2 +; RV64XTHEADBB-NEXT: and a0, a0, a1 +; RV64XTHEADBB-NEXT: srli a2, a1, 3 +; RV64XTHEADBB-NEXT: and a1, a2, a1 +; RV64XTHEADBB-NEXT: call __muldi3 +; RV64XTHEADBB-NEXT: srli a0, a0, 56 +; RV64XTHEADBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64XTHEADBB-NEXT: addi sp, sp, 16 ; RV64XTHEADBB-NEXT: ret %1 = call i8 @llvm.ctpop.i8(i8 %a) ret i8 %1 @@ -2383,26 +2907,65 @@ define i16 @test_ctpop_i16(i16 %a) nounwind { ; RV32_NOZBB-NEXT: add a0, a1, a0 ; RV32_NOZBB-NEXT: ret ; -; RV64NOZBB-LABEL: test_ctpop_i16: -; RV64NOZBB: # %bb.0: -; RV64NOZBB-NEXT: srli a1, a0, 1 -; RV64NOZBB-NEXT: lui a2, 5 -; RV64NOZBB-NEXT: addiw a2, a2, 1365 -; RV64NOZBB-NEXT: and a1, a1, a2 -; RV64NOZBB-NEXT: sub a0, a0, a1 -; RV64NOZBB-NEXT: lui a1, 3 -; RV64NOZBB-NEXT: addiw a1, a1, 819 -; RV64NOZBB-NEXT: and a2, a0, a1 -; RV64NOZBB-NEXT: srli a0, a0, 2 -; RV64NOZBB-NEXT: and a0, a0, a1 -; RV64NOZBB-NEXT: add a0, a2, a0 -; RV64NOZBB-NEXT: srli a1, a0, 4 -; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 15 -; RV64NOZBB-NEXT: slli a0, a0, 52 -; RV64NOZBB-NEXT: srli a0, a0, 60 -; RV64NOZBB-NEXT: add a0, a1, a0 -; RV64NOZBB-NEXT: ret +; RV64I-LABEL: test_ctpop_i16: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 5 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: slli a0, a0, 48 +; RV64I-NEXT: srli a0, a0, 48 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 2 +; RV64I-NEXT: and a3, a3, a2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a0, a3 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64M-LABEL: test_ctpop_i16: +; RV64M: # %bb.0: +; RV64M-NEXT: lui a1, 61681 +; RV64M-NEXT: addiw a1, a1, -241 +; RV64M-NEXT: slli a2, a1, 32 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: srli a3, a0, 1 +; RV64M-NEXT: lui a4, 5 +; RV64M-NEXT: addiw a4, a4, 1365 +; RV64M-NEXT: and a3, a3, a4 +; RV64M-NEXT: slli a0, a0, 48 +; RV64M-NEXT: srli a0, a0, 48 +; RV64M-NEXT: sub a0, a0, a3 +; RV64M-NEXT: srli a3, a0, 2 +; RV64M-NEXT: and a3, a3, a2 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: add a0, a0, a3 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 +; RV64M-NEXT: and a0, a0, a1 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 +; RV64M-NEXT: mul a0, a0, a1 +; RV64M-NEXT: srli a0, a0, 56 +; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_ctpop_i16: ; RV32ZBB: # %bb.0: @@ -2439,23 +3002,33 @@ define i16 @test_ctpop_i16(i16 %a) nounwind { ; ; RV64XTHEADBB-LABEL: test_ctpop_i16: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: srli a1, a0, 1 -; RV64XTHEADBB-NEXT: lui a2, 5 -; RV64XTHEADBB-NEXT: addiw a2, a2, 1365 -; RV64XTHEADBB-NEXT: and a1, a1, a2 -; RV64XTHEADBB-NEXT: sub a0, a0, a1 -; RV64XTHEADBB-NEXT: lui a1, 3 -; RV64XTHEADBB-NEXT: addiw a1, a1, 819 -; RV64XTHEADBB-NEXT: and a2, a0, a1 -; RV64XTHEADBB-NEXT: srli a0, a0, 2 +; RV64XTHEADBB-NEXT: addi sp, sp, -16 +; RV64XTHEADBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64XTHEADBB-NEXT: lui a1, 61681 +; RV64XTHEADBB-NEXT: addiw a1, a1, -241 +; RV64XTHEADBB-NEXT: slli a2, a1, 32 +; RV64XTHEADBB-NEXT: add a1, a1, a2 +; RV64XTHEADBB-NEXT: slli a2, a1, 2 +; RV64XTHEADBB-NEXT: xor a2, a2, a1 +; RV64XTHEADBB-NEXT: srli a3, a0, 1 +; RV64XTHEADBB-NEXT: lui a4, 5 +; RV64XTHEADBB-NEXT: addiw a4, a4, 1365 +; RV64XTHEADBB-NEXT: and a3, a3, a4 +; RV64XTHEADBB-NEXT: th.extu a0, a0, 15, 0 +; RV64XTHEADBB-NEXT: sub a0, a0, a3 +; RV64XTHEADBB-NEXT: srli a3, a0, 2 +; RV64XTHEADBB-NEXT: and a3, a3, a2 +; RV64XTHEADBB-NEXT: and a0, a0, a2 +; RV64XTHEADBB-NEXT: add a0, a0, a3 +; RV64XTHEADBB-NEXT: srli a2, a0, 4 +; RV64XTHEADBB-NEXT: add a0, a0, a2 ; RV64XTHEADBB-NEXT: and a0, a0, a1 -; RV64XTHEADBB-NEXT: add a0, a2, a0 -; RV64XTHEADBB-NEXT: srli a1, a0, 4 -; RV64XTHEADBB-NEXT: add a0, a0, a1 -; RV64XTHEADBB-NEXT: andi a1, a0, 15 -; RV64XTHEADBB-NEXT: slli a0, a0, 52 -; RV64XTHEADBB-NEXT: srli a0, a0, 60 -; RV64XTHEADBB-NEXT: add a0, a1, a0 +; RV64XTHEADBB-NEXT: srli a2, a1, 3 +; RV64XTHEADBB-NEXT: and a1, a2, a1 +; RV64XTHEADBB-NEXT: call __muldi3 +; RV64XTHEADBB-NEXT: srli a0, a0, 56 +; RV64XTHEADBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64XTHEADBB-NEXT: addi sp, sp, 16 ; RV64XTHEADBB-NEXT: ret %1 = call i16 @llvm.ctpop.i16(i16 %a) ret i16 %1 @@ -2494,26 +3067,30 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 2 +; RV64I-NEXT: and a3, a3, a2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a0, a3 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -2544,26 +3121,30 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; ; RV64M-LABEL: test_ctpop_i32: ; RV64M: # %bb.0: -; RV64M-NEXT: srli a1, a0, 1 -; RV64M-NEXT: lui a2, 349525 -; RV64M-NEXT: addiw a2, a2, 1365 -; RV64M-NEXT: and a1, a1, a2 -; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: lui a1, 209715 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: and a2, a0, a1 -; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: add a0, a2, a0 -; RV64M-NEXT: srli a1, a0, 4 -; RV64M-NEXT: add a0, a0, a1 ; RV64M-NEXT: lui a1, 61681 -; RV64M-NEXT: addi a1, a1, -241 +; RV64M-NEXT: addiw a1, a1, -241 +; RV64M-NEXT: slli a2, a1, 32 +; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: srli a3, a0, 1 +; RV64M-NEXT: lui a4, 349525 +; RV64M-NEXT: addiw a4, a4, 1365 +; RV64M-NEXT: and a3, a3, a4 +; RV64M-NEXT: slli a0, a0, 32 +; RV64M-NEXT: srli a0, a0, 32 +; RV64M-NEXT: sub a0, a0, a3 +; RV64M-NEXT: srli a3, a0, 2 +; RV64M-NEXT: and a3, a3, a2 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: add a0, a0, a3 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 ; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: lui a1, 4112 -; RV64M-NEXT: addi a1, a1, 257 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 ; RV64M-NEXT: mul a0, a0, a1 -; RV64M-NEXT: srliw a0, a0, 24 +; RV64M-NEXT: srli a0, a0, 56 ; RV64M-NEXT: ret ; ; RV32ZBB-LABEL: test_ctpop_i32: @@ -2608,26 +3189,29 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: addi sp, sp, -16 ; RV64XTHEADBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64XTHEADBB-NEXT: srli a1, a0, 1 -; RV64XTHEADBB-NEXT: lui a2, 349525 -; RV64XTHEADBB-NEXT: addiw a2, a2, 1365 -; RV64XTHEADBB-NEXT: and a1, a1, a2 -; RV64XTHEADBB-NEXT: sub a0, a0, a1 -; RV64XTHEADBB-NEXT: lui a1, 209715 -; RV64XTHEADBB-NEXT: addiw a1, a1, 819 -; RV64XTHEADBB-NEXT: and a2, a0, a1 -; RV64XTHEADBB-NEXT: srli a0, a0, 2 -; RV64XTHEADBB-NEXT: and a0, a0, a1 -; RV64XTHEADBB-NEXT: add a0, a2, a0 -; RV64XTHEADBB-NEXT: srli a1, a0, 4 -; RV64XTHEADBB-NEXT: add a0, a0, a1 ; RV64XTHEADBB-NEXT: lui a1, 61681 ; RV64XTHEADBB-NEXT: addiw a1, a1, -241 +; RV64XTHEADBB-NEXT: slli a2, a1, 32 +; RV64XTHEADBB-NEXT: add a1, a1, a2 +; RV64XTHEADBB-NEXT: slli a2, a1, 2 +; RV64XTHEADBB-NEXT: xor a2, a2, a1 +; RV64XTHEADBB-NEXT: srli a3, a0, 1 +; RV64XTHEADBB-NEXT: lui a4, 349525 +; RV64XTHEADBB-NEXT: addiw a4, a4, 1365 +; RV64XTHEADBB-NEXT: and a3, a3, a4 +; RV64XTHEADBB-NEXT: th.extu a0, a0, 31, 0 +; RV64XTHEADBB-NEXT: sub a0, a0, a3 +; RV64XTHEADBB-NEXT: srli a3, a0, 2 +; RV64XTHEADBB-NEXT: and a3, a3, a2 +; RV64XTHEADBB-NEXT: and a0, a0, a2 +; RV64XTHEADBB-NEXT: add a0, a0, a3 +; RV64XTHEADBB-NEXT: srli a2, a0, 4 +; RV64XTHEADBB-NEXT: add a0, a0, a2 ; RV64XTHEADBB-NEXT: and a0, a0, a1 -; RV64XTHEADBB-NEXT: lui a1, 4112 -; RV64XTHEADBB-NEXT: addiw a1, a1, 257 +; RV64XTHEADBB-NEXT: srli a2, a1, 3 +; RV64XTHEADBB-NEXT: and a1, a2, a1 ; RV64XTHEADBB-NEXT: call __muldi3 -; RV64XTHEADBB-NEXT: srliw a0, a0, 24 +; RV64XTHEADBB-NEXT: srli a0, a0, 56 ; RV64XTHEADBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64XTHEADBB-NEXT: addi sp, sp, 16 ; RV64XTHEADBB-NEXT: ret @@ -2697,32 +3281,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -2769,32 +3349,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; ; RV64M-LABEL: test_ctpop_i64: ; RV64M: # %bb.0: -; RV64M-NEXT: srli a1, a0, 1 -; RV64M-NEXT: lui a2, 349525 -; RV64M-NEXT: addiw a2, a2, 1365 -; RV64M-NEXT: slli a3, a2, 32 -; RV64M-NEXT: add a2, a2, a3 -; RV64M-NEXT: and a1, a1, a2 -; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: lui a1, 209715 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 -; RV64M-NEXT: and a2, a0, a1 -; RV64M-NEXT: srli a0, a0, 2 -; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: add a0, a2, a0 -; RV64M-NEXT: srli a1, a0, 4 -; RV64M-NEXT: add a0, a0, a1 ; RV64M-NEXT: lui a1, 61681 ; RV64M-NEXT: addiw a1, a1, -241 ; RV64M-NEXT: slli a2, a1, 32 ; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: xor a2, a2, a1 +; RV64M-NEXT: srli a3, a0, 1 +; RV64M-NEXT: lui a4, 349525 +; RV64M-NEXT: addiw a4, a4, 1365 +; RV64M-NEXT: slli a5, a4, 32 +; RV64M-NEXT: add a4, a4, a5 +; RV64M-NEXT: and a3, a3, a4 +; RV64M-NEXT: sub a0, a0, a3 +; RV64M-NEXT: and a3, a0, a2 +; RV64M-NEXT: srli a0, a0, 2 +; RV64M-NEXT: and a0, a0, a2 +; RV64M-NEXT: add a0, a3, a0 +; RV64M-NEXT: srli a2, a0, 4 +; RV64M-NEXT: add a0, a0, a2 ; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: lui a1, 4112 -; RV64M-NEXT: addiw a1, a1, 257 -; RV64M-NEXT: slli a2, a1, 32 -; RV64M-NEXT: add a1, a1, a2 +; RV64M-NEXT: srli a2, a1, 3 +; RV64M-NEXT: and a1, a2, a1 ; RV64M-NEXT: mul a0, a0, a1 ; RV64M-NEXT: srli a0, a0, 56 ; RV64M-NEXT: ret @@ -2873,32 +3449,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: addi sp, sp, -16 ; RV64XTHEADBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64XTHEADBB-NEXT: srli a1, a0, 1 -; RV64XTHEADBB-NEXT: lui a2, 349525 -; RV64XTHEADBB-NEXT: addiw a2, a2, 1365 -; RV64XTHEADBB-NEXT: slli a3, a2, 32 -; RV64XTHEADBB-NEXT: add a2, a2, a3 -; RV64XTHEADBB-NEXT: and a1, a1, a2 -; RV64XTHEADBB-NEXT: sub a0, a0, a1 -; RV64XTHEADBB-NEXT: lui a1, 209715 -; RV64XTHEADBB-NEXT: addiw a1, a1, 819 -; RV64XTHEADBB-NEXT: slli a2, a1, 32 -; RV64XTHEADBB-NEXT: add a1, a1, a2 -; RV64XTHEADBB-NEXT: and a2, a0, a1 -; RV64XTHEADBB-NEXT: srli a0, a0, 2 -; RV64XTHEADBB-NEXT: and a0, a0, a1 -; RV64XTHEADBB-NEXT: add a0, a2, a0 -; RV64XTHEADBB-NEXT: srli a1, a0, 4 -; RV64XTHEADBB-NEXT: add a0, a0, a1 ; RV64XTHEADBB-NEXT: lui a1, 61681 ; RV64XTHEADBB-NEXT: addiw a1, a1, -241 ; RV64XTHEADBB-NEXT: slli a2, a1, 32 ; RV64XTHEADBB-NEXT: add a1, a1, a2 +; RV64XTHEADBB-NEXT: slli a2, a1, 2 +; RV64XTHEADBB-NEXT: xor a2, a2, a1 +; RV64XTHEADBB-NEXT: srli a3, a0, 1 +; RV64XTHEADBB-NEXT: lui a4, 349525 +; RV64XTHEADBB-NEXT: addiw a4, a4, 1365 +; RV64XTHEADBB-NEXT: slli a5, a4, 32 +; RV64XTHEADBB-NEXT: add a4, a4, a5 +; RV64XTHEADBB-NEXT: and a3, a3, a4 +; RV64XTHEADBB-NEXT: sub a0, a0, a3 +; RV64XTHEADBB-NEXT: and a3, a0, a2 +; RV64XTHEADBB-NEXT: srli a0, a0, 2 +; RV64XTHEADBB-NEXT: and a0, a0, a2 +; RV64XTHEADBB-NEXT: add a0, a3, a0 +; RV64XTHEADBB-NEXT: srli a2, a0, 4 +; RV64XTHEADBB-NEXT: add a0, a0, a2 ; RV64XTHEADBB-NEXT: and a0, a0, a1 -; RV64XTHEADBB-NEXT: lui a1, 4112 -; RV64XTHEADBB-NEXT: addiw a1, a1, 257 -; RV64XTHEADBB-NEXT: slli a2, a1, 32 -; RV64XTHEADBB-NEXT: add a1, a1, a2 +; RV64XTHEADBB-NEXT: srli a2, a1, 3 +; RV64XTHEADBB-NEXT: and a1, a2, a1 ; RV64XTHEADBB-NEXT: call __muldi3 ; RV64XTHEADBB-NEXT: srli a0, a0, 56 ; RV64XTHEADBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll index adf614435b31d..007bbf79d2363 100644 --- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll +++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll @@ -85,24 +85,36 @@ define signext i32 @ctz_dereferencing_pointer(ptr %b) nounwind { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: ld s0, 0(a0) -; RV64I-NEXT: neg a0, s0 -; RV64I-NEXT: and a0, s0, a0 -; RV64I-NEXT: lui a1, %hi(.LCPI0_0) -; RV64I-NEXT: ld a1, %lo(.LCPI0_0)(a1) +; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: addi a3, a0, -1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a0, a0, 2 ; RV64I-NEXT: srli a0, a0, 58 -; RV64I-NEXT: lui a1, %hi(.LCPI0_1) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI0_1) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: seqz a1, s0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: andi a0, a0, 63 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -548,24 +560,35 @@ define signext i32 @ctz4(i64 %b) nounwind { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: neg a0, a0 -; RV64I-NEXT: and a0, s0, a0 -; RV64I-NEXT: lui a1, %hi(.LCPI6_0) -; RV64I-NEXT: ld a1, %lo(.LCPI6_0)(a1) +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: addi a3, a0, -1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a0, a0, 2 ; RV64I-NEXT: srli a0, a0, 58 -; RV64I-NEXT: lui a1, %hi(.LCPI6_1) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI6_1) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: seqz a1, s0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: andi a0, a0, 63 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -712,25 +735,21 @@ define signext i32 @ctlz(i64 %b) nounwind { ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: slli a0, a0, 2 ; RV64I-NEXT: srli a0, a0, 58 diff --git a/llvm/test/CodeGen/RISCV/pr56457.ll b/llvm/test/CodeGen/RISCV/pr56457.ll index ba08aa838bf99..bb7239862de1e 100644 --- a/llvm/test/CodeGen/RISCV/pr56457.ll +++ b/llvm/test/CodeGen/RISCV/pr56457.ll @@ -28,25 +28,21 @@ define i15 @foo(i15 %x) nounwind { ; CHECK-NEXT: slli a0, a0, 49 ; CHECK-NEXT: srli a0, a0, 49 ; CHECK-NEXT: sub a0, a0, a1 -; CHECK-NEXT: lui a1, 209715 -; CHECK-NEXT: addiw a1, a1, 819 -; CHECK-NEXT: slli a2, a1, 32 -; CHECK-NEXT: add a1, a1, a2 -; CHECK-NEXT: and a2, a0, a1 -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: add a0, a2, a0 -; CHECK-NEXT: srli a1, a0, 4 -; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: lui a1, 61681 ; CHECK-NEXT: addiw a1, a1, -241 ; CHECK-NEXT: slli a2, a1, 32 ; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a1, 2 +; CHECK-NEXT: xor a2, a2, a1 +; CHECK-NEXT: and a3, a0, a2 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: add a0, a3, a0 +; CHECK-NEXT: srli a2, a0, 4 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: lui a1, 4112 -; CHECK-NEXT: addiw a1, a1, 257 -; CHECK-NEXT: slli a2, a1, 32 -; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: srli a2, a1, 3 +; CHECK-NEXT: and a1, a2, a1 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: srli a0, a0, 56 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll index 73bfc6480b4d7..929cf7d35d674 100644 --- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll @@ -339,25 +339,21 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -591,16 +587,33 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: addi a3, a0, -1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, %hi(.LCPI10_0) -; RV64I-NEXT: ld a1, %lo(.LCPI10_0)(a1) +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srli a0, a0, 58 -; RV64I-NEXT: lui a1, %hi(.LCPI10_1) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI10_1) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll index 7feef4dad4116..268488b14d806 100644 --- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll @@ -329,25 +329,21 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -514,16 +510,33 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: addi a3, a0, -1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, %hi(.LCPI10_0) -; RV64I-NEXT: ld a1, %lo(.LCPI10_0)(a1) +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srli a0, a0, 58 -; RV64I-NEXT: lui a1, %hi(.LCPI10_1) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI10_1) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -625,32 +638,28 @@ define i64 @ctpop_i64(i64 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll index 1f62ea9f56819..7684a904ae9ce 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll @@ -28,22 +28,26 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -83,22 +87,26 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: j .LBB1_3 @@ -148,22 +156,26 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a1, a0, 24 +; RV64I-NEXT: srli a1, a0, 56 ; RV64I-NEXT: .LBB2_2: # %cond.end ; RV64I-NEXT: sub a0, s0, a1 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -208,22 +220,26 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: xori a0, a0, 31 ; RV64I-NEXT: snez a1, s0 ; RV64I-NEXT: addi a1, a1, -1 @@ -275,22 +291,26 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -339,25 +359,21 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -548,16 +564,33 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: addi a3, a0, -1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, %hi(.LCPI10_0) -; RV64I-NEXT: ld a1, %lo(.LCPI10_0)(a1) +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srli a0, a0, 58 -; RV64I-NEXT: lui a1, %hi(.LCPI10_1) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI10_1) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll index 2269d8d04c9cb..b8d4d9706dc0a 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -28,22 +28,26 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -81,22 +85,26 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: j .LBB1_3 @@ -144,22 +152,26 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a1, a0, 24 +; RV64I-NEXT: srli a1, a0, 56 ; RV64I-NEXT: .LBB2_2: # %cond.end ; RV64I-NEXT: sub a0, s0, a1 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -202,22 +214,26 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: xori a0, a0, 31 ; RV64I-NEXT: snez a1, s0 ; RV64I-NEXT: addi a1, a1, -1 @@ -267,22 +283,26 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -329,25 +349,21 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -514,16 +530,33 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: addi a3, a0, -1 +; RV64I-NEXT: not a0, a0 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, %hi(.LCPI10_0) -; RV64I-NEXT: ld a1, %lo(.LCPI10_0)(a1) +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srli a0, a0, 58 -; RV64I-NEXT: lui a1, %hi(.LCPI10_1) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI10_1) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -546,26 +579,30 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: srli a1, a1, 32 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: slli a3, a2, 32 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a2, 2 +; RV64I-NEXT: xor a3, a3, a2 +; RV64I-NEXT: srli a0, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: and a0, a0, a4 +; RV64I-NEXT: sub a1, a1, a0 +; RV64I-NEXT: srli a0, a1, 2 +; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: srli a1, a2, 3 +; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -659,27 +696,29 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, 0(a0) -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lwu a0, 0(a0) ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: srli a3, a0, 2 +; RV64I-NEXT: and a3, a3, a2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a0, a3 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -707,41 +746,48 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind { ; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s5, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 0(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 -; RV64I-NEXT: srli a1, a0, 1 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: srli s3, a1, 32 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: srli a1, a1, 32 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: addiw a2, a2, -241 +; RV64I-NEXT: slli a3, a2, 32 +; RV64I-NEXT: add s4, a2, a3 +; RV64I-NEXT: slli a2, s4, 2 +; RV64I-NEXT: xor s5, a2, s4 +; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw s3, a2, 1365 -; RV64I-NEXT: and a1, a1, s3 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw s4, a1, 819 -; RV64I-NEXT: and a1, a0, s4 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, s4 +; RV64I-NEXT: addiw s6, a2, 1365 +; RV64I-NEXT: and a0, a0, s6 +; RV64I-NEXT: sub a1, a1, a0 +; RV64I-NEXT: srli a0, a1, 2 +; RV64I-NEXT: and a0, a0, s5 +; RV64I-NEXT: and a1, a1, s5 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw s5, a1, -241 -; RV64I-NEXT: and a0, a0, s5 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw s1, a1, 257 +; RV64I-NEXT: and a0, a0, s4 +; RV64I-NEXT: srli a1, s4, 3 +; RV64I-NEXT: and s1, a1, s4 ; RV64I-NEXT: mv a1, s1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw s2, a0, 24 -; RV64I-NEXT: srli a0, s0, 1 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: sub s0, s0, a0 -; RV64I-NEXT: and a0, s0, s4 -; RV64I-NEXT: srli s0, s0, 2 -; RV64I-NEXT: and a1, s0, s4 +; RV64I-NEXT: srli s2, a0, 56 +; RV64I-NEXT: srli s0, s0, 1 +; RV64I-NEXT: and a0, s0, s6 +; RV64I-NEXT: sub a0, s3, a0 +; RV64I-NEXT: srli a1, a0, 2 +; RV64I-NEXT: and a1, a1, s5 +; RV64I-NEXT: and a0, a0, s5 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: and a0, a0, s5 +; RV64I-NEXT: and a0, a0, s4 ; RV64I-NEXT: mv a1, s1 ; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a1, a0, 24 +; RV64I-NEXT: srli a1, a0, 56 ; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -750,6 +796,7 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind { ; RV64I-NEXT: ld s3, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s4, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s5, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 0(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; @@ -877,32 +924,28 @@ define i64 @ctpop_i64(i64 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add a2, a2, a3 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 2 +; RV64I-NEXT: xor a2, a2, a1 +; RV64I-NEXT: srli a3, a0, 1 +; RV64I-NEXT: lui a4, 349525 +; RV64I-NEXT: addiw a4, a4, 1365 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: and a3, a3, a4 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: srli a2, a0, 4 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srli a2, a1, 3 +; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -1007,37 +1050,33 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s5, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add s3, a1, a2 +; RV64I-NEXT: slli a1, s3, 2 +; RV64I-NEXT: xor s4, a1, s3 ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add s3, a2, a3 -; RV64I-NEXT: and a1, a1, s3 +; RV64I-NEXT: add s5, a2, a3 +; RV64I-NEXT: and a1, a1, s5 ; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add s4, a1, a2 ; RV64I-NEXT: and a1, a0, s4 ; RV64I-NEXT: srli a0, a0, 2 ; RV64I-NEXT: and a0, a0, s4 ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add s5, a1, a2 -; RV64I-NEXT: and a0, a0, s5 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw s1, a1, 257 -; RV64I-NEXT: slli a1, s1, 32 -; RV64I-NEXT: add s1, s1, a1 +; RV64I-NEXT: and a0, a0, s3 +; RV64I-NEXT: srli a1, s3, 3 +; RV64I-NEXT: and s1, a1, s3 ; RV64I-NEXT: mv a1, s1 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srli s2, a0, 56 ; RV64I-NEXT: srli a0, s0, 1 -; RV64I-NEXT: and a0, a0, s3 +; RV64I-NEXT: and a0, a0, s5 ; RV64I-NEXT: sub s0, s0, a0 ; RV64I-NEXT: and a0, s0, s4 ; RV64I-NEXT: srli s0, s0, 2 @@ -1045,7 +1084,7 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: and a0, a0, s5 +; RV64I-NEXT: and a0, a0, s3 ; RV64I-NEXT: mv a1, s1 ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srli a1, a0, 56 diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll index fc94f8c2a5279..4b1bfa48a95d8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll @@ -1147,35 +1147,26 @@ define @ctlz_nxv1i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v9 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v9, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 +; RV32I-NEXT: vsll.vi v11, v10, 2 +; RV32I-NEXT: vxor.vv v11, v10, v11 +; RV32I-NEXT: vadd.vv v12, v11, v11 +; RV32I-NEXT: vxor.vv v12, v11, v12 +; RV32I-NEXT: vand.vv v9, v9, v12 ; RV32I-NEXT: vsub.vv v8, v8, v9 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v11 ; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vand.vv v8, v8, v11 +; RV32I-NEXT: vadd.vv v8, v9, v8 ; RV32I-NEXT: vsrl.vi v9, v8, 4 ; RV32I-NEXT: vadd.vv v8, v8, v9 -; RV32I-NEXT: lui a0, 61681 -; RV32I-NEXT: addi a0, a0, -241 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v9, v10, 3 +; RV32I-NEXT: vand.vv v9, v10, v9 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -1199,32 +1190,27 @@ define @ctlz_nxv1i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vmv.v.x v10, a0 +; RV64I-NEXT: vsll.vi v11, v10, 2 +; RV64I-NEXT: vxor.vx v11, v11, a0 +; RV64I-NEXT: vadd.vv v12, v11, v11 +; RV64I-NEXT: vxor.vv v12, v11, v12 +; RV64I-NEXT: vand.vv v9, v9, v12 ; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vand.vv v9, v8, v11 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vv v8, v8, v11 ; RV64I-NEXT: vadd.vv v8, v9, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v9, v10, 3 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vmul.vv v8, v8, v9 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1288,35 +1274,26 @@ define @ctlz_nxv2i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v10 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v10, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 +; RV32I-NEXT: vsll.vi v14, v12, 2 +; RV32I-NEXT: vxor.vv v14, v12, v14 +; RV32I-NEXT: vadd.vv v16, v14, v14 +; RV32I-NEXT: vxor.vv v16, v14, v16 +; RV32I-NEXT: vand.vv v10, v10, v16 ; RV32I-NEXT: vsub.vv v8, v8, v10 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v14 ; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vand.vv v8, v8, v14 +; RV32I-NEXT: vadd.vv v8, v10, v8 ; RV32I-NEXT: vsrl.vi v10, v8, 4 ; RV32I-NEXT: vadd.vv v8, v8, v10 -; RV32I-NEXT: lui a0, 61681 -; RV32I-NEXT: addi a0, a0, -241 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v10, v12, 3 +; RV32I-NEXT: vand.vv v10, v12, v10 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -1340,32 +1317,27 @@ define @ctlz_nxv2i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vmv.v.x v12, a0 +; RV64I-NEXT: vsll.vi v14, v12, 2 +; RV64I-NEXT: vxor.vx v14, v14, a0 +; RV64I-NEXT: vadd.vv v16, v14, v14 +; RV64I-NEXT: vxor.vv v16, v14, v16 +; RV64I-NEXT: vand.vv v10, v10, v16 ; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vand.vv v10, v8, v14 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vv v8, v8, v14 ; RV64I-NEXT: vadd.vv v8, v10, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v10, v12, 3 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vmul.vv v8, v8, v10 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1429,35 +1401,26 @@ define @ctlz_nxv4i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v12 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v12, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v12, v12, v16 +; RV32I-NEXT: vsll.vi v20, v16, 2 +; RV32I-NEXT: vxor.vv v20, v16, v20 +; RV32I-NEXT: vadd.vv v24, v20, v20 +; RV32I-NEXT: vxor.vv v24, v20, v24 +; RV32I-NEXT: vand.vv v12, v12, v24 ; RV32I-NEXT: vsub.vv v8, v8, v12 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v16, v8, v12 +; RV32I-NEXT: vand.vv v12, v8, v20 ; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: vadd.vv v8, v16, v8 +; RV32I-NEXT: vand.vv v8, v8, v20 +; RV32I-NEXT: vadd.vv v8, v12, v8 ; RV32I-NEXT: vsrl.vi v12, v8, 4 ; RV32I-NEXT: vadd.vv v8, v8, v12 -; RV32I-NEXT: lui a0, 61681 -; RV32I-NEXT: addi a0, a0, -241 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v12, v16, 3 +; RV32I-NEXT: vand.vv v12, v16, v12 ; RV32I-NEXT: vmul.vv v8, v8, v12 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -1481,32 +1444,27 @@ define @ctlz_nxv4i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v12 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v12, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vmv.v.x v16, a0 +; RV64I-NEXT: vsll.vi v20, v16, 2 +; RV64I-NEXT: vxor.vx v20, v20, a0 +; RV64I-NEXT: vadd.vv v24, v20, v20 +; RV64I-NEXT: vxor.vv v24, v20, v24 +; RV64I-NEXT: vand.vv v12, v12, v24 ; RV64I-NEXT: vsub.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vand.vv v12, v8, v20 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vv v8, v8, v20 ; RV64I-NEXT: vadd.vv v8, v12, v8 ; RV64I-NEXT: vsrl.vi v12, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v12, v16, 3 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vmul.vv v8, v8, v12 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1554,6 +1512,12 @@ declare @llvm.ctlz.nxv4i64(, i1) define @ctlz_nxv8i64( %va) { ; RV32I-LABEL: ctlz_nxv8i64: ; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: sub sp, sp, a0 +; RV32I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV32I-NEXT: vsrl.vi v16, v8, 1 ; RV32I-NEXT: vor.vv v8, v8, v16 @@ -1569,43 +1533,58 @@ define @ctlz_nxv8i64( %va) { ; RV32I-NEXT: vsrl.vx v16, v8, a0 ; RV32I-NEXT: vor.vv v8, v8, v16 ; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vsrl.vi v16, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v24, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v16, v16, v24 -; RV32I-NEXT: vsub.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v24, v8, v16 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: vadd.vv v8, v24, v8 -; RV32I-NEXT: vsrl.vi v16, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v16 +; RV32I-NEXT: addi a0, sp, 16 +; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32I-NEXT: vsrl.vi v8, v8, 1 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 3 +; RV32I-NEXT: add a0, sp, a0 +; RV32I-NEXT: addi a0, a0, 16 +; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 +; RV32I-NEXT: vmv.v.x v8, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vmul.vv v8, v8, v16 +; RV32I-NEXT: vsll.vi v0, v8, 2 +; RV32I-NEXT: vxor.vv v0, v8, v0 +; RV32I-NEXT: vadd.vv v24, v0, v0 +; RV32I-NEXT: vxor.vv v24, v0, v24 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 3 +; RV32I-NEXT: add a0, sp, a0 +; RV32I-NEXT: addi a0, a0, 16 +; RV32I-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32I-NEXT: vand.vv v24, v16, v24 +; RV32I-NEXT: addi a0, sp, 16 +; RV32I-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32I-NEXT: vsub.vv v16, v16, v24 +; RV32I-NEXT: vand.vv v24, v16, v0 +; RV32I-NEXT: vsrl.vi v16, v16, 2 +; RV32I-NEXT: vand.vv v16, v16, v0 +; RV32I-NEXT: vadd.vv v16, v24, v16 +; RV32I-NEXT: vsrl.vi v24, v16, 4 +; RV32I-NEXT: vadd.vv v16, v16, v24 +; RV32I-NEXT: vand.vv v16, v16, v8 +; RV32I-NEXT: vsrl.vi v24, v8, 3 +; RV32I-NEXT: vand.vv v8, v8, v24 +; RV32I-NEXT: vmul.vv v8, v16, v8 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: add sp, sp, a0 +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV64I-LABEL: ctlz_nxv8i64: ; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: csrr a0, vlenb +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: sub sp, sp, a0 +; RV64I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV64I-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV64I-NEXT: vsrl.vi v16, v8, 1 ; RV64I-NEXT: vor.vv v8, v8, v16 @@ -1620,36 +1599,49 @@ define @ctlz_nxv8i64( %va) { ; RV64I-NEXT: li a0, 32 ; RV64I-NEXT: vsrl.vx v16, v8, a0 ; RV64I-NEXT: vor.vv v8, v8, v16 -; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vsrl.vi v16, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v16, v16, a0 -; RV64I-NEXT: vsub.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v16, v8, a0 -; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: vadd.vv v8, v16, v8 -; RV64I-NEXT: vsrl.vi v16, v8, 4 -; RV64I-NEXT: vadd.vv v8, v8, v16 +; RV64I-NEXT: vnot.v v16, v8 ; RV64I-NEXT: lui a0, 61681 ; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: vmv.v.x v8, a0 +; RV64I-NEXT: addi a1, sp, 16 +; RV64I-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64I-NEXT: vsll.vi v24, v8, 2 +; RV64I-NEXT: vxor.vx v24, v24, a0 +; RV64I-NEXT: vadd.vv v0, v24, v24 +; RV64I-NEXT: vxor.vv v0, v24, v0 +; RV64I-NEXT: csrr a1, vlenb +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: add a1, sp, a1 +; RV64I-NEXT: addi a1, a1, 16 +; RV64I-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill +; RV64I-NEXT: vsrl.vi v0, v16, 1 +; RV64I-NEXT: csrr a1, vlenb +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: add a1, sp, a1 +; RV64I-NEXT: addi a1, a1, 16 +; RV64I-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64I-NEXT: vand.vv v0, v0, v8 +; RV64I-NEXT: vsub.vv v16, v16, v0 +; RV64I-NEXT: vand.vv v0, v16, v24 +; RV64I-NEXT: vsrl.vi v16, v16, 2 +; RV64I-NEXT: vand.vv v16, v16, v24 +; RV64I-NEXT: vadd.vv v16, v0, v16 +; RV64I-NEXT: vsrl.vi v24, v16, 4 +; RV64I-NEXT: vadd.vv v16, v16, v24 +; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: addi a1, sp, 16 +; RV64I-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64I-NEXT: vsrl.vi v8, v8, 3 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vmul.vv v8, v16, v8 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: csrr a0, vlenb +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: add sp, sp, a0 +; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; CHECK-F-LABEL: ctlz_nxv8i64: @@ -2753,35 +2745,26 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v9 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v9, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 +; RV32I-NEXT: vsll.vi v11, v10, 2 +; RV32I-NEXT: vxor.vv v11, v10, v11 +; RV32I-NEXT: vadd.vv v12, v11, v11 +; RV32I-NEXT: vxor.vv v12, v11, v12 +; RV32I-NEXT: vand.vv v9, v9, v12 ; RV32I-NEXT: vsub.vv v8, v8, v9 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v11 ; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vand.vv v8, v8, v11 +; RV32I-NEXT: vadd.vv v8, v9, v8 ; RV32I-NEXT: vsrl.vi v9, v8, 4 ; RV32I-NEXT: vadd.vv v8, v8, v9 -; RV32I-NEXT: lui a0, 61681 -; RV32I-NEXT: addi a0, a0, -241 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v9, v10, 3 +; RV32I-NEXT: vand.vv v9, v10, v9 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -2805,32 +2788,27 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vmv.v.x v10, a0 +; RV64I-NEXT: vsll.vi v11, v10, 2 +; RV64I-NEXT: vxor.vx v11, v11, a0 +; RV64I-NEXT: vadd.vv v12, v11, v11 +; RV64I-NEXT: vxor.vv v12, v11, v12 +; RV64I-NEXT: vand.vv v9, v9, v12 ; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vand.vv v9, v8, v11 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vv v8, v8, v11 ; RV64I-NEXT: vadd.vv v8, v9, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v9, v10, 3 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vmul.vv v8, v8, v9 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -2889,35 +2867,26 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v10 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v10, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 +; RV32I-NEXT: vsll.vi v14, v12, 2 +; RV32I-NEXT: vxor.vv v14, v12, v14 +; RV32I-NEXT: vadd.vv v16, v14, v14 +; RV32I-NEXT: vxor.vv v16, v14, v16 +; RV32I-NEXT: vand.vv v10, v10, v16 ; RV32I-NEXT: vsub.vv v8, v8, v10 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v14 ; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vand.vv v8, v8, v14 +; RV32I-NEXT: vadd.vv v8, v10, v8 ; RV32I-NEXT: vsrl.vi v10, v8, 4 ; RV32I-NEXT: vadd.vv v8, v8, v10 -; RV32I-NEXT: lui a0, 61681 -; RV32I-NEXT: addi a0, a0, -241 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v10, v12, 3 +; RV32I-NEXT: vand.vv v10, v12, v10 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -2941,32 +2910,27 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vmv.v.x v12, a0 +; RV64I-NEXT: vsll.vi v14, v12, 2 +; RV64I-NEXT: vxor.vx v14, v14, a0 +; RV64I-NEXT: vadd.vv v16, v14, v14 +; RV64I-NEXT: vxor.vv v16, v14, v16 +; RV64I-NEXT: vand.vv v10, v10, v16 ; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vand.vv v10, v8, v14 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vv v8, v8, v14 ; RV64I-NEXT: vadd.vv v8, v10, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v10, v12, 3 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vmul.vv v8, v8, v10 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -3025,35 +2989,26 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; RV32I-NEXT: vor.vv v8, v8, v12 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v12, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v12, v12, v16 +; RV32I-NEXT: vsll.vi v20, v16, 2 +; RV32I-NEXT: vxor.vv v20, v16, v20 +; RV32I-NEXT: vadd.vv v24, v20, v20 +; RV32I-NEXT: vxor.vv v24, v20, v24 +; RV32I-NEXT: vand.vv v12, v12, v24 ; RV32I-NEXT: vsub.vv v8, v8, v12 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v16, v8, v12 +; RV32I-NEXT: vand.vv v12, v8, v20 ; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: vadd.vv v8, v16, v8 +; RV32I-NEXT: vand.vv v8, v8, v20 +; RV32I-NEXT: vadd.vv v8, v12, v8 ; RV32I-NEXT: vsrl.vi v12, v8, 4 ; RV32I-NEXT: vadd.vv v8, v8, v12 -; RV32I-NEXT: lui a0, 61681 -; RV32I-NEXT: addi a0, a0, -241 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v12, v16, 3 +; RV32I-NEXT: vand.vv v12, v16, v12 ; RV32I-NEXT: vmul.vv v8, v8, v12 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -3077,32 +3032,27 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; RV64I-NEXT: vor.vv v8, v8, v12 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v12, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vmv.v.x v16, a0 +; RV64I-NEXT: vsll.vi v20, v16, 2 +; RV64I-NEXT: vxor.vx v20, v20, a0 +; RV64I-NEXT: vadd.vv v24, v20, v20 +; RV64I-NEXT: vxor.vv v24, v20, v24 +; RV64I-NEXT: vand.vv v12, v12, v24 ; RV64I-NEXT: vsub.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vand.vv v12, v8, v20 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vv v8, v8, v20 ; RV64I-NEXT: vadd.vv v8, v12, v8 ; RV64I-NEXT: vsrl.vi v12, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v12, v16, 3 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vmul.vv v8, v8, v12 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -3145,6 +3095,12 @@ define @ctlz_zero_undef_nxv4i64( %va) { define @ctlz_zero_undef_nxv8i64( %va) { ; RV32I-LABEL: ctlz_zero_undef_nxv8i64: ; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: sub sp, sp, a0 +; RV32I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV32I-NEXT: vsrl.vi v16, v8, 1 ; RV32I-NEXT: vor.vv v8, v8, v16 @@ -3160,43 +3116,58 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; RV32I-NEXT: vsrl.vx v16, v8, a0 ; RV32I-NEXT: vor.vv v8, v8, v16 ; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vsrl.vi v16, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v24, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v16, v16, v24 -; RV32I-NEXT: vsub.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v24, v8, v16 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: vadd.vv v8, v24, v8 -; RV32I-NEXT: vsrl.vi v16, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v16 +; RV32I-NEXT: addi a0, sp, 16 +; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32I-NEXT: vsrl.vi v8, v8, 1 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 3 +; RV32I-NEXT: add a0, sp, a0 +; RV32I-NEXT: addi a0, a0, 16 +; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 +; RV32I-NEXT: vmv.v.x v8, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vmul.vv v8, v8, v16 +; RV32I-NEXT: vsll.vi v0, v8, 2 +; RV32I-NEXT: vxor.vv v0, v8, v0 +; RV32I-NEXT: vadd.vv v24, v0, v0 +; RV32I-NEXT: vxor.vv v24, v0, v24 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 3 +; RV32I-NEXT: add a0, sp, a0 +; RV32I-NEXT: addi a0, a0, 16 +; RV32I-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32I-NEXT: vand.vv v24, v16, v24 +; RV32I-NEXT: addi a0, sp, 16 +; RV32I-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32I-NEXT: vsub.vv v16, v16, v24 +; RV32I-NEXT: vand.vv v24, v16, v0 +; RV32I-NEXT: vsrl.vi v16, v16, 2 +; RV32I-NEXT: vand.vv v16, v16, v0 +; RV32I-NEXT: vadd.vv v16, v24, v16 +; RV32I-NEXT: vsrl.vi v24, v16, 4 +; RV32I-NEXT: vadd.vv v16, v16, v24 +; RV32I-NEXT: vand.vv v16, v16, v8 +; RV32I-NEXT: vsrl.vi v24, v8, 3 +; RV32I-NEXT: vand.vv v8, v8, v24 +; RV32I-NEXT: vmul.vv v8, v16, v8 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: add sp, sp, a0 +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV64I-LABEL: ctlz_zero_undef_nxv8i64: ; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: csrr a0, vlenb +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: sub sp, sp, a0 +; RV64I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV64I-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV64I-NEXT: vsrl.vi v16, v8, 1 ; RV64I-NEXT: vor.vv v8, v8, v16 @@ -3211,36 +3182,49 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; RV64I-NEXT: li a0, 32 ; RV64I-NEXT: vsrl.vx v16, v8, a0 ; RV64I-NEXT: vor.vv v8, v8, v16 -; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vsrl.vi v16, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v16, v16, a0 -; RV64I-NEXT: vsub.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v16, v8, a0 -; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: vadd.vv v8, v16, v8 -; RV64I-NEXT: vsrl.vi v16, v8, 4 -; RV64I-NEXT: vadd.vv v8, v8, v16 +; RV64I-NEXT: vnot.v v16, v8 ; RV64I-NEXT: lui a0, 61681 ; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: vmv.v.x v8, a0 +; RV64I-NEXT: addi a1, sp, 16 +; RV64I-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64I-NEXT: vsll.vi v24, v8, 2 +; RV64I-NEXT: vxor.vx v24, v24, a0 +; RV64I-NEXT: vadd.vv v0, v24, v24 +; RV64I-NEXT: vxor.vv v0, v24, v0 +; RV64I-NEXT: csrr a1, vlenb +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: add a1, sp, a1 +; RV64I-NEXT: addi a1, a1, 16 +; RV64I-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill +; RV64I-NEXT: vsrl.vi v0, v16, 1 +; RV64I-NEXT: csrr a1, vlenb +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: add a1, sp, a1 +; RV64I-NEXT: addi a1, a1, 16 +; RV64I-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64I-NEXT: vand.vv v0, v0, v8 +; RV64I-NEXT: vsub.vv v16, v16, v0 +; RV64I-NEXT: vand.vv v0, v16, v24 +; RV64I-NEXT: vsrl.vi v16, v16, 2 +; RV64I-NEXT: vand.vv v16, v16, v24 +; RV64I-NEXT: vadd.vv v16, v0, v16 +; RV64I-NEXT: vsrl.vi v24, v16, 4 +; RV64I-NEXT: vadd.vv v16, v16, v24 +; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: addi a1, sp, 16 +; RV64I-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64I-NEXT: vsrl.vi v8, v8, 3 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vmul.vv v8, v16, v8 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: csrr a0, vlenb +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: add sp, sp, a0 +; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; CHECK-F-LABEL: ctlz_zero_undef_nxv8i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll index c310274d68508..a33e7b54288d8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll @@ -675,37 +675,27 @@ declare @llvm.ctpop.nxv16i32() define @ctpop_nxv1i64( %va) { ; RV32-LABEL: ctpop_nxv1i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32-NEXT: vsll.vi v10, v9, 2 +; RV32-NEXT: vxor.vv v10, v9, v10 +; RV32-NEXT: vadd.vv v11, v10, v10 +; RV32-NEXT: vxor.vv v11, v10, v11 +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vand.vv v11, v12, v11 +; RV32-NEXT: vsub.vv v8, v8, v11 +; RV32-NEXT: vand.vv v11, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v11, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v10, v9, 3 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -713,34 +703,29 @@ define @ctpop_nxv1i64( %va) { ; ; RV64-LABEL: ctpop_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 ; RV64-NEXT: slli a1, a0, 32 ; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsll.vi v10, v9, 2 +; RV64-NEXT: vxor.vx v10, v10, a0 +; RV64-NEXT: vadd.vv v11, v10, v10 +; RV64-NEXT: vxor.vv v11, v10, v11 +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vand.vv v11, v12, v11 +; RV64-NEXT: vsub.vv v8, v8, v11 +; RV64-NEXT: vand.vv v11, v8, v10 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: vadd.vv v8, v11, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v9, v9, 3 +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vmul.vv v8, v8, v9 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -758,37 +743,27 @@ declare @llvm.ctpop.nxv1i64() define @ctpop_nxv2i64( %va) { ; RV32-LABEL: ctpop_nxv2i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32-NEXT: vsll.vi v12, v10, 2 +; RV32-NEXT: vxor.vv v12, v10, v12 +; RV32-NEXT: vadd.vv v14, v12, v12 +; RV32-NEXT: vxor.vv v14, v12, v14 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v14, v16, v14 +; RV32-NEXT: vsub.vv v8, v8, v14 +; RV32-NEXT: vand.vv v14, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v14, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v12, v10, 3 +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -796,34 +771,29 @@ define @ctpop_nxv2i64( %va) { ; ; RV64-LABEL: ctpop_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 ; RV64-NEXT: slli a1, a0, 32 ; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vsll.vi v12, v10, 2 +; RV64-NEXT: vxor.vx v12, v12, a0 +; RV64-NEXT: vadd.vv v14, v12, v12 +; RV64-NEXT: vxor.vv v14, v12, v14 +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vv v14, v16, v14 +; RV64-NEXT: vsub.vv v8, v8, v14 +; RV64-NEXT: vand.vv v14, v8, v12 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vadd.vv v8, v14, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v10, v10, 3 +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vmul.vv v8, v8, v10 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -841,37 +811,27 @@ declare @llvm.ctpop.nxv2i64() define @ctpop_nxv4i64( %va) { ; RV32-LABEL: ctpop_nxv4i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32-NEXT: vsll.vi v16, v12, 2 +; RV32-NEXT: vxor.vv v16, v12, v16 +; RV32-NEXT: vadd.vv v20, v16, v16 +; RV32-NEXT: vxor.vv v20, v16, v20 +; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vand.vv v20, v24, v20 +; RV32-NEXT: vsub.vv v8, v8, v20 +; RV32-NEXT: vand.vv v20, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v20, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v16, v12, 3 +; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -879,34 +839,29 @@ define @ctpop_nxv4i64( %va) { ; ; RV64-LABEL: ctpop_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 ; RV64-NEXT: slli a1, a0, 32 ; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vsll.vi v16, v12, 2 +; RV64-NEXT: vxor.vx v16, v16, a0 +; RV64-NEXT: vadd.vv v20, v16, v16 +; RV64-NEXT: vxor.vv v20, v16, v20 +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vv v20, v24, v20 +; RV64-NEXT: vsub.vv v8, v8, v20 +; RV64-NEXT: vand.vv v20, v8, v16 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vadd.vv v8, v20, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v12, v12, 3 +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -924,74 +879,107 @@ declare @llvm.ctpop.nxv4i64() define @ctpop_nxv8i64( %va) { ; RV32-LABEL: ctpop_nxv8i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32-NEXT: vsll.vi v24, v16, 2 +; RV32-NEXT: vxor.vv v24, v16, v24 +; RV32-NEXT: vadd.vv v0, v24, v24 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a0 -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: ctpop_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 ; RV64-NEXT: slli a1, a0, 32 ; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a0 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vi v24, v16, 2 +; RV64-NEXT: vxor.vx v24, v24, a0 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v0, v24, v0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v0, v8, 1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v0, v0, v16 +; RV64-NEXT: vsub.vv v8, v8, v0 +; RV64-NEXT: vand.vv v0, v8, v24 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vv v8, v8, v24 +; RV64-NEXT: vadd.vv v8, v0, v8 +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v24 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: ctpop_nxv8i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll index 2310f85b1fba9..fc7f50e404866 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll @@ -1221,37 +1221,27 @@ declare @llvm.vp.ctpop.nxv1i64(, @vp_ctpop_nxv1i64( %va, %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_nxv1i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vi v10, v9, 2, v0.t +; RV32-NEXT: vxor.vv v10, v9, v10, v0.t +; RV32-NEXT: vsll.vi v11, v10, 1, v0.t +; RV32-NEXT: vxor.vv v11, v10, v11, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vand.vv v11, v12, v11, v0.t +; RV32-NEXT: vsub.vv v8, v8, v11, v0.t +; RV32-NEXT: vand.vv v11, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vadd.vv v8, v11, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v10, v9, 3, v0.t +; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -1259,34 +1249,30 @@ define @vp_ctpop_nxv1i64( %va, @vp_ctpop_nxv1i64( %va, @vp_ctpop_nxv1i64_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_nxv1i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vi v10, v9, 2 +; RV32-NEXT: vxor.vv v10, v9, v10 +; RV32-NEXT: vadd.vv v11, v10, v10 +; RV32-NEXT: vxor.vv v11, v10, v11 +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vand.vv v11, v12, v11 +; RV32-NEXT: vsub.vv v8, v8, v11 +; RV32-NEXT: vand.vv v11, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v11, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v10, v9, 3 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1341,34 +1317,30 @@ define @vp_ctpop_nxv1i64_unmasked( %va, i32 ; ; RV64-LABEL: vp_ctpop_nxv1i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsll.vi v10, v9, 2 +; RV64-NEXT: vxor.vx v10, v10, a1 +; RV64-NEXT: vadd.vv v11, v10, v10 +; RV64-NEXT: vxor.vv v11, v10, v11 +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vand.vv v11, v12, v11 +; RV64-NEXT: vsub.vv v8, v8, v11 +; RV64-NEXT: vand.vv v11, v8, v10 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: vadd.vv v8, v11, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v9, v9, 3 +; RV64-NEXT: vand.vx v9, v9, a1 +; RV64-NEXT: vmul.vv v8, v8, v9 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1389,37 +1361,27 @@ declare @llvm.vp.ctpop.nxv2i64(, @vp_ctpop_nxv2i64( %va, %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_nxv2i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsll.vi v12, v10, 2, v0.t +; RV32-NEXT: vxor.vv v12, v10, v12, v0.t +; RV32-NEXT: vsll.vi v14, v12, 1, v0.t +; RV32-NEXT: vxor.vv v14, v12, v14, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vand.vv v14, v16, v14, v0.t +; RV32-NEXT: vsub.vv v8, v8, v14, v0.t +; RV32-NEXT: vand.vv v14, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vadd.vv v8, v14, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v12, v0.t ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v12, v10, 3, v0.t +; RV32-NEXT: vand.vv v10, v10, v12, v0.t ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -1427,34 +1389,30 @@ define @vp_ctpop_nxv2i64( %va, @vp_ctpop_nxv2i64( %va, @vp_ctpop_nxv2i64_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_nxv2i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsll.vi v12, v10, 2 +; RV32-NEXT: vxor.vv v12, v10, v12 +; RV32-NEXT: vadd.vv v14, v12, v12 +; RV32-NEXT: vxor.vv v14, v12, v14 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v14, v16, v14 +; RV32-NEXT: vsub.vv v8, v8, v14 +; RV32-NEXT: vand.vv v14, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v14, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v12, v10, 3 +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1509,34 +1457,30 @@ define @vp_ctpop_nxv2i64_unmasked( %va, i32 ; ; RV64-LABEL: vp_ctpop_nxv2i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsll.vi v12, v10, 2 +; RV64-NEXT: vxor.vx v12, v12, a1 +; RV64-NEXT: vadd.vv v14, v12, v12 +; RV64-NEXT: vxor.vv v14, v12, v14 +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vv v14, v16, v14 +; RV64-NEXT: vsub.vv v8, v8, v14 +; RV64-NEXT: vand.vv v14, v8, v12 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vadd.vv v8, v14, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v10, v10, 3 +; RV64-NEXT: vand.vx v10, v10, a1 +; RV64-NEXT: vmul.vv v8, v8, v10 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1557,37 +1501,27 @@ declare @llvm.vp.ctpop.nxv4i64(, @vp_ctpop_nxv4i64( %va, %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_nxv4i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsll.vi v16, v12, 2, v0.t +; RV32-NEXT: vxor.vv v16, v12, v16, v0.t +; RV32-NEXT: vsll.vi v20, v16, 1, v0.t +; RV32-NEXT: vxor.vv v20, v16, v20, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vand.vv v20, v24, v20, v0.t +; RV32-NEXT: vsub.vv v8, v8, v20, v0.t +; RV32-NEXT: vand.vv v20, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v8, v20, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v16, v12, 3, v0.t +; RV32-NEXT: vand.vv v12, v12, v16, v0.t ; RV32-NEXT: vmul.vv v8, v8, v12, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -1595,34 +1529,30 @@ define @vp_ctpop_nxv4i64( %va, @vp_ctpop_nxv4i64( %va, @vp_ctpop_nxv4i64_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_nxv4i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsll.vi v16, v12, 2 +; RV32-NEXT: vxor.vv v16, v12, v16 +; RV32-NEXT: vadd.vv v20, v16, v16 +; RV32-NEXT: vxor.vv v20, v16, v20 +; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vand.vv v20, v24, v20 +; RV32-NEXT: vsub.vv v8, v8, v20 +; RV32-NEXT: vand.vv v20, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v20, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v16, v12, 3 +; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1677,34 +1597,30 @@ define @vp_ctpop_nxv4i64_unmasked( %va, i32 ; ; RV64-LABEL: vp_ctpop_nxv4i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsll.vi v16, v12, 2 +; RV64-NEXT: vxor.vx v16, v16, a1 +; RV64-NEXT: vadd.vv v20, v16, v16 +; RV64-NEXT: vxor.vv v20, v16, v20 +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vv v20, v24, v20 +; RV64-NEXT: vsub.vv v8, v8, v20 +; RV64-NEXT: vand.vv v20, v8, v16 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vadd.vv v8, v20, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v12, v12, 3 +; RV64-NEXT: vand.vx v12, v12, a1 +; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1725,74 +1641,163 @@ declare @llvm.vp.ctpop.nxv7i64(, @vp_ctpop_nxv7i64( %va, %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_nxv7i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: vmv.v.x v8, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vsll.vi v24, v8, 2, v0.t +; RV32-NEXT: vxor.vv v24, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v24, 1, v0.t +; RV32-NEXT: vxor.vv v8, v24, v8, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_nxv7i64: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vsll.vi v24, v16, 2, v0.t +; RV64-NEXT: vxor.vx v8, v24, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: vxor.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 1, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v24, v24, v16, v0.t +; RV64-NEXT: vand.vv v16, v24, v8, v0.t +; RV64-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV64-NEXT: vand.vv v8, v24, v8, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctpop_nxv7i64: @@ -1807,74 +1812,98 @@ define @vp_ctpop_nxv7i64( %va, @vp_ctpop_nxv7i64_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_nxv7i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vi v24, v16, 2 +; RV32-NEXT: vxor.vv v24, v16, v24 +; RV32-NEXT: vadd.vv v0, v24, v24 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_nxv7i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v24, v16, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v0, v24, v0 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vv v16, v16, v0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vv v16, v8, v24 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctpop_nxv7i64_unmasked: @@ -1893,74 +1922,163 @@ declare @llvm.vp.ctpop.nxv8i64(, @vp_ctpop_nxv8i64( %va, %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_nxv8i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: vmv.v.x v8, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vsll.vi v24, v8, 2, v0.t +; RV32-NEXT: vxor.vv v24, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v24, 1, v0.t +; RV32-NEXT: vxor.vv v8, v24, v8, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_nxv8i64: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vsll.vi v24, v16, 2, v0.t +; RV64-NEXT: vxor.vx v8, v24, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: vxor.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 1, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v24, v24, v16, v0.t +; RV64-NEXT: vand.vv v16, v24, v8, v0.t +; RV64-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV64-NEXT: vand.vv v8, v24, v8, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i64: @@ -1975,74 +2093,98 @@ define @vp_ctpop_nxv8i64( %va, @vp_ctpop_nxv8i64_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_nxv8i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vi v24, v16, 2 +; RV32-NEXT: vxor.vv v24, v16, v24 +; RV32-NEXT: vadd.vv v0, v24, v24 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_nxv8i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v24, v16, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v0, v24, v0 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vv v16, v16, v0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vv v16, v8, v24 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctpop_nxv8i64_unmasked: @@ -2064,20 +2206,19 @@ define @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64_unmasked( %va, ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: sub a2, a0, a1 ; RV32-NEXT: sltu a3, a0, a2 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a2, a3, a2 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v16, 1 -; RV32-NEXT: lui a3, 349525 -; RV32-NEXT: addi a3, a3, 1365 -; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v0, a3 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vsub.vv v24, v16, v24 -; RV32-NEXT: lui a3, 209715 -; RV32-NEXT: addi a3, a3, 819 -; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v0, a3 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v24, v0 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vadd.vv v24, v16, v24 -; RV32-NEXT: vsrl.vi v16, v24, 4 -; RV32-NEXT: vadd.vv v16, v24, v16 ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: addi a3, a3, -241 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsll.vi v0, v24, 2 +; RV32-NEXT: vxor.vv v8, v24, v0 +; RV32-NEXT: vadd.vv v0, v8, v8 +; RV32-NEXT: vxor.vv v16, v8, v0 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v0, v0, 1 +; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v0, v16 +; RV32-NEXT: vand.vv v0, v16, v8 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: lui a3, 4112 -; RV32-NEXT: addi a3, a3, 257 -; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vmul.vv v16, v16, v24 -; RV32-NEXT: li a2, 56 -; RV32-NEXT: vsrl.vx v16, v16, a2 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: bltu a0, a1, .LBB47_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a1 ; RV32-NEXT: .LBB47_2: ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vsll.vi v8, v24, 2 +; RV32-NEXT: vxor.vv v8, v24, v8 +; RV32-NEXT: vadd.vv v0, v8, v8 +; RV32-NEXT: vxor.vv v16, v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vsub.vv v24, v8, v24 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v0, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v24, v0 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vadd.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: vsub.vv v16, v0, v16 +; RV32-NEXT: vand.vv v0, v16, v8 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v8, v24 -; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_nxv16i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: mv a2, a0 -; RV64-NEXT: bltu a0, a1, .LBB47_2 +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: sub a1, a0, a2 +; RV64-NEXT: sltu a3, a0, a1 +; RV64-NEXT: addi a3, a3, -1 +; RV64-NEXT: and a3, a3, a1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 4 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v0, v8, 2 +; RV64-NEXT: vxor.vx v0, v0, a1 +; RV64-NEXT: vadd.vv v24, v0, v0 +; RV64-NEXT: vxor.vv v16, v0, v24 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vv v16, v24, v16 +; RV64-NEXT: vsub.vv v16, v8, v16 +; RV64-NEXT: vand.vv v24, v16, v0 +; RV64-NEXT: vsrl.vi v16, v16, 2 +; RV64-NEXT: vand.vv v16, v16, v0 +; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vadd.vv v16, v16, v24 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vmv8r.v v8, v24 +; RV64-NEXT: vsrl.vi v24, v24, 3 +; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vmul.vv v16, v16, v24 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: vsrl.vx v16, v16, a3 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: bltu a0, a2, .LBB47_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a2, a1 +; RV64-NEXT: mv a0, a2 ; RV64-NEXT: .LBB47_2: -; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v24, v24, a2 -; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v24, v8, a3 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vadd.vv v8, v24, v8 -; RV64-NEXT: vsrl.vi v24, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4 -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5 -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6 -; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vand.vx v24, v24, a2 -; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: vand.vx v24, v16, a3 +; RV64-NEXT: vsll.vi v24, v8, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v16, v24, v0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v0, v8, 1 +; RV64-NEXT: vand.vv v16, v0, v16 +; RV64-NEXT: vsub.vv v16, v8, v16 +; RV64-NEXT: vand.vv v0, v16, v24 ; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a3 -; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vand.vv v16, v16, v24 +; RV64-NEXT: vadd.vv v16, v0, v16 ; RV64-NEXT: vsrl.vi v24, v16, 4 ; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: vand.vx v16, v16, a4 -; RV64-NEXT: vmul.vx v16, v16, a5 -; RV64-NEXT: vsrl.vx v16, v16, a6 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v8, v8, 3 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vmul.vv v8, v16, v8 +; RV64-NEXT: vsrl.vx v8, v8, a3 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64_unmasked: diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll index b14cde25aa85b..6ee63fc9a12e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll @@ -1135,41 +1135,31 @@ declare @llvm.cttz.nxv16i32(, i1) define @cttz_nxv1i64( %va) { ; RV32I-LABEL: cttz_nxv1i64: ; RV32I: # %bb.0: -; RV32I-NEXT: li a0, 1 -; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV32I-NEXT: vsub.vx v9, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 -; RV32I-NEXT: vsub.vv v8, v8, v9 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v9 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v9, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32I-NEXT: vsll.vi v10, v9, 2 +; RV32I-NEXT: vxor.vv v10, v9, v10 +; RV32I-NEXT: vadd.vv v11, v10, v10 +; RV32I-NEXT: vxor.vv v11, v10, v11 +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsub.vx v12, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vand.vv v11, v12, v11 +; RV32I-NEXT: vsub.vv v8, v8, v11 +; RV32I-NEXT: vand.vv v11, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vadd.vv v8, v11, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32I-NEXT: vsrl.vi v10, v9, 3 +; RV32I-NEXT: vand.vv v9, v9, v10 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -1177,38 +1167,33 @@ define @cttz_nxv1i64( %va) { ; ; RV64I-LABEL: cttz_nxv1i64: ; RV64I: # %bb.0: -; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV64I-NEXT: vsub.vx v9, v8, a0 -; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vand.vv v8, v8, v9 -; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v9, v9, a0 -; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v9, v8, a0 -; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: vadd.vv v8, v9, v8 -; RV64I-NEXT: vsrl.vi v9, v8, 4 -; RV64I-NEXT: vadd.vv v8, v8, v9 ; RV64I-NEXT: lui a0, 61681 ; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV64I-NEXT: vmv.v.x v9, a0 +; RV64I-NEXT: vsll.vi v10, v9, 2 +; RV64I-NEXT: vxor.vx v10, v10, a0 +; RV64I-NEXT: vadd.vv v11, v10, v10 +; RV64I-NEXT: vxor.vv v11, v10, v11 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: vsub.vx v12, v8, a1 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vand.vv v11, v12, v11 +; RV64I-NEXT: vsub.vv v8, v8, v11 +; RV64I-NEXT: vand.vv v11, v8, v10 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vv v8, v8, v10 +; RV64I-NEXT: vadd.vv v8, v11, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v9, v9, 3 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vmul.vv v8, v8, v9 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1298,41 +1283,31 @@ declare @llvm.cttz.nxv1i64(, i1) define @cttz_nxv2i64( %va) { ; RV32I-LABEL: cttz_nxv2i64: ; RV32I: # %bb.0: -; RV32I-NEXT: li a0, 1 -; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV32I-NEXT: vsub.vx v10, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 -; RV32I-NEXT: vsub.vv v8, v8, v10 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32I-NEXT: vsll.vi v12, v10, 2 +; RV32I-NEXT: vxor.vv v12, v10, v12 +; RV32I-NEXT: vadd.vv v14, v12, v12 +; RV32I-NEXT: vxor.vv v14, v12, v14 +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsub.vx v16, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: vand.vv v14, v16, v14 +; RV32I-NEXT: vsub.vv v8, v8, v14 +; RV32I-NEXT: vand.vv v14, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vadd.vv v8, v14, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 ; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32I-NEXT: vsrl.vi v12, v10, 3 +; RV32I-NEXT: vand.vv v10, v10, v12 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -1340,38 +1315,33 @@ define @cttz_nxv2i64( %va) { ; ; RV64I-LABEL: cttz_nxv2i64: ; RV64I: # %bb.0: -; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV64I-NEXT: vsub.vx v10, v8, a0 -; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vand.vv v8, v8, v10 -; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v10, v10, a0 -; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v10, v8, a0 -; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: vadd.vv v8, v10, v8 -; RV64I-NEXT: vsrl.vi v10, v8, 4 -; RV64I-NEXT: vadd.vv v8, v8, v10 ; RV64I-NEXT: lui a0, 61681 ; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64I-NEXT: vmv.v.x v10, a0 +; RV64I-NEXT: vsll.vi v12, v10, 2 +; RV64I-NEXT: vxor.vx v12, v12, a0 +; RV64I-NEXT: vadd.vv v14, v12, v12 +; RV64I-NEXT: vxor.vv v14, v12, v14 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: vsub.vx v16, v8, a1 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: vand.vv v14, v16, v14 +; RV64I-NEXT: vsub.vv v8, v8, v14 +; RV64I-NEXT: vand.vv v14, v8, v12 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: vadd.vv v8, v14, v8 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v10, v10, 3 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vmul.vv v8, v8, v10 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1461,41 +1431,31 @@ declare @llvm.cttz.nxv2i64(, i1) define @cttz_nxv4i64( %va) { ; RV32I-LABEL: cttz_nxv4i64: ; RV32I: # %bb.0: -; RV32I-NEXT: li a0, 1 -; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV32I-NEXT: vsub.vx v12, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: vsrl.vi v12, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v12, v12, v16 -; RV32I-NEXT: vsub.vv v8, v8, v12 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v16, v8, v12 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: vadd.vv v8, v16, v8 -; RV32I-NEXT: vsrl.vi v12, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v12 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32I-NEXT: vsll.vi v16, v12, 2 +; RV32I-NEXT: vxor.vv v16, v12, v16 +; RV32I-NEXT: vadd.vv v20, v16, v16 +; RV32I-NEXT: vxor.vv v20, v16, v20 +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsub.vx v24, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v24 +; RV32I-NEXT: vsrl.vi v24, v8, 1 +; RV32I-NEXT: vand.vv v20, v24, v20 +; RV32I-NEXT: vsub.vv v8, v8, v20 +; RV32I-NEXT: vand.vv v20, v8, v16 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vadd.vv v8, v20, v8 +; RV32I-NEXT: vsrl.vi v16, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v16 ; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32I-NEXT: vsrl.vi v16, v12, 3 +; RV32I-NEXT: vand.vv v12, v12, v16 ; RV32I-NEXT: vmul.vv v8, v8, v12 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -1503,38 +1463,33 @@ define @cttz_nxv4i64( %va) { ; ; RV64I-LABEL: cttz_nxv4i64: ; RV64I: # %bb.0: -; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV64I-NEXT: vsub.vx v12, v8, a0 -; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vand.vv v8, v8, v12 -; RV64I-NEXT: vsrl.vi v12, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v12, v12, a0 -; RV64I-NEXT: vsub.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v12, v8, a0 -; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: vadd.vv v8, v12, v8 -; RV64I-NEXT: vsrl.vi v12, v8, 4 -; RV64I-NEXT: vadd.vv v8, v8, v12 ; RV64I-NEXT: lui a0, 61681 ; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV64I-NEXT: vmv.v.x v12, a0 +; RV64I-NEXT: vsll.vi v16, v12, 2 +; RV64I-NEXT: vxor.vx v16, v16, a0 +; RV64I-NEXT: vadd.vv v20, v16, v16 +; RV64I-NEXT: vxor.vv v20, v16, v20 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: vsub.vx v24, v8, a1 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v24 +; RV64I-NEXT: vsrl.vi v24, v8, 1 +; RV64I-NEXT: vand.vv v20, v24, v20 +; RV64I-NEXT: vsub.vv v8, v8, v20 +; RV64I-NEXT: vand.vv v20, v8, v16 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vv v8, v8, v16 +; RV64I-NEXT: vadd.vv v8, v20, v8 +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v16 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v12, v12, 3 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vmul.vv v8, v8, v12 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -1624,82 +1579,105 @@ declare @llvm.cttz.nxv4i64(, i1) define @cttz_nxv8i64( %va) { ; RV32I-LABEL: cttz_nxv8i64: ; RV32I: # %bb.0: -; RV32I-NEXT: li a0, 1 -; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32I-NEXT: vsub.vx v16, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: vsrl.vi v16, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v24, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v16, v16, v24 -; RV32I-NEXT: vsub.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v24, v8, v16 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: vadd.vv v8, v24, v8 -; RV32I-NEXT: vsrl.vi v16, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v16 +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: sub sp, sp, a0 +; RV32I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32I-NEXT: vsll.vi v24, v16, 2 +; RV32I-NEXT: vxor.vv v24, v16, v24 +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsub.vx v0, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v0 +; RV32I-NEXT: addi a0, sp, 16 +; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32I-NEXT: vadd.vv v0, v24, v24 +; RV32I-NEXT: vxor.vv v0, v24, v0 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 3 +; RV32I-NEXT: add a0, sp, a0 +; RV32I-NEXT: addi a0, a0, 16 +; RV32I-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32I-NEXT: vsrl.vi v0, v8, 1 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 3 +; RV32I-NEXT: add a0, sp, a0 +; RV32I-NEXT: addi a0, a0, 16 +; RV32I-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32I-NEXT: vand.vv v0, v0, v8 +; RV32I-NEXT: addi a0, sp, 16 +; RV32I-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32I-NEXT: vsub.vv v8, v8, v0 +; RV32I-NEXT: vand.vv v0, v8, v24 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v24 +; RV32I-NEXT: vadd.vv v8, v0, v8 +; RV32I-NEXT: vsrl.vi v24, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v24 ; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32I-NEXT: vsrl.vi v24, v16, 3 +; RV32I-NEXT: vand.vv v16, v16, v24 ; RV32I-NEXT: vmul.vv v8, v8, v16 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: add sp, sp, a0 +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV64I-LABEL: cttz_nxv8i64: ; RV64I: # %bb.0: -; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: csrr a0, vlenb +; RV64I-NEXT: slli a0, a0, 3 +; RV64I-NEXT: sub sp, sp, a0 +; RV64I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64I-NEXT: vsub.vx v16, v8, a0 +; RV64I-NEXT: vmv.v.x v16, a0 +; RV64I-NEXT: addi a1, sp, 16 +; RV64I-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64I-NEXT: vsll.vi v24, v16, 2 +; RV64I-NEXT: vxor.vx v24, v24, a0 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: vsub.vx v0, v8, a1 ; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vand.vv v8, v8, v16 +; RV64I-NEXT: vand.vv v8, v8, v0 +; RV64I-NEXT: vadd.vv v0, v24, v24 +; RV64I-NEXT: vxor.vv v0, v24, v0 ; RV64I-NEXT: vsrl.vi v16, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: vand.vv v16, v16, v0 ; RV64I-NEXT: vsub.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v16, v8, a0 +; RV64I-NEXT: vand.vv v16, v8, v24 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vv v8, v8, v24 ; RV64I-NEXT: vadd.vv v8, v16, v8 ; RV64I-NEXT: vsrl.vi v16, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: addi a1, sp, 16 +; RV64I-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64I-NEXT: vsrl.vi v16, v16, 3 +; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: vmul.vv v8, v8, v16 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: csrr a0, vlenb +; RV64I-NEXT: slli a0, a0, 3 +; RV64I-NEXT: add sp, sp, a0 +; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV32F-LABEL: cttz_nxv8i64: @@ -2813,41 +2791,31 @@ define @cttz_zero_undef_nxv16i32( %va) { define @cttz_zero_undef_nxv1i64( %va) { ; RV32I-LABEL: cttz_zero_undef_nxv1i64: ; RV32I: # %bb.0: -; RV32I-NEXT: li a0, 1 -; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV32I-NEXT: vsub.vx v9, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 -; RV32I-NEXT: vsub.vv v8, v8, v9 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v9 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v9, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32I-NEXT: vsll.vi v10, v9, 2 +; RV32I-NEXT: vxor.vv v10, v9, v10 +; RV32I-NEXT: vadd.vv v11, v10, v10 +; RV32I-NEXT: vxor.vv v11, v10, v11 +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsub.vx v12, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vand.vv v11, v12, v11 +; RV32I-NEXT: vsub.vv v8, v8, v11 +; RV32I-NEXT: vand.vv v11, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vadd.vv v8, v11, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32I-NEXT: vsrl.vi v10, v9, 3 +; RV32I-NEXT: vand.vv v9, v9, v10 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -2855,38 +2823,33 @@ define @cttz_zero_undef_nxv1i64( %va) { ; ; RV64I-LABEL: cttz_zero_undef_nxv1i64: ; RV64I: # %bb.0: -; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; RV64I-NEXT: vsub.vx v9, v8, a0 -; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vand.vv v8, v8, v9 -; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v9, v9, a0 -; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v9, v8, a0 -; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: vadd.vv v8, v9, v8 -; RV64I-NEXT: vsrl.vi v9, v8, 4 -; RV64I-NEXT: vadd.vv v8, v8, v9 ; RV64I-NEXT: lui a0, 61681 ; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; RV64I-NEXT: vmv.v.x v9, a0 +; RV64I-NEXT: vsll.vi v10, v9, 2 +; RV64I-NEXT: vxor.vx v10, v10, a0 +; RV64I-NEXT: vadd.vv v11, v10, v10 +; RV64I-NEXT: vxor.vv v11, v10, v11 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: vsub.vx v12, v8, a1 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vand.vv v11, v12, v11 +; RV64I-NEXT: vsub.vv v8, v8, v11 +; RV64I-NEXT: vand.vv v11, v8, v10 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vv v8, v8, v10 +; RV64I-NEXT: vadd.vv v8, v11, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v9, v9, 3 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vmul.vv v8, v8, v9 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -2933,41 +2896,31 @@ define @cttz_zero_undef_nxv1i64( %va) { define @cttz_zero_undef_nxv2i64( %va) { ; RV32I-LABEL: cttz_zero_undef_nxv2i64: ; RV32I: # %bb.0: -; RV32I-NEXT: li a0, 1 -; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV32I-NEXT: vsub.vx v10, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 -; RV32I-NEXT: vsub.vv v8, v8, v10 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v10, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32I-NEXT: vsll.vi v12, v10, 2 +; RV32I-NEXT: vxor.vv v12, v10, v12 +; RV32I-NEXT: vadd.vv v14, v12, v12 +; RV32I-NEXT: vxor.vv v14, v12, v14 +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsub.vx v16, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: vand.vv v14, v16, v14 +; RV32I-NEXT: vsub.vv v8, v8, v14 +; RV32I-NEXT: vand.vv v14, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vadd.vv v8, v14, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 ; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32I-NEXT: vsrl.vi v12, v10, 3 +; RV32I-NEXT: vand.vv v10, v10, v12 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -2975,38 +2928,33 @@ define @cttz_zero_undef_nxv2i64( %va) { ; ; RV64I-LABEL: cttz_zero_undef_nxv2i64: ; RV64I: # %bb.0: -; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma -; RV64I-NEXT: vsub.vx v10, v8, a0 -; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vand.vv v8, v8, v10 -; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v10, v10, a0 -; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v10, v8, a0 -; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: vadd.vv v8, v10, v8 -; RV64I-NEXT: vsrl.vi v10, v8, 4 -; RV64I-NEXT: vadd.vv v8, v8, v10 ; RV64I-NEXT: lui a0, 61681 ; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64I-NEXT: vmv.v.x v10, a0 +; RV64I-NEXT: vsll.vi v12, v10, 2 +; RV64I-NEXT: vxor.vx v12, v12, a0 +; RV64I-NEXT: vadd.vv v14, v12, v12 +; RV64I-NEXT: vxor.vv v14, v12, v14 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: vsub.vx v16, v8, a1 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: vand.vv v14, v16, v14 +; RV64I-NEXT: vsub.vv v8, v8, v14 +; RV64I-NEXT: vand.vv v14, v8, v12 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: vadd.vv v8, v14, v8 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v10, v10, 3 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vmul.vv v8, v8, v10 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -3053,41 +3001,31 @@ define @cttz_zero_undef_nxv2i64( %va) { define @cttz_zero_undef_nxv4i64( %va) { ; RV32I-LABEL: cttz_zero_undef_nxv4i64: ; RV32I: # %bb.0: -; RV32I-NEXT: li a0, 1 -; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV32I-NEXT: vsub.vx v12, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: vsrl.vi v12, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v12, v12, v16 -; RV32I-NEXT: vsub.vv v8, v8, v12 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32I-NEXT: vand.vv v16, v8, v12 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: vadd.vv v8, v16, v8 -; RV32I-NEXT: vsrl.vi v12, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v12 ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV32I-NEXT: vmv.v.x v12, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32I-NEXT: vsll.vi v16, v12, 2 +; RV32I-NEXT: vxor.vv v16, v12, v16 +; RV32I-NEXT: vadd.vv v20, v16, v16 +; RV32I-NEXT: vxor.vv v20, v16, v20 +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsub.vx v24, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v24 +; RV32I-NEXT: vsrl.vi v24, v8, 1 +; RV32I-NEXT: vand.vv v20, v24, v20 +; RV32I-NEXT: vsub.vv v8, v8, v20 +; RV32I-NEXT: vand.vv v20, v8, v16 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vadd.vv v8, v20, v8 +; RV32I-NEXT: vsrl.vi v16, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v16 ; RV32I-NEXT: vand.vv v8, v8, v12 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV32I-NEXT: vmv.v.x v12, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32I-NEXT: vsrl.vi v16, v12, 3 +; RV32I-NEXT: vand.vv v12, v12, v16 ; RV32I-NEXT: vmul.vv v8, v8, v12 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 @@ -3095,38 +3033,33 @@ define @cttz_zero_undef_nxv4i64( %va) { ; ; RV64I-LABEL: cttz_zero_undef_nxv4i64: ; RV64I: # %bb.0: -; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma -; RV64I-NEXT: vsub.vx v12, v8, a0 -; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vand.vv v8, v8, v12 -; RV64I-NEXT: vsrl.vi v12, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v12, v12, a0 -; RV64I-NEXT: vsub.vv v8, v8, v12 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v12, v8, a0 -; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: vadd.vv v8, v12, v8 -; RV64I-NEXT: vsrl.vi v12, v8, 4 -; RV64I-NEXT: vadd.vv v8, v8, v12 ; RV64I-NEXT: lui a0, 61681 ; RV64I-NEXT: addiw a0, a0, -241 ; RV64I-NEXT: slli a1, a0, 32 ; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma +; RV64I-NEXT: vmv.v.x v12, a0 +; RV64I-NEXT: vsll.vi v16, v12, 2 +; RV64I-NEXT: vxor.vx v16, v16, a0 +; RV64I-NEXT: vadd.vv v20, v16, v16 +; RV64I-NEXT: vxor.vv v20, v16, v20 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: vsub.vx v24, v8, a1 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v24 +; RV64I-NEXT: vsrl.vi v24, v8, 1 +; RV64I-NEXT: vand.vv v20, v24, v20 +; RV64I-NEXT: vsub.vv v8, v8, v20 +; RV64I-NEXT: vand.vv v20, v8, v16 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vv v8, v8, v16 +; RV64I-NEXT: vadd.vv v8, v20, v8 +; RV64I-NEXT: vsrl.vi v16, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v16 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v12, v12, 3 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vmul.vv v8, v8, v12 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 ; RV64I-NEXT: ret @@ -3173,82 +3106,105 @@ define @cttz_zero_undef_nxv4i64( %va) { define @cttz_zero_undef_nxv8i64( %va) { ; RV32I-LABEL: cttz_zero_undef_nxv8i64: ; RV32I: # %bb.0: -; RV32I-NEXT: li a0, 1 -; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV32I-NEXT: vsub.vx v16, v8, a0 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: vsrl.vi v16, v8, 1 -; RV32I-NEXT: lui a0, 349525 -; RV32I-NEXT: addi a0, a0, 1365 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v24, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v16, v16, v24 -; RV32I-NEXT: vsub.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi a0, a0, 819 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32I-NEXT: vand.vv v24, v8, v16 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: vadd.vv v8, v24, v8 -; RV32I-NEXT: vsrl.vi v16, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v16 +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: sub sp, sp, a0 +; RV32I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32I-NEXT: lui a0, 61681 ; RV32I-NEXT: addi a0, a0, -241 ; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32I-NEXT: vmv.v.x v16, a0 ; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32I-NEXT: vsll.vi v24, v16, 2 +; RV32I-NEXT: vxor.vv v24, v16, v24 +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: vsub.vx v0, v8, a0 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v0 +; RV32I-NEXT: addi a0, sp, 16 +; RV32I-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32I-NEXT: vadd.vv v0, v24, v24 +; RV32I-NEXT: vxor.vv v0, v24, v0 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 3 +; RV32I-NEXT: add a0, sp, a0 +; RV32I-NEXT: addi a0, a0, 16 +; RV32I-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32I-NEXT: vsrl.vi v0, v8, 1 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 3 +; RV32I-NEXT: add a0, sp, a0 +; RV32I-NEXT: addi a0, a0, 16 +; RV32I-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32I-NEXT: vand.vv v0, v0, v8 +; RV32I-NEXT: addi a0, sp, 16 +; RV32I-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32I-NEXT: vsub.vv v8, v8, v0 +; RV32I-NEXT: vand.vv v0, v8, v24 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v24 +; RV32I-NEXT: vadd.vv v8, v0, v8 +; RV32I-NEXT: vsrl.vi v24, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v24 ; RV32I-NEXT: vand.vv v8, v8, v16 -; RV32I-NEXT: lui a0, 4112 -; RV32I-NEXT: addi a0, a0, 257 -; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32I-NEXT: vmv.v.x v16, a0 -; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32I-NEXT: vsrl.vi v24, v16, 3 +; RV32I-NEXT: vand.vv v16, v16, v24 ; RV32I-NEXT: vmul.vv v8, v8, v16 ; RV32I-NEXT: li a0, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a0 +; RV32I-NEXT: csrr a0, vlenb +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: add sp, sp, a0 +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV64I-LABEL: cttz_zero_undef_nxv8i64: ; RV64I: # %bb.0: -; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: csrr a0, vlenb +; RV64I-NEXT: slli a0, a0, 3 +; RV64I-NEXT: sub sp, sp, a0 +; RV64I-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64I-NEXT: vsub.vx v16, v8, a0 +; RV64I-NEXT: vmv.v.x v16, a0 +; RV64I-NEXT: addi a1, sp, 16 +; RV64I-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64I-NEXT: vsll.vi v24, v16, 2 +; RV64I-NEXT: vxor.vx v24, v24, a0 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: vsub.vx v0, v8, a1 ; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vand.vv v8, v8, v16 +; RV64I-NEXT: vand.vv v8, v8, v0 +; RV64I-NEXT: vadd.vv v0, v24, v24 +; RV64I-NEXT: vxor.vv v0, v24, v0 ; RV64I-NEXT: vsrl.vi v16, v8, 1 -; RV64I-NEXT: lui a0, 349525 -; RV64I-NEXT: addiw a0, a0, 1365 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: vand.vv v16, v16, v0 ; RV64I-NEXT: vsub.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 209715 -; RV64I-NEXT: addiw a0, a0, 819 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vand.vx v16, v8, a0 +; RV64I-NEXT: vand.vv v16, v8, v24 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vand.vv v8, v8, v24 ; RV64I-NEXT: vadd.vv v8, v16, v8 ; RV64I-NEXT: vsrl.vi v16, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v16 -; RV64I-NEXT: lui a0, 61681 -; RV64I-NEXT: addiw a0, a0, -241 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: vand.vx v8, v8, a0 -; RV64I-NEXT: lui a0, 4112 -; RV64I-NEXT: addiw a0, a0, 257 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: addi a1, sp, 16 +; RV64I-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64I-NEXT: vsrl.vi v16, v16, 3 +; RV64I-NEXT: vand.vx v16, v16, a0 +; RV64I-NEXT: vmul.vv v8, v8, v16 ; RV64I-NEXT: li a0, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a0 +; RV64I-NEXT: csrr a0, vlenb +; RV64I-NEXT: slli a0, a0, 3 +; RV64I-NEXT: add sp, sp, a0 +; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; CHECK-F-LABEL: cttz_zero_undef_nxv8i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll index 145ce6e917f96..071b76899e752 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll @@ -1365,41 +1365,31 @@ declare @llvm.vp.cttz.nxv1i64(, i1 immarg, define @vp_cttz_nxv1i64( %va, %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv1i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vi v10, v9, 2, v0.t +; RV32-NEXT: vxor.vv v10, v9, v10, v0.t +; RV32-NEXT: vsll.vi v11, v10, 1, v0.t +; RV32-NEXT: vxor.vv v11, v10, v11, v0.t +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v12, v8, a0, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vand.vv v11, v12, v11, v0.t +; RV32-NEXT: vsub.vv v8, v8, v11, v0.t +; RV32-NEXT: vand.vv v11, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vadd.vv v8, v11, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v10, v9, 3, v0.t +; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -1407,38 +1397,34 @@ define @vp_cttz_nxv1i64( %va, @vp_cttz_nxv1i64( %va, @vp_cttz_nxv1i64_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv1i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vi v10, v9, 2 +; RV32-NEXT: vxor.vv v10, v9, v10 +; RV32-NEXT: vadd.vv v11, v10, v10 +; RV32-NEXT: vxor.vv v11, v10, v11 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v12, v8, a0 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vand.vv v11, v12, v11 +; RV32-NEXT: vsub.vv v8, v8, v11 +; RV32-NEXT: vand.vv v11, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v11, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v10, v9, 3 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1497,38 +1473,34 @@ define @vp_cttz_nxv1i64_unmasked( %va, i32 ; ; RV64-LABEL: vp_cttz_nxv1i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vsll.vi v10, v9, 2 +; RV64-NEXT: vxor.vx v10, v10, a1 +; RV64-NEXT: vadd.vv v11, v10, v10 +; RV64-NEXT: vxor.vv v11, v10, v11 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v12, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vand.vv v11, v12, v11 +; RV64-NEXT: vsub.vv v8, v8, v11 +; RV64-NEXT: vand.vv v11, v8, v10 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: vadd.vv v8, v11, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v9, v9, 3 +; RV64-NEXT: vand.vx v9, v9, a1 +; RV64-NEXT: vmul.vv v8, v8, v9 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1549,41 +1521,31 @@ declare @llvm.vp.cttz.nxv2i64(, i1 immarg, define @vp_cttz_nxv2i64( %va, %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv2i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsll.vi v12, v10, 2, v0.t +; RV32-NEXT: vxor.vv v12, v10, v12, v0.t +; RV32-NEXT: vsll.vi v14, v12, 1, v0.t +; RV32-NEXT: vxor.vv v14, v12, v14, v0.t +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v16, v8, a0, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vand.vv v14, v16, v14, v0.t +; RV32-NEXT: vsub.vv v8, v8, v14, v0.t +; RV32-NEXT: vand.vv v14, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vadd.vv v8, v14, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v12, v0.t ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v12, v10, 3, v0.t +; RV32-NEXT: vand.vv v10, v10, v12, v0.t ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -1591,38 +1553,34 @@ define @vp_cttz_nxv2i64( %va, @vp_cttz_nxv2i64( %va, @vp_cttz_nxv2i64_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv2i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsll.vi v12, v10, 2 +; RV32-NEXT: vxor.vv v12, v10, v12 +; RV32-NEXT: vadd.vv v14, v12, v12 +; RV32-NEXT: vxor.vv v14, v12, v14 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v16, v8, a0 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v14, v16, v14 +; RV32-NEXT: vsub.vv v8, v8, v14 +; RV32-NEXT: vand.vv v14, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v14, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v12, v10, 3 +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1681,38 +1629,34 @@ define @vp_cttz_nxv2i64_unmasked( %va, i32 ; ; RV64-LABEL: vp_cttz_nxv2i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a1 +; RV64-NEXT: vsll.vi v12, v10, 2 +; RV64-NEXT: vxor.vx v12, v12, a1 +; RV64-NEXT: vadd.vv v14, v12, v12 +; RV64-NEXT: vxor.vv v14, v12, v14 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v16, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vv v14, v16, v14 +; RV64-NEXT: vsub.vv v8, v8, v14 +; RV64-NEXT: vand.vv v14, v8, v12 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vadd.vv v8, v14, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v10, v10, 3 +; RV64-NEXT: vand.vx v10, v10, a1 +; RV64-NEXT: vmul.vv v8, v8, v10 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1733,41 +1677,31 @@ declare @llvm.vp.cttz.nxv4i64(, i1 immarg, define @vp_cttz_nxv4i64( %va, %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv4i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsll.vi v16, v12, 2, v0.t +; RV32-NEXT: vxor.vv v16, v12, v16, v0.t +; RV32-NEXT: vsll.vi v20, v16, 1, v0.t +; RV32-NEXT: vxor.vv v20, v16, v20, v0.t +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v24, v8, a0, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vand.vv v20, v24, v20, v0.t +; RV32-NEXT: vsub.vv v8, v8, v20, v0.t +; RV32-NEXT: vand.vv v20, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v8, v20, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v16, v12, 3, v0.t +; RV32-NEXT: vand.vv v12, v12, v16, v0.t ; RV32-NEXT: vmul.vv v8, v8, v12, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -1775,38 +1709,34 @@ define @vp_cttz_nxv4i64( %va, @vp_cttz_nxv4i64( %va, @vp_cttz_nxv4i64_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv4i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsll.vi v16, v12, 2 +; RV32-NEXT: vxor.vv v16, v12, v16 +; RV32-NEXT: vadd.vv v20, v16, v16 +; RV32-NEXT: vxor.vv v20, v16, v20 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v24, v8, a0 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vand.vv v20, v24, v20 +; RV32-NEXT: vsub.vv v8, v8, v20 +; RV32-NEXT: vand.vv v20, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v20, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v16, v12, 3 +; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1865,38 +1785,34 @@ define @vp_cttz_nxv4i64_unmasked( %va, i32 ; ; RV64-LABEL: vp_cttz_nxv4i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vsub.vx v12, v8, a1 +; RV64-NEXT: vsll.vi v16, v12, 2 +; RV64-NEXT: vxor.vx v16, v16, a1 +; RV64-NEXT: vadd.vv v20, v16, v16 +; RV64-NEXT: vxor.vv v20, v16, v20 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v24, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vv v20, v24, v20 +; RV64-NEXT: vsub.vv v8, v8, v20 +; RV64-NEXT: vand.vv v20, v8, v16 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vadd.vv v8, v20, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v12, v12, 3 +; RV64-NEXT: vand.vx v12, v12, a1 +; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1917,82 +1833,182 @@ declare @llvm.vp.cttz.nxv7i64(, i1 immarg, define @vp_cttz_nxv7i64( %va, %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv7i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: vsll.vi v24, v8, 2, v0.t +; RV32-NEXT: vxor.vv v24, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v24, 1, v0.t +; RV32-NEXT: vxor.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: li a0, 1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v16, v8, a0, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v24, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv7i64: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 2, v0.t +; RV64-NEXT: vxor.vx v24, v16, a1, v0.t +; RV64-NEXT: vsll.vi v16, v24, 1, v0.t +; RV64-NEXT: vxor.vv v8, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: li a0, 1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vx v16, v8, a0, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vv v16, v8, v24, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v24, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_cttz_nxv7i64: @@ -2007,82 +2023,106 @@ define @vp_cttz_nxv7i64( %va, @vp_cttz_nxv7i64_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv7i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vi v24, v16, 2 +; RV32-NEXT: vxor.vv v24, v16, v24 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v0, v8, a0 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vadd.vv v0, v24, v24 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv7i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: vsll.vi v24, v16, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v0, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v0, v24, v0 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vv v16, v16, v0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vv v16, v8, v24 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_cttz_nxv7i64_unmasked: @@ -2101,82 +2141,182 @@ declare @llvm.vp.cttz.nxv8i64(, i1 immarg, define @vp_cttz_nxv8i64( %va, %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv8i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: vsll.vi v24, v8, 2, v0.t +; RV32-NEXT: vxor.vv v24, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v24, 1, v0.t +; RV32-NEXT: vxor.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: li a0, 1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v16, v8, a0, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v24, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 2, v0.t +; RV64-NEXT: vxor.vx v24, v16, a1, v0.t +; RV64-NEXT: vsll.vi v16, v24, 1, v0.t +; RV64-NEXT: vxor.vv v8, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: li a0, 1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vx v16, v8, a0, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vv v16, v8, v24, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v24, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_cttz_nxv8i64: @@ -2191,82 +2331,106 @@ define @vp_cttz_nxv8i64( %va, @vp_cttz_nxv8i64_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv8i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vi v24, v16, 2 +; RV32-NEXT: vxor.vv v24, v16, v24 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v0, v8, a0 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vadd.vv v0, v24, v24 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv8i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: vsll.vi v24, v16, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v0, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v0, v24, v0 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vv v16, v16, v0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vv v16, v8, v24 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_cttz_nxv8i64_unmasked: @@ -2288,13 +2452,19 @@ define @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 +; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill @@ -2669,62 +2985,53 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: sub a2, a0, a1 ; RV32-NEXT: sltu a3, a0, a2 ; RV32-NEXT: addi a3, a3, -1 -; RV32-NEXT: and a3, a3, a2 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: lui a3, 61681 +; RV32-NEXT: addi a3, a3, -241 +; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: li a2, 1 -; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v16, a2 +; RV32-NEXT: vsub.vx v0, v16, a2 ; RV32-NEXT: vnot.v v16, v16 -; RV32-NEXT: vand.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v24, v8, 1 -; RV32-NEXT: lui a4, 349525 -; RV32-NEXT: addi a4, a4, 1365 -; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a4 -; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsub.vv v8, v8, v24 -; RV32-NEXT: lui a4, 209715 -; RV32-NEXT: addi a4, a4, 819 -; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v0, a4 -; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v0 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v24 -; RV32-NEXT: lui a4, 61681 -; RV32-NEXT: addi a4, a4, -241 -; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a4 -; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: lui a4, 4112 -; RV32-NEXT: addi a4, a4, 257 -; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 -; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v16, v0 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v0, v24, 2 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: vadd.vv v16, v0, v0 +; RV32-NEXT: vxor.vv v16, v0, v16 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vmul.vv v16, v16, v8 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16 +; RV32-NEXT: vand.vv v16, v8, v0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a3, 56 -; RV32-NEXT: vsrl.vx v8, v16, a3 -; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vsrl.vx v8, v8, a3 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: bltu a0, a1, .LBB47_2 ; RV32-NEXT: # %bb.1: @@ -2732,45 +3039,52 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: .LBB47_2: ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v16, v24, a2 -; RV32-NEXT: vnot.v v24, v24 -; RV32-NEXT: vand.vv v16, v24, v16 -; RV32-NEXT: vsrl.vi v24, v16, 1 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v8, v16, a2 +; RV32-NEXT: vnot.v v0, v16 +; RV32-NEXT: vand.vv v16, v0, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v0, v24, 2 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: vadd.vv v8, v0, v0 +; RV32-NEXT: vxor.vv v8, v0, v8 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8 -; RV32-NEXT: vsub.vv v16, v16, v24 -; RV32-NEXT: vand.vv v24, v16, v0 -; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vand.vv v16, v16, v0 -; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: vsrl.vi v24, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v24 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v16, 1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v16, v8 +; RV32-NEXT: vand.vv v16, v8, v0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsrl.vx v8, v8, a3 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v16, v8 -; RV32-NEXT: vsrl.vx v8, v8, a3 -; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 @@ -2778,65 +3092,109 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; ; RV64-LABEL: vp_cttz_nxv16i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: sub a2, a0, a1 -; RV64-NEXT: sltu a3, a0, a2 +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: sub a1, a0, a2 +; RV64-NEXT: sltu a3, a0, a1 ; RV64-NEXT: addi a3, a3, -1 -; RV64-NEXT: and a3, a3, a2 -; RV64-NEXT: li a2, 1 +; RV64-NEXT: and a3, a3, a1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 4 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v24, v16, a2 +; RV64-NEXT: li a3, 1 +; RV64-NEXT: vsub.vx v0, v16, a3 ; RV64-NEXT: vnot.v v16, v16 -; RV64-NEXT: vand.vv v16, v16, v24 -; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: lui a3, 349525 -; RV64-NEXT: addiw a3, a3, 1365 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v24, v24, a3 -; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: lui a4, 209715 -; RV64-NEXT: addiw a4, a4, 819 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v24, v16, a4 -; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a4 -; RV64-NEXT: vadd.vv v16, v24, v16 -; RV64-NEXT: vsrl.vi v24, v16, 4 -; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: lui a5, 61681 -; RV64-NEXT: addiw a5, a5, -241 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vand.vx v16, v16, a5 -; RV64-NEXT: lui a6, 4112 -; RV64-NEXT: addiw a6, a6, 257 -; RV64-NEXT: slli a7, a6, 32 -; RV64-NEXT: add a6, a6, a7 -; RV64-NEXT: vmul.vx v16, v16, a6 -; RV64-NEXT: li a7, 56 -; RV64-NEXT: vsrl.vx v16, v16, a7 -; RV64-NEXT: bltu a0, a1, .LBB47_2 +; RV64-NEXT: vand.vv v16, v16, v0 +; RV64-NEXT: vsll.vi v0, v8, 2 +; RV64-NEXT: vxor.vx v0, v0, a1 +; RV64-NEXT: vadd.vv v24, v0, v0 +; RV64-NEXT: vxor.vv v24, v0, v24 +; RV64-NEXT: vsrl.vi v8, v16, 1 +; RV64-NEXT: vand.vv v8, v8, v24 +; RV64-NEXT: vsub.vv v8, v16, v8 +; RV64-NEXT: vand.vv v16, v8, v0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 4 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV64-NEXT: vmv8r.v v0, v16 +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: li a4, 56 +; RV64-NEXT: vsrl.vx v8, v8, a4 +; RV64-NEXT: addi a5, sp, 16 +; RV64-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill +; RV64-NEXT: bltu a0, a2, .LBB47_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a1 +; RV64-NEXT: mv a0, a2 ; RV64-NEXT: .LBB47_2: ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v24, v8, a2 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v24 -; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: vand.vx v24, v24, a3 -; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: vand.vx v24, v8, a4 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vx v8, v16, a3 +; RV64-NEXT: vnot.v v24, v16 +; RV64-NEXT: vand.vv v8, v24, v8 +; RV64-NEXT: vsll.vi v24, v0, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v16, v24, v0 +; RV64-NEXT: vsrl.vi v0, v8, 1 +; RV64-NEXT: vand.vv v16, v0, v16 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vv v16, v8, v24 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a4 -; RV64-NEXT: vadd.vv v8, v24, v8 -; RV64-NEXT: vsrl.vi v24, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: vand.vx v8, v8, a5 -; RV64-NEXT: vmul.vx v8, v8, a6 -; RV64-NEXT: vsrl.vx v8, v8, a7 +; RV64-NEXT: vand.vv v8, v8, v24 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsrl.vx v8, v8, a4 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_cttz_nxv16i64_unmasked: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll index 36f22bd3259cf..8ab3ee6206de7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll @@ -913,35 +913,26 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: vor.vv v8, v8, v9, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t +; RV32-NEXT: vsll.vi v11, v10, 2, v0.t +; RV32-NEXT: vxor.vv v11, v10, v11, v0.t +; RV32-NEXT: vsll.vi v12, v11, 1, v0.t +; RV32-NEXT: vxor.vv v12, v11, v12, v0.t +; RV32-NEXT: vand.vv v9, v9, v12, v0.t ; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v8, v11, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v11, v0.t +; RV32-NEXT: vadd.vv v8, v9, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v9, v10, 3, v0.t +; RV32-NEXT: vand.vv v9, v10, v9, v0.t ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -960,37 +951,34 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v9, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0, v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v9, v8, a1, v0.t ; RV64-NEXT: vor.vv v8, v8, v9, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vsll.vi v11, v10, 2, v0.t +; RV64-NEXT: vxor.vx v11, v11, a1, v0.t +; RV64-NEXT: vsll.vi v12, v11, 1, v0.t +; RV64-NEXT: vxor.vv v12, v11, v12, v0.t +; RV64-NEXT: vand.vv v9, v9, v12, v0.t ; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t +; RV64-NEXT: vand.vv v9, v8, v11, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v11, v0.t ; RV64-NEXT: vadd.vv v8, v9, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v9, v10, 3, v0.t +; RV64-NEXT: vand.vx v9, v9, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v9, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -1017,35 +1005,26 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vsll.vi v11, v10, 2 +; RV32-NEXT: vxor.vv v11, v10, v11 +; RV32-NEXT: vadd.vv v12, v11, v11 +; RV32-NEXT: vxor.vv v12, v11, v12 +; RV32-NEXT: vand.vv v9, v9, v12 ; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v11 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vadd.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v9, v10, 3 +; RV32-NEXT: vand.vv v9, v10, v9 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1064,37 +1043,34 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v9, v8, a1 ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vsll.vi v11, v10, 2 +; RV64-NEXT: vxor.vx v11, v11, a1 +; RV64-NEXT: vadd.vv v12, v11, v11 +; RV64-NEXT: vxor.vv v12, v11, v12 +; RV64-NEXT: vand.vv v9, v9, v12 ; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vv v9, v8, v11 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v11 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v9, v10, 3 +; RV64-NEXT: vand.vx v9, v9, a1 +; RV64-NEXT: vmul.vv v8, v8, v9 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1125,35 +1101,26 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: vor.vv v8, v8, v10, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t +; RV32-NEXT: vsll.vi v14, v12, 2, v0.t +; RV32-NEXT: vxor.vv v14, v12, v14, v0.t +; RV32-NEXT: vsll.vi v16, v14, 1, v0.t +; RV32-NEXT: vxor.vv v16, v14, v16, v0.t +; RV32-NEXT: vand.vv v10, v10, v16, v0.t ; RV32-NEXT: vsub.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v8, v14, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v14, v0.t +; RV32-NEXT: vadd.vv v8, v10, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v10, v12, 3, v0.t +; RV32-NEXT: vand.vv v10, v12, v10, v0.t ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -1172,37 +1139,34 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0, v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v10, v8, a1, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64-NEXT: vsll.vi v14, v12, 2, v0.t +; RV64-NEXT: vxor.vx v14, v14, a1, v0.t +; RV64-NEXT: vsll.vi v16, v14, 1, v0.t +; RV64-NEXT: vxor.vv v16, v14, v16, v0.t +; RV64-NEXT: vand.vv v10, v10, v16, v0.t ; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vand.vv v10, v8, v14, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v14, v0.t ; RV64-NEXT: vadd.vv v8, v10, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v10, v12, 3, v0.t +; RV64-NEXT: vand.vx v10, v10, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v10, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -1229,35 +1193,26 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 +; RV32-NEXT: vsll.vi v14, v12, 2 +; RV32-NEXT: vxor.vv v14, v12, v14 +; RV32-NEXT: vadd.vv v16, v14, v14 +; RV32-NEXT: vxor.vv v16, v14, v16 +; RV32-NEXT: vand.vv v10, v10, v16 ; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v14 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vand.vv v8, v8, v14 +; RV32-NEXT: vadd.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v10, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v10, v12, 3 +; RV32-NEXT: vand.vv v10, v12, v10 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1276,37 +1231,34 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vsrl.vi v10, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v10, v8, a1 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64-NEXT: vsll.vi v14, v12, 2 +; RV64-NEXT: vxor.vx v14, v14, a1 +; RV64-NEXT: vadd.vv v16, v14, v14 +; RV64-NEXT: vxor.vv v16, v14, v16 +; RV64-NEXT: vand.vv v10, v10, v16 ; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vv v10, v8, v14 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v14 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v10, v12, 3 +; RV64-NEXT: vand.vx v10, v10, a1 +; RV64-NEXT: vmul.vv v8, v8, v10 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1335,38 +1287,29 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vnot.v v12, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v12, 1, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v12, v0.t +; RV32-NEXT: vsll.vi v20, v8, 2, v0.t +; RV32-NEXT: vxor.vv v20, v8, v20, v0.t +; RV32-NEXT: vsll.vi v24, v20, 1, v0.t +; RV32-NEXT: vxor.vv v24, v20, v24, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vsub.vv v12, v12, v16, v0.t +; RV32-NEXT: vand.vv v16, v12, v20, v0.t +; RV32-NEXT: vsrl.vi v12, v12, 2, v0.t +; RV32-NEXT: vand.vv v12, v12, v20, v0.t +; RV32-NEXT: vadd.vv v12, v16, v12, v0.t +; RV32-NEXT: vsrl.vi v16, v12, 4, v0.t +; RV32-NEXT: vadd.vv v12, v12, v16, v0.t +; RV32-NEXT: vand.vv v12, v12, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vmul.vv v8, v12, v8, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV32-NEXT: ret @@ -1384,37 +1327,34 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v12, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0, v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v12, v8, a1, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vnot.v v12, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v12, 1, v0.t +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64-NEXT: vsll.vi v20, v8, 2, v0.t +; RV64-NEXT: vxor.vx v20, v20, a1, v0.t +; RV64-NEXT: vsll.vi v24, v20, 1, v0.t +; RV64-NEXT: vxor.vv v24, v20, v24, v0.t +; RV64-NEXT: vand.vv v16, v16, v24, v0.t +; RV64-NEXT: vsub.vv v12, v12, v16, v0.t +; RV64-NEXT: vand.vv v16, v12, v20, v0.t +; RV64-NEXT: vsrl.vi v12, v12, 2, v0.t +; RV64-NEXT: vand.vv v12, v12, v20, v0.t +; RV64-NEXT: vadd.vv v12, v16, v12, v0.t +; RV64-NEXT: vsrl.vi v16, v12, 4, v0.t +; RV64-NEXT: vadd.vv v12, v12, v16, v0.t +; RV64-NEXT: vand.vx v12, v12, a1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 3, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vmul.vv v8, v12, v8, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -1441,35 +1381,26 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vsll.vi v20, v16, 2 +; RV32-NEXT: vxor.vv v20, v16, v20 +; RV32-NEXT: vadd.vv v24, v20, v20 +; RV32-NEXT: vxor.vv v24, v20, v24 +; RV32-NEXT: vand.vv v12, v12, v24 ; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 +; RV32-NEXT: vand.vv v12, v8, v20 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vand.vv v8, v8, v20 +; RV32-NEXT: vadd.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v12, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v12, v16, 3 +; RV32-NEXT: vand.vv v12, v16, v12 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1488,37 +1419,34 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vsrl.vi v12, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v12, v8, a1 ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64-NEXT: vsll.vi v20, v16, 2 +; RV64-NEXT: vxor.vx v20, v20, a1 +; RV64-NEXT: vadd.vv v24, v20, v20 +; RV64-NEXT: vxor.vv v24, v20, v24 +; RV64-NEXT: vand.vv v12, v12, v24 ; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vv v12, v8, v20 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v20 ; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v12, v16, 3 +; RV64-NEXT: vand.vx v12, v12, a1 +; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1533,24 +1461,13 @@ declare <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32) define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -1566,40 +1483,80 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsll.vi v24, v8, 1, v0.t +; RV32-NEXT: vxor.vv v16, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v24, v16, v0.t +; RV32-NEXT: vand.vv v24, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 2, v0.t +; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v15i64: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t @@ -1611,39 +1568,65 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v8, v8, 2, v0.t +; RV64-NEXT: vxor.vx v8, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: vxor.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v24, v16, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v16, v24, v16, v0.t +; RV64-NEXT: vand.vv v24, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: vadd.vv v8, v24, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl) ret <15 x i64> %v @@ -1652,24 +1635,12 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v15i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v16 @@ -1685,40 +1656,58 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsll.vi v0, v8, 2 +; RV32-NEXT: vxor.vv v0, v8, v0 +; RV32-NEXT: vadd.vv v24, v0, v0 +; RV32-NEXT: vxor.vv v24, v0, v24 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v16, v24 +; RV32-NEXT: vand.vv v24, v16, v0 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: vsrl.vi v24, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v24 +; RV32-NEXT: vand.vv v16, v16, v8 +; RV32-NEXT: vsrl.vi v24, v8, 3 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vmul.vv v8, v16, v8 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v15i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v16 @@ -1730,39 +1719,54 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v16, v8, a1 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vnot.v v16, v8 +; RV64-NEXT: vsrl.vi v8, v16, 1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v0, v8, 2 +; RV64-NEXT: vxor.vx v0, v0, a1 +; RV64-NEXT: vadd.vv v24, v0, v0 +; RV64-NEXT: vxor.vv v24, v0, v24 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v24, v8, v24 +; RV64-NEXT: vsub.vv v16, v16, v24 +; RV64-NEXT: vand.vv v24, v16, v0 +; RV64-NEXT: vsrl.vi v16, v16, 2 +; RV64-NEXT: vand.vv v16, v16, v0 +; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vadd.vv v16, v16, v24 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v8, v8, 3 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vmul.vv v8, v16, v8 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer @@ -1775,24 +1779,13 @@ declare <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32) define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -1808,40 +1801,80 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: addi sp, sp, 32 -; RV32-NEXT: ret -; -; RV64-LABEL: vp_ctlz_v16i64: +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsll.vi v24, v8, 1, v0.t +; RV32-NEXT: vxor.vv v16, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v24, v16, v0.t +; RV32-NEXT: vand.vv v24, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 2, v0.t +; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: li a0, 56 +; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_ctlz_v16i64: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t @@ -1853,39 +1886,65 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v8, v8, 2, v0.t +; RV64-NEXT: vxor.vx v8, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: vxor.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v24, v16, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v16, v24, v16, v0.t +; RV64-NEXT: vand.vv v24, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: vadd.vv v8, v24, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl) ret <16 x i64> %v @@ -1894,24 +1953,12 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v16i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v16 @@ -1927,40 +1974,58 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsll.vi v0, v8, 2 +; RV32-NEXT: vxor.vv v0, v8, v0 +; RV32-NEXT: vadd.vv v24, v0, v0 +; RV32-NEXT: vxor.vv v24, v0, v24 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v16, v24 +; RV32-NEXT: vand.vv v24, v16, v0 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: vsrl.vi v24, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v24 +; RV32-NEXT: vand.vv v16, v16, v8 +; RV32-NEXT: vsrl.vi v24, v8, 3 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vmul.vv v8, v16, v8 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v16i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v16 @@ -1972,39 +2037,54 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v16, v8, a1 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vnot.v v16, v8 +; RV64-NEXT: vsrl.vi v8, v16, 1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v0, v8, 2 +; RV64-NEXT: vxor.vx v0, v0, a1 +; RV64-NEXT: vadd.vv v24, v0, v0 +; RV64-NEXT: vxor.vv v24, v0, v24 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v24, v8, v24 +; RV64-NEXT: vsub.vv v16, v16, v24 +; RV64-NEXT: vand.vv v24, v16, v0 +; RV64-NEXT: vsrl.vi v16, v16, 2 +; RV64-NEXT: vand.vv v16, v16, v0 +; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vadd.vv v16, v16, v24 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v8, v8, 3 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vmul.vv v8, v16, v8 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -2017,39 +2097,24 @@ declare <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64>, i1 immarg, <32 x i1>, i32) define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v32i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: li a1, 16 ; RV32-NEXT: vslidedown.vi v24, v0, 2 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a3, 16 -; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a3, .LBB34_2 +; RV32-NEXT: bltu a0, a1, .LBB34_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB34_2: @@ -2069,113 +2134,114 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: lui a3, 61681 +; RV32-NEXT: addi a3, a3, -241 +; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a3 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsll.vi v16, v8, 1, v0.t +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 @@ -2184,9 +2250,10 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a0, a0, a3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: vor.vv v8, v16, v8, v0.t @@ -2201,81 +2268,119 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsll.vi v16, v8, 1, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: vsub.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v32i64: @@ -2283,23 +2388,25 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: li a2, 48 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: li a2, 16 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB34_2 +; RV64-NEXT: mv a3, a0 +; RV64-NEXT: bltu a0, a1, .LBB34_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: .LBB34_2: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t @@ -2310,81 +2417,238 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV64-NEXT: li a2, 32 +; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v16, v16, a2, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v16, v8, a3, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t -; RV64-NEXT: addi a7, sp, 16 -; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: li a5, 40 +; RV64-NEXT: mul a4, a4, a5 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v8, v8, 2, v0.t +; RV64-NEXT: vxor.vx v8, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vxor.vv v8, v8, v16, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v16, v16, v8, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v16, v8, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV64-NEXT: vadd.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 40 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t +; RV64-NEXT: li a3, 56 +; RV64-NEXT: vsrl.vx v8, v8, a3, v0.t +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 4 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV64-NEXT: addi a4, a0, -16 +; RV64-NEXT: sltu a0, a0, a4 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a4 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: li a4, 24 +; RV64-NEXT: mul a0, a0, a4 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vor.vv v16, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v8, v16, 2, v0.t +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV64-NEXT: vor.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 8, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 40 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsll.vi v16, v16, 2, v0.t +; RV64-NEXT: vxor.vx v16, v16, a1, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vi v8, v16, 1, v0.t +; RV64-NEXT: vxor.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 40 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a3, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 48 +; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -2395,160 +2659,170 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb -; RV32-NEXT: vmv8r.v v24, v16 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: li a2, 16 -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: mv a1, a0 ; RV32-NEXT: bltu a0, a2, .LBB35_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB35_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v16, v8, a2 -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: lui a3, 61681 +; RV32-NEXT: addi a3, a3, -241 +; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vsll.vi v0, v24, 2 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: vadd.vv v16, v0, v0 +; RV32-NEXT: vxor.vv v16, v0, v16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a3), zero -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v0 ; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 1 -; RV32-NEXT: vor.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vx v24, v8, a2 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsub.vv v8, v8, v24 -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v0 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vor.vv v16, v8, v16 +; RV32-NEXT: vsrl.vi v0, v16, 2 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vi v0, v16, 8 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vi v0, v16, 16 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vx v0, v16, a2 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vnot.v v8, v16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v0, v24, 2 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: vadd.vv v16, v0, v0 +; RV32-NEXT: vxor.vv v16, v0, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16 +; RV32-NEXT: vand.vv v16, v8, v0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v32i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB35_2 -; RV64-NEXT: # %bb.1: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; RV64-NEXT: li a1, 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: bltu a0, a1, .LBB35_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB35_2: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vsrl.vi v24, v8, 2 @@ -2559,69 +2833,117 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vsrl.vi v24, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsrl.vx v24, v8, a1 +; RV64-NEXT: li a3, 32 +; RV64-NEXT: vsrl.vx v24, v8, a3 ; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v24, v24, a2 -; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v24, v8, a3 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v0, v16, 2 +; RV64-NEXT: vmv8r.v v24, v16 +; RV64-NEXT: vxor.vx v0, v0, a1 +; RV64-NEXT: vadd.vv v16, v0, v0 +; RV64-NEXT: vxor.vv v16, v0, v16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vv v16, v8, v0 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vadd.vv v8, v24, v8 -; RV64-NEXT: vsrl.vi v24, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4 -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5 -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6 -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v16, v24, 3 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsrl.vx v8, v8, a2 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 4 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV64-NEXT: addi a4, a0, -16 +; RV64-NEXT: sltu a0, a0, a4 ; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 +; RV64-NEXT: and a0, a0, a4 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: vsrl.vi v24, v16, 2 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: vsrl.vi v24, v16, 4 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: vsrl.vi v24, v16, 8 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: vsrl.vi v24, v16, 16 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: vsrl.vx v24, v16, a1 -; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a4, 24 +; RV64-NEXT: mul a0, a0, a4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v16, v8, v16 +; RV64-NEXT: vsrl.vi v0, v16, 2 +; RV64-NEXT: vor.vv v16, v16, v0 +; RV64-NEXT: vsrl.vi v0, v16, 4 +; RV64-NEXT: vor.vv v16, v16, v0 +; RV64-NEXT: vsrl.vi v0, v16, 8 +; RV64-NEXT: vor.vv v16, v16, v0 +; RV64-NEXT: vsrl.vi v0, v16, 16 +; RV64-NEXT: vor.vv v16, v16, v0 +; RV64-NEXT: vsrl.vx v0, v16, a3 +; RV64-NEXT: vor.vv v16, v16, v0 ; RV64-NEXT: vnot.v v16, v16 +; RV64-NEXT: vsll.vi v0, v24, 2 +; RV64-NEXT: vxor.vx v0, v0, a1 +; RV64-NEXT: vadd.vv v24, v0, v0 +; RV64-NEXT: vxor.vv v8, v0, v24 ; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vand.vx v24, v24, a2 -; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: vand.vx v24, v16, a3 -; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a3 -; RV64-NEXT: vadd.vv v16, v24, v16 -; RV64-NEXT: vsrl.vi v24, v16, 4 -; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: vand.vx v16, v16, a4 -; RV64-NEXT: vmul.vx v16, v16, a5 -; RV64-NEXT: vsrl.vx v16, v16, a6 +; RV64-NEXT: vand.vv v8, v24, v8 +; RV64-NEXT: vsub.vv v8, v16, v8 +; RV64-NEXT: vand.vv v16, v8, v0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsrl.vx v16, v8, a2 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer @@ -3512,35 +3834,26 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe ; RV32-NEXT: vor.vv v8, v8, v9, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t +; RV32-NEXT: vsll.vi v11, v10, 2, v0.t +; RV32-NEXT: vxor.vv v11, v10, v11, v0.t +; RV32-NEXT: vsll.vi v12, v11, 1, v0.t +; RV32-NEXT: vxor.vv v12, v11, v12, v0.t +; RV32-NEXT: vand.vv v9, v9, v12, v0.t ; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v8, v11, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v11, v0.t +; RV32-NEXT: vadd.vv v8, v9, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v9, v10, 3, v0.t +; RV32-NEXT: vand.vv v9, v10, v9, v0.t ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -3559,37 +3872,34 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe ; RV64-NEXT: vor.vv v8, v8, v9, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0, v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v9, v8, a1, v0.t ; RV64-NEXT: vor.vv v8, v8, v9, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vsll.vi v11, v10, 2, v0.t +; RV64-NEXT: vxor.vx v11, v11, a1, v0.t +; RV64-NEXT: vsll.vi v12, v11, 1, v0.t +; RV64-NEXT: vxor.vv v12, v11, v12, v0.t +; RV64-NEXT: vand.vv v9, v9, v12, v0.t ; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t +; RV64-NEXT: vand.vv v9, v8, v11, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v11, v0.t ; RV64-NEXT: vadd.vv v8, v9, v8, v0.t ; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v9, v10, 3, v0.t +; RV64-NEXT: vand.vx v9, v9, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v9, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -3616,35 +3926,26 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext % ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vsll.vi v11, v10, 2 +; RV32-NEXT: vxor.vv v11, v10, v11 +; RV32-NEXT: vadd.vv v12, v11, v11 +; RV32-NEXT: vxor.vv v12, v11, v12 +; RV32-NEXT: vand.vv v9, v9, v12 ; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v11 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vadd.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v9, v10, 3 +; RV32-NEXT: vand.vv v9, v10, v9 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3663,37 +3964,34 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext % ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v9, v8, a1 ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vsll.vi v11, v10, 2 +; RV64-NEXT: vxor.vx v11, v11, a1 +; RV64-NEXT: vadd.vv v12, v11, v11 +; RV64-NEXT: vxor.vv v12, v11, v12 +; RV64-NEXT: vand.vv v9, v9, v12 ; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vv v9, v8, v11 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v11 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v9, v10, 3 +; RV64-NEXT: vand.vx v9, v9, a1 +; RV64-NEXT: vmul.vv v8, v8, v9 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -3722,35 +4020,26 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe ; RV32-NEXT: vor.vv v8, v8, v10, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t +; RV32-NEXT: vsll.vi v14, v12, 2, v0.t +; RV32-NEXT: vxor.vv v14, v12, v14, v0.t +; RV32-NEXT: vsll.vi v16, v14, 1, v0.t +; RV32-NEXT: vxor.vv v16, v14, v16, v0.t +; RV32-NEXT: vand.vv v10, v10, v16, v0.t ; RV32-NEXT: vsub.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v8, v14, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v14, v0.t +; RV32-NEXT: vadd.vv v8, v10, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v10, v12, 3, v0.t +; RV32-NEXT: vand.vv v10, v12, v10, v0.t ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -3769,37 +4058,34 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0, v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v10, v8, a1, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64-NEXT: vsll.vi v14, v12, 2, v0.t +; RV64-NEXT: vxor.vx v14, v14, a1, v0.t +; RV64-NEXT: vsll.vi v16, v14, 1, v0.t +; RV64-NEXT: vxor.vv v16, v14, v16, v0.t +; RV64-NEXT: vand.vv v10, v10, v16, v0.t ; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vand.vv v10, v8, v14, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v14, v0.t ; RV64-NEXT: vadd.vv v8, v10, v8, v0.t ; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v10, v12, 3, v0.t +; RV64-NEXT: vand.vx v10, v10, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v10, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -3826,35 +4112,26 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext % ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 +; RV32-NEXT: vsll.vi v14, v12, 2 +; RV32-NEXT: vxor.vv v14, v12, v14 +; RV32-NEXT: vadd.vv v16, v14, v14 +; RV32-NEXT: vxor.vv v16, v14, v16 +; RV32-NEXT: vand.vv v10, v10, v16 ; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v14 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vand.vv v8, v8, v14 +; RV32-NEXT: vadd.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v10, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v10, v12, 3 +; RV32-NEXT: vand.vv v10, v12, v10 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3873,37 +4150,34 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext % ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vsrl.vi v10, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v10, v8, a1 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64-NEXT: vsll.vi v14, v12, 2 +; RV64-NEXT: vxor.vx v14, v14, a1 +; RV64-NEXT: vadd.vv v16, v14, v14 +; RV64-NEXT: vxor.vv v16, v14, v16 +; RV64-NEXT: vand.vv v10, v10, v16 ; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vv v10, v8, v14 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v14 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v10, v12, 3 +; RV64-NEXT: vand.vx v10, v10, a1 +; RV64-NEXT: vmul.vv v8, v8, v10 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -3930,38 +4204,29 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vnot.v v12, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v12, 1, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v12, v0.t +; RV32-NEXT: vsll.vi v20, v8, 2, v0.t +; RV32-NEXT: vxor.vv v20, v8, v20, v0.t +; RV32-NEXT: vsll.vi v24, v20, 1, v0.t +; RV32-NEXT: vxor.vv v24, v20, v24, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vsub.vv v12, v12, v16, v0.t +; RV32-NEXT: vand.vv v16, v12, v20, v0.t +; RV32-NEXT: vsrl.vi v12, v12, 2, v0.t +; RV32-NEXT: vand.vv v12, v12, v20, v0.t +; RV32-NEXT: vadd.vv v12, v16, v12, v0.t +; RV32-NEXT: vsrl.vi v16, v12, 4, v0.t +; RV32-NEXT: vadd.vv v12, v12, v16, v0.t +; RV32-NEXT: vand.vv v12, v12, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vmul.vv v8, v12, v8, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV32-NEXT: ret @@ -3979,37 +4244,34 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe ; RV64-NEXT: vor.vv v8, v8, v12, v0.t ; RV64-NEXT: vsrl.vi v12, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0, v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v12, v8, a1, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vnot.v v12, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v12, 1, v0.t +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64-NEXT: vsll.vi v20, v8, 2, v0.t +; RV64-NEXT: vxor.vx v20, v20, a1, v0.t +; RV64-NEXT: vsll.vi v24, v20, 1, v0.t +; RV64-NEXT: vxor.vv v24, v20, v24, v0.t +; RV64-NEXT: vand.vv v16, v16, v24, v0.t +; RV64-NEXT: vsub.vv v12, v12, v16, v0.t +; RV64-NEXT: vand.vv v16, v12, v20, v0.t +; RV64-NEXT: vsrl.vi v12, v12, 2, v0.t +; RV64-NEXT: vand.vv v12, v12, v20, v0.t +; RV64-NEXT: vadd.vv v12, v16, v12, v0.t +; RV64-NEXT: vsrl.vi v16, v12, 4, v0.t +; RV64-NEXT: vadd.vv v12, v12, v16, v0.t +; RV64-NEXT: vand.vx v12, v12, a1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 3, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vmul.vv v8, v12, v8, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -4036,35 +4298,26 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vsll.vi v20, v16, 2 +; RV32-NEXT: vxor.vv v20, v16, v20 +; RV32-NEXT: vadd.vv v24, v20, v20 +; RV32-NEXT: vxor.vv v24, v20, v24 +; RV32-NEXT: vand.vv v12, v12, v24 ; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 +; RV32-NEXT: vand.vv v12, v8, v20 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vand.vv v8, v8, v20 +; RV32-NEXT: vadd.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v12, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v12, v16, 3 +; RV32-NEXT: vand.vv v12, v16, v12 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -4083,37 +4336,34 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vsrl.vi v12, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v12, v8, a1 ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64-NEXT: vsll.vi v20, v16, 2 +; RV64-NEXT: vxor.vx v20, v20, a1 +; RV64-NEXT: vadd.vv v24, v20, v20 +; RV64-NEXT: vxor.vv v24, v20, v24 +; RV64-NEXT: vand.vv v12, v12, v24 ; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vv v12, v8, v20 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v20 ; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v12, v16, 3 +; RV64-NEXT: vand.vx v12, v12, a1 +; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -4126,24 +4376,13 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -4159,40 +4398,80 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsll.vi v24, v8, 1, v0.t +; RV32-NEXT: vxor.vv v16, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v24, v16, v0.t +; RV32-NEXT: vand.vv v24, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 2, v0.t +; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v15i64: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t @@ -4204,39 +4483,65 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v8, v8, 2, v0.t +; RV64-NEXT: vxor.vx v8, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: vxor.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v24, v16, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v16, v24, v16, v0.t +; RV64-NEXT: vand.vv v24, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: vadd.vv v8, v24, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl) ret <15 x i64> %v @@ -4245,24 +4550,12 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v15i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v16 @@ -4278,40 +4571,58 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsll.vi v0, v8, 2 +; RV32-NEXT: vxor.vv v0, v8, v0 +; RV32-NEXT: vadd.vv v24, v0, v0 +; RV32-NEXT: vxor.vv v24, v0, v24 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v16, v24 +; RV32-NEXT: vand.vv v24, v16, v0 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: vsrl.vi v24, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v24 +; RV32-NEXT: vand.vv v16, v16, v8 +; RV32-NEXT: vsrl.vi v24, v8, 3 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vmul.vv v8, v16, v8 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v15i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v16 @@ -4323,39 +4634,54 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v16, v8, a1 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vnot.v v16, v8 +; RV64-NEXT: vsrl.vi v8, v16, 1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v0, v8, 2 +; RV64-NEXT: vxor.vx v0, v0, a1 +; RV64-NEXT: vadd.vv v24, v0, v0 +; RV64-NEXT: vxor.vv v24, v0, v24 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v24, v8, v24 +; RV64-NEXT: vsub.vv v16, v16, v24 +; RV64-NEXT: vand.vv v24, v16, v0 +; RV64-NEXT: vsrl.vi v16, v16, 2 +; RV64-NEXT: vand.vv v16, v16, v0 +; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vadd.vv v16, v16, v24 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v8, v8, 3 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vmul.vv v8, v16, v8 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer @@ -4366,24 +4692,13 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -4399,40 +4714,80 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsll.vi v24, v8, 1, v0.t +; RV32-NEXT: vxor.vv v16, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v24, v16, v0.t +; RV32-NEXT: vand.vv v24, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 2, v0.t +; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v16i64: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t @@ -4444,39 +4799,65 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v8, v8, 2, v0.t +; RV64-NEXT: vxor.vx v8, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: vxor.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v24, v16, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v16, v24, v16, v0.t +; RV64-NEXT: vand.vv v24, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: vadd.vv v8, v24, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl) ret <16 x i64> %v @@ -4485,24 +4866,12 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v16i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v16 @@ -4518,40 +4887,58 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsll.vi v0, v8, 2 +; RV32-NEXT: vxor.vv v0, v8, v0 +; RV32-NEXT: vadd.vv v24, v0, v0 +; RV32-NEXT: vxor.vv v24, v0, v24 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v16, v24 +; RV32-NEXT: vand.vv v24, v16, v0 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: vsrl.vi v24, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v24 +; RV32-NEXT: vand.vv v16, v16, v8 +; RV32-NEXT: vsrl.vi v24, v8, 3 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vmul.vv v8, v16, v8 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v16i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v16 @@ -4563,39 +4950,54 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsrl.vx v16, v8, a1 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vnot.v v16, v8 +; RV64-NEXT: vsrl.vi v8, v16, 1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v0, v8, 2 +; RV64-NEXT: vxor.vx v0, v0, a1 +; RV64-NEXT: vadd.vv v24, v0, v0 +; RV64-NEXT: vxor.vv v24, v0, v24 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v24, v8, v24 +; RV64-NEXT: vsub.vv v16, v16, v24 +; RV64-NEXT: vand.vv v24, v16, v0 +; RV64-NEXT: vsrl.vi v16, v16, 2 +; RV64-NEXT: vand.vv v16, v16, v0 +; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vadd.vv v16, v16, v24 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v8, v8, 3 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vmul.vv v8, v16, v8 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -4606,39 +5008,24 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v32i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: li a1, 16 ; RV32-NEXT: vslidedown.vi v24, v0, 2 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a3, 16 -; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a3, .LBB70_2 +; RV32-NEXT: bltu a0, a1, .LBB70_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB70_2: @@ -4658,113 +5045,114 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: lui a3, 61681 +; RV32-NEXT: addi a3, a3, -241 +; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a3 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsll.vi v16, v8, 1, v0.t +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 @@ -4773,9 +5161,10 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a0, a0, a3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: vor.vv v8, v16, v8, v0.t @@ -4790,81 +5179,119 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsll.vi v16, v8, 1, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: vsub.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v32i64: @@ -4872,23 +5299,25 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: li a2, 48 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: li a2, 16 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB70_2 +; RV64-NEXT: mv a3, a0 +; RV64-NEXT: bltu a0, a1, .LBB70_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: .LBB70_2: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t @@ -4899,81 +5328,238 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV64-NEXT: li a2, 32 +; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v16, v16, a2, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v16, v8, a3, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: li a5, 40 +; RV64-NEXT: mul a4, a4, a5 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v8, v8, 2, v0.t +; RV64-NEXT: vxor.vx v8, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vxor.vv v8, v8, v16, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v16, v16, v8, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v16, v8, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t -; RV64-NEXT: addi a7, sp, 16 -; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 40 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t +; RV64-NEXT: li a3, 56 +; RV64-NEXT: vsrl.vx v8, v8, a3, v0.t +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 4 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV64-NEXT: addi a4, a0, -16 +; RV64-NEXT: sltu a0, a0, a4 ; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 +; RV64-NEXT: and a0, a0, a4 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: li a4, 24 +; RV64-NEXT: mul a0, a0, a4 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vor.vv v16, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v8, v16, 2, v0.t +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV64-NEXT: vor.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 8, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 40 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsll.vi v16, v16, 2, v0.t +; RV64-NEXT: vxor.vx v16, v16, a1, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vi v8, v16, 1, v0.t +; RV64-NEXT: vxor.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 40 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a3, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 48 +; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -4984,160 +5570,170 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb -; RV32-NEXT: vmv8r.v v24, v16 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: li a2, 16 -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: mv a1, a0 ; RV32-NEXT: bltu a0, a2, .LBB71_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB71_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v16, v8, a2 -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: lui a3, 61681 +; RV32-NEXT: addi a3, a3, -241 +; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vsll.vi v0, v24, 2 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: vadd.vv v16, v0, v0 +; RV32-NEXT: vxor.vv v16, v0, v16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a3), zero -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v0 ; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 1 -; RV32-NEXT: vor.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vsrl.vx v24, v8, a2 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsub.vv v8, v8, v24 -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v0 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vor.vv v16, v8, v16 +; RV32-NEXT: vsrl.vi v0, v16, 2 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vi v0, v16, 8 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vi v0, v16, 16 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vx v0, v16, a2 +; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vnot.v v8, v16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v0, v24, 2 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: vadd.vv v16, v0, v0 +; RV32-NEXT: vxor.vv v16, v0, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16 +; RV32-NEXT: vand.vv v16, v8, v0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v32i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB71_2 -; RV64-NEXT: # %bb.1: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; RV64-NEXT: li a1, 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: bltu a0, a1, .LBB71_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB71_2: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV64-NEXT: vsrl.vi v24, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vsrl.vi v24, v8, 2 @@ -5148,69 +5744,117 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vsrl.vi v24, v8, 16 ; RV64-NEXT: vor.vv v8, v8, v24 -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsrl.vx v24, v8, a1 +; RV64-NEXT: li a3, 32 +; RV64-NEXT: vsrl.vx v24, v8, a3 ; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v24, v24, a2 -; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v24, v8, a3 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a4, a1, 32 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: vsetvli a4, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v0, v16, 2 +; RV64-NEXT: vmv8r.v v24, v16 +; RV64-NEXT: vxor.vx v0, v0, a1 +; RV64-NEXT: vadd.vv v16, v0, v0 +; RV64-NEXT: vxor.vv v16, v0, v16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vv v16, v8, v0 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vadd.vv v8, v24, v8 -; RV64-NEXT: vsrl.vi v24, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4 -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5 -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6 -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v16, v24, 3 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsrl.vx v8, v8, a2 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 4 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV64-NEXT: addi a4, a0, -16 +; RV64-NEXT: sltu a0, a0, a4 ; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 +; RV64-NEXT: and a0, a0, a4 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: vsrl.vi v24, v16, 2 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: vsrl.vi v24, v16, 4 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: vsrl.vi v24, v16, 8 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: vsrl.vi v24, v16, 16 -; RV64-NEXT: vor.vv v16, v16, v24 -; RV64-NEXT: vsrl.vx v24, v16, a1 -; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a4, 24 +; RV64-NEXT: mul a0, a0, a4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v16, v8, v16 +; RV64-NEXT: vsrl.vi v0, v16, 2 +; RV64-NEXT: vor.vv v16, v16, v0 +; RV64-NEXT: vsrl.vi v0, v16, 4 +; RV64-NEXT: vor.vv v16, v16, v0 +; RV64-NEXT: vsrl.vi v0, v16, 8 +; RV64-NEXT: vor.vv v16, v16, v0 +; RV64-NEXT: vsrl.vi v0, v16, 16 +; RV64-NEXT: vor.vv v16, v16, v0 +; RV64-NEXT: vsrl.vx v0, v16, a3 +; RV64-NEXT: vor.vv v16, v16, v0 ; RV64-NEXT: vnot.v v16, v16 +; RV64-NEXT: vsll.vi v0, v24, 2 +; RV64-NEXT: vxor.vx v0, v0, a1 +; RV64-NEXT: vadd.vv v24, v0, v0 +; RV64-NEXT: vxor.vv v8, v0, v24 ; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vand.vx v24, v24, a2 -; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: vand.vx v24, v16, a3 -; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a3 -; RV64-NEXT: vadd.vv v16, v24, v16 -; RV64-NEXT: vsrl.vi v24, v16, 4 -; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: vand.vx v16, v16, a4 -; RV64-NEXT: vmul.vx v16, v16, a5 -; RV64-NEXT: vsrl.vx v16, v16, a6 +; RV64-NEXT: vand.vv v8, v24, v8 +; RV64-NEXT: vsub.vv v8, v16, v8 +; RV64-NEXT: vand.vv v16, v8, v0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsrl.vx v16, v8, a2 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll index 277146cc1403e..33ac13ffc2cac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -262,35 +262,26 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind { ; RV32I-NEXT: vor.vv v8, v8, v9 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v9, v8, 1 -; RV32I-NEXT: lui a1, 349525 -; RV32I-NEXT: addi a1, a1, 1365 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32I-NEXT: lui a1, 61681 +; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 +; RV32I-NEXT: vsll.vi v11, v10, 2 +; RV32I-NEXT: vxor.vv v11, v10, v11 +; RV32I-NEXT: vadd.vv v12, v11, v11 +; RV32I-NEXT: vxor.vv v12, v11, v12 +; RV32I-NEXT: vand.vv v9, v9, v12 ; RV32I-NEXT: vsub.vv v8, v8, v9 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v11 ; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vand.vv v8, v8, v11 +; RV32I-NEXT: vadd.vv v8, v9, v8 ; RV32I-NEXT: vsrl.vi v9, v8, 4 ; RV32I-NEXT: vadd.vv v8, v8, v9 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v9, v10, 3 +; RV32I-NEXT: vand.vv v9, v10, v9 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -316,32 +307,29 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v9, v9, a1 +; RV64I-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64I-NEXT: vmv.v.x v10, a1 +; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64I-NEXT: vsll.vi v11, v10, 2 +; RV64I-NEXT: vxor.vx v11, v11, a1 +; RV64I-NEXT: vadd.vv v12, v11, v11 +; RV64I-NEXT: vxor.vv v12, v11, v12 +; RV64I-NEXT: vand.vv v9, v9, v12 ; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v9, v8, a1 +; RV64I-NEXT: vand.vv v9, v8, v11 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vand.vv v8, v8, v11 ; RV64I-NEXT: vadd.vv v8, v9, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vsrl.vi v9, v10, 3 +; RV64I-NEXT: vand.vx v9, v9, a1 +; RV64I-NEXT: vmul.vv v8, v8, v9 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -671,35 +659,26 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind { ; RV32I-NEXT: vor.vv v8, v8, v10 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v10, v8, 1 -; RV32I-NEXT: lui a1, 349525 -; RV32I-NEXT: addi a1, a1, 1365 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32I-NEXT: lui a1, 61681 +; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v12, a1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 +; RV32I-NEXT: vsll.vi v14, v12, 2 +; RV32I-NEXT: vxor.vv v14, v12, v14 +; RV32I-NEXT: vadd.vv v16, v14, v14 +; RV32I-NEXT: vxor.vv v16, v14, v16 +; RV32I-NEXT: vand.vv v10, v10, v16 ; RV32I-NEXT: vsub.vv v8, v8, v10 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v14 ; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vand.vv v8, v8, v14 +; RV32I-NEXT: vadd.vv v8, v10, v8 ; RV32I-NEXT: vsrl.vi v10, v8, 4 ; RV32I-NEXT: vadd.vv v8, v8, v10 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v10, v12, 3 +; RV32I-NEXT: vand.vv v10, v12, v10 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -725,32 +704,29 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v10, v10, a1 +; RV64I-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64I-NEXT: vmv.v.x v12, a1 +; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64I-NEXT: vsll.vi v14, v12, 2 +; RV64I-NEXT: vxor.vx v14, v14, a1 +; RV64I-NEXT: vadd.vv v16, v14, v14 +; RV64I-NEXT: vxor.vv v16, v14, v16 +; RV64I-NEXT: vand.vv v10, v10, v16 ; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v10, v8, a1 +; RV64I-NEXT: vand.vv v10, v8, v14 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vand.vv v8, v8, v14 ; RV64I-NEXT: vadd.vv v8, v10, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vsrl.vi v10, v12, 3 +; RV64I-NEXT: vand.vx v10, v10, a1 +; RV64I-NEXT: vmul.vv v8, v8, v10 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -1061,35 +1037,26 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV32I-NEXT: vor.vv v8, v8, v9 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v9, v8, 1 -; RV32I-NEXT: lui a1, 349525 -; RV32I-NEXT: addi a1, a1, 1365 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32I-NEXT: lui a1, 61681 +; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 +; RV32I-NEXT: vsll.vi v11, v10, 2 +; RV32I-NEXT: vxor.vv v11, v10, v11 +; RV32I-NEXT: vadd.vv v12, v11, v11 +; RV32I-NEXT: vxor.vv v12, v11, v12 +; RV32I-NEXT: vand.vv v9, v9, v12 ; RV32I-NEXT: vsub.vv v8, v8, v9 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 +; RV32I-NEXT: vand.vv v9, v8, v11 ; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vand.vv v8, v8, v11 +; RV32I-NEXT: vadd.vv v8, v9, v8 ; RV32I-NEXT: vsrl.vi v9, v8, 4 ; RV32I-NEXT: vadd.vv v8, v8, v9 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v9, v10, 3 +; RV32I-NEXT: vand.vv v9, v10, v9 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -1115,32 +1082,29 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vor.vv v8, v8, v9 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v9, v9, a1 +; RV64I-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64I-NEXT: vmv.v.x v10, a1 +; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64I-NEXT: vsll.vi v11, v10, 2 +; RV64I-NEXT: vxor.vx v11, v11, a1 +; RV64I-NEXT: vadd.vv v12, v11, v11 +; RV64I-NEXT: vxor.vv v12, v11, v12 +; RV64I-NEXT: vand.vv v9, v9, v12 ; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v9, v8, a1 +; RV64I-NEXT: vand.vv v9, v8, v11 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vand.vv v8, v8, v11 ; RV64I-NEXT: vadd.vv v8, v9, v8 ; RV64I-NEXT: vsrl.vi v9, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vsrl.vi v9, v10, 3 +; RV64I-NEXT: vand.vx v9, v9, a1 +; RV64I-NEXT: vmul.vv v8, v8, v9 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -1446,35 +1410,26 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV32I-NEXT: vor.vv v8, v8, v10 ; RV32I-NEXT: vnot.v v8, v8 ; RV32I-NEXT: vsrl.vi v10, v8, 1 -; RV32I-NEXT: lui a1, 349525 -; RV32I-NEXT: addi a1, a1, 1365 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32I-NEXT: lui a1, 61681 +; RV32I-NEXT: addi a1, a1, -241 +; RV32I-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v12, a1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 +; RV32I-NEXT: vsll.vi v14, v12, 2 +; RV32I-NEXT: vxor.vv v14, v12, v14 +; RV32I-NEXT: vadd.vv v16, v14, v14 +; RV32I-NEXT: vxor.vv v16, v14, v16 +; RV32I-NEXT: vand.vv v10, v10, v16 ; RV32I-NEXT: vsub.vv v8, v8, v10 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 +; RV32I-NEXT: vand.vv v10, v8, v14 ; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vand.vv v8, v8, v14 +; RV32I-NEXT: vadd.vv v8, v10, v8 ; RV32I-NEXT: vsrl.vi v10, v8, 4 ; RV32I-NEXT: vadd.vv v8, v8, v10 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v10, v12, 3 +; RV32I-NEXT: vand.vv v10, v12, v10 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -1500,32 +1455,29 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV64I-NEXT: vor.vv v8, v8, v10 ; RV64I-NEXT: vnot.v v8, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v10, v10, a1 +; RV64I-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64I-NEXT: vmv.v.x v12, a1 +; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64I-NEXT: vsll.vi v14, v12, 2 +; RV64I-NEXT: vxor.vx v14, v14, a1 +; RV64I-NEXT: vadd.vv v16, v14, v14 +; RV64I-NEXT: vxor.vv v16, v14, v16 +; RV64I-NEXT: vand.vv v10, v10, v16 ; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v10, v8, a1 +; RV64I-NEXT: vand.vv v10, v8, v14 ; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 +; RV64I-NEXT: vand.vv v8, v8, v14 ; RV64I-NEXT: vadd.vv v8, v10, v8 ; RV64I-NEXT: vsrl.vi v10, v8, 4 ; RV64I-NEXT: vadd.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vsrl.vi v10, v12, 3 +; RV64I-NEXT: vand.vx v10, v10, a1 +; RV64I-NEXT: vmul.vv v8, v8, v10 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll index c4b22955f84c4..7e1e94cbed375 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll @@ -681,37 +681,27 @@ declare <2 x i64> @llvm.vp.ctpop.v2i64(<2 x i64>, <2 x i1>, i32) define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v2i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vi v10, v9, 2, v0.t +; RV32-NEXT: vxor.vv v10, v9, v10, v0.t +; RV32-NEXT: vsll.vi v11, v10, 1, v0.t +; RV32-NEXT: vxor.vv v11, v10, v11, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vand.vv v11, v12, v11, v0.t +; RV32-NEXT: vsub.vv v8, v8, v11, v0.t +; RV32-NEXT: vand.vv v11, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vadd.vv v8, v11, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v10, v9, 3, v0.t +; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -719,34 +709,30 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; ; RV64-LABEL: vp_ctpop_v2i64: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t +; RV64-NEXT: vsll.vi v10, v9, 2, v0.t +; RV64-NEXT: vxor.vx v10, v10, a1, v0.t +; RV64-NEXT: vsll.vi v11, v10, 1, v0.t +; RV64-NEXT: vxor.vv v11, v10, v11, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV64-NEXT: vand.vv v11, v12, v11, v0.t +; RV64-NEXT: vsub.vv v8, v8, v11, v0.t +; RV64-NEXT: vand.vv v11, v8, v10, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v10, v0.t +; RV64-NEXT: vadd.vv v8, v11, v8, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v9, v9, 3, v0.t +; RV64-NEXT: vand.vx v9, v9, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v9, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -757,37 +743,27 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v2i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vi v10, v9, 2 +; RV32-NEXT: vxor.vv v10, v9, v10 +; RV32-NEXT: vadd.vv v11, v10, v10 +; RV32-NEXT: vxor.vv v11, v10, v11 +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vand.vv v11, v12, v11 +; RV32-NEXT: vsub.vv v8, v8, v11 +; RV32-NEXT: vand.vv v11, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v11, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v10, v9, 3 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -795,34 +771,30 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_ctpop_v2i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsll.vi v10, v9, 2 +; RV64-NEXT: vxor.vx v10, v10, a1 +; RV64-NEXT: vadd.vv v11, v10, v10 +; RV64-NEXT: vxor.vv v11, v10, v11 +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vand.vv v11, v12, v11 +; RV64-NEXT: vsub.vv v8, v8, v11 +; RV64-NEXT: vand.vv v11, v8, v10 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: vadd.vv v8, v11, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v9, v9, 3 +; RV64-NEXT: vand.vx v9, v9, a1 +; RV64-NEXT: vmul.vv v8, v8, v9 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -837,37 +809,27 @@ declare <4 x i64> @llvm.vp.ctpop.v4i64(<4 x i64>, <4 x i1>, i32) define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v4i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsll.vi v12, v10, 2, v0.t +; RV32-NEXT: vxor.vv v12, v10, v12, v0.t +; RV32-NEXT: vsll.vi v14, v12, 1, v0.t +; RV32-NEXT: vxor.vv v14, v12, v14, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vand.vv v14, v16, v14, v0.t +; RV32-NEXT: vsub.vv v8, v8, v14, v0.t +; RV32-NEXT: vand.vv v14, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vadd.vv v8, v14, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v12, v0.t ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v12, v10, 3, v0.t +; RV32-NEXT: vand.vv v10, v10, v12, v0.t ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -875,34 +837,30 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; ; RV64-LABEL: vp_ctpop_v4i64: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vsll.vi v12, v10, 2, v0.t +; RV64-NEXT: vxor.vx v12, v12, a1, v0.t +; RV64-NEXT: vsll.vi v14, v12, 1, v0.t +; RV64-NEXT: vxor.vv v14, v12, v14, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: vand.vv v14, v16, v14, v0.t +; RV64-NEXT: vsub.vv v8, v8, v14, v0.t +; RV64-NEXT: vand.vv v14, v8, v12, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v12, v0.t +; RV64-NEXT: vadd.vv v8, v14, v8, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t +; RV64-NEXT: vadd.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v10, v10, 3, v0.t +; RV64-NEXT: vand.vx v10, v10, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v10, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -913,37 +871,27 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v4i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsll.vi v12, v10, 2 +; RV32-NEXT: vxor.vv v12, v10, v12 +; RV32-NEXT: vadd.vv v14, v12, v12 +; RV32-NEXT: vxor.vv v14, v12, v14 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v14, v16, v14 +; RV32-NEXT: vsub.vv v8, v8, v14 +; RV32-NEXT: vand.vv v14, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v14, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v12, v10, 3 +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -951,34 +899,30 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_ctpop_v4i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsll.vi v12, v10, 2 +; RV64-NEXT: vxor.vx v12, v12, a1 +; RV64-NEXT: vadd.vv v14, v12, v12 +; RV64-NEXT: vxor.vv v14, v12, v14 +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vv v14, v16, v14 +; RV64-NEXT: vsub.vv v8, v8, v14 +; RV64-NEXT: vand.vv v14, v8, v12 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vadd.vv v8, v14, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v10, v10, 3 +; RV64-NEXT: vand.vx v10, v10, a1 +; RV64-NEXT: vmul.vv v8, v8, v10 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -993,37 +937,27 @@ declare <8 x i64> @llvm.vp.ctpop.v8i64(<8 x i64>, <8 x i1>, i32) define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsll.vi v16, v12, 2, v0.t +; RV32-NEXT: vxor.vv v16, v12, v16, v0.t +; RV32-NEXT: vsll.vi v20, v16, 1, v0.t +; RV32-NEXT: vxor.vv v20, v16, v20, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vand.vv v20, v24, v20, v0.t +; RV32-NEXT: vsub.vv v8, v8, v20, v0.t +; RV32-NEXT: vand.vv v20, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v8, v20, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v16, v12, 3, v0.t +; RV32-NEXT: vand.vv v12, v12, v16, v0.t ; RV32-NEXT: vmul.vv v8, v8, v12, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -1031,34 +965,30 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; ; RV64-LABEL: vp_ctpop_v8i64: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vsll.vi v16, v12, 2, v0.t +; RV64-NEXT: vxor.vx v16, v16, a1, v0.t +; RV64-NEXT: vsll.vi v20, v16, 1, v0.t +; RV64-NEXT: vxor.vv v20, v16, v20, v0.t +; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV64-NEXT: vand.vv v20, v24, v20, v0.t +; RV64-NEXT: vsub.vv v8, v8, v20, v0.t +; RV64-NEXT: vand.vv v20, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: vadd.vv v8, v20, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV64-NEXT: vadd.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v12, v12, 3, v0.t +; RV64-NEXT: vand.vx v12, v12, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v12, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -1069,37 +999,27 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v8i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsll.vi v16, v12, 2 +; RV32-NEXT: vxor.vv v16, v12, v16 +; RV32-NEXT: vadd.vv v20, v16, v16 +; RV32-NEXT: vxor.vv v20, v16, v20 +; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vand.vv v20, v24, v20 +; RV32-NEXT: vsub.vv v8, v8, v20 +; RV32-NEXT: vand.vv v20, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v20, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v16, v12, 3 +; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1107,34 +1027,30 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_ctpop_v8i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsll.vi v16, v12, 2 +; RV64-NEXT: vxor.vx v16, v16, a1 +; RV64-NEXT: vadd.vv v20, v16, v16 +; RV64-NEXT: vxor.vv v20, v16, v20 +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vv v20, v24, v20 +; RV64-NEXT: vsub.vv v8, v8, v20 +; RV64-NEXT: vand.vv v20, v8, v16 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vadd.vv v8, v20, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v12, v12, 3 +; RV64-NEXT: vand.vx v12, v12, a1 +; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1149,89 +1065,163 @@ declare <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64>, <15 x i1>, i32) define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vi v24, v8, 2, v0.t +; RV32-NEXT: vxor.vv v24, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v24, 1, v0.t +; RV32-NEXT: vxor.vv v8, v24, v8, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v15i64: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vsll.vi v24, v16, 2, v0.t +; RV64-NEXT: vxor.vx v8, v24, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: vxor.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 1, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v24, v24, v16, v0.t +; RV64-NEXT: vand.vv v16, v24, v8, v0.t +; RV64-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV64-NEXT: vand.vv v8, v24, v8, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v = call <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl) ret <15 x i64> %v @@ -1240,91 +1230,100 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v15i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsll.vi v24, v16, 2 +; RV32-NEXT: vxor.vv v24, v16, v24 +; RV32-NEXT: vadd.vv v0, v24, v24 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 ; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v15i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v24, v16, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v0, v24, v0 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vv v16, v16, v0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vv v16, v8, v24 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret - %head = insertelement <15 x i1> poison, i1 true, i32 0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer %v = call <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl) ret <15 x i64> %v @@ -1335,89 +1334,163 @@ declare <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64>, <16 x i1>, i32) define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vi v24, v8, 2, v0.t +; RV32-NEXT: vxor.vv v24, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v24, 1, v0.t +; RV32-NEXT: vxor.vv v8, v24, v8, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v16i64: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vsll.vi v24, v16, 2, v0.t +; RV64-NEXT: vxor.vx v8, v24, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: vxor.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 1, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v24, v24, v16, v0.t +; RV64-NEXT: vand.vv v16, v24, v8, v0.t +; RV64-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV64-NEXT: vand.vv v8, v24, v8, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v = call <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64> %va, <16 x i1> %m, i32 %evl) ret <16 x i64> %v @@ -1426,89 +1499,98 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v16i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsll.vi v24, v16, 2 +; RV32-NEXT: vxor.vv v24, v16, v24 +; RV32-NEXT: vadd.vv v0, v24, v24 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 ; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v16i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v24, v16, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v0, v24, v0 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vv v16, v16, v0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vv v16, v8, v24 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -1521,117 +1603,151 @@ declare <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64>, <32 x i1>, i32) define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v32i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: li a2, 16 -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: vslidedown.vi v7, v0, 2 ; RV32-NEXT: mv a1, a0 ; RV32-NEXT: bltu a0, a2, .LBB34_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB34_2: -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: addi a2, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v8, (a2), zero -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: lui a2, 61681 +; RV32-NEXT: addi a2, a2, -241 +; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v24, v8, v16, v0.t -; RV32-NEXT: addi a2, sp, 32 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a2), zero -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v24, v8, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t -; RV32-NEXT: vadd.vv v16, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a2), zero -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v16, v8, 1, v0.t +; RV32-NEXT: vxor.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v16, 3, v0.t ; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: addi a2, a0, -16 ; RV32-NEXT: sltu a0, a0, a2 @@ -1639,52 +1755,102 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v16, v16, v8, v0.t +; RV32-NEXT: vsll.vi v8, v16, 1, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v24, v8, v0.t -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 +; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v16, 3, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vmul.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v32i64: @@ -1692,81 +1858,263 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: li a2, 48 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: slli a1, a1, 5 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: li a2, 16 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB34_2 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: bltu a0, a1, .LBB34_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 +; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB34_2: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v16, v16, a1, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a2, 209715 -; RV64-NEXT: addiw a2, a2, 819 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v16, v8, a2, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a2, v0.t +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a3, a1, 32 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 40 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v8, v8, 2, v0.t +; RV64-NEXT: vxor.vx v8, v8, a1, v0.t +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: vxor.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v16, v16, v8, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v16, v8, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a3, 61681 -; RV64-NEXT: addiw a3, a3, -241 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v8, v8, a3, v0.t -; RV64-NEXT: lui a4, 4112 -; RV64-NEXT: addiw a4, a4, 257 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vmul.vx v8, v8, a4, v0.t -; RV64-NEXT: li a5, 56 -; RV64-NEXT: vsrl.vx v8, v8, a5, v0.t -; RV64-NEXT: addi a6, sp, 16 -; RV64-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill -; RV64-NEXT: addi a6, a0, -16 -; RV64-NEXT: sltu a0, a0, a6 +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 40 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV64-NEXT: vand.vx v8, v16, a1, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vmul.vv v16, v16, v8, v0.t +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsrl.vx v16, v16, a2, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 24 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: addi a3, a0, -16 +; RV64-NEXT: sltu a0, a0, a3 ; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a6 +; RV64-NEXT: and a0, a0, a3 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 40 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsll.vi v16, v8, 2, v0.t +; RV64-NEXT: vxor.vx v16, v16, a1, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vi v8, v16, 1, v0.t +; RV64-NEXT: vxor.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v8, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a1, v0.t -; RV64-NEXT: vsub.vv v16, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v16, a2, v0.t -; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t -; RV64-NEXT: vmul.vx v8, v8, a4, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a5, v0.t ; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV64-NEXT: vadd.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 40 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 48 +; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -1777,190 +2125,223 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 +; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 40 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: li a2, 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a2, 16 -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: mv a1, a0 ; RV32-NEXT: bltu a0, a2, .LBB35_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB35_2: +; RV32-NEXT: lui a2, 61681 +; RV32-NEXT: addi a2, a2, -241 +; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a2, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a2), zero -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: addi a2, sp, 32 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a2), zero -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v8, v0 +; RV32-NEXT: vsll.vi v0, v24, 2 +; RV32-NEXT: vxor.vv v16, v24, v0 +; RV32-NEXT: vadd.vv v0, v16, v16 +; RV32-NEXT: vxor.vv v8, v16, v0 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v0, v8 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v0, v8 +; RV32-NEXT: vand.vv v0, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v0 -; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v0, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a2), zero -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v16, v16, v8 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v8, v16, a1 +; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: addi a2, a0, -16 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vi v16, v24, 2 +; RV32-NEXT: vxor.vv v16, v24, v16 +; RV32-NEXT: vadd.vv v0, v16, v16 +; RV32-NEXT: vxor.vv v8, v16, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a0, a0, a2 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v16, v8, v16 -; RV32-NEXT: vand.vv v8, v16, v0 -; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vand.vv v16, v16, v0 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v0, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v0, v8 +; RV32-NEXT: vand.vv v0, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v32i64_unmasked: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV64-NEXT: li a2, 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: mv a1, a0 ; RV64-NEXT: bltu a0, a2, .LBB35_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB35_2: -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v24, v24, a1 -; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: lui a2, 209715 -; RV64-NEXT: addiw a2, a2, 819 +; RV64-NEXT: lui a2, 61681 +; RV64-NEXT: addiw a2, a2, -241 ; RV64-NEXT: slli a3, a2, 32 ; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v24, v8, a2 +; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a2 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vsll.vi v0, v8, 2 +; RV64-NEXT: vxor.vx v0, v0, a2 +; RV64-NEXT: vadd.vv v24, v0, v0 +; RV64-NEXT: vxor.vv v8, v0, v24 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v24, v16, 1 +; RV64-NEXT: vand.vv v8, v24, v8 +; RV64-NEXT: vsub.vv v8, v16, v8 +; RV64-NEXT: vand.vv v24, v8, v0 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: vand.vv v8, v8, v0 ; RV64-NEXT: vadd.vv v8, v24, v8 ; RV64-NEXT: vsrl.vi v24, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: lui a3, 61681 -; RV64-NEXT: addiw a3, a3, -241 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: lui a4, 4112 -; RV64-NEXT: addiw a4, a4, 257 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vmul.vx v8, v8, a4 -; RV64-NEXT: li a5, 56 -; RV64-NEXT: vsrl.vx v8, v8, a5 -; RV64-NEXT: addi a6, a0, -16 -; RV64-NEXT: sltu a0, a0, a6 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v24, v16, 3 +; RV64-NEXT: vand.vx v24, v24, a2 +; RV64-NEXT: vmul.vv v8, v8, v24 +; RV64-NEXT: li a1, 56 +; RV64-NEXT: vsrl.vx v8, v8, a1 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: addi a3, a0, -16 +; RV64-NEXT: sltu a0, a0, a3 ; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a6 +; RV64-NEXT: and a0, a0, a3 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vand.vx v24, v24, a1 -; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: vand.vx v24, v16, a2 -; RV64-NEXT: vsrl.vi v16, v16, 2 +; RV64-NEXT: vsll.vi v24, v16, 2 +; RV64-NEXT: vxor.vx v24, v24, a2 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v8, v24, v0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v0, v16, 1 +; RV64-NEXT: vand.vv v8, v0, v8 +; RV64-NEXT: vsub.vv v8, v16, v8 +; RV64-NEXT: vand.vv v0, v8, v24 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vv v8, v8, v24 +; RV64-NEXT: vadd.vv v8, v0, v8 +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v24 +; RV64-NEXT: vand.vx v8, v8, a2 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 ; RV64-NEXT: vand.vx v16, v16, a2 -; RV64-NEXT: vadd.vv v16, v24, v16 -; RV64-NEXT: vsrl.vi v24, v16, 4 -; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: vand.vx v16, v16, a3 -; RV64-NEXT: vmul.vx v16, v16, a4 -; RV64-NEXT: vsrl.vx v16, v16, a5 +; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsrl.vx v16, v8, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll index b5114bbe49189..5cdcaf226d856 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll @@ -129,36 +129,27 @@ define void @ctpop_v2i64(ptr %x, ptr %y) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: vand.vv v9, v10, v9 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vsll.vi v10, v9, 2 +; RV32-NEXT: vxor.vv v10, v9, v10 +; RV32-NEXT: vadd.vv v11, v10, v10 +; RV32-NEXT: vxor.vv v11, v10, v11 +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vand.vv v11, v12, v11 +; RV32-NEXT: vsub.vv v8, v8, v11 +; RV32-NEXT: vand.vv v11, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v11, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v10, v9, 3 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1 @@ -169,33 +160,30 @@ define void @ctpop_v2i64(ptr %x, ptr %y) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v9, v9, a1 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a1, 209715 -; RV64-NEXT: addiw a1, a1, 819 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v9, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a1, 61681 ; RV64-NEXT: addiw a1, a1, -241 ; RV64-NEXT: slli a2, a1, 32 ; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a1 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vsll.vi v10, v9, 2 +; RV64-NEXT: vxor.vx v10, v10, a1 +; RV64-NEXT: vadd.vv v11, v10, v10 +; RV64-NEXT: vxor.vv v11, v10, v11 +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vand.vv v11, v12, v11 +; RV64-NEXT: vsub.vv v8, v8, v11 +; RV64-NEXT: vand.vv v11, v8, v10 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: vadd.vv v8, v11, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: lui a1, 4112 -; RV64-NEXT: addiw a1, a1, 257 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vmul.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v9, v9, 3 +; RV64-NEXT: vand.vx v9, v9, a1 +; RV64-NEXT: vmul.vv v8, v8, v9 ; RV64-NEXT: li a1, 56 ; RV64-NEXT: vsrl.vx v8, v8, a1 ; RV64-NEXT: vse64.v v8, (a0) @@ -435,36 +423,27 @@ define void @ctpop_v4i64(ptr %x, ptr %y) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: vand.vv v10, v12, v10 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vsll.vi v12, v10, 2 +; RV32-NEXT: vxor.vv v12, v10, v12 +; RV32-NEXT: vadd.vv v14, v12, v12 +; RV32-NEXT: vxor.vv v14, v12, v14 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v14, v16, v14 +; RV32-NEXT: vsub.vv v8, v8, v14 +; RV32-NEXT: vand.vv v14, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v14, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v12, v10, 3 +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1 @@ -475,33 +454,30 @@ define void @ctpop_v4i64(ptr %x, ptr %y) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v10, v10, a1 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a1, 209715 -; RV64-NEXT: addiw a1, a1, 819 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vand.vx v10, v8, a1 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: lui a1, 61681 ; RV64-NEXT: addiw a1, a1, -241 ; RV64-NEXT: slli a2, a1, 32 ; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vsll.vi v12, v10, 2 +; RV64-NEXT: vxor.vx v12, v12, a1 +; RV64-NEXT: vadd.vv v14, v12, v12 +; RV64-NEXT: vxor.vv v14, v12, v14 +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vv v14, v16, v14 +; RV64-NEXT: vsub.vv v8, v8, v14 +; RV64-NEXT: vand.vv v14, v8, v12 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vadd.vv v8, v14, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 ; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: lui a1, 4112 -; RV64-NEXT: addiw a1, a1, 257 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vmul.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v10, v10, 3 +; RV64-NEXT: vand.vx v10, v10, a1 +; RV64-NEXT: vmul.vv v8, v8, v10 ; RV64-NEXT: li a1, 56 ; RV64-NEXT: vsrl.vx v8, v8, a1 ; RV64-NEXT: vse64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll index 49f6ffd691292..4a17552fc137c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -777,41 +777,31 @@ declare <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64>, i1 immarg, <2 x i1>, i32) define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v2i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vi v10, v9, 2, v0.t +; RV32-NEXT: vxor.vv v10, v9, v10, v0.t +; RV32-NEXT: vsll.vi v11, v10, 1, v0.t +; RV32-NEXT: vxor.vv v11, v10, v11, v0.t +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v12, v8, a0, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vand.vv v11, v12, v11, v0.t +; RV32-NEXT: vsub.vv v8, v8, v11, v0.t +; RV32-NEXT: vand.vv v11, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vadd.vv v8, v11, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v10, v9, 3, v0.t +; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -819,38 +809,34 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; ; RV64-LABEL: vp_cttz_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t +; RV64-NEXT: vsll.vi v10, v9, 2, v0.t +; RV64-NEXT: vxor.vx v10, v10, a1, v0.t +; RV64-NEXT: vsll.vi v11, v10, 1, v0.t +; RV64-NEXT: vxor.vv v11, v10, v11, v0.t +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v12, v8, a0, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v12, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV64-NEXT: vand.vv v11, v12, v11, v0.t +; RV64-NEXT: vsub.vv v8, v8, v11, v0.t +; RV64-NEXT: vand.vv v11, v8, v10, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v10, v0.t +; RV64-NEXT: vadd.vv v8, v11, v8, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v9, v9, 3, v0.t +; RV64-NEXT: vand.vx v9, v9, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v9, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -861,41 +847,31 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v2i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vi v10, v9, 2 +; RV32-NEXT: vxor.vv v10, v9, v10 +; RV32-NEXT: vadd.vv v11, v10, v10 +; RV32-NEXT: vxor.vv v11, v10, v11 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v12, v8, a0 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vand.vv v11, v12, v11 +; RV32-NEXT: vsub.vv v8, v8, v11 +; RV32-NEXT: vand.vv v11, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v11, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v10, v9, 3 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -903,38 +879,34 @@ define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_cttz_v2i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vsll.vi v10, v9, 2 +; RV64-NEXT: vxor.vx v10, v10, a1 +; RV64-NEXT: vadd.vv v11, v10, v10 +; RV64-NEXT: vxor.vv v11, v10, v11 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v12, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vand.vv v11, v12, v11 +; RV64-NEXT: vsub.vv v8, v8, v11 +; RV64-NEXT: vand.vv v11, v8, v10 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: vadd.vv v8, v11, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v9, v9, 3 +; RV64-NEXT: vand.vx v9, v9, a1 +; RV64-NEXT: vmul.vv v8, v8, v9 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -949,41 +921,31 @@ declare <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64>, i1 immarg, <4 x i1>, i32) define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v4i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsll.vi v12, v10, 2, v0.t +; RV32-NEXT: vxor.vv v12, v10, v12, v0.t +; RV32-NEXT: vsll.vi v14, v12, 1, v0.t +; RV32-NEXT: vxor.vv v14, v12, v14, v0.t +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v16, v8, a0, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vand.vv v14, v16, v14, v0.t +; RV32-NEXT: vsub.vv v8, v8, v14, v0.t +; RV32-NEXT: vand.vv v14, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vadd.vv v8, v14, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v12, v0.t ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v12, v10, 3, v0.t +; RV32-NEXT: vand.vv v10, v10, v12, v0.t ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -991,38 +953,34 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; ; RV64-LABEL: vp_cttz_v4i64: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a1, v0.t +; RV64-NEXT: vsll.vi v12, v10, 2, v0.t +; RV64-NEXT: vxor.vx v12, v12, a1, v0.t +; RV64-NEXT: vsll.vi v14, v12, 1, v0.t +; RV64-NEXT: vxor.vv v14, v12, v14, v0.t +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v16, v8, a0, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: vand.vv v14, v16, v14, v0.t +; RV64-NEXT: vsub.vv v8, v8, v14, v0.t +; RV64-NEXT: vand.vv v14, v8, v12, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v12, v0.t +; RV64-NEXT: vadd.vv v8, v14, v8, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t +; RV64-NEXT: vadd.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v10, v10, 3, v0.t +; RV64-NEXT: vand.vx v10, v10, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v10, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -1033,41 +991,31 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v4i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsll.vi v12, v10, 2 +; RV32-NEXT: vxor.vv v12, v10, v12 +; RV32-NEXT: vadd.vv v14, v12, v12 +; RV32-NEXT: vxor.vv v14, v12, v14 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v16, v8, a0 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v14, v16, v14 +; RV32-NEXT: vsub.vv v8, v8, v14 +; RV32-NEXT: vand.vv v14, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v14, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v12, v10, 3 +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1075,38 +1023,34 @@ define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_cttz_v4i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a1 +; RV64-NEXT: vsll.vi v12, v10, 2 +; RV64-NEXT: vxor.vx v12, v12, a1 +; RV64-NEXT: vadd.vv v14, v12, v12 +; RV64-NEXT: vxor.vv v14, v12, v14 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v16, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vv v14, v16, v14 +; RV64-NEXT: vsub.vv v8, v8, v14 +; RV64-NEXT: vand.vv v14, v8, v12 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vadd.vv v8, v14, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v10, v10, 3 +; RV64-NEXT: vand.vx v10, v10, a1 +; RV64-NEXT: vmul.vv v8, v8, v10 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1121,41 +1065,31 @@ declare <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64>, i1 immarg, <8 x i1>, i32) define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsll.vi v16, v12, 2, v0.t +; RV32-NEXT: vxor.vv v16, v12, v16, v0.t +; RV32-NEXT: vsll.vi v20, v16, 1, v0.t +; RV32-NEXT: vxor.vv v20, v16, v20, v0.t +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v24, v8, a0, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vand.vv v20, v24, v20, v0.t +; RV32-NEXT: vsub.vv v8, v8, v20, v0.t +; RV32-NEXT: vand.vv v20, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v8, v20, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v16, v12, 3, v0.t +; RV32-NEXT: vand.vv v12, v12, v16, v0.t ; RV32-NEXT: vmul.vv v8, v8, v12, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -1163,38 +1097,34 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; ; RV64-LABEL: vp_cttz_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vsub.vx v12, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v12, 2, v0.t +; RV64-NEXT: vxor.vx v16, v16, a1, v0.t +; RV64-NEXT: vsll.vi v20, v16, 1, v0.t +; RV64-NEXT: vxor.vv v20, v16, v20, v0.t +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v24, v8, a0, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v24, v0.t +; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV64-NEXT: vand.vv v20, v24, v20, v0.t +; RV64-NEXT: vsub.vv v8, v8, v20, v0.t +; RV64-NEXT: vand.vv v20, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: vadd.vv v8, v20, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV64-NEXT: vadd.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v12, v12, 3, v0.t +; RV64-NEXT: vand.vx v12, v12, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v12, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -1205,41 +1135,31 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v8i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsll.vi v16, v12, 2 +; RV32-NEXT: vxor.vv v16, v12, v16 +; RV32-NEXT: vadd.vv v20, v16, v16 +; RV32-NEXT: vxor.vv v20, v16, v20 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v24, v8, a0 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vand.vv v20, v24, v20 +; RV32-NEXT: vsub.vv v8, v8, v20 +; RV32-NEXT: vand.vv v20, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v20, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v16, v12, 3 +; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1247,38 +1167,34 @@ define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_cttz_v8i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vsub.vx v12, v8, a1 +; RV64-NEXT: vsll.vi v16, v12, 2 +; RV64-NEXT: vxor.vx v16, v16, a1 +; RV64-NEXT: vadd.vv v20, v16, v16 +; RV64-NEXT: vxor.vv v20, v16, v20 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v24, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vv v20, v24, v20 +; RV64-NEXT: vsub.vv v8, v8, v20 +; RV64-NEXT: vand.vv v20, v8, v16 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vadd.vv v8, v20, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v12, v12, 3 +; RV64-NEXT: vand.vx v12, v12, a1 +; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -1293,196 +1209,290 @@ declare <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32) define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: vsll.vi v24, v8, 2, v0.t +; RV32-NEXT: vxor.vv v24, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v24, 1, v0.t +; RV32-NEXT: vxor.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: li a0, 1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v16, v8, a0, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v24, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v15i64: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 2, v0.t +; RV64-NEXT: vxor.vx v24, v16, a1, v0.t +; RV64-NEXT: vsll.vi v16, v24, 1, v0.t +; RV64-NEXT: vxor.vv v8, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: li a0, 1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vx v16, v8, a0, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV64-NEXT: ret - %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl) - ret <15 x i64> %v -} - +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vv v16, v8, v24, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV64-NEXT: vand.vv v8, v8, v24, v0.t +; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV64-NEXT: vadd.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t +; RV64-NEXT: li a0, 56 +; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl) + ret <15 x i64> %v +} + define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v15i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1 +; RV32-NEXT: vsll.vi v24, v16, 2 +; RV32-NEXT: vxor.vv v24, v16, v24 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v0, v8, a0 ; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vadd.vv v0, v24, v24 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 ; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v15i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: vsll.vi v24, v16, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v0, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v0, v24, v0 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vv v16, v16, v0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vv v16, v8, v24 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer @@ -1495,97 +1505,182 @@ declare <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32) define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: vsll.vi v24, v8, 2, v0.t +; RV32-NEXT: vxor.vv v24, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v24, 1, v0.t +; RV32-NEXT: vxor.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: li a0, 1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v16, v8, a0, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v24, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v16i64: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 2, v0.t +; RV64-NEXT: vxor.vx v24, v16, a1, v0.t +; RV64-NEXT: vsll.vi v16, v24, 1, v0.t +; RV64-NEXT: vxor.vv v8, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: li a0, 1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vx v16, v8, a0, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vv v16, v8, v24, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v24, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl) ret <16 x i64> %v @@ -1594,97 +1689,106 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v16i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1 +; RV32-NEXT: vsll.vi v24, v16, 2 +; RV32-NEXT: vxor.vv v24, v16, v24 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v0, v8, a0 ; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vadd.vv v0, v24, v24 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 ; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v16i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: vsll.vi v24, v16, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v0, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v0, v24, v0 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vv v16, v16, v0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vv v16, v8, v24 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -1697,155 +1801,139 @@ declare <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64>, i1 immarg, <32 x i1>, i32) define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v32i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: li a2, 16 ; RV32-NEXT: vslidedown.vi v24, v0, 2 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a3, 16 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a3, .LBB34_2 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB34_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB34_2: +; RV32-NEXT: lui a2, 61681 +; RV32-NEXT: addi a2, a2, -241 +; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v16, v8, 1, v0.t +; RV32-NEXT: vxor.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v8, v16, a1, v0.t +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vnot.v v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 @@ -1854,88 +1942,116 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a0, a0, a3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v16, v8, 1, v0.t +; RV32-NEXT: vxor.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v8, v16, a1, v0.t +; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vnot.v v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v32i64: @@ -1943,14 +2059,21 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: li a2, 48 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: li a1, 16 ; RV64-NEXT: vslidedown.vi v24, v0, 2 @@ -1959,72 +2082,231 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB34_2: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a3, a1, 32 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 40 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsll.vi v8, v8, 2, v0.t +; RV64-NEXT: vxor.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 5 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: vxor.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: li a2, 1 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vx v8, v16, a2, v0.t +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vnot.v v8, v16, v0.t +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vand.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v16, v8, a3, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t -; RV64-NEXT: addi a7, sp, 16 -; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 40 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV64-NEXT: vand.vx v8, v16, a1, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vmul.vv v16, v16, v8, v0.t +; RV64-NEXT: li a3, 56 +; RV64-NEXT: vsrl.vx v16, v16, a3, v0.t +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 4 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: addi a4, a0, -16 +; RV64-NEXT: sltu a0, a0, a4 ; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 +; RV64-NEXT: and a0, a0, a4 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a4, 40 +; RV64-NEXT: mul a0, a0, a4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsll.vi v16, v8, 2, v0.t +; RV64-NEXT: vxor.vx v16, v16, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vi v8, v16, 1, v0.t +; RV64-NEXT: vxor.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a4, 24 +; RV64-NEXT: mul a0, a0, a4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: vsub.vx v16, v8, a2, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vnot.v v16, v8, v0.t +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a3, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v16, 2, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 40 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a3, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 48 +; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -2035,193 +2317,233 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb -; RV32-NEXT: vmv8r.v v24, v16 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: li a2, 16 -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: mv a1, a0 ; RV32-NEXT: bltu a0, a2, .LBB35_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB35_2: -; RV32-NEXT: li a2, 1 +; RV32-NEXT: lui a2, 61681 +; RV32-NEXT: addi a2, a2, -241 +; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a2 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsub.vx v0, v8, a1 ; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v16, v8, v0 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v0, v24, 2 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: vadd.vv v8, v0, v0 +; RV32-NEXT: vxor.vv v8, v0, v8 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v16, 1 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v0 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a3), zero -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v16, v8 ; RV32-NEXT: vand.vv v16, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v0 ; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v24, a2 -; RV32-NEXT: vnot.v v24, v24 -; RV32-NEXT: vand.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsub.vv v8, v8, v24 -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v0 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v16, v8, a1 +; RV32-NEXT: vnot.v v0, v8 +; RV32-NEXT: vand.vv v8, v0, v16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v0, v24, 2 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: vadd.vv v16, v0, v0 +; RV32-NEXT: vxor.vv v16, v0, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16 +; RV32-NEXT: vand.vv v16, v8, v0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vsrl.vx v16, v8, a2 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v32i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB35_2 -; RV64-NEXT: # %bb.1: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV64-NEXT: li a1, 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: bltu a0, a1, .LBB35_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB35_2: -; RV64-NEXT: li a2, 1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v24, v8, a2 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v24 -; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 ; RV64-NEXT: slli a3, a1, 32 ; RV64-NEXT: add a1, a1, a3 -; RV64-NEXT: vand.vx v24, v24, a1 -; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v24, v8, a3 +; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64-NEXT: li a2, 1 +; RV64-NEXT: vsub.vx v0, v8, a2 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vsll.vi v0, v16, 2 +; RV64-NEXT: vxor.vx v0, v0, a1 +; RV64-NEXT: vadd.vv v24, v0, v0 +; RV64-NEXT: vxor.vv v24, v0, v24 +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vv v16, v16, v24 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vv v16, v8, v0 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vadd.vv v8, v24, v8 -; RV64-NEXT: vsrl.vi v24, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4 -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5 -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6 -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v0, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v0, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: vsrl.vx v8, v8, a3 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV64-NEXT: addi a4, a0, -16 +; RV64-NEXT: sltu a0, a0, a4 ; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 +; RV64-NEXT: and a0, a0, a4 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v24, v16, a2 -; RV64-NEXT: vnot.v v16, v16 -; RV64-NEXT: vand.vv v16, v16, v24 -; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vand.vx v24, v24, a1 -; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: vand.vx v24, v16, a3 -; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a3 -; RV64-NEXT: vadd.vv v16, v24, v16 -; RV64-NEXT: vsrl.vi v24, v16, 4 -; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: vand.vx v16, v16, a4 -; RV64-NEXT: vmul.vx v16, v16, a5 -; RV64-NEXT: vsrl.vx v16, v16, a6 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vx v16, v8, a2 +; RV64-NEXT: vnot.v v24, v8 +; RV64-NEXT: vand.vv v16, v24, v16 +; RV64-NEXT: vsll.vi v24, v0, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v8, v24, v0 +; RV64-NEXT: vsrl.vi v0, v16, 1 +; RV64-NEXT: vand.vv v8, v0, v8 +; RV64-NEXT: vsub.vv v8, v16, v8 +; RV64-NEXT: vand.vv v16, v8, v24 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vv v8, v8, v24 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsrl.vx v16, v8, a3 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer @@ -2976,41 +3298,31 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v2i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vi v10, v9, 2, v0.t +; RV32-NEXT: vxor.vv v10, v9, v10, v0.t +; RV32-NEXT: vsll.vi v11, v10, 1, v0.t +; RV32-NEXT: vxor.vv v11, v10, v11, v0.t +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v12, v8, a0, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vand.vv v11, v12, v11, v0.t +; RV32-NEXT: vsub.vv v8, v8, v11, v0.t +; RV32-NEXT: vand.vv v11, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vadd.vv v8, v11, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v10, v9, 3, v0.t +; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -3018,38 +3330,34 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe ; ; RV64-LABEL: vp_cttz_zero_undef_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t +; RV64-NEXT: vsll.vi v10, v9, 2, v0.t +; RV64-NEXT: vxor.vx v10, v10, a1, v0.t +; RV64-NEXT: vsll.vi v11, v10, 1, v0.t +; RV64-NEXT: vxor.vv v11, v10, v11, v0.t +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v12, v8, a0, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v12, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV64-NEXT: vand.vv v11, v12, v11, v0.t +; RV64-NEXT: vsub.vv v8, v8, v11, v0.t +; RV64-NEXT: vand.vv v11, v8, v10, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v10, v0.t +; RV64-NEXT: vadd.vv v8, v11, v8, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v9, v9, 3, v0.t +; RV64-NEXT: vand.vx v9, v9, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v9, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -3060,41 +3368,31 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v2i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vi v10, v9, 2 +; RV32-NEXT: vxor.vv v10, v9, v10 +; RV32-NEXT: vadd.vv v11, v10, v10 +; RV32-NEXT: vxor.vv v11, v10, v11 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v12, v8, a0 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vand.vv v11, v12, v11 +; RV32-NEXT: vsub.vv v8, v8, v11 +; RV32-NEXT: vand.vv v11, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v11, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v10, v9, 3 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3102,38 +3400,34 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext % ; ; RV64-LABEL: vp_cttz_zero_undef_v2i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vsll.vi v10, v9, 2 +; RV64-NEXT: vxor.vx v10, v10, a1 +; RV64-NEXT: vadd.vv v11, v10, v10 +; RV64-NEXT: vxor.vv v11, v10, v11 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v12, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vand.vv v11, v12, v11 +; RV64-NEXT: vsub.vv v8, v8, v11 +; RV64-NEXT: vand.vv v11, v8, v10 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: vadd.vv v8, v11, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v9, v9, 3 +; RV64-NEXT: vand.vx v9, v9, a1 +; RV64-NEXT: vmul.vv v8, v8, v9 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -3146,41 +3440,31 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext % define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v4i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsll.vi v12, v10, 2, v0.t +; RV32-NEXT: vxor.vv v12, v10, v12, v0.t +; RV32-NEXT: vsll.vi v14, v12, 1, v0.t +; RV32-NEXT: vxor.vv v14, v12, v14, v0.t +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v16, v8, a0, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vand.vv v14, v16, v14, v0.t +; RV32-NEXT: vsub.vv v8, v8, v14, v0.t +; RV32-NEXT: vand.vv v14, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vadd.vv v8, v14, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v12, v0.t ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v12, v10, 3, v0.t +; RV32-NEXT: vand.vv v10, v10, v12, v0.t ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -3188,38 +3472,34 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe ; ; RV64-LABEL: vp_cttz_zero_undef_v4i64: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a1, v0.t +; RV64-NEXT: vsll.vi v12, v10, 2, v0.t +; RV64-NEXT: vxor.vx v12, v12, a1, v0.t +; RV64-NEXT: vsll.vi v14, v12, 1, v0.t +; RV64-NEXT: vxor.vv v14, v12, v14, v0.t +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v16, v8, a0, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: vand.vv v14, v16, v14, v0.t +; RV64-NEXT: vsub.vv v8, v8, v14, v0.t +; RV64-NEXT: vand.vv v14, v8, v12, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v12, v0.t +; RV64-NEXT: vadd.vv v8, v14, v8, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t +; RV64-NEXT: vadd.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v10, v10, 3, v0.t +; RV64-NEXT: vand.vx v10, v10, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v10, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -3230,41 +3510,31 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v4i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsll.vi v12, v10, 2 +; RV32-NEXT: vxor.vv v12, v10, v12 +; RV32-NEXT: vadd.vv v14, v12, v12 +; RV32-NEXT: vxor.vv v14, v12, v14 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v16, v8, a0 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v14, v16, v14 +; RV32-NEXT: vsub.vv v8, v8, v14 +; RV32-NEXT: vand.vv v14, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v14, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v12, v10, 3 +; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3272,38 +3542,34 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext % ; ; RV64-LABEL: vp_cttz_zero_undef_v4i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a1 +; RV64-NEXT: vsll.vi v12, v10, 2 +; RV64-NEXT: vxor.vx v12, v12, a1 +; RV64-NEXT: vadd.vv v14, v12, v12 +; RV64-NEXT: vxor.vv v14, v12, v14 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v16, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vv v14, v16, v14 +; RV64-NEXT: vsub.vv v8, v8, v14 +; RV64-NEXT: vand.vv v14, v8, v12 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vadd.vv v8, v14, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v10, v10, 3 +; RV64-NEXT: vand.vx v10, v10, a1 +; RV64-NEXT: vmul.vv v8, v8, v10 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -3316,41 +3582,31 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext % define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsll.vi v16, v12, 2, v0.t +; RV32-NEXT: vxor.vv v16, v12, v16, v0.t +; RV32-NEXT: vsll.vi v20, v16, 1, v0.t +; RV32-NEXT: vxor.vv v20, v16, v20, v0.t +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v24, v8, a0, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vand.vv v20, v24, v20, v0.t +; RV32-NEXT: vsub.vv v8, v8, v20, v0.t +; RV32-NEXT: vand.vv v20, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v8, v20, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v16, v12, 3, v0.t +; RV32-NEXT: vand.vv v12, v12, v16, v0.t ; RV32-NEXT: vmul.vv v8, v8, v12, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t @@ -3358,38 +3614,34 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe ; ; RV64-LABEL: vp_cttz_zero_undef_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vsub.vx v12, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v12, 2, v0.t +; RV64-NEXT: vxor.vx v16, v16, a1, v0.t +; RV64-NEXT: vsll.vi v20, v16, 1, v0.t +; RV64-NEXT: vxor.vv v20, v16, v20, v0.t +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v24, v8, a0, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v24, v0.t +; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV64-NEXT: vand.vv v20, v24, v20, v0.t +; RV64-NEXT: vsub.vv v8, v8, v20, v0.t +; RV64-NEXT: vand.vv v20, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: vadd.vv v8, v20, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV64-NEXT: vadd.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v12, v12, 3, v0.t +; RV64-NEXT: vand.vx v12, v12, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v12, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t ; RV64-NEXT: ret @@ -3400,41 +3652,31 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v8i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1 -; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vsetvli a2, zero, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsll.vi v16, v12, 2 +; RV32-NEXT: vxor.vv v16, v12, v16 +; RV32-NEXT: vadd.vv v20, v16, v16 +; RV32-NEXT: vxor.vv v20, v16, v20 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v24, v8, a0 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vand.vv v20, v24, v20 +; RV32-NEXT: vsub.vv v8, v8, v20 +; RV32-NEXT: vand.vv v20, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v20, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v16, v12, 3 +; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3442,38 +3684,34 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % ; ; RV64-LABEL: vp_cttz_zero_undef_v8i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vsub.vx v12, v8, a1 +; RV64-NEXT: vsll.vi v16, v12, 2 +; RV64-NEXT: vxor.vx v16, v16, a1 +; RV64-NEXT: vadd.vv v20, v16, v16 +; RV64-NEXT: vxor.vv v20, v16, v20 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v24, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vv v20, v24, v20 +; RV64-NEXT: vsub.vv v8, v8, v20 +; RV64-NEXT: vand.vv v20, v8, v16 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vadd.vv v8, v20, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vsrl.vi v12, v12, 3 +; RV64-NEXT: vand.vx v12, v12, a1 +; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret @@ -3486,97 +3724,182 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: vsll.vi v24, v8, 2, v0.t +; RV32-NEXT: vxor.vv v24, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v24, 1, v0.t +; RV32-NEXT: vxor.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: li a0, 1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v16, v8, a0, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v24, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v15i64: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 2, v0.t +; RV64-NEXT: vxor.vx v24, v16, a1, v0.t +; RV64-NEXT: vsll.vi v16, v24, 1, v0.t +; RV64-NEXT: vxor.vv v8, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: li a0, 1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vx v16, v8, a0, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vv v16, v8, v24, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v24, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl) ret <15 x i64> %v @@ -3585,97 +3908,106 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v15i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1 +; RV32-NEXT: vsll.vi v24, v16, 2 +; RV32-NEXT: vxor.vv v24, v16, v24 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v0, v8, a0 ; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vadd.vv v0, v24, v24 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 ; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v15i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: vsll.vi v24, v16, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v0, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v0, v24, v0 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vv v16, v16, v0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vv v16, v8, v24 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer @@ -3686,97 +4018,182 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: vsll.vi v24, v8, 2, v0.t +; RV32-NEXT: vxor.vv v24, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v24, 1, v0.t +; RV32-NEXT: vxor.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: li a0, 1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v16, v8, a0, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v24, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v16i64: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v8, 2, v0.t +; RV64-NEXT: vxor.vx v24, v16, a1, v0.t +; RV64-NEXT: vsll.vi v16, v24, 1, v0.t +; RV64-NEXT: vxor.vv v8, v24, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: li a0, 1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vx v16, v8, a0, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vand.vv v16, v8, v24, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vv v8, v8, v24, v0.t ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl) ret <16 x i64> %v @@ -3785,97 +4202,106 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v16i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1 +; RV32-NEXT: vsll.vi v24, v16, 2 +; RV32-NEXT: vxor.vv v24, v16, v24 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: vsub.vx v0, v8, a0 ; RV32-NEXT: vnot.v v8, v8 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vadd.vv v0, v24, v24 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v24 ; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: mv a1, sp -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v24, v16, 3 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v16i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: vsll.vi v24, v16, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsub.vx v0, v8, a0 ; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v0, v24, v0 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vand.vv v16, v16, v0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vand.vv v16, v8, v24 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: li a0, 56 ; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -3886,155 +4312,139 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v32i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: li a2, 16 ; RV32-NEXT: vslidedown.vi v24, v0, 2 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a3, 16 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a3, .LBB70_2 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB70_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB70_2: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: lui a2, 61681 +; RV32-NEXT: addi a2, a2, -241 +; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v16, v8, 1, v0.t +; RV32-NEXT: vxor.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: li a1, 1 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v8, v16, a1, v0.t +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vnot.v v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 @@ -4043,88 +4453,116 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a0, a0, a3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: vsll.vi v8, v16, 2, v0.t +; RV32-NEXT: vxor.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v16, v8, 1, v0.t +; RV32-NEXT: vxor.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v8, v16, a1, v0.t +; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vnot.v v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v32i64: @@ -4132,14 +4570,21 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: li a2, 48 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: li a1, 16 ; RV64-NEXT: vslidedown.vi v24, v0, 2 @@ -4148,72 +4593,231 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB70_2: -; RV64-NEXT: li a1, 1 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 +; RV64-NEXT: slli a3, a1, 32 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 40 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsll.vi v8, v8, 2, v0.t +; RV64-NEXT: vxor.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 5 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vi v16, v8, 1, v0.t +; RV64-NEXT: vxor.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: li a2, 1 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vx v8, v16, a2, v0.t +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vnot.v v8, v16, v0.t +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vand.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a2, 349525 -; RV64-NEXT: addiw a2, a2, 1365 -; RV64-NEXT: slli a3, a2, 32 -; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v16, v8, a3, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t -; RV64-NEXT: addi a7, sp, 16 -; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 40 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v8, 3, v0.t +; RV64-NEXT: vand.vx v8, v16, a1, v0.t +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vmul.vv v16, v16, v8, v0.t +; RV64-NEXT: li a3, 56 +; RV64-NEXT: vsrl.vx v16, v16, a3, v0.t +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 4 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV64-NEXT: addi a4, a0, -16 +; RV64-NEXT: sltu a0, a0, a4 ; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 +; RV64-NEXT: and a0, a0, a4 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vmv1r.v v0, v24 ; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a4, 40 +; RV64-NEXT: mul a0, a0, a4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsll.vi v16, v8, 2, v0.t +; RV64-NEXT: vxor.vx v16, v16, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsll.vi v8, v16, 1, v0.t +; RV64-NEXT: vxor.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a4, 24 +; RV64-NEXT: mul a0, a0, a4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: vsub.vx v16, v8, a2, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vnot.v v16, v8, v0.t +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a3, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v16, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vv v16, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vi v8, v16, 2, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vv v8, v16, v8, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t -; RV64-NEXT: vsrl.vx v16, v8, a6, v0.t -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vand.vx v8, v8, a1, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a2, 40 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vmul.vv v8, v8, v16, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a3, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 48 +; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -4224,193 +4828,233 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb -; RV32-NEXT: vmv8r.v v24, v16 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 36(sp) -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: li a2, 16 -; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: mv a1, a0 ; RV32-NEXT: bltu a0, a2, .LBB71_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB71_2: -; RV32-NEXT: li a2, 1 +; RV32-NEXT: lui a2, 61681 +; RV32-NEXT: addi a2, a2, -241 +; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a2 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsub.vx v0, v8, a1 ; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v16, v8, v0 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v0, v24, 2 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: vadd.vv v8, v0, v0 +; RV32-NEXT: vxor.vv v8, v0, v8 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v16, 1 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v0 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a3), zero -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v16, v8 ; RV32-NEXT: vand.vv v16, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v0 ; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v24, a2 -; RV32-NEXT: vnot.v v24, v24 -; RV32-NEXT: vand.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsub.vv v8, v8, v24 -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v0 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v16, v8, a1 +; RV32-NEXT: vnot.v v0, v8 +; RV32-NEXT: vand.vv v8, v0, v16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vi v0, v24, 2 +; RV32-NEXT: vxor.vv v0, v24, v0 +; RV32-NEXT: vadd.vv v16, v0, v0 +; RV32-NEXT: vxor.vv v16, v0, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16 +; RV32-NEXT: vand.vv v16, v8, v0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v16, v24, 3 +; RV32-NEXT: vand.vv v16, v24, v16 ; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vsrl.vx v16, v8, a2 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v32i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 16 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB71_2 -; RV64-NEXT: # %bb.1: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV64-NEXT: li a1, 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: bltu a0, a1, .LBB71_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB71_2: -; RV64-NEXT: li a2, 1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v24, v8, a2 -; RV64-NEXT: vnot.v v8, v8 -; RV64-NEXT: vand.vv v8, v8, v24 -; RV64-NEXT: vsrl.vi v24, v8, 1 -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: lui a1, 61681 +; RV64-NEXT: addiw a1, a1, -241 ; RV64-NEXT: slli a3, a1, 32 ; RV64-NEXT: add a1, a1, a3 -; RV64-NEXT: vand.vx v24, v24, a1 -; RV64-NEXT: vsub.vv v8, v8, v24 -; RV64-NEXT: lui a3, 209715 -; RV64-NEXT: addiw a3, a3, 819 -; RV64-NEXT: slli a4, a3, 32 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: vand.vx v24, v8, a3 +; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64-NEXT: li a2, 1 +; RV64-NEXT: vsub.vx v0, v8, a2 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vsll.vi v0, v16, 2 +; RV64-NEXT: vxor.vx v0, v0, a1 +; RV64-NEXT: vadd.vv v24, v0, v0 +; RV64-NEXT: vxor.vv v24, v0, v24 +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vv v16, v16, v24 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vv v16, v8, v0 ; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vadd.vv v8, v24, v8 -; RV64-NEXT: vsrl.vi v24, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v24 -; RV64-NEXT: lui a4, 61681 -; RV64-NEXT: addiw a4, a4, -241 -; RV64-NEXT: slli a5, a4, 32 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: vand.vx v8, v8, a4 -; RV64-NEXT: lui a5, 4112 -; RV64-NEXT: addiw a5, a5, 257 -; RV64-NEXT: slli a6, a5, 32 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: vmul.vx v8, v8, a5 -; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6 -; RV64-NEXT: addi a7, a0, -16 -; RV64-NEXT: sltu a0, a0, a7 +; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v0, (a3) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v0, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: vsrl.vx v8, v8, a3 +; RV64-NEXT: addi a4, sp, 16 +; RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV64-NEXT: addi a4, a0, -16 +; RV64-NEXT: sltu a0, a0, a4 ; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: and a0, a0, a7 +; RV64-NEXT: and a0, a0, a4 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v24, v16, a2 -; RV64-NEXT: vnot.v v16, v16 -; RV64-NEXT: vand.vv v16, v16, v24 -; RV64-NEXT: vsrl.vi v24, v16, 1 -; RV64-NEXT: vand.vx v24, v24, a1 -; RV64-NEXT: vsub.vv v16, v16, v24 -; RV64-NEXT: vand.vx v24, v16, a3 -; RV64-NEXT: vsrl.vi v16, v16, 2 -; RV64-NEXT: vand.vx v16, v16, a3 -; RV64-NEXT: vadd.vv v16, v24, v16 -; RV64-NEXT: vsrl.vi v24, v16, 4 -; RV64-NEXT: vadd.vv v16, v16, v24 -; RV64-NEXT: vand.vx v16, v16, a4 -; RV64-NEXT: vmul.vx v16, v16, a5 -; RV64-NEXT: vsrl.vx v16, v16, a6 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsub.vx v16, v8, a2 +; RV64-NEXT: vnot.v v24, v8 +; RV64-NEXT: vand.vv v16, v24, v16 +; RV64-NEXT: vsll.vi v24, v0, 2 +; RV64-NEXT: vxor.vx v24, v24, a1 +; RV64-NEXT: vadd.vv v0, v24, v24 +; RV64-NEXT: vxor.vv v8, v24, v0 +; RV64-NEXT: vsrl.vi v0, v16, 1 +; RV64-NEXT: vand.vv v8, v0, v8 +; RV64-NEXT: vsub.vv v8, v16, v8 +; RV64-NEXT: vand.vv v16, v8, v24 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vv v8, v8, v24 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsrl.vi v16, v16, 3 +; RV64-NEXT: vand.vx v16, v16, a1 +; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vsrl.vx v16, v8, a3 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 24 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll index 8c8da6d1e0031..c778fab049005 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -250,40 +250,31 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) -; RV32I-NEXT: li a1, 1 -; RV32I-NEXT: vsub.vx v9, v8, a1 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 1 -; RV32I-NEXT: lui a1, 349525 -; RV32I-NEXT: addi a1, a1, 1365 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 -; RV32I-NEXT: vsub.vv v8, v8, v9 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v9 ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32I-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v9, a1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vsll.vi v10, v9, 2 +; RV32I-NEXT: vxor.vv v10, v9, v10 +; RV32I-NEXT: vadd.vv v11, v10, v10 +; RV32I-NEXT: vxor.vv v11, v10, v11 +; RV32I-NEXT: li a1, 1 +; RV32I-NEXT: vsub.vx v12, v8, a1 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vand.vv v11, v12, v11 +; RV32I-NEXT: vsub.vv v8, v8, v11 +; RV32I-NEXT: vand.vv v11, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vadd.vv v8, v11, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vsrl.vi v10, v9, 3 +; RV32I-NEXT: vand.vv v9, v9, v10 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -294,37 +285,34 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) -; RV64I-NEXT: li a1, 1 -; RV64I-NEXT: vsub.vx v9, v8, a1 -; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vand.vv v8, v8, v9 -; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v9, v9, a1 -; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v9, v8, a1 -; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: vadd.vv v8, v9, v8 -; RV64I-NEXT: vsrl.vi v9, v8, 4 -; RV64I-NEXT: vadd.vv v8, v8, v9 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64I-NEXT: vmv.v.x v9, a1 +; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64I-NEXT: vsll.vi v10, v9, 2 +; RV64I-NEXT: vxor.vx v10, v10, a1 +; RV64I-NEXT: vadd.vv v11, v10, v10 +; RV64I-NEXT: vxor.vv v11, v10, v11 +; RV64I-NEXT: li a2, 1 +; RV64I-NEXT: vsub.vx v12, v8, a2 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vand.vv v11, v12, v11 +; RV64I-NEXT: vsub.vv v8, v8, v11 +; RV64I-NEXT: vand.vv v11, v8, v10 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vv v8, v8, v10 +; RV64I-NEXT: vadd.vv v8, v11, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 ; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vsrl.vi v9, v9, 3 +; RV64I-NEXT: vand.vx v9, v9, a1 +; RV64I-NEXT: vmul.vv v8, v8, v9 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -651,40 +639,31 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) -; RV32I-NEXT: li a1, 1 -; RV32I-NEXT: vsub.vx v10, v8, a1 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 1 -; RV32I-NEXT: lui a1, 349525 -; RV32I-NEXT: addi a1, a1, 1365 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v12, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 -; RV32I-NEXT: vsub.vv v8, v8, v10 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32I-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vsll.vi v12, v10, 2 +; RV32I-NEXT: vxor.vv v12, v10, v12 +; RV32I-NEXT: vadd.vv v14, v12, v12 +; RV32I-NEXT: vxor.vv v14, v12, v14 +; RV32I-NEXT: li a1, 1 +; RV32I-NEXT: vsub.vx v16, v8, a1 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: vand.vv v14, v16, v14 +; RV32I-NEXT: vsub.vv v8, v8, v14 +; RV32I-NEXT: vand.vv v14, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vadd.vv v8, v14, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 ; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vsrl.vi v12, v10, 3 +; RV32I-NEXT: vand.vv v10, v10, v12 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -695,37 +674,34 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) -; RV64I-NEXT: li a1, 1 -; RV64I-NEXT: vsub.vx v10, v8, a1 -; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vand.vv v8, v8, v10 -; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v10, v10, a1 -; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v10, v8, a1 -; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: vadd.vv v8, v10, v8 -; RV64I-NEXT: vsrl.vi v10, v8, 4 -; RV64I-NEXT: vadd.vv v8, v8, v10 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64I-NEXT: vmv.v.x v10, a1 +; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64I-NEXT: vsll.vi v12, v10, 2 +; RV64I-NEXT: vxor.vx v12, v12, a1 +; RV64I-NEXT: vadd.vv v14, v12, v12 +; RV64I-NEXT: vxor.vv v14, v12, v14 +; RV64I-NEXT: li a2, 1 +; RV64I-NEXT: vsub.vx v16, v8, a2 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: vand.vv v14, v16, v14 +; RV64I-NEXT: vsub.vv v8, v8, v14 +; RV64I-NEXT: vand.vv v14, v8, v12 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: vadd.vv v8, v14, v8 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 ; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vsrl.vi v10, v10, 3 +; RV64I-NEXT: vand.vx v10, v10, a1 +; RV64I-NEXT: vmul.vv v8, v8, v10 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -1029,40 +1005,31 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) -; RV32I-NEXT: li a1, 1 -; RV32I-NEXT: vsub.vx v9, v8, a1 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vsrl.vi v9, v8, 1 -; RV32I-NEXT: lui a1, 349525 -; RV32I-NEXT: addi a1, a1, 1365 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v9, v9, v10 -; RV32I-NEXT: vsub.vv v8, v8, v9 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32I-NEXT: vand.vv v10, v8, v9 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: vadd.vv v8, v10, v8 -; RV32I-NEXT: vsrl.vi v9, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v9 ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32I-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; RV32I-NEXT: vmv.v.x v9, a1 ; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vsll.vi v10, v9, 2 +; RV32I-NEXT: vxor.vv v10, v9, v10 +; RV32I-NEXT: vadd.vv v11, v10, v10 +; RV32I-NEXT: vxor.vv v11, v10, v11 +; RV32I-NEXT: li a1, 1 +; RV32I-NEXT: vsub.vx v12, v8, a1 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vand.vv v11, v12, v11 +; RV32I-NEXT: vsub.vv v8, v8, v11 +; RV32I-NEXT: vand.vv v11, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vadd.vv v8, v11, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: vand.vv v8, v8, v9 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32I-NEXT: vmv.v.x v9, a1 -; RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32I-NEXT: vsrl.vi v10, v9, 3 +; RV32I-NEXT: vand.vv v9, v9, v10 ; RV32I-NEXT: vmul.vv v8, v8, v9 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -1073,37 +1040,34 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) -; RV64I-NEXT: li a1, 1 -; RV64I-NEXT: vsub.vx v9, v8, a1 -; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vand.vv v8, v8, v9 -; RV64I-NEXT: vsrl.vi v9, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v9, v9, a1 -; RV64I-NEXT: vsub.vv v8, v8, v9 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v9, v8, a1 -; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: vadd.vv v8, v9, v8 -; RV64I-NEXT: vsrl.vi v9, v8, 4 -; RV64I-NEXT: vadd.vv v8, v8, v9 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; RV64I-NEXT: vmv.v.x v9, a1 +; RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64I-NEXT: vsll.vi v10, v9, 2 +; RV64I-NEXT: vxor.vx v10, v10, a1 +; RV64I-NEXT: vadd.vv v11, v10, v10 +; RV64I-NEXT: vxor.vv v11, v10, v11 +; RV64I-NEXT: li a2, 1 +; RV64I-NEXT: vsub.vx v12, v8, a2 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vand.vv v11, v12, v11 +; RV64I-NEXT: vsub.vv v8, v8, v11 +; RV64I-NEXT: vand.vv v11, v8, v10 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vv v8, v8, v10 +; RV64I-NEXT: vadd.vv v8, v11, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 ; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vsrl.vi v9, v9, 3 +; RV64I-NEXT: vand.vx v9, v9, a1 +; RV64I-NEXT: vmul.vv v8, v8, v9 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) @@ -1400,40 +1364,31 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32I-NEXT: vle64.v v8, (a0) -; RV32I-NEXT: li a1, 1 -; RV32I-NEXT: vsub.vx v10, v8, a1 -; RV32I-NEXT: vnot.v v8, v8 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vsrl.vi v10, v8, 1 -; RV32I-NEXT: lui a1, 349525 -; RV32I-NEXT: addi a1, a1, 1365 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v12, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v10, v10, v12 -; RV32I-NEXT: vsub.vv v8, v8, v10 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32I-NEXT: vand.vv v12, v8, v10 -; RV32I-NEXT: vsrl.vi v8, v8, 2 -; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: vadd.vv v8, v12, v8 -; RV32I-NEXT: vsrl.vi v10, v8, 4 -; RV32I-NEXT: vadd.vv v8, v8, v10 ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32I-NEXT: vsetvli a2, zero, e32, m2, ta, ma ; RV32I-NEXT: vmv.v.x v10, a1 ; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vsll.vi v12, v10, 2 +; RV32I-NEXT: vxor.vv v12, v10, v12 +; RV32I-NEXT: vadd.vv v14, v12, v12 +; RV32I-NEXT: vxor.vv v14, v12, v14 +; RV32I-NEXT: li a1, 1 +; RV32I-NEXT: vsub.vx v16, v8, a1 +; RV32I-NEXT: vnot.v v8, v8 +; RV32I-NEXT: vand.vv v8, v8, v16 +; RV32I-NEXT: vsrl.vi v16, v8, 1 +; RV32I-NEXT: vand.vv v14, v16, v14 +; RV32I-NEXT: vsub.vv v8, v8, v14 +; RV32I-NEXT: vand.vv v14, v8, v12 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vadd.vv v8, v14, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 ; RV32I-NEXT: vand.vv v8, v8, v10 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32I-NEXT: vmv.v.x v10, a1 -; RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32I-NEXT: vsrl.vi v12, v10, 3 +; RV32I-NEXT: vand.vv v10, v10, v12 ; RV32I-NEXT: vmul.vv v8, v8, v10 ; RV32I-NEXT: li a1, 56 ; RV32I-NEXT: vsrl.vx v8, v8, a1 @@ -1444,37 +1399,34 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64I-NEXT: vle64.v v8, (a0) -; RV64I-NEXT: li a1, 1 -; RV64I-NEXT: vsub.vx v10, v8, a1 -; RV64I-NEXT: vnot.v v8, v8 -; RV64I-NEXT: vand.vv v8, v8, v10 -; RV64I-NEXT: vsrl.vi v10, v8, 1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v10, v10, a1 -; RV64I-NEXT: vsub.vv v8, v8, v10 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vand.vx v10, v8, a1 -; RV64I-NEXT: vsrl.vi v8, v8, 2 -; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: vadd.vv v8, v10, v8 -; RV64I-NEXT: vsrl.vi v10, v8, 4 -; RV64I-NEXT: vadd.vv v8, v8, v10 ; RV64I-NEXT: lui a1, 61681 ; RV64I-NEXT: addiw a1, a1, -241 ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; RV64I-NEXT: vmv.v.x v10, a1 +; RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64I-NEXT: vsll.vi v12, v10, 2 +; RV64I-NEXT: vxor.vx v12, v12, a1 +; RV64I-NEXT: vadd.vv v14, v12, v12 +; RV64I-NEXT: vxor.vv v14, v12, v14 +; RV64I-NEXT: li a2, 1 +; RV64I-NEXT: vsub.vx v16, v8, a2 +; RV64I-NEXT: vnot.v v8, v8 +; RV64I-NEXT: vand.vv v8, v8, v16 +; RV64I-NEXT: vsrl.vi v16, v8, 1 +; RV64I-NEXT: vand.vv v14, v16, v14 +; RV64I-NEXT: vsub.vv v8, v8, v14 +; RV64I-NEXT: vand.vv v14, v8, v12 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: vadd.vv v8, v14, v8 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 ; RV64I-NEXT: vand.vx v8, v8, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: vmul.vx v8, v8, a1 +; RV64I-NEXT: vsrl.vi v10, v10, 3 +; RV64I-NEXT: vand.vx v10, v10, a1 +; RV64I-NEXT: vmul.vv v8, v8, v10 ; RV64I-NEXT: li a1, 56 ; RV64I-NEXT: vsrl.vx v8, v8, a1 ; RV64I-NEXT: vse64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll index f707cb31e3ece..0b788878a601e 100644 --- a/llvm/test/CodeGen/RISCV/sextw-removal.ll +++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll @@ -176,30 +176,34 @@ define void @test5(i32 signext %arg, i32 signext %arg1) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sraw a0, a0, a1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw s0, a1, 1365 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw s1, a1, 819 ; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addi s2, a1, -241 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addi s3, a1, 257 +; RV64I-NEXT: addiw s0, a1, -241 +; RV64I-NEXT: slli a1, s0, 32 +; RV64I-NEXT: add s0, s0, a1 +; RV64I-NEXT: slli s1, s0, 2 +; RV64I-NEXT: xor s1, s1, s0 +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: addiw s2, a1, 1365 +; RV64I-NEXT: srli a1, s0, 3 +; RV64I-NEXT: and s3, a1, s0 ; RV64I-NEXT: .LBB4_1: # %bb2 ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64I-NEXT: call bar ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: srli a0, a0, 1 -; RV64I-NEXT: and a0, a0, s0 -; RV64I-NEXT: sub a0, a1, a0 -; RV64I-NEXT: and a2, a0, s1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, s1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: and a0, a0, s2 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: srli a2, a2, 32 +; RV64I-NEXT: sub a2, a2, a0 +; RV64I-NEXT: and a0, a2, s1 +; RV64I-NEXT: srli a2, a2, 2 +; RV64I-NEXT: and a2, a2, s1 +; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: srli a2, a0, 4 ; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: and a0, a0, s2 +; RV64I-NEXT: and a0, a0, s0 ; RV64I-NEXT: mul a0, a0, s3 -; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: bnez a1, .LBB4_1 ; RV64I-NEXT: # %bb.2: # %bb7 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -323,27 +327,23 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sraw a0, a0, a1 -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw s0, a1, 1365 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw s0, a1, -241 ; RV64I-NEXT: slli a1, s0, 32 ; RV64I-NEXT: add s0, s0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw s1, a1, 819 -; RV64I-NEXT: slli a1, s1, 32 -; RV64I-NEXT: add s1, s1, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw s2, a1, -241 +; RV64I-NEXT: slli s1, s0, 2 +; RV64I-NEXT: xor s1, s1, s0 +; RV64I-NEXT: lui a1, 349525 +; RV64I-NEXT: addiw s2, a1, 1365 ; RV64I-NEXT: slli a1, s2, 32 ; RV64I-NEXT: add s2, s2, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw s3, a1, 257 -; RV64I-NEXT: slli a1, s3, 32 -; RV64I-NEXT: add s3, s3, a1 +; RV64I-NEXT: srli a1, s0, 3 +; RV64I-NEXT: and s3, a1, s0 ; RV64I-NEXT: .LBB6_1: # %bb2 ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64I-NEXT: call foo ; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: and a1, a1, s0 +; RV64I-NEXT: and a1, a1, s2 ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: and a1, a0, s1 ; RV64I-NEXT: srli a0, a0, 2 @@ -351,7 +351,7 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind { ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: and a0, a0, s2 +; RV64I-NEXT: and a0, a0, s0 ; RV64I-NEXT: mul a0, a0, s3 ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: bnez a0, .LBB6_1