diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index eb3c9b0defccb..e36204c536c0d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -2982,21 +2982,21 @@ multiclass VPseudoVFWALU_WV_WF_RM { multiclass VPseudoVMRG_VM_XM_IM { foreach m = MxList in { defvar mx = m.MX; - def "_VVM" # "_" # m.MX: - VPseudoTiedBinaryCarryIn.R, - m.vrclass, m.vrclass, m>, - SchedBinary<"WriteVIMergeV", "ReadVIMergeV", "ReadVIMergeV", mx, - forcePassthruRead=true>; - def "_VXM" # "_" # m.MX: - VPseudoTiedBinaryCarryIn.R, - m.vrclass, GPR, m>, - SchedBinary<"WriteVIMergeX", "ReadVIMergeV", "ReadVIMergeX", mx, - forcePassthruRead=true>; - def "_VIM" # "_" # m.MX: - VPseudoTiedBinaryCarryIn.R, - m.vrclass, simm5, m>, - SchedUnary<"WriteVIMergeI", "ReadVIMergeV", mx, - forcePassthruRead=true>; + def "_VVM"#"_"#m.MX : VPseudoTiedBinaryCarryIn.R, + GetVRegNoV0.R, + GetVRegNoV0.R, m>, + SchedBinary<"WriteVIMergeV", "ReadVIMergeV", "ReadVIMergeV", mx, + forcePassthruRead = true>; + def "_VXM"#"_"#m.MX + : VPseudoTiedBinaryCarryIn.R, + GetVRegNoV0.R, GPR, m>, + SchedBinary<"WriteVIMergeX", "ReadVIMergeV", "ReadVIMergeX", mx, + forcePassthruRead = true>; + def "_VIM"#"_"#m.MX + : VPseudoTiedBinaryCarryIn.R, + GetVRegNoV0.R, simm5, m>, + SchedUnary<"WriteVIMergeI", "ReadVIMergeV", mx, + forcePassthruRead = true>; } } diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index e1ff243bb1a47..5acb7f5bcd56a 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -73,7 +73,7 @@ class RISCVVectorPeephole : public MachineFunctionPass { bool isAllOnesMask(const MachineInstr *MaskDef) const; std::optional getConstant(const MachineOperand &VL) const; bool ensureDominates(const MachineOperand &Use, MachineInstr &Src) const; - bool isKnownSameDefs(Register A, Register B) const; + Register lookThruCopies(Register Reg) const; }; } // namespace @@ -387,23 +387,18 @@ bool RISCVVectorPeephole::convertAllOnesVMergeToVMv(MachineInstr &MI) const { return true; } -bool RISCVVectorPeephole::isKnownSameDefs(Register A, Register B) const { - if (A.isPhysical() || B.isPhysical()) - return false; - - auto LookThruVirtRegCopies = [this](Register Reg) { - while (MachineInstr *Def = MRI->getUniqueVRegDef(Reg)) { - if (!Def->isFullCopy()) - break; - Register Src = Def->getOperand(1).getReg(); - if (!Src.isVirtual()) - break; - Reg = Src; - } - return Reg; - }; - - return LookThruVirtRegCopies(A) == LookThruVirtRegCopies(B); +// If \p Reg is defined by one or more COPYs of virtual registers, traverses +// the chain and returns the root non-COPY source. +Register RISCVVectorPeephole::lookThruCopies(Register Reg) const { + while (MachineInstr *Def = MRI->getUniqueVRegDef(Reg)) { + if (!Def->isFullCopy()) + break; + Register Src = Def->getOperand(1).getReg(); + if (!Src.isVirtual()) + break; + Reg = Src; + } + return Reg; } /// If a PseudoVMERGE_VVM's true operand is a masked pseudo and both have the @@ -428,10 +423,11 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) { if (!TrueMaskedInfo || !hasSameEEW(MI, *True)) return false; - const MachineOperand &TrueMask = - True->getOperand(TrueMaskedInfo->MaskOpIdx + True->getNumExplicitDefs()); - const MachineOperand &MIMask = MI.getOperand(4); - if (!isKnownSameDefs(TrueMask.getReg(), MIMask.getReg())) + Register TrueMaskReg = lookThruCopies( + True->getOperand(TrueMaskedInfo->MaskOpIdx + True->getNumExplicitDefs()) + .getReg()); + Register MIMaskReg = lookThruCopies(MI.getOperand(4).getReg()); + if (!TrueMaskReg.isVirtual() || TrueMaskReg != MIMaskReg) return false; // Masked off lanes past TrueVL will come from False, and converting to vmv @@ -717,9 +713,9 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const { if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VMERGE_VVM) return false; - Register PassthruReg = MI.getOperand(1).getReg(); - Register FalseReg = MI.getOperand(2).getReg(); - Register TrueReg = MI.getOperand(3).getReg(); + Register PassthruReg = lookThruCopies(MI.getOperand(1).getReg()); + Register FalseReg = lookThruCopies(MI.getOperand(2).getReg()); + Register TrueReg = lookThruCopies(MI.getOperand(3).getReg()); if (!TrueReg.isVirtual() || !MRI->hasOneUse(TrueReg)) return false; MachineInstr &True = *MRI->getUniqueVRegDef(TrueReg); @@ -740,16 +736,17 @@ bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const { // We require that either passthru and false are the same, or that passthru // is undefined. - if (PassthruReg && !isKnownSameDefs(PassthruReg, FalseReg)) + if (PassthruReg && !(PassthruReg.isVirtual() && PassthruReg == FalseReg)) return false; std::optional> NeedsCommute; // If True has a passthru operand then it needs to be the same as vmerge's // False, since False will be used for the result's passthru operand. - Register TruePassthru = True.getOperand(True.getNumExplicitDefs()).getReg(); + Register TruePassthru = + lookThruCopies(True.getOperand(True.getNumExplicitDefs()).getReg()); if (RISCVII::isFirstDefTiedToFirstUse(True.getDesc()) && TruePassthru && - !isKnownSameDefs(TruePassthru, FalseReg)) { + !(TruePassthru.isVirtual() && TruePassthru == FalseReg)) { // If True's passthru != False, check if it uses False in another operand // and try to commute it. int OtherIdx = True.findRegisterUseOperandIdx(FalseReg, TRI); diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir index f8061462c6220..ada76a43639d7 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir @@ -11,7 +11,7 @@ body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv1i8 ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */ ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]] @@ -19,7 +19,7 @@ body: | ; ; RV64I-LABEL: name: select_nxv1i8 ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */ ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]] @@ -40,7 +40,7 @@ body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv4i8 ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */ ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]] @@ -48,7 +48,7 @@ body: | ; ; RV64I-LABEL: name: select_nxv4i8 ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */ ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]] @@ -69,7 +69,7 @@ body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv16i8 ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF + ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF ; RV32I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */ ; RV32I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]] @@ -77,7 +77,7 @@ body: | ; ; RV64I-LABEL: name: select_nxv16i8 ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF + ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF ; RV64I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */ ; RV64I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]] @@ -98,7 +98,7 @@ body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv64i8 ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */ ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]] @@ -106,7 +106,7 @@ body: | ; ; RV64I-LABEL: name: select_nxv64i8 ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */ ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]] @@ -127,7 +127,7 @@ body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv2i16 ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */ ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]] @@ -135,7 +135,7 @@ body: | ; ; RV64I-LABEL: name: select_nxv2i16 ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */ ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]] @@ -156,7 +156,7 @@ body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv8i16 ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF + ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF ; RV32I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */ ; RV32I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]] @@ -164,7 +164,7 @@ body: | ; ; RV64I-LABEL: name: select_nxv8i16 ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF + ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF ; RV64I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */ ; RV64I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]] @@ -185,7 +185,7 @@ body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv32i16 ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */ ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]] @@ -193,7 +193,7 @@ body: | ; ; RV64I-LABEL: name: select_nxv32i16 ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */ ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]] @@ -214,7 +214,7 @@ body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv2i32 ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF + ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF ; RV32I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */ ; RV32I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]] @@ -222,7 +222,7 @@ body: | ; ; RV64I-LABEL: name: select_nxv2i32 ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF + ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF ; RV64I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */ ; RV64I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]] @@ -243,7 +243,7 @@ body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv8i32 ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF + ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF ; RV32I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */ ; RV32I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]] @@ -251,7 +251,7 @@ body: | ; ; RV64I-LABEL: name: select_nxv8i32 ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF + ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF ; RV64I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */ ; RV64I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]] @@ -272,7 +272,7 @@ body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv1i64 ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF + ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF ; RV32I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */ ; RV32I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]] @@ -280,7 +280,7 @@ body: | ; ; RV64I-LABEL: name: select_nxv1i64 ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF + ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF ; RV64I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */ ; RV64I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]] @@ -301,7 +301,7 @@ body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv4i64 ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF + ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF ; RV32I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */ ; RV32I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]] @@ -309,7 +309,7 @@ body: | ; ; RV64I-LABEL: name: select_nxv4i64 ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF - ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF + ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF ; RV64I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */ ; RV64I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]] diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll index 2d4fce68f9545..96252f070a580 100644 --- a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll @@ -311,10 +311,10 @@ define i32 @test_nxv128i1( %x) { ; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v6, a0 +; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vx v6, v7, a1 -; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v7, a0 ; CHECK-NEXT: vslidedown.vx v5, v6, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/copyprop.mir b/llvm/test/CodeGen/RISCV/rvv/copyprop.mir index 31e79e58f44c5..aba75ffe29d33 100644 --- a/llvm/test/CodeGen/RISCV/rvv/copyprop.mir +++ b/llvm/test/CodeGen/RISCV/rvv/copyprop.mir @@ -43,7 +43,7 @@ body: | %2:gpr = COPY $x11 %1:gpr = COPY $x10 %3:vr = COPY $v8 - %17:vr = PseudoVSLL_VI_M1 undef $noreg, %3, 5, 1, 6 /* e64 */, 0 + %17:vrnov0 = PseudoVSLL_VI_M1 undef $noreg, %3, 5, 1, 6 /* e64 */, 0 %22:vr = PseudoVMSNE_VI_M1 %3, 0, 1, 6 /* e64 */ %23:vmv0 = COPY %22 %25:vrnov0 = PseudoVMERGE_VIM_M1 undef $noreg, %17, -1, %23, 1, 6 /* e64 */ diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 5567310bb2a61..9b35860904f11 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -530,290 +530,267 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 100 +; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 100 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 84 * vlenb ; RV32-NEXT: addi a4, a1, 128 ; RV32-NEXT: addi a5, a1, 256 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: lui a3, 12 -; RV32-NEXT: lui a6, 12291 -; RV32-NEXT: lui a7, %hi(.LCPI27_0) -; RV32-NEXT: addi a7, a7, %lo(.LCPI27_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a5) -; RV32-NEXT: vmv.s.x v0, a3 +; RV32-NEXT: lui a5, 12291 +; RV32-NEXT: vmv.s.x v3, a3 ; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a6, 76 +; RV32-NEXT: mul a1, a1, a6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: addi a6, a6, 3 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vslideup.vi v16, v24, 4 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v8, v24, 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a5, 76 -; RV32-NEXT: mul a1, a1, a5 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a5, 92 -; RV32-NEXT: mul a1, a1, a5 +; RV32-NEXT: li a6, 60 +; RV32-NEXT: mul a1, a1, a6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: vmv1r.v v30, v0 +; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vslideup.vi v16, v8, 10, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a5, 72 -; RV32-NEXT: mul a1, a1, a5 +; RV32-NEXT: li a6, 56 +; RV32-NEXT: mul a1, a1, a6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v16, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v8, (a4) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 84 +; RV32-NEXT: li a4, 68 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: addi a5, a5, 3 +; RV32-NEXT: vmv.s.x v0, a5 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle16.v v28, (a7) -; RV32-NEXT: vmv.s.x v0, a6 +; RV32-NEXT: vslideup.vi v28, v24, 2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a4, 76 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 84 +; RV32-NEXT: li a4, 68 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v0, v16, v28 +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 52 +; RV32-NEXT: li a4, 44 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; RV32-NEXT: vslideup.vi v8, v24, 2 -; RV32-NEXT: vmv1r.v v0, v30 +; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 92 +; RV32-NEXT: li a4, 60 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vslideup.vi v8, v16, 8, v0.t +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vslideup.vi v28, v8, 8, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 60 +; RV32-NEXT: li a4, 52 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV32-NEXT: lui a7, 49164 -; RV32-NEXT: lui a1, %hi(.LCPI27_1) -; RV32-NEXT: addi a1, a1, %lo(.LCPI27_1) -; RV32-NEXT: lui t2, 3 -; RV32-NEXT: lui t1, 196656 -; RV32-NEXT: lui a4, %hi(.LCPI27_3) -; RV32-NEXT: addi a4, a4, %lo(.LCPI27_3) -; RV32-NEXT: lui t0, 786624 -; RV32-NEXT: li a5, 48 -; RV32-NEXT: lui a6, 768 -; RV32-NEXT: addi a7, a7, 12 -; RV32-NEXT: vmv.s.x v0, a7 -; RV32-NEXT: addi t2, t2, 3 -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: li t3, 84 -; RV32-NEXT: mul a7, a7, t3 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vl8r.v v16, (a7) # vscale x 64-byte Folded Reload -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: slli a7, a7, 6 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v8, v16, v8, v0 -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: slli a7, a7, 5 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill -; RV32-NEXT: vmv.s.x v0, t2 -; RV32-NEXT: addi a7, t1, 48 -; RV32-NEXT: csrr t1, vlenb -; RV32-NEXT: li t2, 92 -; RV32-NEXT: mul t1, t1, t2 -; RV32-NEXT: add t1, sp, t1 -; RV32-NEXT: addi t1, t1, 16 -; RV32-NEXT: vl8r.v v24, (t1) # vscale x 64-byte Folded Reload -; RV32-NEXT: csrr t1, vlenb +; RV32-NEXT: vs4r.v v28, (a1) # vscale x 32-byte Folded Spill +; RV32-NEXT: lui a1, %hi(.LCPI27_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI27_0) +; RV32-NEXT: lui a6, 49164 +; RV32-NEXT: lui t1, 3 +; RV32-NEXT: lui t0, 196656 +; RV32-NEXT: lui a7, 786624 +; RV32-NEXT: li a4, 48 +; RV32-NEXT: lui a5, 768 +; RV32-NEXT: addi a6, a6, 12 +; RV32-NEXT: vmv.s.x v0, a6 +; RV32-NEXT: addi t1, t1, 3 +; RV32-NEXT: csrr a6, vlenb ; RV32-NEXT: li t2, 76 -; RV32-NEXT: mul t1, t1, t2 -; RV32-NEXT: add t1, sp, t1 -; RV32-NEXT: addi t1, t1, 16 -; RV32-NEXT: vl8r.v v8, (t1) # vscale x 64-byte Folded Reload +; RV32-NEXT: mul a6, a6, t2 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vl8r.v v16, (a6) # vscale x 64-byte Folded Reload +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li t2, 68 +; RV32-NEXT: mul a6, a6, t2 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vl8r.v v8, (a6) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li t2, 28 +; RV32-NEXT: mul a6, a6, t2 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vs8r.v v8, (a6) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv.s.x v0, t1 +; RV32-NEXT: addi a6, t0, 48 +; RV32-NEXT: csrr t0, vlenb +; RV32-NEXT: li t1, 60 +; RV32-NEXT: mul t0, t0, t1 +; RV32-NEXT: add t0, sp, t0 +; RV32-NEXT: addi t0, t0, 16 +; RV32-NEXT: vl8r.v v8, (t0) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 -; RV32-NEXT: csrr t1, vlenb -; RV32-NEXT: li t2, 44 -; RV32-NEXT: mul t1, t1, t2 -; RV32-NEXT: add t1, sp, t1 -; RV32-NEXT: addi t1, t1, 16 -; RV32-NEXT: vs4r.v v8, (t1) # vscale x 32-byte Folded Spill -; RV32-NEXT: vmv.s.x v0, a7 +; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 +; RV32-NEXT: csrr t0, vlenb +; RV32-NEXT: li t1, 36 +; RV32-NEXT: mul t0, t0, t1 +; RV32-NEXT: add t0, sp, t0 +; RV32-NEXT: addi t0, t0, 16 +; RV32-NEXT: vs4r.v v8, (t0) # vscale x 32-byte Folded Spill +; RV32-NEXT: vmv.s.x v0, a6 ; RV32-NEXT: addi a3, a3, 12 -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: slli a7, a7, 6 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vl8r.v v24, (a7) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v8, v16, v24, v0 -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: slli a7, a7, 4 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill -; RV32-NEXT: vmv8r.v v16, v24 -; RV32-NEXT: vmv.s.x v0, a3 -; RV32-NEXT: addi a3, t0, 192 -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: li t0, 92 -; RV32-NEXT: mul a7, a7, t0 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vl8r.v v24, (a7) # vscale x 64-byte Folded Reload -; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: csrr a6, vlenb ; RV32-NEXT: li t0, 76 -; RV32-NEXT: mul a7, a7, t0 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: li t0, 48 -; RV32-NEXT: mul a7, a7, t0 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vs4r.v v8, (a7) # vscale x 32-byte Folded Spill -; RV32-NEXT: vmv.s.x v0, a3 -; RV32-NEXT: li a3, 192 -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: li t0, 84 -; RV32-NEXT: mul a7, a7, t0 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload +; RV32-NEXT: mul a6, a6, t0 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vl8r.v v16, (a6) # vscale x 64-byte Folded Reload +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li t0, 68 +; RV32-NEXT: mul a6, a6, t0 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vl8r.v v8, (a6) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: li t0, 24 -; RV32-NEXT: mul a7, a7, t0 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill -; RV32-NEXT: vmv.s.x v0, a5 -; RV32-NEXT: addi a5, a6, 768 ; RV32-NEXT: csrr a6, vlenb -; RV32-NEXT: li a7, 92 -; RV32-NEXT: mul a6, a6, a7 +; RV32-NEXT: li t0, 20 +; RV32-NEXT: mul a6, a6, t0 ; RV32-NEXT: add a6, sp, a6 ; RV32-NEXT: addi a6, a6, 16 -; RV32-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload +; RV32-NEXT: vs8r.v v8, (a6) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv.s.x v0, a3 +; RV32-NEXT: addi a3, a7, 192 ; RV32-NEXT: csrr a6, vlenb -; RV32-NEXT: li a7, 76 +; RV32-NEXT: li a7, 60 ; RV32-NEXT: mul a6, a6, a7 ; RV32-NEXT: add a6, sp, a6 ; RV32-NEXT: addi a6, a6, 16 ; RV32-NEXT: vl8r.v v8, (a6) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 +; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 ; RV32-NEXT: csrr a6, vlenb ; RV32-NEXT: li a7, 40 ; RV32-NEXT: mul a6, a6, a7 ; RV32-NEXT: add a6, sp, a6 ; RV32-NEXT: addi a6, a6, 16 ; RV32-NEXT: vs4r.v v8, (a6) # vscale x 32-byte Folded Spill -; RV32-NEXT: vmv.s.x v0, a5 -; RV32-NEXT: vle16.v v6, (a1) -; RV32-NEXT: vle16.v v2, (a4) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 84 -; RV32-NEXT: mul a1, a1, a4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmv.s.x v0, a3 +; RV32-NEXT: li a3, 192 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li a7, 76 +; RV32-NEXT: mul a6, a6, a7 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vl8r.v v16, (a6) # vscale x 64-byte Folded Reload +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li a7, 68 +; RV32-NEXT: mul a6, a6, a7 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vl8r.v v8, (a6) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: slli a6, a6, 3 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vs8r.v v8, (a6) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv.s.x v0, a4 +; RV32-NEXT: addi a4, a5, 768 +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: li a6, 60 +; RV32-NEXT: mul a5, a5, a6 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 4 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs4r.v v8, (a5) # vscale x 32-byte Folded Spill +; RV32-NEXT: vmv.s.x v0, a4 +; RV32-NEXT: vle16.v v2, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: li a4, 76 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: vmv.s.x v0, a3 +; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a4, 68 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v24, v8, v6 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 92 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv.s.x v0, a3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 76 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 92 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a3, 44 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v24, v8, v2 -; RV32-NEXT: lui a1, %hi(.LCPI27_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI27_2) +; RV32-NEXT: lui a1, %hi(.LCPI27_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI27_1) ; RV32-NEXT: lui a3, 3073 ; RV32-NEXT: addi a3, a3, -1024 ; RV32-NEXT: vmv.s.x v0, a3 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v3, (a1) +; RV32-NEXT: vle16.v v30, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 84 +; RV32-NEXT: li a3, 76 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -821,179 +798,191 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 72 +; RV32-NEXT: li a2, 76 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v28, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: lui a1, %hi(.LCPI27_3) +; RV32-NEXT: addi a1, a1, %lo(.LCPI27_3) +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle16.v v28, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 52 +; RV32-NEXT: li a2, 28 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v28, v16 +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v8, v30 +; RV32-NEXT: lui a1, %hi(.LCPI27_2) +; RV32-NEXT: addi a1, a1, %lo(.LCPI27_2) +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vle16.v v20, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 72 +; RV32-NEXT: li a2, 20 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v28, (a1) # vscale x 32-byte Folded Spill -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vrgatherei16.vv v8, v0, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 60 +; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v20, (a1) # vscale x 32-byte Folded Reload -; RV32-NEXT: vmv.v.v v20, v16 +; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma +; RV32-NEXT: vmv.v.v v12, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 60 +; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v20, (a1) # vscale x 32-byte Folded Spill +; RV32-NEXT: vs4r.v v12, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 44 +; RV32-NEXT: li a2, 52 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v16, (a1) # vscale x 32-byte Folded Reload -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v20, v16, v3 -; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v20, v24 +; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vmv.v.v v12, v16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 52 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v12, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a2, 36 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v20, (a1) # vscale x 32-byte Folded Spill +; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vrgatherei16.vv v24, v12, v20 +; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma +; RV32-NEXT: vmv.v.v v24, v8 ; RV32-NEXT: lui a1, %hi(.LCPI27_4) ; RV32-NEXT: addi a1, a1, %lo(.LCPI27_4) ; RV32-NEXT: lui a2, %hi(.LCPI27_5) ; RV32-NEXT: addi a2, a2, %lo(.LCPI27_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v24, (a2) +; RV32-NEXT: vle16.v v28, (a2) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v16, (a1) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 84 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v16, (a1) # vscale x 8-byte Folded Spill +; RV32-NEXT: vle16.v v1, (a1) ; RV32-NEXT: lui a1, %hi(.LCPI27_7) ; RV32-NEXT: addi a1, a1, %lo(.LCPI27_7) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle16.v v16, (a1) +; RV32-NEXT: vle16.v v2, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 76 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs2r.v v16, (a1) # vscale x 16-byte Folded Spill +; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v16, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 +; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vrgatherei16.vv v16, v0, v24 +; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vrgatherei16.vv v28, v12, v1 +; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma +; RV32-NEXT: vmv.v.v v28, v8 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vrgatherei16.vv v16, v8, v2 +; RV32-NEXT: lui a1, %hi(.LCPI27_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI27_6) +; RV32-NEXT: lui a2, %hi(.LCPI27_8) +; RV32-NEXT: addi a2, a2, %lo(.LCPI27_8) +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: lui a1, %hi(.LCPI27_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI27_9) +; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: li a3, 44 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v20, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vs2r.v v10, (a1) # vscale x 16-byte Folded Spill +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vle16.v v9, (a2) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 84 +; RV32-NEXT: li a2, 68 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v7, (a1) # vscale x 8-byte Folded Reload -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v24, v20, v7 +; RV32-NEXT: vs1r.v v9, (a1) # vscale x 8-byte Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vrgatherei16.vv v20, v12, v8 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v24, v16 +; RV32-NEXT: vmv.v.v v20, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: li a2, 76 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 76 +; RV32-NEXT: li a2, 44 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v28, (a1) # vscale x 16-byte Folded Reload +; RV32-NEXT: vl2r.v v16, (a1) # vscale x 16-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v0, v28 -; RV32-NEXT: lui a1, %hi(.LCPI27_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI27_6) -; RV32-NEXT: lui a2, %hi(.LCPI27_8) -; RV32-NEXT: addi a2, a2, %lo(.LCPI27_8) -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI27_9) -; RV32-NEXT: addi a1, a1, %lo(.LCPI27_9) -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v6, (a1) -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vle16.v v5, (a2) +; RV32-NEXT: vrgatherei16.vv v8, v0, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 +; RV32-NEXT: li a2, 60 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v20, (a1) # vscale x 32-byte Folded Reload -; RV32-NEXT: vrgatherei16.vv v0, v20, v4 -; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v0, v16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v8, v6 +; RV32-NEXT: vl4r.v v16, (a1) # vscale x 32-byte Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 92 +; RV32-NEXT: li a2, 68 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vl1r.v v7, (a1) # vscale x 8-byte Folded Reload ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v12, v5 +; RV32-NEXT: vrgatherei16.vv v12, v16, v7 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v8, v16 +; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: addi a1, a0, 320 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: vse32.v v12, (a1) ; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: vse32.v v0, (a1) +; RV32-NEXT: vse32.v v20, (a1) ; RV32-NEXT: addi a1, a0, 192 -; RV32-NEXT: vse32.v v24, (a1) +; RV32-NEXT: vse32.v v28, (a1) ; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 6 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vl4r.v v8, (a2) # vscale x 32-byte Folded Reload -; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: vse32.v v24, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 60 +; RV32-NEXT: li a3, 52 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # vscale x 32-byte Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 72 +; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 100 +; RV32-NEXT: li a1, 84 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 @@ -1013,60 +1002,48 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a1) ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 53 +; RV64-NEXT: li a3, 85 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; RV64-NEXT: addi a2, a1, 128 -; RV64-NEXT: addi a3, a1, 256 -; RV64-NEXT: li a4, 128 +; RV64-NEXT: addi a1, a1, 256 +; RV64-NEXT: vle64.v v8, (a1) +; RV64-NEXT: li a3, 128 ; RV64-NEXT: lui a1, 1 -; RV64-NEXT: vle64.v v8, (a3) -; RV64-NEXT: lui a3, %hi(.LCPI27_0) -; RV64-NEXT: addi a3, a3, %lo(.LCPI27_0) -; RV64-NEXT: vmv.s.x v0, a4 -; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: li a5, 61 -; RV64-NEXT: mul a4, a4, a5 -; RV64-NEXT: add a4, sp, a4 -; RV64-NEXT: addi a4, a4, 16 -; RV64-NEXT: vs1r.v v0, (a4) # vscale x 8-byte Folded Spill -; RV64-NEXT: addi a4, a1, 65 +; RV64-NEXT: vmv.s.x v3, a3 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vslideup.vi v24, v8, 2 ; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma ; RV64-NEXT: vslidedown.vi v16, v8, 8 -; RV64-NEXT: csrr a5, vlenb -; RV64-NEXT: li a6, 77 -; RV64-NEXT: mul a5, a5, a6 -; RV64-NEXT: add a5, sp, a5 -; RV64-NEXT: addi a5, a5, 16 -; RV64-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill -; RV64-NEXT: csrr a5, vlenb -; RV64-NEXT: li a6, 77 -; RV64-NEXT: mul a5, a5, a6 -; RV64-NEXT: add a5, sp, a5 -; RV64-NEXT: addi a5, a5, 16 -; RV64-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 45 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill +; RV64-NEXT: vmv1r.v v0, v3 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslideup.vi v24, v16, 5, v0.t -; RV64-NEXT: csrr a5, vlenb -; RV64-NEXT: li a6, 73 -; RV64-NEXT: mul a5, a5, a6 -; RV64-NEXT: add a5, sp, a5 -; RV64-NEXT: addi a5, a5, 16 -; RV64-NEXT: vs4r.v v24, (a5) # vscale x 32-byte Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 73 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs4r.v v24, (a3) # vscale x 32-byte Folded Spill ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v24, (a2) +; RV64-NEXT: vle64.v v16, (a2) ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a5, 85 -; RV64-NEXT: mul a2, a2, a5 +; RV64-NEXT: li a3, 77 +; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: vle16.v v12, (a3) -; RV64-NEXT: vmv.s.x v0, a4 +; RV64-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; RV64-NEXT: addi a2, a1, 65 +; RV64-NEXT: vmv.s.x v0, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vslideup.vi v12, v8, 1 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: li a3, 85 ; RV64-NEXT: mul a2, a2, a3 @@ -1074,35 +1051,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 53 +; RV64-NEXT: li a3, 77 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; RV64-NEXT: vmerge.vvm v24, v24, v16, v0 -; RV64-NEXT: vrgatherei16.vv v0, v24, v12 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vmerge.vvm v16, v16, v24, v0 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: li a3, 37 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs8r.v v0, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vslideup.vi v12, v8, 1 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 61 -; RV64-NEXT: mul a2, a2, a3 -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl1r.v v7, (a2) # vscale x 8-byte Folded Reload -; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; RV64-NEXT: vmv1r.v v0, v3 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 77 +; RV64-NEXT: li a3, 45 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; RV64-NEXT: vslideup.vi v12, v24, 4, v0.t +; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: vslideup.vi v12, v16, 4, v0.t ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: li a3, 69 ; RV64-NEXT: mul a2, a2, a3 @@ -1115,17 +1085,23 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a2, a2, 130 ; RV64-NEXT: vmv.s.x v0, a2 ; RV64-NEXT: addi a2, a3, 260 -; RV64-NEXT: vmv8r.v v24, v16 ; RV64-NEXT: csrr a3, vlenb ; RV64-NEXT: li a5, 85 ; RV64-NEXT: mul a3, a3, a5 ; RV64-NEXT: add a3, sp, a3 ; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a5, 77 +; RV64-NEXT: mul a3, a3, a5 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 ; RV64-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vmerge.vvm v16, v16, v24, v0 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: li a5, 21 +; RV64-NEXT: mul a3, a3, a5 ; RV64-NEXT: add a3, sp, a3 ; RV64-NEXT: addi a3, a3, 16 ; RV64-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill @@ -1137,6 +1113,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 77 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vmerge.vvm v16, v16, v24, v0 ; RV64-NEXT: csrr a2, vlenb @@ -1147,21 +1129,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; RV64-NEXT: vmv1r.v v0, v2 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 45 +; RV64-NEXT: li a3, 53 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslideup.vi v12, v8, 5, v0.t -; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vmv1r.v v0, v3 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 77 +; RV64-NEXT: li a3, 45 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; RV64-NEXT: vrgather.vi v12, v24, 4, v0.t +; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; RV64-NEXT: vrgather.vi v12, v16, 4, v0.t ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: slli a3, a2, 6 ; RV64-NEXT: add a2, a3, a2 @@ -1171,84 +1153,65 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vslidedown.vi v12, v8, 1 ; RV64-NEXT: vmv1r.v v0, v2 ; RV64-NEXT: vslideup.vi v12, v8, 4, v0.t -; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: vrgather.vi v12, v24, 5, v0.t +; RV64-NEXT: vmv1r.v v0, v3 +; RV64-NEXT: vmv4r.v v8, v16 +; RV64-NEXT: vrgather.vi v12, v16, 5, v0.t ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 25 -; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: slli a3, a2, 4 +; RV64-NEXT: add a2, a3, a2 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vs4r.v v12, (a2) # vscale x 32-byte Folded Spill ; RV64-NEXT: lui a2, 8 ; RV64-NEXT: addi a2, a2, 520 ; RV64-NEXT: vmv.s.x v0, a2 -; RV64-NEXT: vslideup.vi v12, v24, 6 +; RV64-NEXT: vslideup.vi v4, v16, 6 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: li a3, 85 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 53 +; RV64-NEXT: li a3, 77 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vmerge.vvm v16, v16, v24, v0 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a3, a2, 4 +; RV64-NEXT: slli a3, a2, 3 ; RV64-NEXT: add a2, a3, a2 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 77 -; RV64-NEXT: mul a2, a2, a3 -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmv1r.v v0, v3 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t -; RV64-NEXT: lui a2, %hi(.LCPI27_1) -; RV64-NEXT: addi a2, a2, %lo(.LCPI27_1) -; RV64-NEXT: li a3, 192 -; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vle16.v v6, (a2) -; RV64-NEXT: vmv.s.x v0, a3 +; RV64-NEXT: vslideup.vi v4, v8, 1, v0.t +; RV64-NEXT: li a2, 192 +; RV64-NEXT: vmv.s.x v0, a2 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: slli a2, a2, 3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vs1r.v v0, (a2) # vscale x 8-byte Folded Spill ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 45 +; RV64-NEXT: li a3, 53 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vrgather.vi v28, v16, 2 -; RV64-NEXT: vmerge.vvm v16, v28, v12, v0 +; RV64-NEXT: vrgather.vi v12, v16, 2 +; RV64-NEXT: vmerge.vvm v12, v12, v4, v0 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: li a3, 61 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs4r.v v16, (a2) # vscale x 32-byte Folded Spill -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v24, v16, v6 -; RV64-NEXT: addi a2, sp, 16 -; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: lui a2, %hi(.LCPI27_2) -; RV64-NEXT: addi a2, a2, %lo(.LCPI27_2) +; RV64-NEXT: vs4r.v v12, (a2) # vscale x 32-byte Folded Spill +; RV64-NEXT: lui a2, %hi(.LCPI27_0) +; RV64-NEXT: addi a2, a2, %lo(.LCPI27_0) ; RV64-NEXT: li a3, 1040 ; RV64-NEXT: vmv.s.x v0, a3 ; RV64-NEXT: addi a1, a1, -2016 @@ -1259,41 +1222,77 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a3, a3, 16 ; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: li a4, 53 +; RV64-NEXT: li a4, 77 ; RV64-NEXT: mul a3, a3, a4 ; RV64-NEXT: add a3, sp, a3 ; RV64-NEXT: addi a3, a3, 16 ; RV64-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload -; RV64-NEXT: vmerge.vvm v8, v24, v16, v0 -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 3 -; RV64-NEXT: add a3, sp, a3 -; RV64-NEXT: addi a3, a3, 16 -; RV64-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: vle16.v v6, (a2) -; RV64-NEXT: li a1, 64 -; RV64-NEXT: vmerge.vvm v8, v24, v16, v0 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 85 -; RV64-NEXT: mul a2, a2, a3 -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vmerge.vvm v16, v16, v24, v0 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill ; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: vle16.v v12, (a2) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 29 +; RV64-NEXT: li a2, 85 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 77 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload -; RV64-NEXT: vrgatherei16.vv v24, v16, v6 +; RV64-NEXT: vmerge.vvm v16, v16, v24, v0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 85 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; RV64-NEXT: lui a1, %hi(.LCPI27_1) +; RV64-NEXT: addi a1, a1, %lo(.LCPI27_1) +; RV64-NEXT: vle16.v v24, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 37 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vrgatherei16.vv v0, v16, v12 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 77 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; RV64-NEXT: lui a1, %hi(.LCPI27_2) +; RV64-NEXT: addi a1, a1, %lo(.LCPI27_2) +; RV64-NEXT: vle16.v v12, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 21 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vrgatherei16.vv v0, v16, v24 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 37 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; RV64-NEXT: li a1, 64 +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 29 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vrgatherei16.vv v24, v16, v12 ; RV64-NEXT: vmv4r.v v28, v8 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslideup.vi v28, v8, 5, v0.t @@ -1304,13 +1303,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 37 +; RV64-NEXT: li a2, 77 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v8, v0 +; RV64-NEXT: vmv.v.v v8, v16 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 73 ; RV64-NEXT: mul a1, a1, a2 @@ -1323,7 +1322,11 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload -; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 37 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; RV64-NEXT: vmv.v.v v8, v16 ; RV64-NEXT: csrr a1, vlenb @@ -1335,62 +1338,59 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: lui a1, %hi(.LCPI27_3) ; RV64-NEXT: addi a1, a1, %lo(.LCPI27_3) ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vle16.v v20, (a1) +; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: lui a1, %hi(.LCPI27_4) ; RV64-NEXT: addi a1, a1, %lo(.LCPI27_4) -; RV64-NEXT: vle16.v v8, (a1) +; RV64-NEXT: vle16.v v10, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 77 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs2r.v v8, (a1) # vscale x 16-byte Folded Spill +; RV64-NEXT: vs2r.v v10, (a1) # vscale x 16-byte Folded Spill ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a2, a1, 6 ; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload +; RV64-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v8, v24 +; RV64-NEXT: vmv.v.v v12, v24 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a2, a1, 6 ; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill +; RV64-NEXT: vs4r.v v12, (a1) # vscale x 32-byte Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 4 +; RV64-NEXT: slli a2, a1, 3 ; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v0, v8, v20 +; RV64-NEXT: vrgatherei16.vv v0, v16, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 25 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 4 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload +; RV64-NEXT: vl4r.v v20, (a1) # vscale x 32-byte Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v12, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmv.v.v v20, v0 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 77 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload +; RV64-NEXT: vl2r.v v16, (a1) # vscale x 16-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v0, v16, v8 +; RV64-NEXT: vrgatherei16.vv v0, v8, v16 ; RV64-NEXT: lui a1, %hi(.LCPI27_5) ; RV64-NEXT: addi a1, a1, %lo(.LCPI27_5) -; RV64-NEXT: vle16.v v20, (a1) +; RV64-NEXT: vle16.v v12, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 61 ; RV64-NEXT: mul a1, a1, a2 @@ -1406,7 +1406,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 53 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -1414,7 +1414,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vrgather.vi v8, v0, 3 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl1r.v v0, (a1) # vscale x 8-byte Folded Reload @@ -1426,7 +1426,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v24, v0, v20 +; RV64-NEXT: vrgatherei16.vv v24, v0, v12 ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma ; RV64-NEXT: vmv.v.v v8, v24 ; RV64-NEXT: addi a1, a0, 320 @@ -1441,7 +1441,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl4r.v v8, (a2) # vscale x 32-byte Folded Reload ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 192 -; RV64-NEXT: vse64.v v12, (a1) +; RV64-NEXT: vse64.v v20, (a1) ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: slli a3, a2, 6 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index ffbf1c7a548e1..1ccc52be36215 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -1874,57 +1874,77 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: addi a2, a0, 128 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v24, (a2) ; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmfeq.vv v7, v8, v8 +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfmin.vv v24, v16, v24 +; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfmin.vv v24, v8, v24 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vmfeq.vv v7, v8, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfmin.vv v16, v8, v16 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfmin.vv v16, v16, v8 ; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 @@ -1943,7 +1963,10 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: .LBB121_3: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -2257,56 +2280,76 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v24, (a1) ; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: vle64.v v8, (a1) ; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vle64.v v16, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; RV32-NEXT: vmfeq.vv v0, v24, v24 -; RV32-NEXT: vmfeq.vv v7, v16, v16 -; RV32-NEXT: vmerge.vvm v8, v24, v16, v0 +; RV32-NEXT: vmfeq.vv v7, v8, v8 +; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; RV32-NEXT: vle64.v v8, (a1) +; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmerge.vvm v16, v24, v8, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vmerge.vvm v16, v16, v24, v0 -; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vfmin.vv v24, v16, v24 +; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vfmin.vv v24, v8, v24 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmfeq.vv v0, v8, v8 +; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vmfeq.vv v0, v16, v16 -; RV32-NEXT: vmfeq.vv v7, v8, v8 +; RV32-NEXT: vmfeq.vv v7, v16, v16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vmerge.vvm v16, v16, v8, v0 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; RV32-NEXT: vmv1r.v v0, v7 ; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmerge.vvm v16, v16, v8, v0 +; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vfmin.vv v16, v8, v16 +; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vfmin.vv v16, v16, v8 ; RV32-NEXT: vmfeq.vv v0, v16, v16 ; RV32-NEXT: vmfeq.vv v7, v24, v24 ; RV32-NEXT: vmerge.vvm v8, v16, v24, v0 @@ -2325,7 +2368,10 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; RV32-NEXT: vfmv.f.s fa0, v8 ; RV32-NEXT: .LBB133_3: ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 ; RV32-NEXT: addi sp, sp, 16 @@ -2337,56 +2383,76 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: mv a2, a1 +; RV64-NEXT: slli a1, a1, 1 +; RV64-NEXT: add a1, a1, a2 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v24, (a1) ; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: vle64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 256 -; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vle64.v v16, (a0) ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; RV64-NEXT: vmfeq.vv v0, v24, v24 -; RV64-NEXT: vmfeq.vv v7, v16, v16 -; RV64-NEXT: vmerge.vvm v8, v24, v16, v0 +; RV64-NEXT: vmfeq.vv v7, v8, v8 +; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; RV64-NEXT: vle64.v v8, (a1) +; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vmerge.vvm v16, v24, v8, v0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: vmerge.vvm v16, v16, v24, v0 -; RV64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; RV64-NEXT: vfmin.vv v24, v16, v24 +; RV64-NEXT: vmerge.vvm v8, v8, v24, v0 ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vfmin.vv v24, v8, v24 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmfeq.vv v0, v8, v8 +; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV64-NEXT: vmfeq.vv v0, v16, v16 -; RV64-NEXT: vmfeq.vv v7, v8, v8 +; RV64-NEXT: vmfeq.vv v7, v16, v16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV64-NEXT: vmerge.vvm v16, v16, v8, v0 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; RV64-NEXT: vmv1r.v v0, v7 ; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmerge.vvm v16, v16, v8, v0 +; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV64-NEXT: vfmin.vv v16, v8, v16 +; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vfmin.vv v16, v16, v8 ; RV64-NEXT: vmfeq.vv v0, v16, v16 ; RV64-NEXT: vmfeq.vv v7, v24, v24 ; RV64-NEXT: vmerge.vvm v8, v16, v24, v0 @@ -2406,7 +2472,10 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; RV64-NEXT: vfmv.f.s fa0, v8 ; RV64-NEXT: .LBB133_3: ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 @@ -2702,57 +2771,77 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: addi a2, a0, 128 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v24, (a2) ; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmfeq.vv v7, v8, v8 +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfmax.vv v24, v16, v24 +; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfmax.vv v24, v8, v24 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vmfeq.vv v7, v8, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfmax.vv v16, v8, v16 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfmax.vv v16, v16, v8 ; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 @@ -2771,7 +2860,10 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: .LBB149_3: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -3085,56 +3177,76 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v24, (a1) ; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: vle64.v v8, (a1) ; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vle64.v v16, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; RV32-NEXT: vmfeq.vv v0, v24, v24 -; RV32-NEXT: vmfeq.vv v7, v16, v16 -; RV32-NEXT: vmerge.vvm v8, v24, v16, v0 +; RV32-NEXT: vmfeq.vv v7, v8, v8 +; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; RV32-NEXT: vle64.v v8, (a1) +; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmerge.vvm v16, v24, v8, v0 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; RV32-NEXT: vmv1r.v v0, v7 -; RV32-NEXT: vmerge.vvm v16, v16, v24, v0 -; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vfmax.vv v24, v16, v24 +; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vfmax.vv v24, v8, v24 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmfeq.vv v0, v8, v8 +; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vmfeq.vv v0, v16, v16 -; RV32-NEXT: vmfeq.vv v7, v8, v8 +; RV32-NEXT: vmfeq.vv v7, v16, v16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vmerge.vvm v16, v16, v8, v0 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; RV32-NEXT: vmv1r.v v0, v7 ; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmerge.vvm v16, v16, v8, v0 +; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vfmax.vv v16, v8, v16 +; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vfmax.vv v16, v16, v8 ; RV32-NEXT: vmfeq.vv v0, v16, v16 ; RV32-NEXT: vmfeq.vv v7, v24, v24 ; RV32-NEXT: vmerge.vvm v8, v16, v24, v0 @@ -3153,7 +3265,10 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; RV32-NEXT: vfmv.f.s fa0, v8 ; RV32-NEXT: .LBB161_3: ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 ; RV32-NEXT: addi sp, sp, 16 @@ -3165,56 +3280,76 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: mv a2, a1 +; RV64-NEXT: slli a1, a1, 1 +; RV64-NEXT: add a1, a1, a2 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v24, (a1) ; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: vle64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 256 -; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vle64.v v16, (a0) ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; RV64-NEXT: vmfeq.vv v0, v24, v24 -; RV64-NEXT: vmfeq.vv v7, v16, v16 -; RV64-NEXT: vmerge.vvm v8, v24, v16, v0 +; RV64-NEXT: vmfeq.vv v7, v8, v8 +; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; RV64-NEXT: vle64.v v8, (a1) +; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vmerge.vvm v16, v24, v8, v0 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: vmerge.vvm v16, v16, v24, v0 -; RV64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; RV64-NEXT: vfmax.vv v24, v16, v24 +; RV64-NEXT: vmerge.vvm v8, v8, v24, v0 ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vfmax.vv v24, v8, v24 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmfeq.vv v0, v8, v8 +; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV64-NEXT: vmfeq.vv v0, v16, v16 -; RV64-NEXT: vmfeq.vv v7, v8, v8 +; RV64-NEXT: vmfeq.vv v7, v16, v16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV64-NEXT: vmerge.vvm v16, v16, v8, v0 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; RV64-NEXT: vmv1r.v v0, v7 ; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmerge.vvm v16, v16, v8, v0 +; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 -; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV64-NEXT: vfmax.vv v16, v8, v16 +; RV64-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vfmax.vv v16, v16, v8 ; RV64-NEXT: vmfeq.vv v0, v16, v16 ; RV64-NEXT: vmfeq.vv v7, v24, v24 ; RV64-NEXT: vmerge.vvm v8, v16, v24, v0 @@ -3234,7 +3369,10 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; RV64-NEXT: vfmv.f.s fa0, v8 ; RV64-NEXT: .LBB161_3: ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll index 25a4eb74eeba7..fd70f95ed53c6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll @@ -121,155 +121,68 @@ define @vfmax_nxv16bf16_vv( %a, @vfmax_nxv32bf16_vv( %a, %b) nounwind { -; ZVFH-LABEL: vfmax_nxv32bf16_vv: -; ZVFH: # %bb.0: -; ZVFH-NEXT: addi sp, sp, -16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: mv a1, a0 -; ZVFH-NEXT: slli a0, a0, 1 -; ZVFH-NEXT: add a0, a0, a1 -; ZVFH-NEXT: sub sp, sp, a0 -; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFH-NEXT: vmv8r.v v24, v16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: vmv8r.v v0, v8 -; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v24 -; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v0 -; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v3, v16, v16 -; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: vmv1r.v v0, v3 -; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0 -; ZVFH-NEXT: addi a0, sp, 16 -; ZVFH-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v12 -; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v4 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: addi a0, sp, 16 -; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFH-NEXT: vfmax.vv v16, v0, v16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v7, v24, v24 -; ZVFH-NEXT: vmerge.vvm v16, v8, v24, v0 -; ZVFH-NEXT: vmv1r.v v0, v7 -; ZVFH-NEXT: vmerge.vvm v8, v24, v8, v0 -; ZVFH-NEXT: vfmax.vv v16, v8, v16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24 -; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: mv a1, a0 -; ZVFH-NEXT: slli a0, a0, 1 -; ZVFH-NEXT: add a0, a0, a1 -; ZVFH-NEXT: add sp, sp, a0 -; ZVFH-NEXT: addi sp, sp, 16 -; ZVFH-NEXT: ret -; -; ZVFHMIN-LABEL: vfmax_nxv32bf16_vv: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 24 -; ZVFHMIN-NEXT: mul a0, a0, a1 -; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vmv8r.v v24, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vmv8r.v v0, v8 -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v24 -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16 -; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vmv1r.v v0, v3 -; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v8, v0 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v4 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v16, v0, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24 -; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v7 -; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v8, v0 -; ZVFHMIN-NEXT: vfmax.vv v16, v8, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 24 -; ZVFHMIN-NEXT: mul a0, a0, a1 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: ret +; CHECK-LABEL: vfmax_nxv32bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv8r.v v0, v16 +; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v3, v16, v16 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vmv1r.v v0, v3 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vfmax.vv v16, v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret %v = call @llvm.maximum.nxv32bf16( %a, %b) ret %v } @@ -444,54 +357,45 @@ define @vfmax_nxv32f16_vv( %a, @vfmax_nxv32f16_vv( %a, @vfmax_nxv32f16_vv( %a, @vfmin_nxv16bf16_vv( %a, @vfmin_nxv32bf16_vv( %a, %b) nounwind { -; ZVFH-LABEL: vfmin_nxv32bf16_vv: -; ZVFH: # %bb.0: -; ZVFH-NEXT: addi sp, sp, -16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: mv a1, a0 -; ZVFH-NEXT: slli a0, a0, 1 -; ZVFH-NEXT: add a0, a0, a1 -; ZVFH-NEXT: sub sp, sp, a0 -; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFH-NEXT: vmv8r.v v24, v16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: vmv8r.v v0, v8 -; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v24 -; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v0 -; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v3, v16, v16 -; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: vmv1r.v v0, v3 -; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0 -; ZVFH-NEXT: addi a0, sp, 16 -; ZVFH-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v12 -; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v4 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: addi a0, sp, 16 -; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFH-NEXT: vfmin.vv v16, v0, v16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v7, v24, v24 -; ZVFH-NEXT: vmerge.vvm v16, v8, v24, v0 -; ZVFH-NEXT: vmv1r.v v0, v7 -; ZVFH-NEXT: vmerge.vvm v8, v24, v8, v0 -; ZVFH-NEXT: vfmin.vv v16, v8, v16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24 -; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: mv a1, a0 -; ZVFH-NEXT: slli a0, a0, 1 -; ZVFH-NEXT: add a0, a0, a1 -; ZVFH-NEXT: add sp, sp, a0 -; ZVFH-NEXT: addi sp, sp, 16 -; ZVFH-NEXT: ret -; -; ZVFHMIN-LABEL: vfmin_nxv32bf16_vv: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 24 -; ZVFHMIN-NEXT: mul a0, a0, a1 -; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vmv8r.v v24, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vmv8r.v v0, v8 -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v24 -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16 -; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vmv1r.v v0, v3 -; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v8, v0 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v12 -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v4 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmin.vv v16, v0, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24 -; ZVFHMIN-NEXT: vmerge.vvm v16, v8, v24, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v7 -; ZVFHMIN-NEXT: vmerge.vvm v8, v24, v8, v0 -; ZVFHMIN-NEXT: vfmin.vv v16, v8, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 24 -; ZVFHMIN-NEXT: mul a0, a0, a1 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: ret +; CHECK-LABEL: vfmin_nxv32bf16_vv: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vmv8r.v v0, v16 +; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v3, v16, v16 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vmv1r.v v0, v3 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vfmin.vv v16, v8, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret %v = call @llvm.minimum.nxv32bf16( %a, %b) ret %v } @@ -444,54 +357,45 @@ define @vfmin_nxv32f16_vv( %a, @vfmin_nxv32f16_vv( %a, @vfmin_nxv32f16_vv( %a, %x, i64 %y) { ; CHECK-NEXT: add a0, a2, a0 ; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: vmerge.vim v24, v16, 1, v0 -; CHECK-NEXT: vs8r.v v24, (a1) ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v16, 1, v0 +; CHECK-NEXT: vs8r.v v24, (a1) ; CHECK-NEXT: vs8r.v v8, (a2) ; CHECK-NEXT: lbu a0, 0(a0) ; CHECK-NEXT: addi sp, s0, -80 diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir index a7eaf39793236..c73c2004834db 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir @@ -10,13 +10,13 @@ body: | ; CHECK-LABEL: name: undef_passthru ; CHECK: liveins: $x1, $v8, $v9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %false:vr = COPY $v8 - ; CHECK-NEXT: %true:vr = COPY $v9 + ; CHECK-NEXT: %false:vrnov0 = COPY $v8 + ; CHECK-NEXT: %true:vrnov0 = COPY $v9 ; CHECK-NEXT: %avl:gprnox0 = COPY $x1 ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 /* e8 */ ; CHECK-NEXT: $v0 = COPY %mask - %false:vr = COPY $v8 - %true:vr = COPY $v9 + %false:vrnov0 = COPY $v8 + %true:vrnov0 = COPY $v9 %avl:gprnox0 = COPY $x1 %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 $v0 = COPY %mask @@ -31,15 +31,15 @@ body: | ; CHECK: liveins: $x1, $v8, $v9 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %pt:vr = COPY $v8 - ; CHECK-NEXT: %false:vr = COPY $noreg - ; CHECK-NEXT: %true:vr = COPY $v9 + ; CHECK-NEXT: %false:vrnov0 = COPY $noreg + ; CHECK-NEXT: %true:vrnov0 = COPY $v9 ; CHECK-NEXT: %avl:gprnox0 = COPY $x1 ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 /* e8 */ ; CHECK-NEXT: $v0 = COPY %mask ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, %avl, 5 /* e32 */, 0 /* tu, mu */ %pt:vrnov0 = COPY $v8 - %false:vr = COPY $noreg - %true:vr = COPY $v9 + %false:vrnov0 = COPY $noreg + %true:vrnov0 = COPY $v9 %avl:gprnox0 = COPY $x1 %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 $v0 = COPY %mask @@ -53,15 +53,15 @@ body: | ; CHECK-LABEL: name: equal_passthru_false ; CHECK: liveins: $x1, $v8, $v9 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %false:vr = COPY $v8 + ; CHECK-NEXT: %false:vrnov0 = COPY $v8 ; CHECK-NEXT: %pt:vr = COPY $v8 - ; CHECK-NEXT: %true:vr = COPY $v9 + ; CHECK-NEXT: %true:vrnov0 = COPY $v9 ; CHECK-NEXT: %avl:gprnox0 = COPY $x1 ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 /* e8 */ ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, %avl, 5 /* e32 */, 0 /* tu, mu */ - %false:vr = COPY $v8 + %false:vrnov0 = COPY $v8 %pt:vrnov0 = COPY $v8 - %true:vr = COPY $v9 + %true:vrnov0 = COPY $v9 %avl:gprnox0 = COPY $x1 %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl, 5 @@ -136,7 +136,7 @@ body: | ; CHECK-NEXT: %false:vrnov0 = COPY $v8 ; CHECK-NEXT: %mask:vmv0 = COPY $v0 ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 1 /* ta, mu */ - %false:vr = COPY $v8 + %false:vrnov0 = COPY $v8 %mask:vmv0 = COPY $v0 %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ %x:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */ @@ -150,7 +150,7 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $v8, $v0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %false:vr = COPY $v8 + ; CHECK-NEXT: %false:vrnov0 = COPY $v8 ; CHECK-NEXT: %mask:vmv0 = COPY $v0 ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ ; CHECK-NEXT: {{ $}} @@ -158,7 +158,7 @@ body: | ; CHECK-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */ bb.0: liveins: $v8, $v0 - %false:vr = COPY $v8 + %false:vrnov0 = COPY $v8 %mask:vmv0 = COPY $v0 %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ bb.1: @@ -174,14 +174,14 @@ body: | ; CHECK: liveins: $v8, $v9, $v0, $x8, $x9 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %pt:vrnov0 = COPY $v8 - ; CHECK-NEXT: %false:vr = COPY $v9 + ; CHECK-NEXT: %false:vrnov0 = COPY $v9 ; CHECK-NEXT: %mask:vmv0 = COPY $v0 ; CHECK-NEXT: %avl1:gprnox0 = COPY $x8 ; CHECK-NEXT: %avl2:gprnox0 = COPY $x9 ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */ ; CHECK-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */ %pt:vrnov0 = COPY $v8 - %false:vr = COPY $v9 + %false:vrnov0 = COPY $v9 %mask:vmv0 = COPY $v0 %avl1:gprnox0 = COPY $x8 %avl2:gprnox0 = COPY $x9 @@ -203,7 +203,7 @@ body: | ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 1, 5 /* e32 */, 3 /* ta, ma */ ; CHECK-NEXT: [[PseudoVMV_V_V_M1_:%[0-9]+]]:vr = PseudoVMV_V_V_M1 %pt, %true, 1, 5 /* e32 */, 0 /* tu, mu */ %pt:vrnov0 = COPY $v8 - %false:vr = COPY $v9 + %false:vrnov0 = COPY $v9 %mask:vmv0 = COPY $v0 %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 2, 5 /* e32 */, 3 /* ta, ma */ %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 1, 5 /* e32 */ diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll index acd9519bb5a8e..5be32cc35fe37 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll @@ -867,8 +867,9 @@ define void @test_dag_loop() { ; CHECK-NEXT: vmseq.vv v0, v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vsetvli zero, zero, e16, m8, tu, mu +; CHECK-NEXT: vsetivli zero, 1, e16, m8, tu, mu ; CHECK-NEXT: vle16.v v8, (zero), v0.t +; CHECK-NEXT: vsetivli zero, 0, e16, m8, ta, ma ; CHECK-NEXT: vse16.v v8, (zero) ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll index e3f43cd904198..cc389236df3ff 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll @@ -516,15 +516,15 @@ define @splice_nxv64i1_offset_negone( %a, < ; NOVLDEP-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; NOVLDEP-NEXT: vmv1r.v v9, v0 ; NOVLDEP-NEXT: vmv1r.v v0, v8 -; NOVLDEP-NEXT: vmv.v.i v24, 0 +; NOVLDEP-NEXT: vmv.v.i v16, 0 ; NOVLDEP-NEXT: csrr a0, vlenb -; NOVLDEP-NEXT: vmerge.vim v16, v24, 1, v0 +; NOVLDEP-NEXT: vmerge.vim v24, v16, 1, v0 ; NOVLDEP-NEXT: vmv1r.v v0, v9 -; NOVLDEP-NEXT: vmerge.vim v8, v24, 1, v0 +; NOVLDEP-NEXT: vmerge.vim v8, v16, 1, v0 ; NOVLDEP-NEXT: slli a0, a0, 3 ; NOVLDEP-NEXT: addi a0, a0, -1 ; NOVLDEP-NEXT: vslidedown.vx v8, v8, a0 -; NOVLDEP-NEXT: vslideup.vi v8, v16, 1 +; NOVLDEP-NEXT: vslideup.vi v8, v24, 1 ; NOVLDEP-NEXT: vand.vi v8, v8, 1 ; NOVLDEP-NEXT: vmsne.vi v0, v8, 0 ; NOVLDEP-NEXT: ret @@ -534,17 +534,17 @@ define @splice_nxv64i1_offset_negone( %a, < ; VLDEP-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; VLDEP-NEXT: vmv1r.v v9, v0 ; VLDEP-NEXT: vmv1r.v v0, v8 -; VLDEP-NEXT: vmv.v.i v24, 0 +; VLDEP-NEXT: vmv.v.i v16, 0 ; VLDEP-NEXT: csrr a0, vlenb -; VLDEP-NEXT: vmerge.vim v16, v24, 1, v0 +; VLDEP-NEXT: vmerge.vim v24, v16, 1, v0 ; VLDEP-NEXT: vmv1r.v v0, v9 -; VLDEP-NEXT: vmerge.vim v8, v24, 1, v0 +; VLDEP-NEXT: vmerge.vim v8, v16, 1, v0 ; VLDEP-NEXT: slli a0, a0, 3 ; VLDEP-NEXT: addi a0, a0, -1 ; VLDEP-NEXT: vsetivli zero, 1, e8, m8, ta, ma ; VLDEP-NEXT: vslidedown.vx v8, v8, a0 ; VLDEP-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; VLDEP-NEXT: vslideup.vi v8, v16, 1 +; VLDEP-NEXT: vslideup.vi v8, v24, 1 ; VLDEP-NEXT: vand.vi v8, v8, 1 ; VLDEP-NEXT: vmsne.vi v0, v8, 0 ; VLDEP-NEXT: ret @@ -558,16 +558,16 @@ define @splice_nxv64i1_offset_max( %a,