diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 6d9c3569550fb..6298c7d5e9ef5 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -163,8 +163,6 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() { CurDAG->setRoot(Dummy.getValue()); - MadeChange |= doPeepholeMergeVVMFold(); - // After we're done with everything else, convert IMPLICIT_DEF // passthru operands to NoRegister. This is required to workaround // an optimization deficiency in MachineCSE. This really should @@ -4092,218 +4090,6 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) { return true; } -static bool IsVMerge(SDNode *N) { - return RISCV::getRVVMCOpcode(N->getMachineOpcode()) == RISCV::VMERGE_VVM; -} - -// Try to fold away VMERGE_VVM instructions into their true operands: -// -// %true = PseudoVADD_VV ... -// %x = PseudoVMERGE_VVM %false, %false, %true, %mask -// -> -// %x = PseudoVADD_VV_MASK %false, ..., %mask -// -// We can only fold if vmerge's passthru operand, vmerge's false operand and -// %true's passthru operand (if it has one) are the same. This is because we -// have to consolidate them into one passthru operand in the result. -// -// If %true is masked, then we can use its mask instead of vmerge's if vmerge's -// mask is all ones. -// -// The resulting VL is the minimum of the two VLs. -// -// The resulting policy is the effective policy the vmerge would have had, -// i.e. whether or not it's passthru operand was implicit-def. -bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { - SDValue Passthru, False, True, VL, Mask; - assert(IsVMerge(N)); - Passthru = N->getOperand(0); - False = N->getOperand(1); - True = N->getOperand(2); - Mask = N->getOperand(3); - VL = N->getOperand(4); - - // If the EEW of True is different from vmerge's SEW, then we can't fold. - if (True.getSimpleValueType() != N->getSimpleValueType(0)) - return false; - - // We require that either passthru and false are the same, or that passthru - // is undefined. - if (Passthru != False && !isImplicitDef(Passthru)) - return false; - - assert(True.getResNo() == 0 && - "Expect True is the first output of an instruction."); - - // Need N is the exactly one using True. - if (!True.hasOneUse()) - return false; - - if (!True.isMachineOpcode()) - return false; - - unsigned TrueOpc = True.getMachineOpcode(); - const MCInstrDesc &TrueMCID = TII->get(TrueOpc); - uint64_t TrueTSFlags = TrueMCID.TSFlags; - bool HasTiedDest = RISCVII::isFirstDefTiedToFirstUse(TrueMCID); - - const RISCV::RISCVMaskedPseudoInfo *Info = - RISCV::lookupMaskedIntrinsicByUnmasked(TrueOpc); - if (!Info) - return false; - - // If True has a passthru operand then it needs to be the same as vmerge's - // False, since False will be used for the result's passthru operand. - if (HasTiedDest && !isImplicitDef(True->getOperand(0))) { - SDValue PassthruOpTrue = True->getOperand(0); - if (False != PassthruOpTrue) - return false; - } - - // Skip if True has side effect. - if (TII->get(TrueOpc).hasUnmodeledSideEffects()) - return false; - - unsigned TrueChainOpIdx = True.getNumOperands() - 1; - bool HasChainOp = - True.getOperand(TrueChainOpIdx).getValueType() == MVT::Other; - - if (HasChainOp) { - // Avoid creating cycles in the DAG. We must ensure that none of the other - // operands depend on True through it's Chain. - SmallVector LoopWorklist; - SmallPtrSet Visited; - LoopWorklist.push_back(False.getNode()); - LoopWorklist.push_back(Mask.getNode()); - LoopWorklist.push_back(VL.getNode()); - if (SDNode::hasPredecessorHelper(True.getNode(), Visited, LoopWorklist)) - return false; - } - - // The vector policy operand may be present for masked intrinsics - bool HasVecPolicyOp = RISCVII::hasVecPolicyOp(TrueTSFlags); - unsigned TrueVLIndex = - True.getNumOperands() - HasVecPolicyOp - HasChainOp - 2; - SDValue TrueVL = True.getOperand(TrueVLIndex); - SDValue SEW = True.getOperand(TrueVLIndex + 1); - - auto GetMinVL = [](SDValue LHS, SDValue RHS) { - if (LHS == RHS) - return LHS; - if (isAllOnesConstant(LHS)) - return RHS; - if (isAllOnesConstant(RHS)) - return LHS; - auto *CLHS = dyn_cast(LHS); - auto *CRHS = dyn_cast(RHS); - if (!CLHS || !CRHS) - return SDValue(); - return CLHS->getZExtValue() <= CRHS->getZExtValue() ? LHS : RHS; - }; - - // Because N and True must have the same passthru operand (or True's operand - // is implicit_def), the "effective" body is the minimum of their VLs. - SDValue OrigVL = VL; - VL = GetMinVL(TrueVL, VL); - if (!VL) - return false; - - // Some operations produce different elementwise results depending on the - // active elements, like viota.m or vredsum. This transformation is illegal - // for these if we change the active elements (i.e. mask or VL). - const MCInstrDesc &TrueBaseMCID = TII->get(RISCV::getRVVMCOpcode(TrueOpc)); - if (RISCVII::elementsDependOnVL(TrueBaseMCID.TSFlags) && (TrueVL != VL)) - return false; - if (RISCVII::elementsDependOnMask(TrueBaseMCID.TSFlags) && - (Mask && !usesAllOnesMask(Mask))) - return false; - - // Make sure it doesn't raise any observable fp exceptions, since changing the - // active elements will affect how fflags is set. - if (mayRaiseFPException(True.getNode()) && !True->getFlags().hasNoFPExcept()) - return false; - - SDLoc DL(N); - - unsigned MaskedOpc = Info->MaskedPseudo; -#ifndef NDEBUG - const MCInstrDesc &MaskedMCID = TII->get(MaskedOpc); - assert(RISCVII::hasVecPolicyOp(MaskedMCID.TSFlags) && - "Expected instructions with mask have policy operand."); - assert(MaskedMCID.getOperandConstraint(MaskedMCID.getNumDefs(), - MCOI::TIED_TO) == 0 && - "Expected instructions with mask have a tied dest."); -#endif - - // Use a tumu policy, relaxing it to tail agnostic provided that the passthru - // operand is undefined. - // - // However, if the VL became smaller than what the vmerge had originally, then - // elements past VL that were previously in the vmerge's body will have moved - // to the tail. In that case we always need to use tail undisturbed to - // preserve them. - bool MergeVLShrunk = VL != OrigVL; - uint64_t Policy = (isImplicitDef(Passthru) && !MergeVLShrunk) - ? RISCVVType::TAIL_AGNOSTIC - : /*TUMU*/ 0; - SDValue PolicyOp = - CurDAG->getTargetConstant(Policy, DL, Subtarget->getXLenVT()); - - - SmallVector Ops; - Ops.push_back(False); - - const bool HasRoundingMode = RISCVII::hasRoundModeOp(TrueTSFlags); - const unsigned NormalOpsEnd = TrueVLIndex - HasRoundingMode; - Ops.append(True->op_begin() + HasTiedDest, True->op_begin() + NormalOpsEnd); - - Ops.push_back(Mask); - - // For unmasked "VOp" with rounding mode operand, that is interfaces like - // (..., rm, vl) or (..., rm, vl, policy). - // Its masked version is (..., vm, rm, vl, policy). - // Check the rounding mode pseudo nodes under RISCVInstrInfoVPseudos.td - if (HasRoundingMode) - Ops.push_back(True->getOperand(TrueVLIndex - 1)); - - Ops.append({VL, SEW, PolicyOp}); - - // Result node should have chain operand of True. - if (HasChainOp) - Ops.push_back(True.getOperand(TrueChainOpIdx)); - - MachineSDNode *Result = - CurDAG->getMachineNode(MaskedOpc, DL, True->getVTList(), Ops); - Result->setFlags(True->getFlags()); - - if (!cast(True)->memoperands_empty()) - CurDAG->setNodeMemRefs(Result, cast(True)->memoperands()); - - // Replace vmerge.vvm node by Result. - ReplaceUses(SDValue(N, 0), SDValue(Result, 0)); - - // Replace another value of True. E.g. chain and VL. - for (unsigned Idx = 1; Idx < True->getNumValues(); ++Idx) - ReplaceUses(True.getValue(Idx), SDValue(Result, Idx)); - - return true; -} - -bool RISCVDAGToDAGISel::doPeepholeMergeVVMFold() { - bool MadeChange = false; - SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); - - while (Position != CurDAG->allnodes_begin()) { - SDNode *N = &*--Position; - if (N->use_empty() || !N->isMachineOpcode()) - continue; - - if (IsVMerge(N)) - MadeChange |= performCombineVMergeAndVOps(N); - } - return MadeChange; -} - /// If our passthru is an implicit_def, use noreg instead. This side /// steps issues with MachineCSE not being able to CSE expressions with /// IMPLICIT_DEF operands while preserving the semantic intent. See diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index 65c2220e25822..66d878f037446 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -200,7 +200,6 @@ class RISCVDAGToDAGISel : public SelectionDAGISel { private: bool doPeepholeSExtW(SDNode *Node); bool doPeepholeMaskedRVV(MachineSDNode *Node); - bool doPeepholeMergeVVMFold(); bool doPeepholeNoRegPassThru(); bool performCombineVMergeAndVOps(SDNode *N); bool selectImm64IfCheaper(int64_t Imm, int64_t OrigImm, SDValue N, diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index f7acd676461fb..07907298386c3 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -67,12 +67,13 @@ class RISCVVectorPeephole : public MachineFunctionPass { bool convertSameMaskVMergeToVMv(MachineInstr &MI); bool foldUndefPassthruVMV_V_V(MachineInstr &MI); bool foldVMV_V_V(MachineInstr &MI); + bool foldVMergeToMask(MachineInstr &MI) const; bool hasSameEEW(const MachineInstr &User, const MachineInstr &Src) const; bool isAllOnesMask(const MachineInstr *MaskDef) const; std::optional getConstant(const MachineOperand &VL) const; bool ensureDominates(const MachineOperand &Use, MachineInstr &Src) const; - bool isKnownSameDefs(const MachineOperand &A, const MachineOperand &B) const; + bool isKnownSameDefs(Register A, Register B) const; }; } // namespace @@ -386,13 +387,23 @@ bool RISCVVectorPeephole::convertAllOnesVMergeToVMv(MachineInstr &MI) const { return true; } -bool RISCVVectorPeephole::isKnownSameDefs(const MachineOperand &A, - const MachineOperand &B) const { - if (A.getReg().isPhysical() || B.getReg().isPhysical()) +bool RISCVVectorPeephole::isKnownSameDefs(Register A, Register B) const { + if (A.isPhysical() || B.isPhysical()) return false; - return TRI->lookThruCopyLike(A.getReg(), MRI) == - TRI->lookThruCopyLike(B.getReg(), MRI); + auto LookThruVirtRegCopies = [this](Register Reg) { + while (MachineInstr *Def = MRI->getUniqueVRegDef(Reg)) { + if (!Def->isFullCopy()) + break; + Register Src = Def->getOperand(1).getReg(); + if (!Src.isVirtual()) + break; + Reg = Src; + } + return Reg; + }; + + return LookThruVirtRegCopies(A) == LookThruVirtRegCopies(B); } /// If a PseudoVMERGE_VVM's true operand is a masked pseudo and both have the @@ -420,7 +431,7 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) { const MachineOperand &TrueMask = True->getOperand(TrueMaskedInfo->MaskOpIdx + True->getNumExplicitDefs()); const MachineOperand &MIMask = MI.getOperand(4); - if (!isKnownSameDefs(TrueMask, MIMask)) + if (!isKnownSameDefs(TrueMask.getReg(), MIMask.getReg())) return false; // True's passthru needs to be equivalent to False @@ -669,6 +680,133 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { return true; } +/// Try to fold away VMERGE_VVM instructions into their operands: +/// +/// %true = PseudoVADD_VV ... +/// %x = PseudoVMERGE_VVM_M1 %false, %false, %true, %mask +/// -> +/// %x = PseudoVADD_VV_M1_MASK %false, ..., %mask +/// +/// We can only fold if vmerge's passthru operand, vmerge's false operand and +/// %true's passthru operand (if it has one) are the same. This is because we +/// have to consolidate them into one passthru operand in the result. +/// +/// If %true is masked, then we can use its mask instead of vmerge's if vmerge's +/// mask is all ones. +/// +/// The resulting VL is the minimum of the two VLs. +/// +/// The resulting policy is the effective policy the vmerge would have had, +/// i.e. whether or not it's passthru operand was implicit-def. +bool RISCVVectorPeephole::foldVMergeToMask(MachineInstr &MI) const { + if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VMERGE_VVM) + return false; + + Register PassthruReg = MI.getOperand(1).getReg(); + Register FalseReg = MI.getOperand(2).getReg(); + Register TrueReg = MI.getOperand(3).getReg(); + if (!TrueReg.isVirtual() || !MRI->hasOneUse(TrueReg)) + return false; + MachineInstr &True = *MRI->getUniqueVRegDef(TrueReg); + if (True.getParent() != MI.getParent()) + return false; + const MachineOperand &MaskOp = MI.getOperand(4); + MachineInstr *Mask = MRI->getUniqueVRegDef(MaskOp.getReg()); + assert(Mask); + + const RISCV::RISCVMaskedPseudoInfo *Info = + RISCV::lookupMaskedIntrinsicByUnmasked(True.getOpcode()); + if (!Info) + return false; + + // If the EEW of True is different from vmerge's SEW, then we can't fold. + if (!hasSameEEW(MI, True)) + return false; + + // We require that either passthru and false are the same, or that passthru + // is undefined. + if (PassthruReg && !isKnownSameDefs(PassthruReg, FalseReg)) + return false; + + // If True has a passthru operand then it needs to be the same as vmerge's + // False, since False will be used for the result's passthru operand. + Register TruePassthru = True.getOperand(True.getNumExplicitDefs()).getReg(); + if (RISCVII::isFirstDefTiedToFirstUse(True.getDesc()) && TruePassthru && + !isKnownSameDefs(TruePassthru, FalseReg)) + return false; + + // Make sure it doesn't raise any observable fp exceptions, since changing the + // active elements will affect how fflags is set. + if (True.hasUnmodeledSideEffects() || True.mayRaiseFPException()) + return false; + + const MachineOperand &VMergeVL = + MI.getOperand(RISCVII::getVLOpNum(MI.getDesc())); + const MachineOperand &TrueVL = + True.getOperand(RISCVII::getVLOpNum(True.getDesc())); + + MachineOperand MinVL = MachineOperand::CreateImm(0); + if (RISCV::isVLKnownLE(TrueVL, VMergeVL)) + MinVL = TrueVL; + else if (RISCV::isVLKnownLE(VMergeVL, TrueVL)) + MinVL = VMergeVL; + else + return false; + + unsigned RVVTSFlags = + TII->get(RISCV::getRVVMCOpcode(True.getOpcode())).TSFlags; + if (RISCVII::elementsDependOnVL(RVVTSFlags) && !TrueVL.isIdenticalTo(MinVL)) + return false; + if (RISCVII::elementsDependOnMask(RVVTSFlags) && !isAllOnesMask(Mask)) + return false; + + // Use a tumu policy, relaxing it to tail agnostic provided that the passthru + // operand is undefined. + // + // However, if the VL became smaller than what the vmerge had originally, then + // elements past VL that were previously in the vmerge's body will have moved + // to the tail. In that case we always need to use tail undisturbed to + // preserve them. + uint64_t Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED; + if (!PassthruReg && RISCV::isVLKnownLE(VMergeVL, MinVL)) + Policy |= RISCVVType::TAIL_AGNOSTIC; + + assert(RISCVII::hasVecPolicyOp(True.getDesc().TSFlags) && + "Foldable unmasked pseudo should have a policy op already"); + + // Make sure the mask dominates True, otherwise move down True so it does. + // VL will always dominate since if it's a register they need to be the same. + if (!ensureDominates(MaskOp, True)) + return false; + + True.setDesc(TII->get(Info->MaskedPseudo)); + + // Insert the mask operand. + // TODO: Increment MaskOpIdx by number of explicit defs? + True.insert(True.operands_begin() + Info->MaskOpIdx + + True.getNumExplicitDefs(), + MachineOperand::CreateReg(MaskOp.getReg(), false)); + + // Update the passthru, AVL and policy. + True.getOperand(True.getNumExplicitDefs()).setReg(FalseReg); + True.removeOperand(RISCVII::getVLOpNum(True.getDesc())); + True.insert(True.operands_begin() + RISCVII::getVLOpNum(True.getDesc()), + MinVL); + True.getOperand(RISCVII::getVecPolicyOpNum(True.getDesc())).setImm(Policy); + + MRI->replaceRegWith(True.getOperand(0).getReg(), MI.getOperand(0).getReg()); + // Now that True is masked, constrain its operands from vr -> vrnov0. + for (MachineOperand &MO : True.explicit_operands()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) + continue; + MRI->constrainRegClass( + MO.getReg(), True.getRegClassConstraint(MO.getOperandNo(), TII, TRI)); + } + MI.eraseFromParent(); + + return true; +} + bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -685,6 +823,9 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : make_early_inc_range(MBB)) + Changed |= foldVMergeToMask(MI); + for (MachineInstr &MI : make_early_inc_range(MBB)) { Changed |= convertToVLMAX(MI); Changed |= tryToReduceVL(MI); diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll index 5dc532273b770..0d8aff306252e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll @@ -313,12 +313,12 @@ define i32 @test_nxv128i1( %x) { ; CHECK-NEXT: vslidedown.vx v0, v6, a0 ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vx v6, v7, a1 -; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v7, a0 ; CHECK-NEXT: vslidedown.vx v5, v6, a0 +; CHECK-NEXT: vslidedown.vx v4, v7, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v4 ; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t ; CHECK-NEXT: vmv1r.v v0, v5 ; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t @@ -364,9 +364,9 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: vmv1r.v v7, v9 ; CHECK-NEXT: vmv1r.v v5, v8 ; CHECK-NEXT: vmv1r.v v4, v0 -; CHECK-NEXT: vmv.v.i v24, 0 +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v16, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a2, a0 @@ -376,7 +376,7 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vmv1r.v v0, v5 -; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v16, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add a0, sp, a0 @@ -389,7 +389,7 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: vslidedown.vx v2, v5, a0 ; CHECK-NEXT: vmv.v.v v0, v3 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v16, 1, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: mv a3, a2 @@ -399,41 +399,41 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 +; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v3, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 +; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v2, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v16, v24, 1, v0 +; CHECK-NEXT: vmerge.vim v24, v8, 1, v0 ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v4, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 +; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v5, a1 -; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v24, v24, 1, v0 -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v6, a1 ; CHECK-NEXT: vslidedown.vx v5, v7, a1 +; CHECK-NEXT: vslidedown.vx v4, v6, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, mu -; CHECK-NEXT: vadd.vi v24, v24, 1, v0.t -; CHECK-NEXT: vmv1r.v v0, v5 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v4 ; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v24 +; CHECK-NEXT: vmv1r.v v0, v5 +; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: addi a2, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma @@ -443,7 +443,7 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: vslidedown.vx v0, v4, a1 ; CHECK-NEXT: vslidedown.vx v3, v5, a1 ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu -; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t +; CHECK-NEXT: vadd.vi v24, v24, 1, v0.t ; CHECK-NEXT: vmv1r.v v0, v3 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 @@ -451,7 +451,7 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vadd.vv v8, v8, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 @@ -492,16 +492,16 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vadd.vi v24, v24, 1, v0.t -; CHECK-NEXT: vadd.vv v0, v24, v8 +; CHECK-NEXT: vadd.vv v24, v24, v8 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vadd.vv v8, v8, v24 -; CHECK-NEXT: vadd.vv v16, v0, v16 +; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vadd.vv v16, v24, v16 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vmv.s.x v16, zero ; CHECK-NEXT: vredsum.vs v8, v8, v16 @@ -537,17 +537,18 @@ entry: define i16 @test_narrow_nxv64i1( %x) { ; CHECK-LABEL: test_narrow_nxv64i1: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu -; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t -; CHECK-NEXT: vmv.s.x v16, zero -; CHECK-NEXT: vredsum.vs v8, v8, v16 +; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vredsum.vs v8, v16, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index 564e95c43f68a..3c3e08d387faa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -1756,16 +1756,16 @@ define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float ; CHECK-LABEL: buildvec_v8f32_zvl256: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, mu -; CHECK-NEXT: vfmv.v.f v8, fa0 -; CHECK-NEXT: vfmv.v.f v9, fa4 +; CHECK-NEXT: vfmv.v.f v8, fa4 +; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vmv.v.i v0, 15 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 -; CHECK-NEXT: vfslide1down.vf v9, v9, fa5 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 -; CHECK-NEXT: vfslide1down.vf v9, v9, fa6 -; CHECK-NEXT: vfslide1down.vf v10, v8, fa3 -; CHECK-NEXT: vfslide1down.vf v8, v9, fa7 -; CHECK-NEXT: vslidedown.vi v8, v10, 4, v0.t +; CHECK-NEXT: vfslide1down.vf v8, v8, fa5 +; CHECK-NEXT: vfslide1down.vf v9, v9, fa1 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa6 +; CHECK-NEXT: vfslide1down.vf v9, v9, fa2 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa7 +; CHECK-NEXT: vfslide1down.vf v9, v9, fa3 +; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: ret %v0 = insertelement <8 x float> poison, float %e0, i64 0 %v1 = insertelement <8 x float> %v0, float %e1, i64 1 @@ -1807,16 +1807,16 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d ; CHECK-LABEL: buildvec_v8f64_zvl512: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, mu -; CHECK-NEXT: vfmv.v.f v8, fa0 -; CHECK-NEXT: vfmv.v.f v9, fa4 +; CHECK-NEXT: vfmv.v.f v8, fa4 +; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vmv.v.i v0, 15 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 -; CHECK-NEXT: vfslide1down.vf v9, v9, fa5 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 -; CHECK-NEXT: vfslide1down.vf v9, v9, fa6 -; CHECK-NEXT: vfslide1down.vf v10, v8, fa3 -; CHECK-NEXT: vfslide1down.vf v8, v9, fa7 -; CHECK-NEXT: vslidedown.vi v8, v10, 4, v0.t +; CHECK-NEXT: vfslide1down.vf v8, v8, fa5 +; CHECK-NEXT: vfslide1down.vf v9, v9, fa1 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa6 +; CHECK-NEXT: vfslide1down.vf v9, v9, fa2 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa7 +; CHECK-NEXT: vfslide1down.vf v9, v9, fa3 +; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: ret %v0 = insertelement <8 x double> poison, double %e0, i64 0 %v1 = insertelement <8 x double> %v0, double %e1, i64 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index f235540cc8ffb..d9bb007a10f71 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -1359,23 +1359,23 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV32-ONLY-NEXT: lbu s0, 14(a0) ; RV32-ONLY-NEXT: lbu a0, 15(a0) ; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV32-ONLY-NEXT: vmv.v.x v8, a1 -; RV32-ONLY-NEXT: vmv.v.x v9, t1 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t2 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t3 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t4 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t5 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a6 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t6 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, s0 -; RV32-ONLY-NEXT: vslide1down.vx v10, v8, t0 -; RV32-ONLY-NEXT: vslide1down.vx v8, v9, a0 -; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV32-ONLY-NEXT: vmv.v.x v8, t1 +; RV32-ONLY-NEXT: vmv.v.x v9, a1 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t2 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a2 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t3 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a3 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t4 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a4 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t5 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a5 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t6 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a6 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, s0 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a7 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t0 +; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV32-ONLY-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-ONLY-NEXT: .cfi_restore s0 ; RV32-ONLY-NEXT: addi sp, sp, 16 @@ -1494,23 +1494,23 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV64V-ONLY-NEXT: lbu s0, 14(a0) ; RV64V-ONLY-NEXT: lbu a0, 15(a0) ; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64V-ONLY-NEXT: vmv.v.x v8, a1 -; RV64V-ONLY-NEXT: vmv.v.x v9, t1 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t2 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t3 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t4 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t5 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a6 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t6 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, s0 -; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, t0 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, a0 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64V-ONLY-NEXT: vmv.v.x v8, t1 +; RV64V-ONLY-NEXT: vmv.v.x v9, a1 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t2 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a2 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t3 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a3 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t4 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a4 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t5 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a5 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t6 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a6 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, s0 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a7 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t0 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV64V-ONLY-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; RV64V-ONLY-NEXT: .cfi_restore s0 ; RV64V-ONLY-NEXT: addi sp, sp, 16 @@ -1631,23 +1631,23 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV64ZVE32-NEXT: lbu s0, 14(a0) ; RV64ZVE32-NEXT: lbu a0, 15(a0) ; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64ZVE32-NEXT: vmv.v.x v8, a1 -; RV64ZVE32-NEXT: vmv.v.x v9, t1 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t2 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t3 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t4 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t5 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t6 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, s0 -; RV64ZVE32-NEXT: vslide1down.vx v10, v8, t0 -; RV64ZVE32-NEXT: vslide1down.vx v8, v9, a0 -; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64ZVE32-NEXT: vmv.v.x v8, t1 +; RV64ZVE32-NEXT: vmv.v.x v9, a1 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t2 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a2 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t3 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t4 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t5 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a5 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t6 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a6 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, s0 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a7 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t0 +; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV64ZVE32-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; RV64ZVE32-NEXT: .cfi_restore s0 ; RV64ZVE32-NEXT: addi sp, sp, 16 @@ -1733,23 +1733,23 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV32-ONLY-NEXT: lbu s0, 124(a0) ; RV32-ONLY-NEXT: lbu a0, 144(a0) ; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV32-ONLY-NEXT: vmv.v.x v8, a1 -; RV32-ONLY-NEXT: vmv.v.x v9, t1 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t5 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t6 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t3 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a6 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, s0 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t4 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a0 -; RV32-ONLY-NEXT: vslide1down.vx v10, v8, t0 -; RV32-ONLY-NEXT: vslide1down.vx v8, v9, t2 -; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV32-ONLY-NEXT: vmv.v.x v8, t1 +; RV32-ONLY-NEXT: vmv.v.x v9, a1 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t5 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a2 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t6 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a3 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t3 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a4 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, s0 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a6 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t4 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a7 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a5 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t2 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t0 +; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV32-ONLY-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-ONLY-NEXT: .cfi_restore s0 ; RV32-ONLY-NEXT: addi sp, sp, 16 @@ -1868,23 +1868,23 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV64V-ONLY-NEXT: lbu s0, 124(a0) ; RV64V-ONLY-NEXT: lbu a0, 144(a0) ; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64V-ONLY-NEXT: vmv.v.x v8, a1 -; RV64V-ONLY-NEXT: vmv.v.x v9, t1 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t5 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t6 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t3 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a6 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, s0 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t4 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a5 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a0 -; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, t0 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, t2 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64V-ONLY-NEXT: vmv.v.x v8, t1 +; RV64V-ONLY-NEXT: vmv.v.x v9, a1 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t5 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a2 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t6 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a3 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t3 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a4 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, s0 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a6 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t4 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a7 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a5 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t2 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t0 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV64V-ONLY-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; RV64V-ONLY-NEXT: .cfi_restore s0 ; RV64V-ONLY-NEXT: addi sp, sp, 16 @@ -2013,23 +2013,23 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV64ZVE32-NEXT: lbu s0, 124(a0) ; RV64ZVE32-NEXT: lbu a0, 144(a0) ; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64ZVE32-NEXT: vmv.v.x v8, a1 -; RV64ZVE32-NEXT: vmv.v.x v9, t1 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t5 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t6 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t3 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, s0 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t4 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a0 -; RV64ZVE32-NEXT: vslide1down.vx v10, v8, t0 -; RV64ZVE32-NEXT: vslide1down.vx v8, v9, t2 -; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64ZVE32-NEXT: vmv.v.x v8, t1 +; RV64ZVE32-NEXT: vmv.v.x v9, a1 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t5 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a2 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t6 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t3 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, s0 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a6 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t4 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a7 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a5 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t2 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t0 +; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV64ZVE32-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; RV64ZVE32-NEXT: .cfi_restore s0 ; RV64ZVE32-NEXT: addi sp, sp, 16 @@ -2505,17 +2505,17 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV32-ONLY-NEXT: lbu t0, 105(a0) ; RV32-ONLY-NEXT: lbu a0, 161(a0) ; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV32-ONLY-NEXT: vmv.v.x v8, a2 -; RV32-ONLY-NEXT: vmv.v.x v9, a6 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a7 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t0 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a1 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a0 -; RV32-ONLY-NEXT: vslide1down.vx v10, v8, a5 -; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 4 -; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV32-ONLY-NEXT: vmv.v.x v8, a6 +; RV32-ONLY-NEXT: vmv.v.x v9, a2 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a3 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t0 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a4 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a1 +; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 4 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a5 +; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV32-ONLY-NEXT: ret ; ; RV32VB-LABEL: buildvec_v16i8_undef_edges: @@ -2592,17 +2592,17 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV64V-ONLY-NEXT: lbu t0, 105(a0) ; RV64V-ONLY-NEXT: lbu a0, 161(a0) ; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64V-ONLY-NEXT: vmv.v.x v8, a2 -; RV64V-ONLY-NEXT: vmv.v.x v9, a6 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a7 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t0 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a1 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a0 -; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, a5 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 4 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64V-ONLY-NEXT: vmv.v.x v8, a6 +; RV64V-ONLY-NEXT: vmv.v.x v9, a2 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a3 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t0 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a4 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a1 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 4 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a5 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV64V-ONLY-NEXT: ret ; ; RVA22U64-LABEL: buildvec_v16i8_undef_edges: @@ -2679,17 +2679,17 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV64ZVE32-NEXT: lbu t0, 105(a0) ; RV64ZVE32-NEXT: lbu a0, 161(a0) ; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64ZVE32-NEXT: vmv.v.x v8, a2 -; RV64ZVE32-NEXT: vmv.v.x v9, a6 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a7 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t0 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a1 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a0 -; RV64ZVE32-NEXT: vslide1down.vx v10, v8, a5 -; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 4 -; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64ZVE32-NEXT: vmv.v.x v8, a6 +; RV64ZVE32-NEXT: vmv.v.x v9, a2 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t0 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a1 +; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a5 +; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV64ZVE32-NEXT: ret %p4 = getelementptr i8, ptr %p, i32 31 %p5 = getelementptr i8, ptr %p, i32 44 @@ -2740,21 +2740,21 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV32-ONLY-NEXT: lbu t1, 144(a0) ; RV32-ONLY-NEXT: lbu a0, 154(a0) ; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV32-ONLY-NEXT: vmv.v.x v8, a1 -; RV32-ONLY-NEXT: vmv.v.x v9, a6 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a7 +; RV32-ONLY-NEXT: vmv.v.x v8, a6 +; RV32-ONLY-NEXT: vmv.v.x v9, a1 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a2 ; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 2 ; RV32-ONLY-NEXT: vslidedown.vi v9, v9, 2 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t0 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 -; RV32-ONLY-NEXT: vslidedown.vi v9, v9, 1 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t0 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a3 ; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 1 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t1 -; RV32-ONLY-NEXT: vslide1down.vx v10, v8, a5 -; RV32-ONLY-NEXT: vslide1down.vx v8, v9, a0 -; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a4 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, t1 +; RV32-ONLY-NEXT: vslidedown.vi v9, v9, 1 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a5 +; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV32-ONLY-NEXT: ret ; ; RV32VB-LABEL: buildvec_v16i8_loads_undef_scattered: @@ -2834,21 +2834,21 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV64V-ONLY-NEXT: lbu t1, 144(a0) ; RV64V-ONLY-NEXT: lbu a0, 154(a0) ; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64V-ONLY-NEXT: vmv.v.x v8, a1 -; RV64V-ONLY-NEXT: vmv.v.x v9, a6 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a7 +; RV64V-ONLY-NEXT: vmv.v.x v8, a6 +; RV64V-ONLY-NEXT: vmv.v.x v9, a1 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a2 ; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 2 ; RV64V-ONLY-NEXT: vslidedown.vi v9, v9, 2 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t0 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 -; RV64V-ONLY-NEXT: vslidedown.vi v9, v9, 1 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t0 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a3 ; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 1 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t1 -; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, a5 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, a0 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a4 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, t1 +; RV64V-ONLY-NEXT: vslidedown.vi v9, v9, 1 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a5 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV64V-ONLY-NEXT: ret ; ; RVA22U64-LABEL: buildvec_v16i8_loads_undef_scattered: @@ -2930,21 +2930,21 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV64ZVE32-NEXT: lbu t1, 144(a0) ; RV64ZVE32-NEXT: lbu a0, 154(a0) ; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; RV64ZVE32-NEXT: vmv.v.x v8, a1 -; RV64ZVE32-NEXT: vmv.v.x v9, a6 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a7 +; RV64ZVE32-NEXT: vmv.v.x v8, a6 +; RV64ZVE32-NEXT: vmv.v.x v9, a1 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a2 ; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t0 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t0 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a3 ; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t1 -; RV64ZVE32-NEXT: vslide1down.vx v10, v8, a5 -; RV64ZVE32-NEXT: vslide1down.vx v8, v9, a0 -; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, t1 +; RV64ZVE32-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a5 +; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 8, v0.t ; RV64ZVE32-NEXT: ret %p2 = getelementptr i8, ptr %p, i32 1 %p3 = getelementptr i8, ptr %p, i32 22 @@ -3002,16 +3002,16 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 % ; RV32-ONLY-LABEL: buildvec_v8i8_pack: ; RV32-ONLY: # %bb.0: ; RV32-ONLY-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV32-ONLY-NEXT: vmv.v.x v8, a0 -; RV32-ONLY-NEXT: vmv.v.x v9, a4 +; RV32-ONLY-NEXT: vmv.v.x v8, a4 +; RV32-ONLY-NEXT: vmv.v.x v9, a0 ; RV32-ONLY-NEXT: vmv.v.i v0, 15 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a1 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a5 -; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 -; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a6 -; RV32-ONLY-NEXT: vslide1down.vx v10, v8, a3 -; RV32-ONLY-NEXT: vslide1down.vx v8, v9, a7 -; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a5 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a1 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a6 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a2 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a7 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a3 +; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV32-ONLY-NEXT: ret ; ; RV32VB-LABEL: buildvec_v8i8_pack: @@ -3055,16 +3055,16 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 % ; RV64V-ONLY-LABEL: buildvec_v8i8_pack: ; RV64V-ONLY: # %bb.0: ; RV64V-ONLY-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64V-ONLY-NEXT: vmv.v.x v8, a0 -; RV64V-ONLY-NEXT: vmv.v.x v9, a4 +; RV64V-ONLY-NEXT: vmv.v.x v8, a4 +; RV64V-ONLY-NEXT: vmv.v.x v9, a0 ; RV64V-ONLY-NEXT: vmv.v.i v0, 15 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a1 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a5 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 -; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a6 -; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, a3 -; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, a7 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a5 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a1 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a6 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a2 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a7 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a3 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64V-ONLY-NEXT: ret ; ; RVA22U64-LABEL: buildvec_v8i8_pack: @@ -3110,16 +3110,16 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 % ; RV64ZVE32-LABEL: buildvec_v8i8_pack: ; RV64ZVE32: # %bb.0: ; RV64ZVE32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; RV64ZVE32-NEXT: vmv.v.x v8, a0 -; RV64ZVE32-NEXT: vmv.v.x v9, a4 +; RV64ZVE32-NEXT: vmv.v.x v8, a4 +; RV64ZVE32-NEXT: vmv.v.x v9, a0 ; RV64ZVE32-NEXT: vmv.v.i v0, 15 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a1 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a5 -; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a6 -; RV64ZVE32-NEXT: vslide1down.vx v10, v8, a3 -; RV64ZVE32-NEXT: vslide1down.vx v8, v9, a7 -; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a5 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a1 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a2 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32-NEXT: ret %v1 = insertelement <8 x i8> poison, i8 %e1, i32 0 %v2 = insertelement <8 x i8> %v1, i8 %e2, i32 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll index c29ccd45528b8..3bfe41337a110 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll @@ -280,16 +280,16 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vslide1down.vx v9, v8, a0 -; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: vslide1down.vx v9, v8, a3 +; CHECK-NEXT: li a3, 1 ; CHECK-NEXT: vmv.v.i v0, 15 +; CHECK-NEXT: vslide1down.vx v8, v8, a0 +; CHECK-NEXT: vslide1down.vx v9, v9, zero ; CHECK-NEXT: vslide1down.vx v8, v8, a3 -; CHECK-NEXT: vslide1down.vx v9, v9, a0 -; CHECK-NEXT: vslide1down.vx v8, v8, zero -; CHECK-NEXT: vslide1down.vx v9, v9, a1 -; CHECK-NEXT: vslide1down.vx v8, v8, a2 -; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t -; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vslide1down.vx v9, v9, a2 +; CHECK-NEXT: vslide1down.vx v8, v8, a1 +; CHECK-NEXT: vslidedown.vi v9, v8, 4, v0.t +; CHECK-NEXT: vand.vi v8, v9, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret ; @@ -297,16 +297,16 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) { ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; ZVE32F-NEXT: vmv.v.x v8, a0 -; ZVE32F-NEXT: vslide1down.vx v9, v8, a0 -; ZVE32F-NEXT: li a0, 1 +; ZVE32F-NEXT: vslide1down.vx v9, v8, a3 +; ZVE32F-NEXT: li a3, 1 ; ZVE32F-NEXT: vmv.v.i v0, 15 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; ZVE32F-NEXT: vslide1down.vx v9, v9, zero ; ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a0 -; ZVE32F-NEXT: vslide1down.vx v8, v8, zero -; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t -; ZVE32F-NEXT: vand.vi v8, v8, 1 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVE32F-NEXT: vand.vi v8, v9, 1 ; ZVE32F-NEXT: vmsne.vi v0, v8, 0 ; ZVE32F-NEXT: ret %1 = insertelement <8 x i1> poison, i1 %x, i32 0 @@ -325,16 +325,16 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vslide1down.vx v9, v8, a0 -; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: vslide1down.vx v9, v8, a3 +; CHECK-NEXT: li a3, 1 ; CHECK-NEXT: vmv.v.i v0, 15 +; CHECK-NEXT: vslide1down.vx v8, v8, a0 +; CHECK-NEXT: vslide1down.vx v9, v9, zero ; CHECK-NEXT: vslide1down.vx v8, v8, a3 -; CHECK-NEXT: vslide1down.vx v9, v9, a0 -; CHECK-NEXT: vslide1down.vx v8, v8, zero -; CHECK-NEXT: vslide1down.vx v9, v9, a1 -; CHECK-NEXT: vslide1down.vx v8, v8, a2 -; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t -; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vslide1down.vx v9, v9, a2 +; CHECK-NEXT: vslide1down.vx v8, v8, a1 +; CHECK-NEXT: vslidedown.vi v9, v8, 4, v0.t +; CHECK-NEXT: vand.vi v8, v9, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret ; @@ -342,16 +342,16 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 % ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; ZVE32F-NEXT: vmv.v.x v8, a0 -; ZVE32F-NEXT: vslide1down.vx v9, v8, a0 -; ZVE32F-NEXT: li a0, 1 +; ZVE32F-NEXT: vslide1down.vx v9, v8, a3 +; ZVE32F-NEXT: li a3, 1 ; ZVE32F-NEXT: vmv.v.i v0, 15 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; ZVE32F-NEXT: vslide1down.vx v9, v9, zero ; ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a0 -; ZVE32F-NEXT: vslide1down.vx v8, v8, zero -; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t -; ZVE32F-NEXT: vand.vi v8, v8, 1 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVE32F-NEXT: vand.vi v8, v9, 1 ; ZVE32F-NEXT: vmsne.vi v0, v8, 0 ; ZVE32F-NEXT: ret %1 = insertelement <8 x i1> poison, i1 %x, i32 0 @@ -371,14 +371,14 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vmv.v.i v0, 15 -; CHECK-NEXT: vslide1down.vx v9, v8, a0 -; CHECK-NEXT: vslide1down.vx v8, v8, a1 +; CHECK-NEXT: vslide1down.vx v9, v8, a1 +; CHECK-NEXT: vslide1down.vx v8, v8, a0 ; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 -; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t -; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vslidedown.vi v9, v8, 4, v0.t +; CHECK-NEXT: vand.vi v8, v9, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret ; @@ -387,14 +387,14 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize { ; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; ZVE32F-NEXT: vmv.v.x v8, a0 ; ZVE32F-NEXT: vmv.v.i v0, 15 -; ZVE32F-NEXT: vslide1down.vx v9, v8, a0 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; ZVE32F-NEXT: vslide1down.vx v9, v8, a1 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a0 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t -; ZVE32F-NEXT: vand.vi v8, v8, 1 +; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t +; ZVE32F-NEXT: vand.vi v8, v9, 1 ; ZVE32F-NEXT: vmsne.vi v0, v8, 0 ; ZVE32F-NEXT: ret %1 = insertelement <8 x i1> poison, i1 %x, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 533b8b6864ebc..67d55366674f3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -14055,45 +14055,45 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) { ; RV32-NEXT: vmv.x.s t0, v8 ; RV32-NEXT: lbu t1, 0(a1) ; RV32-NEXT: lbu a1, 1(a1) -; RV32-NEXT: lbu t2, 0(a2) -; RV32-NEXT: lbu a2, 1(a2) ; RV32-NEXT: slli a0, a0, 8 ; RV32-NEXT: or a0, a0, a6 -; RV32-NEXT: lbu a6, 0(a3) -; RV32-NEXT: lbu a3, 1(a3) +; RV32-NEXT: lbu a6, 0(a2) +; RV32-NEXT: lbu a2, 1(a2) ; RV32-NEXT: slli a1, a1, 8 ; RV32-NEXT: or a1, a1, t1 -; RV32-NEXT: lbu t1, 0(a4) -; RV32-NEXT: lbu a4, 1(a4) +; RV32-NEXT: lbu t1, 0(a3) +; RV32-NEXT: lbu a3, 1(a3) ; RV32-NEXT: slli a2, a2, 8 -; RV32-NEXT: or a2, a2, t2 -; RV32-NEXT: lbu t2, 0(a5) -; RV32-NEXT: lbu a5, 1(a5) +; RV32-NEXT: or a2, a2, a6 +; RV32-NEXT: lbu a6, 0(a4) +; RV32-NEXT: lbu a4, 1(a4) ; RV32-NEXT: slli a3, a3, 8 -; RV32-NEXT: or a3, a3, a6 +; RV32-NEXT: or a3, a3, t1 +; RV32-NEXT: lbu t1, 0(a5) +; RV32-NEXT: lbu a5, 1(a5) +; RV32-NEXT: slli a4, a4, 8 +; RV32-NEXT: or a4, a4, a6 ; RV32-NEXT: lbu a6, 0(a7) ; RV32-NEXT: lbu a7, 1(a7) -; RV32-NEXT: slli a4, a4, 8 -; RV32-NEXT: or a4, a4, t1 +; RV32-NEXT: slli a5, a5, 8 +; RV32-NEXT: or a5, a5, t1 ; RV32-NEXT: lbu t1, 0(t0) ; RV32-NEXT: lbu t0, 1(t0) -; RV32-NEXT: slli a5, a5, 8 -; RV32-NEXT: or a5, a5, t2 ; RV32-NEXT: slli a7, a7, 8 ; RV32-NEXT: or a6, a7, a6 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 15 ; RV32-NEXT: slli t0, t0, 8 ; RV32-NEXT: or a7, t0, t1 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: vmv.v.x v9, a4 -; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: vslide1down.vx v9, v9, a5 -; RV32-NEXT: vslide1down.vx v10, v8, a3 +; RV32-NEXT: vslide1down.vx v10, v8, a2 ; RV32-NEXT: vslide1down.vx v8, v9, a6 -; RV32-NEXT: vmv.v.i v0, 15 ; RV32-NEXT: vslide1down.vx v8, v8, a7 -; RV32-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV32-NEXT: vslide1down.vx v9, v10, a3 +; RV32-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_strided_unaligned: @@ -14215,15 +14215,15 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) { ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: slli a0, a0, 8 ; RV64ZVE32F-NEXT: or a0, a0, t1 -; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vmv.v.x v9, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vmv.v.x v8, a2 +; RV64ZVE32F-NEXT: vmv.v.x v9, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 1, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14258,15 +14258,15 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a0, 26(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vmv.v.x v9, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vmv.v.x v9, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14303,15 +14303,15 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) { ; RV64ZVE32F-NEXT: lh a0, 30(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vmv.v.x v9, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vmv.v.x v9, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14342,21 +14342,21 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a2, 26(a0) ; RV64ZVE32F-NEXT: lh a3, 28(a0) ; RV64ZVE32F-NEXT: lh a4, 30(a0) -; RV64ZVE32F-NEXT: lh a5, 16(a0) -; RV64ZVE32F-NEXT: lh a6, 18(a0) -; RV64ZVE32F-NEXT: lh a7, 20(a0) +; RV64ZVE32F-NEXT: lh a5, 20(a0) +; RV64ZVE32F-NEXT: lh a6, 16(a0) +; RV64ZVE32F-NEXT: lh a7, 18(a0) ; RV64ZVE32F-NEXT: lh a0, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vmv.v.x v8, a3 -; RV64ZVE32F-NEXT: vmv.v.x v9, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a0 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a6 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vmv.v.x v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14387,21 +14387,21 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a2, 22(a0) ; RV64ZVE32F-NEXT: lh a3, 28(a0) ; RV64ZVE32F-NEXT: lh a4, 30(a0) -; RV64ZVE32F-NEXT: lh a5, 4(a0) -; RV64ZVE32F-NEXT: lh a6, 6(a0) -; RV64ZVE32F-NEXT: lh a7, 12(a0) +; RV64ZVE32F-NEXT: lh a5, 12(a0) +; RV64ZVE32F-NEXT: lh a6, 4(a0) +; RV64ZVE32F-NEXT: lh a7, 6(a0) ; RV64ZVE32F-NEXT: lh a0, 14(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vmv.v.x v8, a3 -; RV64ZVE32F-NEXT: vmv.v.x v9, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a0 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a6 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vmv.v.x v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14437,15 +14437,15 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vmv.v.x v8, a5 -; RV64ZVE32F-NEXT: vmv.v.x v9, a1 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.v.x v9, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14484,15 +14484,15 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) { ; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vmv.v.x v8, a5 -; RV64ZVE32F-NEXT: vmv.v.x v9, a1 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.v.x v9, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14531,15 +14531,15 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned2(ptr %base) { ; RV64ZVE32F-NEXT: lh a0, 20(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vmv.v.x v9, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vmv.v.x v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v9, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a0 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a3 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a0 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14582,15 +14582,15 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a0, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vmv.v.x v9, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vmv.v.x v9, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14630,15 +14630,15 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) { ; RV64ZVE32F-NEXT: lh a0, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vmv.v.x v9, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vmv.v.x v9, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14687,15 +14687,15 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) { ; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vmv.v.x v9, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vmv.v.x v9, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) @@ -14735,15 +14735,15 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) { ; RV64ZVE32F-NEXT: lh a0, 14(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vmv.v.x v9, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t +; RV64ZVE32F-NEXT: vmv.v.x v8, a5 +; RV64ZVE32F-NEXT: vmv.v.x v9, a1 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll index b6267bf481c85..0909c11078ff4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll @@ -1340,26 +1340,26 @@ define <4 x i64> @unzip2a_dual_v4i64(<4 x i64> %a, <4 x i64> %b) { ; ; ZVE32F-LABEL: unzip2a_dual_v4i64: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: ld a3, 0(a2) -; ZVE32F-NEXT: ld a2, 16(a2) -; ZVE32F-NEXT: ld a4, 0(a1) +; ZVE32F-NEXT: ld a3, 0(a1) ; ZVE32F-NEXT: ld a1, 16(a1) +; ZVE32F-NEXT: ld a4, 0(a2) +; ZVE32F-NEXT: ld a2, 16(a2) ; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu ; ZVE32F-NEXT: vmv.v.i v0, 15 -; ZVE32F-NEXT: srli a5, a2, 32 +; ZVE32F-NEXT: srli a5, a1, 32 ; ZVE32F-NEXT: srli a6, a3, 32 -; ZVE32F-NEXT: srli a7, a1, 32 +; ZVE32F-NEXT: srli a7, a2, 32 ; ZVE32F-NEXT: srli t0, a4, 32 ; ZVE32F-NEXT: vmv.v.x v8, a4 ; ZVE32F-NEXT: vmv.v.x v9, a3 ; ZVE32F-NEXT: vslide1down.vx v8, v8, t0 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a6 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a5 -; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVE32F-NEXT: vse32.v v9, (a0) +; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; ZVE32F-NEXT: vse32.v v8, (a0) ; ZVE32F-NEXT: ret ; ; ZIP-LABEL: unzip2a_dual_v4i64: @@ -1378,9 +1378,9 @@ entry: define <16 x i64> @unzip2a_dual_v16i64(<16 x i64> %a, <16 x i64> %b) { ; V-LABEL: unzip2a_dual_v16i64: ; V: # %bb.0: # %entry -; V-NEXT: lui a0, 5 ; V-NEXT: vsetivli zero, 16, e16, m1, ta, ma ; V-NEXT: vid.v v16 +; V-NEXT: lui a0, 5 ; V-NEXT: addi a0, a0, 1365 ; V-NEXT: vmv.s.x v20, a0 ; V-NEXT: li a0, -256 @@ -1526,26 +1526,26 @@ define <4 x i64> @unzip2a_dual_v4i64_exact(<4 x i64> %a, <4 x i64> %b) vscale_ra ; ; ZVE32F-LABEL: unzip2a_dual_v4i64_exact: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: ld a3, 0(a2) -; ZVE32F-NEXT: ld a2, 16(a2) -; ZVE32F-NEXT: ld a4, 0(a1) +; ZVE32F-NEXT: ld a3, 0(a1) ; ZVE32F-NEXT: ld a1, 16(a1) +; ZVE32F-NEXT: ld a4, 0(a2) +; ZVE32F-NEXT: ld a2, 16(a2) ; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu ; ZVE32F-NEXT: vmv.v.i v0, 15 -; ZVE32F-NEXT: srli a5, a2, 32 +; ZVE32F-NEXT: srli a5, a1, 32 ; ZVE32F-NEXT: srli a6, a3, 32 -; ZVE32F-NEXT: srli a7, a1, 32 +; ZVE32F-NEXT: srli a7, a2, 32 ; ZVE32F-NEXT: srli t0, a4, 32 ; ZVE32F-NEXT: vmv.v.x v8, a4 ; ZVE32F-NEXT: vmv.v.x v9, a3 ; ZVE32F-NEXT: vslide1down.vx v8, v8, t0 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a6 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a5 -; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVE32F-NEXT: vs1r.v v9, (a0) +; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; ZVE32F-NEXT: vs1r.v v8, (a0) ; ZVE32F-NEXT: ret ; ; ZIP-LABEL: unzip2a_dual_v4i64_exact: @@ -1574,26 +1574,26 @@ define <4 x i64> @unzip2a_dual_v4i64_exact_nf2(<4 x i64> %a, <4 x i64> %b) vscal ; ; ZVE32F-LABEL: unzip2a_dual_v4i64_exact_nf2: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: ld a3, 0(a2) -; ZVE32F-NEXT: ld a2, 16(a2) -; ZVE32F-NEXT: ld a4, 0(a1) +; ZVE32F-NEXT: ld a3, 0(a1) ; ZVE32F-NEXT: ld a1, 16(a1) +; ZVE32F-NEXT: ld a4, 0(a2) +; ZVE32F-NEXT: ld a2, 16(a2) ; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu ; ZVE32F-NEXT: vmv.v.i v0, 15 -; ZVE32F-NEXT: srli a5, a2, 32 +; ZVE32F-NEXT: srli a5, a1, 32 ; ZVE32F-NEXT: srli a6, a3, 32 -; ZVE32F-NEXT: srli a7, a1, 32 +; ZVE32F-NEXT: srli a7, a2, 32 ; ZVE32F-NEXT: srli t0, a4, 32 ; ZVE32F-NEXT: vmv.v.x v8, a4 ; ZVE32F-NEXT: vmv.v.x v9, a3 ; ZVE32F-NEXT: vslide1down.vx v8, v8, t0 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a6 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a5 -; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVE32F-NEXT: vse32.v v9, (a0) +; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; ZVE32F-NEXT: vse32.v v8, (a0) ; ZVE32F-NEXT: ret ; ; ZIP-LABEL: unzip2a_dual_v4i64_exact_nf2: @@ -1651,76 +1651,76 @@ define <16 x i64> @unzip2a_dual_v16i64_exact(<16 x i64> %a, <16 x i64> %b) vscal ; ; ZVE32F-LABEL: unzip2a_dual_v16i64_exact: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: ld a6, 0(a1) +; ZVE32F-NEXT: ld a5, 96(a2) +; ZVE32F-NEXT: ld a7, 0(a1) ; ZVE32F-NEXT: ld a4, 16(a1) -; ZVE32F-NEXT: ld a7, 32(a1) +; ZVE32F-NEXT: ld t0, 32(a1) ; ZVE32F-NEXT: ld a3, 48(a1) -; ZVE32F-NEXT: ld a5, 80(a1) -; ZVE32F-NEXT: ld t0, 96(a1) ; ZVE32F-NEXT: ld t1, 64(a1) +; ZVE32F-NEXT: ld a6, 80(a1) +; ZVE32F-NEXT: ld t2, 96(a1) ; ZVE32F-NEXT: ld a1, 112(a1) -; ZVE32F-NEXT: srli t2, a6, 32 ; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu -; ZVE32F-NEXT: vmv.v.x v8, t0 +; ZVE32F-NEXT: vmv.v.x v8, a7 +; ZVE32F-NEXT: srli a7, a7, 32 +; ZVE32F-NEXT: vmv.v.x v9, t0 ; ZVE32F-NEXT: srli t0, t0, 32 -; ZVE32F-NEXT: vmv.v.x v9, t1 -; ZVE32F-NEXT: srli t1, t1, 32 -; ZVE32F-NEXT: vmv.v.x v10, a6 -; ZVE32F-NEXT: vslide1down.vx v9, v9, t1 -; ZVE32F-NEXT: vslide1down.vx v8, v8, t0 -; ZVE32F-NEXT: vslide1down.vx v10, v10, t2 -; ZVE32F-NEXT: ld t1, 32(a2) -; ZVE32F-NEXT: ld t0, 16(a2) -; ZVE32F-NEXT: ld t2, 0(a2) -; ZVE32F-NEXT: ld a6, 48(a2) -; ZVE32F-NEXT: vmv.v.x v11, t1 +; ZVE32F-NEXT: vmv.v.x v10, t1 ; ZVE32F-NEXT: srli t1, t1, 32 -; ZVE32F-NEXT: vmv.v.x v12, t2 +; ZVE32F-NEXT: vmv.v.x v11, t2 ; ZVE32F-NEXT: srli t2, t2, 32 -; ZVE32F-NEXT: vmv.v.x v13, a7 -; ZVE32F-NEXT: srli a7, a7, 32 -; ZVE32F-NEXT: vslide1down.vx v13, v13, a7 -; ZVE32F-NEXT: vslide1down.vx v12, v12, t2 -; ZVE32F-NEXT: vslide1down.vx v11, v11, t1 -; ZVE32F-NEXT: ld a7, 64(a2) -; ZVE32F-NEXT: ld t1, 80(a2) -; ZVE32F-NEXT: ld t2, 96(a2) -; ZVE32F-NEXT: ld a2, 112(a2) -; ZVE32F-NEXT: vmv.v.x v14, a7 -; ZVE32F-NEXT: srli a7, a7, 32 -; ZVE32F-NEXT: vslide1down.vx v14, v14, a7 -; ZVE32F-NEXT: vmv.v.x v15, t2 -; ZVE32F-NEXT: srli a7, t2, 32 -; ZVE32F-NEXT: vslide1down.vx v15, v15, a7 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a5 +; ZVE32F-NEXT: vslide1down.vx v11, v11, t2 +; ZVE32F-NEXT: vslide1down.vx v10, v10, t1 +; ZVE32F-NEXT: vslide1down.vx v12, v9, t0 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a7 +; ZVE32F-NEXT: ld t0, 0(a2) +; ZVE32F-NEXT: ld t1, 16(a2) +; ZVE32F-NEXT: ld t2, 32(a2) +; ZVE32F-NEXT: ld a7, 48(a2) +; ZVE32F-NEXT: vmv.v.x v9, t0 +; ZVE32F-NEXT: srli t0, t0, 32 +; ZVE32F-NEXT: vmv.v.x v13, t2 +; ZVE32F-NEXT: srli t2, t2, 32 +; ZVE32F-NEXT: vslide1down.vx v13, v13, t2 +; ZVE32F-NEXT: vslide1down.vx v14, v9, t0 +; ZVE32F-NEXT: ld t0, 64(a2) +; ZVE32F-NEXT: ld t2, 112(a2) +; ZVE32F-NEXT: vmv.v.x v9, a5 ; ZVE32F-NEXT: srli a5, a5, 32 +; ZVE32F-NEXT: vslide1down.vx v15, v9, a5 +; ZVE32F-NEXT: ld a2, 80(a2) +; ZVE32F-NEXT: vmv.v.x v9, t0 +; ZVE32F-NEXT: srli a5, t0, 32 ; ZVE32F-NEXT: vslide1down.vx v16, v9, a5 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; ZVE32F-NEXT: vslide1down.vx v9, v11, a1 ; ZVE32F-NEXT: srli a1, a1, 32 -; ZVE32F-NEXT: vslide1down.vx v9, v8, a1 -; ZVE32F-NEXT: vslide1down.vx v8, v10, a4 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 +; ZVE32F-NEXT: vslide1down.vx v10, v10, a6 +; ZVE32F-NEXT: srli a1, a6, 32 +; ZVE32F-NEXT: vslide1down.vx v10, v10, a1 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a4 ; ZVE32F-NEXT: srli a4, a4, 32 -; ZVE32F-NEXT: vslide1down.vx v10, v8, a4 -; ZVE32F-NEXT: vslide1down.vx v8, v12, t0 -; ZVE32F-NEXT: srli a1, t0, 32 -; ZVE32F-NEXT: vslide1down.vx v12, v8, a1 +; ZVE32F-NEXT: vslide1down.vx v11, v8, a4 ; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslide1down.vx v8, v14, t1 ; ZVE32F-NEXT: srli a1, t1, 32 ; ZVE32F-NEXT: vslide1down.vx v14, v8, a1 -; ZVE32F-NEXT: vslidedown.vi v9, v16, 4, v0.t -; ZVE32F-NEXT: vslide1down.vx v8, v13, a3 +; ZVE32F-NEXT: vslidedown.vi v9, v10, 4, v0.t +; ZVE32F-NEXT: vslide1down.vx v8, v12, a3 ; ZVE32F-NEXT: srli a3, a3, 32 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t -; ZVE32F-NEXT: vslide1down.vx v10, v11, a6 -; ZVE32F-NEXT: srli a1, a6, 32 +; ZVE32F-NEXT: vslidedown.vi v8, v11, 4, v0.t +; ZVE32F-NEXT: vslide1down.vx v10, v13, a7 +; ZVE32F-NEXT: srli a1, a7, 32 ; ZVE32F-NEXT: vslide1down.vx v10, v10, a1 -; ZVE32F-NEXT: vslidedown.vi v10, v12, 4, v0.t -; ZVE32F-NEXT: vslide1down.vx v11, v15, a2 +; ZVE32F-NEXT: vslidedown.vi v10, v14, 4, v0.t +; ZVE32F-NEXT: vslide1down.vx v11, v15, t2 +; ZVE32F-NEXT: srli a1, t2, 32 +; ZVE32F-NEXT: vslide1down.vx v11, v11, a1 +; ZVE32F-NEXT: vslide1down.vx v12, v16, a2 ; ZVE32F-NEXT: srli a2, a2, 32 -; ZVE32F-NEXT: vslide1down.vx v11, v11, a2 -; ZVE32F-NEXT: vslidedown.vi v11, v14, 4, v0.t +; ZVE32F-NEXT: vslide1down.vx v12, v12, a2 +; ZVE32F-NEXT: vslidedown.vi v11, v12, 4, v0.t ; ZVE32F-NEXT: vs4r.v v8, (a0) ; ZVE32F-NEXT: ret ; @@ -1751,26 +1751,26 @@ define <4 x i64> @unzip2b_dual_v4i64(<4 x i64> %a, <4 x i64> %b) { ; ; ZVE32F-LABEL: unzip2b_dual_v4i64: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: ld a3, 8(a2) -; ZVE32F-NEXT: ld a2, 24(a2) -; ZVE32F-NEXT: ld a4, 8(a1) +; ZVE32F-NEXT: ld a3, 8(a1) ; ZVE32F-NEXT: ld a1, 24(a1) +; ZVE32F-NEXT: ld a4, 8(a2) +; ZVE32F-NEXT: ld a2, 24(a2) ; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu ; ZVE32F-NEXT: vmv.v.i v0, 15 -; ZVE32F-NEXT: srli a5, a2, 32 +; ZVE32F-NEXT: srli a5, a1, 32 ; ZVE32F-NEXT: srli a6, a3, 32 -; ZVE32F-NEXT: srli a7, a1, 32 +; ZVE32F-NEXT: srli a7, a2, 32 ; ZVE32F-NEXT: srli t0, a4, 32 ; ZVE32F-NEXT: vmv.v.x v8, a4 ; ZVE32F-NEXT: vmv.v.x v9, a3 ; ZVE32F-NEXT: vslide1down.vx v8, v8, t0 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a6 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a5 -; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVE32F-NEXT: vse32.v v9, (a0) +; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; ZVE32F-NEXT: vse32.v v8, (a0) ; ZVE32F-NEXT: ret ; ; ZIP-LABEL: unzip2b_dual_v4i64: @@ -1802,26 +1802,26 @@ define <4 x i64> @unzip2b_dual_v4i64_exact(<4 x i64> %a, <4 x i64> %b) vscale_ra ; ; ZVE32F-LABEL: unzip2b_dual_v4i64_exact: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: ld a3, 8(a2) -; ZVE32F-NEXT: ld a2, 24(a2) -; ZVE32F-NEXT: ld a4, 8(a1) +; ZVE32F-NEXT: ld a3, 8(a1) ; ZVE32F-NEXT: ld a1, 24(a1) +; ZVE32F-NEXT: ld a4, 8(a2) +; ZVE32F-NEXT: ld a2, 24(a2) ; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu ; ZVE32F-NEXT: vmv.v.i v0, 15 -; ZVE32F-NEXT: srli a5, a2, 32 +; ZVE32F-NEXT: srli a5, a1, 32 ; ZVE32F-NEXT: srli a6, a3, 32 -; ZVE32F-NEXT: srli a7, a1, 32 +; ZVE32F-NEXT: srli a7, a2, 32 ; ZVE32F-NEXT: srli t0, a4, 32 ; ZVE32F-NEXT: vmv.v.x v8, a4 ; ZVE32F-NEXT: vmv.v.x v9, a3 ; ZVE32F-NEXT: vslide1down.vx v8, v8, t0 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a6 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a2 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a5 -; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t -; ZVE32F-NEXT: vs1r.v v9, (a0) +; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t +; ZVE32F-NEXT: vs1r.v v8, (a0) ; ZVE32F-NEXT: ret ; ; ZIP-LABEL: unzip2b_dual_v4i64_exact: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll index 917613d5c786f..3718156971919 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll @@ -598,43 +598,54 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { ; ZIP-NEXT: addi sp, sp, -16 ; ZIP-NEXT: .cfi_def_cfa_offset 16 ; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: slli a0, a0, 5 +; ZIP-NEXT: li a1, 40 +; ZIP-NEXT: mul a0, a0, a1 ; ZIP-NEXT: sub sp, sp, a0 -; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: li a1, 24 -; ZIP-NEXT: mul a0, a0, a1 +; ZIP-NEXT: slli a0, a0, 5 ; ZIP-NEXT: add a0, sp, a0 ; ZIP-NEXT: addi a0, a0, 16 ; ZIP-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; ZIP-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; ZIP-NEXT: vslidedown.vi v24, v8, 16 ; ZIP-NEXT: li a0, 32 +; ZIP-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; ZIP-NEXT: vslidedown.vi v16, v8, 16 ; ZIP-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZIP-NEXT: ri.vzip2a.vv v16, v24, v0 +; ZIP-NEXT: ri.vzip2a.vv v8, v16, v0 ; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: li a2, 24 -; ZIP-NEXT: mul a1, a1, a2 +; ZIP-NEXT: slli a1, a1, 3 ; ZIP-NEXT: add a1, sp, a1 ; ZIP-NEXT: addi a1, a1, 16 -; ZIP-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload +; ZIP-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: slli a1, a1, 5 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 16 +; ZIP-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; ZIP-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; ZIP-NEXT: vslidedown.vi v24, v24, 16 +; ZIP-NEXT: vslidedown.vi v16, v16, 16 ; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: slli a1, a1, 4 +; ZIP-NEXT: li a2, 24 +; ZIP-NEXT: mul a1, a1, a2 ; ZIP-NEXT: add a1, sp, a1 ; ZIP-NEXT: addi a1, a1, 16 -; ZIP-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; ZIP-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZIP-NEXT: lui a1, 699051 ; ZIP-NEXT: addi a1, a1, -1366 ; ZIP-NEXT: vmv.s.x v0, a1 ; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: slli a1, a1, 3 +; ZIP-NEXT: slli a1, a1, 4 ; ZIP-NEXT: add a1, sp, a1 ; ZIP-NEXT: addi a1, a1, 16 -; ZIP-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; ZIP-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: li a2, 24 +; ZIP-NEXT: mul a1, a1, a2 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 16 +; ZIP-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; ZIP-NEXT: csrr a1, vlenb ; ZIP-NEXT: slli a1, a1, 4 ; ZIP-NEXT: add a1, sp, a1 @@ -646,19 +657,21 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { ; ZIP-NEXT: addi a1, a1, 16 ; ZIP-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; ZIP-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; ZIP-NEXT: ri.vzip2a.vv v16, v8, v24, v0.t +; ZIP-NEXT: ri.vzip2a.vv v8, v24, v16, v0.t +; ZIP-NEXT: vmv.v.v v24, v8 ; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: li a1, 24 -; ZIP-NEXT: mul a0, a0, a1 +; ZIP-NEXT: slli a0, a0, 5 ; ZIP-NEXT: add a0, sp, a0 ; ZIP-NEXT: addi a0, a0, 16 -; ZIP-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZIP-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZIP-NEXT: ri.vzip2a.vv v0, v8, v24 +; ZIP-NEXT: ri.vzip2a.vv v0, v8, v16 ; ZIP-NEXT: vmv.v.v v8, v0 +; ZIP-NEXT: vmv.v.v v16, v24 ; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: slli a0, a0, 5 +; ZIP-NEXT: li a1, 40 +; ZIP-NEXT: mul a0, a0, a1 ; ZIP-NEXT: add sp, sp, a0 ; ZIP-NEXT: .cfi_def_cfa sp, 16 ; ZIP-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll index 8676803e20e3b..f5216d82c81ba 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll @@ -195,15 +195,15 @@ define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) { ; RV32-NEXT: lui a0, %hi(.LCPI12_0) ; RV32-NEXT: addi a0, a0, %lo(.LCPI12_0) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vmv.v.i v16, -1 ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: lui a0, %hi(.LCPI12_1) ; RV32-NEXT: addi a0, a0, %lo(.LCPI12_1) ; RV32-NEXT: vle16.v v21, (a0) +; RV32-NEXT: vmv.v.i v16, -1 ; RV32-NEXT: li a0, 113 ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vrgatherei16.vv v12, v16, v20 -; RV32-NEXT: vrgatherei16.vv v12, v8, v21, v0.t +; RV32-NEXT: vrgatherei16.vv v12, v16, v21 +; RV32-NEXT: vrgatherei16.vv v12, v8, v20, v0.t ; RV32-NEXT: vmv.v.v v8, v12 ; RV32-NEXT: ret ; @@ -227,12 +227,12 @@ define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) { define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) { ; RV32-LABEL: vrgather_shuffle_vx_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI13_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI13_0) -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vle16.v v16, (a0) ; RV32-NEXT: lui a0, %hi(.LCPI13_1) ; RV32-NEXT: addi a0, a0, %lo(.LCPI13_1) +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32-NEXT: vle16.v v16, (a0) +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI13_0) ; RV32-NEXT: vle16.v v17, (a0) ; RV32-NEXT: li a0, 140 ; RV32-NEXT: vmv.s.x v0, a0 @@ -388,14 +388,13 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) { define <8 x i8> @splat_ve2_we0_ins_i0we4(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: splat_ve2_we0_ins_i0we4: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vrgather.vi v10, v8, 2 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v11, 4 ; CHECK-NEXT: li a0, 67 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t +; CHECK-NEXT: vrgather.vi v10, v8, 2 +; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll deleted file mode 100644 index 0c058b562f53d..0000000000000 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll +++ /dev/null @@ -1,45 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=riscv64 -mattr=+v -stop-after=finalize-isel | FileCheck %s - -declare @llvm.vp.merge.nxv2i32(, , , i32) -declare @llvm.vp.select.nxv2i32(, , , i32) -declare @llvm.vp.load.nxv2i32.p0(ptr, , i32) - -; Test result has chain output of true operand of merge.vvm. -define void @vpmerge_vpload_store( %passthru, ptr %p, %m, i32 zeroext %vl) { - ; CHECK-LABEL: name: vpmerge_vpload_store - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $v8, $x10, $v0, $x11 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x11 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr = COPY $v0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $x10 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]] - ; CHECK-NEXT: [[PseudoVLE32_V_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 8) - ; CHECK-NEXT: PseudoVSE32_V_M1 killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]], -1, 5 /* e32 */ :: (store () into %ir.p) - ; CHECK-NEXT: PseudoRET - %a = call @llvm.vp.load.nxv2i32.p0(ptr %p, splat (i1 -1), i32 %vl) - %b = call @llvm.vp.merge.nxv2i32( %m, %a, %passthru, i32 %vl) - store %b, ptr %p - ret void -} - -define void @vpselect_vpload_store( %passthru, ptr %p, %m, i32 zeroext %vl) { - ; CHECK-LABEL: name: vpselect_vpload_store - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $v8, $x10, $v0, $x11 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x11 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr = COPY $v0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $x10 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]] - ; CHECK-NEXT: [[PseudoVLE32_V_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 5 /* e32 */, 1 /* ta, mu */ :: (load unknown-size from %ir.p, align 8) - ; CHECK-NEXT: PseudoVSE32_V_M1 killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]], -1, 5 /* e32 */ :: (store () into %ir.p) - ; CHECK-NEXT: PseudoRET - %a = call @llvm.vp.load.nxv2i32.p0(ptr %p, splat (i1 -1), i32 %vl) - %b = call @llvm.vp.select.nxv2i32( %m, %a, %passthru, i32 %vl) - store %b, ptr %p - ret void -} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index 4753ab915bdf3..5c04a09c9953b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -141,28 +141,26 @@ define {<4 x i64>, <4 x i64>} @vector_deinterleave_v4i64_v8i64(<8 x i64> %vec) { define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec) { ; V-LABEL: vector_deinterleave_v8i64_v16i64: ; V: # %bb.0: -; V-NEXT: li a0, 85 ; V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; V-NEXT: vmv.v.i v0, -16 ; V-NEXT: vid.v v16 ; V-NEXT: vsetivli zero, 8, e64, m8, ta, ma ; V-NEXT: vslidedown.vi v24, v8, 8 +; V-NEXT: li a0, 85 +; V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; V-NEXT: vmv.v.i v0, -16 ; V-NEXT: vmv.s.x v12, a0 ; V-NEXT: li a0, 170 -; V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; V-NEXT: vadd.vv v20, v16, v16 -; V-NEXT: vmv.s.x v21, a0 +; V-NEXT: vadd.vv v13, v16, v16 +; V-NEXT: vmv.s.x v20, a0 +; V-NEXT: vadd.vi v21, v13, -8 ; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; V-NEXT: vcompress.vm v16, v8, v12 ; V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; V-NEXT: vadd.vi v22, v20, -8 -; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; V-NEXT: vcompress.vm v12, v8, v21 -; V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; V-NEXT: vadd.vi v8, v20, -7 +; V-NEXT: vadd.vi v22, v13, -7 ; V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; V-NEXT: vrgatherei16.vv v16, v24, v22, v0.t -; V-NEXT: vrgatherei16.vv v12, v24, v8, v0.t +; V-NEXT: vcompress.vm v12, v8, v20 +; V-NEXT: vrgatherei16.vv v16, v24, v21, v0.t +; V-NEXT: vrgatherei16.vv v12, v24, v22, v0.t ; V-NEXT: vmv.v.v v8, v16 ; V-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir new file mode 100644 index 0000000000000..03204468dc14c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir @@ -0,0 +1,57 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v -run-pass=riscv-vector-peephole -verify-machineinstrs | FileCheck %s + +--- +name: vle32 +body: | + bb.0: + liveins: $x8, $v0, $v8 + ; CHECK-LABEL: name: vle32 + ; CHECK: liveins: $x8, $v0, $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %avl:gprnox0 = COPY $x8 + ; CHECK-NEXT: %passthru:vrnov0 = COPY $v8 + ; CHECK-NEXT: %mask:vmv0 = COPY $v0 + ; CHECK-NEXT: %y:vrnov0 = PseudoVLE32_V_M1_MASK %passthru, $noreg, %mask, %avl, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size, align 1) + %avl:gprnox0 = COPY $x8 + %passthru:vrnov0 = COPY $v8 + %x:vr = PseudoVLE32_V_M1 $noreg, $noreg, %avl, 5 /* e32 */, 2 /* tu, ma */ :: (load unknown-size) + %mask:vmv0 = COPY $v0 + %y:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, %x, %mask, %avl, 5 /* e32 */ +... +--- +name: vle32_no_passthru +body: | + bb.0: + liveins: $x8, $v0, $v8 + ; CHECK-LABEL: name: vle32_no_passthru + ; CHECK: liveins: $x8, $v0, $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %avl:gprnox0 = COPY $x8 + ; CHECK-NEXT: %false:vrnov0 = COPY $v8 + ; CHECK-NEXT: %mask:vmv0 = COPY $v0 + ; CHECK-NEXT: %y:vrnov0 = PseudoVLE32_V_M1_MASK %false, $noreg, %mask, %avl, 5 /* e32 */, 1 /* ta, mu */ :: (load unknown-size, align 1) + %avl:gprnox0 = COPY $x8 + %false:vr = COPY $v8 + %x:vr = PseudoVLE32_V_M1 $noreg, $noreg, %avl, 5 /* e32 */, 2 /* tu, ma */ :: (load unknown-size) + %mask:vmv0 = COPY $v0 + %y:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %x, %mask, %avl, 5 /* e32 */ +... +--- +name: vle32_move_past_passthru +body: | + bb.0: + liveins: $x8, $v0, $v8 + ; CHECK-LABEL: name: vle32_move_past_passthru + ; CHECK: liveins: $x8, $v0, $v8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %avl:gprnox0 = COPY $x8 + ; CHECK-NEXT: %passthru:vrnov0 = COPY $v8 + ; CHECK-NEXT: %mask:vmv0 = COPY $v0 + ; CHECK-NEXT: %y:vrnov0 = PseudoVLE32_V_M1_MASK %passthru, $noreg, %mask, %avl, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size, align 1) + %avl:gprnox0 = COPY $x8 + %x:vr = PseudoVLE32_V_M1 $noreg, $noreg, %avl, 5 /* e32 */, 2 /* tu, ma */ :: (load unknown-size) + %passthru:vrnov0 = COPY $v8 + %mask:vmv0 = COPY $v0 + %y:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, %x, %mask, %avl, 5 /* e32 */ +... diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll index f94e46771f49c..c8b882b92b934 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll @@ -1540,12 +1540,11 @@ define @vwadd_vx_splat_zext_i1( %va, i16 %b) ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 -; RV64-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; RV64-NEXT: vmv.v.x v12, a0 -; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: li a0, 1 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: li a0, 1 ; RV64-NEXT: vwaddu.vx v8, v12, a0, v0.t ; RV64-NEXT: ret %zb = zext i16 %b to i32 @@ -1615,12 +1614,11 @@ define @vwadd_vx_splat_sext_i1( %va, i16 %b) ; RV64: # %bb.0: ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srai a0, a0, 48 -; RV64-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; RV64-NEXT: vmv.v.x v12, a0 -; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: li a0, 1 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: li a0, 1 ; RV64-NEXT: vwsub.vx v8, v12, a0, v0.t ; RV64-NEXT: ret %sb = sext i16 %b to i32