diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 5cc612e89162a..48a781b64de5e 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1508,6 +1508,9 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) { switch (I->getOpcode()) { default: return false; + case AArch64::PTRUE_C_B: + case AArch64::LD1B_2Z_IMM: + case AArch64::ST1B_2Z_IMM: case AArch64::STR_ZXI: case AArch64::STR_PXI: case AArch64::LDR_ZXI: @@ -2781,6 +2784,16 @@ struct RegPairInfo { } // end anonymous namespace +unsigned findFreePredicateReg(BitVector &SavedRegs) { + for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) { + if (SavedRegs.test(PReg)) { + unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0; + return PNReg; + } + } + return AArch64::NoRegister; +} + static void computeCalleeSaveRegisterPairs( MachineFunction &MF, ArrayRef CSI, const TargetRegisterInfo *TRI, SmallVectorImpl &RegPairs, @@ -2859,7 +2872,11 @@ static void computeCalleeSaveRegisterPairs( RPI.Reg2 = NextReg; break; case RegPairInfo::PPR: + break; case RegPairInfo::ZPR: + if (AFI->getPredicateRegForFillSpill() != 0) + if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) + RPI.Reg2 = NextReg; break; } } @@ -2897,14 +2914,13 @@ static void computeCalleeSaveRegisterPairs( if (NeedsWinCFI && RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair RPI.FrameIdx = CSI[i + RegInc].getFrameIdx(); - int Scale = RPI.getScale(); int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset; assert(OffsetPre % Scale == 0); if (RPI.isScalable()) - ScalableByteOffset += StackFillDir * Scale; + ScalableByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale); else ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale); @@ -2915,9 +2931,6 @@ static void computeCalleeSaveRegisterPairs( (IsWindows && RPI.Reg2 == AArch64::LR))) ByteOffset += StackFillDir * 8; - assert(!(RPI.isScalable() && RPI.isPaired()) && - "Paired spill/fill instructions don't exist for SVE vectors"); - // Round up size of non-pair to pair size if we need to pad the // callee-save area to ensure 16-byte alignment. if (NeedGapToAlignStack && !NeedsWinCFI && @@ -3004,6 +3017,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( } return true; } + bool PTrueCreated = false; for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) { unsigned Reg1 = RPI.Reg1; unsigned Reg2 = RPI.Reg2; @@ -3038,10 +3052,10 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( Alignment = Align(16); break; case RegPairInfo::ZPR: - StrOpc = AArch64::STR_ZXI; - Size = 16; - Alignment = Align(16); - break; + StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI; + Size = 16; + Alignment = Align(16); + break; case RegPairInfo::PPR: StrOpc = AArch64::STR_PXI; Size = 2; @@ -3065,33 +3079,79 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( std::swap(Reg1, Reg2); std::swap(FrameIdxReg1, FrameIdxReg2); } - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); - if (!MRI.isReserved(Reg1)) - MBB.addLiveIn(Reg1); - if (RPI.isPaired()) { + + if (RPI.isPaired() && RPI.isScalable()) { + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + AArch64FunctionInfo *AFI = MF.getInfo(); + unsigned PnReg = AFI->getPredicateRegForFillSpill(); + assert(((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && PnReg != 0) && + "Expects SVE2.1 or SME2 target and a predicate register"); +#ifdef EXPENSIVE_CHECKS + auto IsPPR = [](const RegPairInfo &c) { + return c.Reg1 == RegPairInfo::PPR; + }; + auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR); + auto IsZPR = [](const RegPairInfo &c) { + return c.Type == RegPairInfo::ZPR; + }; + auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR); + assert(!(PPRBegin < ZPRBegin) && + "Expected callee save predicate to be handled first"); +#endif + if (!PTrueCreated) { + PTrueCreated = true; + BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg) + .setMIFlags(MachineInstr::FrameSetup); + } + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); + if (!MRI.isReserved(Reg1)) + MBB.addLiveIn(Reg1); if (!MRI.isReserved(Reg2)) MBB.addLiveIn(Reg2); - MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); + MIB.addReg(/*PairRegs*/ AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), MachineMemOperand::MOStore, Size, Alignment)); + MIB.addReg(PnReg); + MIB.addReg(AArch64::SP) + .addImm(RPI.Offset) // [sp, #offset*scale], + // where factor*scale is implicit + .setMIFlag(MachineInstr::FrameSetup); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), + MachineMemOperand::MOStore, Size, Alignment)); + if (NeedsWinCFI) + InsertSEH(MIB, TII, MachineInstr::FrameSetup); + } else { // The code when the pair of ZReg is not present + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); + if (!MRI.isReserved(Reg1)) + MBB.addLiveIn(Reg1); + if (RPI.isPaired()) { + if (!MRI.isReserved(Reg2)) + MBB.addLiveIn(Reg2); + MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), + MachineMemOperand::MOStore, Size, Alignment)); + } + MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) + .addReg(AArch64::SP) + .addImm(RPI.Offset) // [sp, #offset*scale], + // where factor*scale is implicit + .setMIFlag(MachineInstr::FrameSetup); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), + MachineMemOperand::MOStore, Size, Alignment)); + if (NeedsWinCFI) + InsertSEH(MIB, TII, MachineInstr::FrameSetup); } - MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) - .addReg(AArch64::SP) - .addImm(RPI.Offset) // [sp, #offset*scale], - // where factor*scale is implicit - .setMIFlag(MachineInstr::FrameSetup); - MIB.addMemOperand(MF.getMachineMemOperand( - MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), - MachineMemOperand::MOStore, Size, Alignment)); - if (NeedsWinCFI) - InsertSEH(MIB, TII, MachineInstr::FrameSetup); - // Update the StackIDs of the SVE stack slots. MachineFrameInfo &MFI = MF.getFrameInfo(); - if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) - MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector); - + if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) { + MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector); + if (RPI.isPaired()) + MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector); + } } return true; } @@ -3109,7 +3169,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( DL = MBBI->getDebugLoc(); computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF)); - if (homogeneousPrologEpilog(MF, &MBB)) { auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog)) .setMIFlag(MachineInstr::FrameDestroy); @@ -3130,6 +3189,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR); std::reverse(ZPRBegin, ZPREnd); + bool PTrueCreated = false; for (const RegPairInfo &RPI : RegPairs) { unsigned Reg1 = RPI.Reg1; unsigned Reg2 = RPI.Reg2; @@ -3162,7 +3222,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( Alignment = Align(16); break; case RegPairInfo::ZPR: - LdrOpc = AArch64::LDR_ZXI; + LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI; Size = 16; Alignment = Align(16); break; @@ -3187,25 +3247,58 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( std::swap(Reg1, Reg2); std::swap(FrameIdxReg1, FrameIdxReg2); } - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc)); - if (RPI.isPaired()) { - MIB.addReg(Reg2, getDefRegState(true)); + + AArch64FunctionInfo *AFI = MF.getInfo(); + if (RPI.isPaired() && RPI.isScalable()) { + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + unsigned PnReg = AFI->getPredicateRegForFillSpill(); + assert(((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && PnReg != 0) && + "Expects SVE2.1 or SME2 target and a predicate register"); +#ifdef EXPENSIVE_CHECKS + assert(!(PPRBegin < ZPRBegin) && + "Expected callee save predicate to be handled first"); +#endif + if (!PTrueCreated) { + PTrueCreated = true; + BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg) + .setMIFlags(MachineInstr::FrameDestroy); + } + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc)); + MIB.addReg(/*PairRegs*/ AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0), + getDefRegState(true)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), MachineMemOperand::MOLoad, Size, Alignment)); + MIB.addReg(PnReg); + MIB.addReg(AArch64::SP) + .addImm(RPI.Offset) // [sp, #offset*scale] + // where factor*scale is implicit + .setMIFlag(MachineInstr::FrameDestroy); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), + MachineMemOperand::MOLoad, Size, Alignment)); + if (NeedsWinCFI) + InsertSEH(MIB, TII, MachineInstr::FrameDestroy); + } else { + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc)); + if (RPI.isPaired()) { + MIB.addReg(Reg2, getDefRegState(true)); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), + MachineMemOperand::MOLoad, Size, Alignment)); + } + MIB.addReg(Reg1, getDefRegState(true)); + MIB.addReg(AArch64::SP) + .addImm(RPI.Offset) // [sp, #offset*scale] + // where factor*scale is implicit + .setMIFlag(MachineInstr::FrameDestroy); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), + MachineMemOperand::MOLoad, Size, Alignment)); + if (NeedsWinCFI) + InsertSEH(MIB, TII, MachineInstr::FrameDestroy); } - MIB.addReg(Reg1, getDefRegState(true)) - .addReg(AArch64::SP) - .addImm(RPI.Offset) // [sp, #offset*scale] - // where factor*scale is implicit - .setMIFlag(MachineInstr::FrameDestroy); - MIB.addMemOperand(MF.getMachineMemOperand( - MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), - MachineMemOperand::MOLoad, Size, Alignment)); - if (NeedsWinCFI) - InsertSEH(MIB, TII, MachineInstr::FrameDestroy); } - return true; } @@ -3234,6 +3327,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned ExtraCSSpill = 0; bool HasUnpairedGPR64 = false; + bool HasPairZReg = false; // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { const unsigned Reg = CSRegs[i]; @@ -3287,6 +3381,28 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, !RegInfo->isReservedReg(MF, PairedReg)) ExtraCSSpill = PairedReg; } + // Check if there is a pair of ZRegs, so it can select PReg for spill/fill + HasPairZReg |= (AArch64::ZPRRegClass.contains(Reg, CSRegs[i ^ 1]) && + SavedRegs.test(CSRegs[i ^ 1])); + } + + if (HasPairZReg && (Subtarget.hasSVE2p1() || Subtarget.hasSME2())) { + AArch64FunctionInfo *AFI = MF.getInfo(); + // Find a suitable predicate register for the multi-vector spill/fill + // instructions. + unsigned PnReg = findFreePredicateReg(SavedRegs); + if (PnReg != AArch64::NoRegister) + AFI->setPredicateRegForFillSpill(PnReg); + // If no free callee-save has been found assign one. + if (!AFI->getPredicateRegForFillSpill() && + MF.getFunction().getCallingConv() == + CallingConv::AArch64_SVE_VectorCall) { + SavedRegs.set(AArch64::P8); + AFI->setPredicateRegForFillSpill(AArch64::PN8); + } + + assert(!RegInfo->isReservedReg(MF, AFI->getPredicateRegForFillSpill()) && + "Predicate cannot be a reserved register"); } if (MF.getFunction().getCallingConv() == CallingConv::Win64 && diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index d5941e6284111..df09fc5592edf 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -212,6 +212,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // on function entry to record the initial pstate of a function. Register PStateSMReg = MCRegister::NoRegister; + // Has the PNReg used to build PTRUE instruction. + // The PTRUE is used for the LD/ST of ZReg pairs in save and restore. + unsigned PredicateRegForFillSpill = 0; + public: AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI); @@ -220,6 +224,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { const DenseMap &Src2DstMBB) const override; + void setPredicateRegForFillSpill(unsigned Reg) { + PredicateRegForFillSpill = Reg; + } + unsigned getPredicateRegForFillSpill() const { + return PredicateRegForFillSpill; + } + Register getPStateSMReg() const { return PStateSMReg; }; void setPStateSMReg(Register Reg) { PStateSMReg = Reg; }; diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll index ea7808d73093e..3a94b0333e267 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll @@ -55,45 +55,31 @@ define @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -569,20 +477,14 @@ define @ld1_x2_i16_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -595,21 +497,15 @@ define @ld1_x2_i16_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -677,45 +573,31 @@ define @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8_scalar( %unused, < ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -880,20 +736,14 @@ define @ld1_x2_i32_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -906,21 +756,15 @@ define @ld1_x2_i32_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -988,45 +832,31 @@ define @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8_scalar( %unused, < ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -1191,20 +995,14 @@ define @ld1_x2_i64_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1217,21 +1015,15 @@ define @ld1_x2_i64_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1301,46 +1093,32 @@ define @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1516,19 +1270,14 @@ define @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1545,20 +1294,15 @@ define @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1632,46 +1376,32 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1682,19 +1412,14 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1711,20 +1436,15 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1798,46 +1518,32 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1848,19 +1554,14 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1877,20 +1578,15 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1964,46 +1660,32 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2014,19 +1696,14 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -2043,20 +1720,15 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2130,46 +1802,32 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2180,19 +1838,14 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -2209,20 +1862,15 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2296,46 +1944,32 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2346,19 +1980,14 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -2375,20 +2004,15 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2462,46 +2086,32 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2512,19 +2122,14 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -2541,20 +2146,15 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll index 7e2d28fbf7982..8ecb7c858c6a2 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll @@ -8,45 +8,31 @@ define @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -164,20 +124,14 @@ define @ldnt1_x2_i8_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -190,21 +144,15 @@ define @ldnt1_x2_i8_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -225,45 +173,31 @@ define @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8_scalar( %unused ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -381,20 +289,14 @@ define @ldnt1_x2_i16_z0_z8_scalar( %unused ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -407,21 +309,15 @@ define @ldnt1_x2_i16_z0_z8_scalar( %unused ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -442,45 +338,31 @@ define @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -598,20 +454,14 @@ define @ldnt1_x2_i32_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -624,21 +474,15 @@ define @ldnt1_x2_i32_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -659,45 +503,31 @@ define @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -815,20 +619,14 @@ define @ldnt1_x2_i64_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -841,21 +639,15 @@ define @ldnt1_x2_i64_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -876,46 +668,32 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -926,19 +704,14 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -955,20 +728,15 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -992,46 +760,32 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1042,19 +796,14 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1071,20 +820,15 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1109,46 +853,32 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1159,19 +889,14 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1188,20 +913,15 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1225,46 +945,32 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1275,19 +981,14 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1304,20 +1005,15 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1342,46 +1038,32 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1392,19 +1074,14 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1421,20 +1098,15 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1458,46 +1130,32 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1508,19 +1166,14 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1537,20 +1190,15 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1575,46 +1223,32 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1625,19 +1259,14 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1654,20 +1283,15 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1691,46 +1315,32 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b -; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill -; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ptrue pn8.b +; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1741,19 +1351,14 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1770,20 +1375,15 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ptrue pn8.b ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload +; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll new file mode 100644 index 0000000000000..c62016d8ea01a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll @@ -0,0 +1,556 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=NOPAIR +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=PAIR + + +declare void @my_func() + +define void @fbyte( %v) { +; NOPAIR-LABEL: fbyte: +; NOPAIR: // %bb.0: +; NOPAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; NOPAIR-NEXT: addvl sp, sp, #-18 +; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG +; NOPAIR-NEXT: .cfi_offset w30, -8 +; NOPAIR-NEXT: .cfi_offset w29, -16 +; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; NOPAIR-NEXT: bl my_func +; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: addvl sp, sp, #18 +; NOPAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; NOPAIR-NEXT: ret +; +; PAIR-LABEL: fbyte: +; PAIR: // %bb.0: +; PAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; PAIR-NEXT: addvl sp, sp, #-18 +; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG +; PAIR-NEXT: .cfi_offset w30, -8 +; PAIR-NEXT: .cfi_offset w29, -16 +; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; PAIR-NEXT: bl my_func +; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: addvl sp, sp, #18 +; PAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; PAIR-NEXT: ret + call void @my_func() + ret void +} + +define void @fhalf( %v) { +; NOPAIR-LABEL: fhalf: +; NOPAIR: // %bb.0: +; NOPAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; NOPAIR-NEXT: addvl sp, sp, #-18 +; NOPAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG +; NOPAIR-NEXT: .cfi_offset w30, -8 +; NOPAIR-NEXT: .cfi_offset w29, -16 +; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; NOPAIR-NEXT: bl my_func +; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: addvl sp, sp, #18 +; NOPAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; NOPAIR-NEXT: ret +; +; PAIR-LABEL: fhalf: +; PAIR: // %bb.0: +; PAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; PAIR-NEXT: addvl sp, sp, #-18 +; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG +; PAIR-NEXT: .cfi_offset w30, -8 +; PAIR-NEXT: .cfi_offset w29, -16 +; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; PAIR-NEXT: bl my_func +; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: addvl sp, sp, #18 +; PAIR-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; PAIR-NEXT: ret + call void @my_func() + ret void +} + +;; Do NOT group Z10 +;; DO group Z8 and Z9 and save P8 +define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() { +; NOPAIR-LABEL: test_clobbers_z_p_regs: +; NOPAIR: // %bb.0: +; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NOPAIR-NEXT: addvl sp, sp, #-4 +; NOPAIR-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; NOPAIR-NEXT: .cfi_offset w29, -16 +; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; NOPAIR-NEXT: //APP +; NOPAIR-NEXT: //NO_APP +; NOPAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: addvl sp, sp, #4 +; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; NOPAIR-NEXT: ret +; +; PAIR-LABEL: test_clobbers_z_p_regs: +; PAIR: // %bb.0: +; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; PAIR-NEXT: addvl sp, sp, #-4 +; PAIR-NEXT: str p8, [sp, #5, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; PAIR-NEXT: .cfi_offset w29, -16 +; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; PAIR-NEXT: //APP +; PAIR-NEXT: //NO_APP +; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ldr p8, [sp, #5, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: addvl sp, sp, #4 +; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; PAIR-NEXT: ret + call void asm sideeffect "", "~{p4},~{p5},~{z8},~{z9},~{z10}"() + ret void +} + +;; Do NOT group Z10 +;; DO group Z8 and Z9 and use P9 +define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs2() { +; NOPAIR-LABEL: test_clobbers_z_p_regs2: +; NOPAIR: // %bb.0: +; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NOPAIR-NEXT: addvl sp, sp, #-4 +; NOPAIR-NEXT: str p10, [sp, #6, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; NOPAIR-NEXT: .cfi_offset w29, -16 +; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; NOPAIR-NEXT: //APP +; NOPAIR-NEXT: //NO_APP +; NOPAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr p10, [sp, #6, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: addvl sp, sp, #4 +; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; NOPAIR-NEXT: ret +; +; PAIR-LABEL: test_clobbers_z_p_regs2: +; PAIR: // %bb.0: +; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; PAIR-NEXT: addvl sp, sp, #-4 +; PAIR-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: ptrue pn9.b +; PAIR-NEXT: str p10, [sp, #6, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: st1b { z8.b, z9.b }, pn9, [sp, #4, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; PAIR-NEXT: .cfi_offset w29, -16 +; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; PAIR-NEXT: //APP +; PAIR-NEXT: //NO_APP +; PAIR-NEXT: ptrue pn9.b +; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr p10, [sp, #6, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ld1b { z8.b, z9.b }, pn9/z, [sp, #4, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: addvl sp, sp, #4 +; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; PAIR-NEXT: ret + call void asm sideeffect "", "~{p9},~{p10},~{z8},~{z9},~{z10}"() + ret void +} + + +;; Test order of PRegs and ZRegs when there is no PReg being clobbered +define aarch64_sve_vector_pcs void @test_clobbers_z_regs() { +; NOPAIR-LABEL: test_clobbers_z_regs: +; NOPAIR: // %bb.0: +; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NOPAIR-NEXT: addvl sp, sp, #-2 +; NOPAIR-NEXT: str z9, [sp] // 16-byte Folded Spill +; NOPAIR-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; NOPAIR-NEXT: .cfi_offset w29, -16 +; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; NOPAIR-NEXT: //APP +; NOPAIR-NEXT: //NO_APP +; NOPAIR-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: addvl sp, sp, #2 +; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; NOPAIR-NEXT: ret +; +; PAIR-LABEL: test_clobbers_z_regs: +; PAIR: // %bb.0: +; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; PAIR-NEXT: addvl sp, sp, #-3 +; PAIR-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +; PAIR-NEXT: .cfi_offset w29, -16 +; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; PAIR-NEXT: //APP +; PAIR-NEXT: //NO_APP +; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload +; PAIR-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: addvl sp, sp, #3 +; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; PAIR-NEXT: ret + call void asm sideeffect "", "~{z8},~{z9}"() + ret void +} + +;; DO NOT group Z8 and Z9 and +;; DO NOT save P8 +;; It does not belong to the allowed calling conventions +;; NOPAIR and PAIR should have the same assembly +define void @test_clobbers_z_regs_negative() { +; NOPAIR-LABEL: test_clobbers_z_regs_negative: +; NOPAIR: // %bb.0: +; NOPAIR-NEXT: stp d9, d8, [sp, #-16]! // 16-byte Folded Spill +; NOPAIR-NEXT: .cfi_def_cfa_offset 16 +; NOPAIR-NEXT: .cfi_offset b8, -8 +; NOPAIR-NEXT: .cfi_offset b9, -16 +; NOPAIR-NEXT: //APP +; NOPAIR-NEXT: //NO_APP +; NOPAIR-NEXT: ldp d9, d8, [sp], #16 // 16-byte Folded Reload +; NOPAIR-NEXT: ret +; +; PAIR-LABEL: test_clobbers_z_regs_negative: +; PAIR: // %bb.0: +; PAIR-NEXT: stp d9, d8, [sp, #-16]! // 16-byte Folded Spill +; PAIR-NEXT: .cfi_def_cfa_offset 16 +; PAIR-NEXT: .cfi_offset b8, -8 +; PAIR-NEXT: .cfi_offset b9, -16 +; PAIR-NEXT: //APP +; PAIR-NEXT: //NO_APP +; PAIR-NEXT: ldp d9, d8, [sp], #16 // 16-byte Folded Reload +; PAIR-NEXT: ret + call void asm sideeffect "", "~{z8},~{z9}"() + ret void +} + +;; Do NOT save P8 and NOT group any Z8 and Z10 register +define aarch64_sve_vector_pcs void @test_clobbers_2_z_regs_negative() { +; NOPAIR-LABEL: test_clobbers_2_z_regs_negative: +; NOPAIR: // %bb.0: +; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NOPAIR-NEXT: addvl sp, sp, #-2 +; NOPAIR-NEXT: str z10, [sp] // 16-byte Folded Spill +; NOPAIR-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; NOPAIR-NEXT: .cfi_offset w29, -16 +; NOPAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; NOPAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG +; NOPAIR-NEXT: //APP +; NOPAIR-NEXT: //NO_APP +; NOPAIR-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; NOPAIR-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; NOPAIR-NEXT: addvl sp, sp, #2 +; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; NOPAIR-NEXT: ret +; +; PAIR-LABEL: test_clobbers_2_z_regs_negative: +; PAIR: // %bb.0: +; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; PAIR-NEXT: addvl sp, sp, #-2 +; PAIR-NEXT: str z10, [sp] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; PAIR-NEXT: .cfi_offset w29, -16 +; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 16 * VG +; PAIR-NEXT: //APP +; PAIR-NEXT: //NO_APP +; PAIR-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: addvl sp, sp, #2 +; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; PAIR-NEXT: ret + call void asm sideeffect "", "~{z8},~{z10}"() + ret void +} + + +;; NOTHING TO DO HERE +;; There is no ZReg pairs to save +define aarch64_sve_vector_pcs void @test_clobbers_p_reg_negative() { +; NOPAIR-LABEL: test_clobbers_p_reg_negative: +; NOPAIR: // %bb.0: +; NOPAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NOPAIR-NEXT: addvl sp, sp, #-1 +; NOPAIR-NEXT: str p10, [sp, #7, mul vl] // 2-byte Folded Spill +; NOPAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; NOPAIR-NEXT: .cfi_offset w29, -16 +; NOPAIR-NEXT: //APP +; NOPAIR-NEXT: //NO_APP +; NOPAIR-NEXT: ldr p10, [sp, #7, mul vl] // 2-byte Folded Reload +; NOPAIR-NEXT: addvl sp, sp, #1 +; NOPAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; NOPAIR-NEXT: ret +; +; PAIR-LABEL: test_clobbers_p_reg_negative: +; PAIR: // %bb.0: +; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; PAIR-NEXT: addvl sp, sp, #-1 +; PAIR-NEXT: str p10, [sp, #7, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; PAIR-NEXT: .cfi_offset w29, -16 +; PAIR-NEXT: //APP +; PAIR-NEXT: //NO_APP +; PAIR-NEXT: ldr p10, [sp, #7, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: addvl sp, sp, #1 +; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; PAIR-NEXT: ret + call void asm sideeffect "", "~{p10}"() + ret void +}