diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp index 94ebb59c4c7713..46bb85606a6290 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -418,7 +418,9 @@ unsigned VEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, if (MI.getOpcode() == VE::LDrii || // I64 MI.getOpcode() == VE::LDLSXrii || // I32 MI.getOpcode() == VE::LDUrii || // F32 - MI.getOpcode() == VE::LDQrii // F128 (pseudo) + MI.getOpcode() == VE::LDQrii || // F128 (pseudo) + MI.getOpcode() == VE::LDVMrii || // VM (pseudo) + MI.getOpcode() == VE::LDVM512rii // VM512 (pseudo) ) { if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0 && MI.getOperand(3).isImm() && @@ -437,10 +439,12 @@ unsigned VEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, /// any side effects other than storing to the stack slot. unsigned VEInstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { - if (MI.getOpcode() == VE::STrii || // I64 - MI.getOpcode() == VE::STLrii || // I32 - MI.getOpcode() == VE::STUrii || // F32 - MI.getOpcode() == VE::STQrii // F128 (pseudo) + if (MI.getOpcode() == VE::STrii || // I64 + MI.getOpcode() == VE::STLrii || // I32 + MI.getOpcode() == VE::STUrii || // F32 + MI.getOpcode() == VE::STQrii || // F128 (pseudo) + MI.getOpcode() == VE::STVMrii || // VM (pseudo) + MI.getOpcode() == VE::STVM512rii // VM512 (pseudo) ) { if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0 && MI.getOperand(2).isImm() && @@ -496,6 +500,20 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addImm(0) .addReg(SrcReg, getKillRegState(isKill)) .addMemOperand(MMO); + } else if (RC == &VE::VMRegClass) { + BuildMI(MBB, I, DL, get(VE::STVMrii)) + .addFrameIndex(FI) + .addImm(0) + .addImm(0) + .addReg(SrcReg, getKillRegState(isKill)) + .addMemOperand(MMO); + } else if (VE::VM512RegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(VE::STVM512rii)) + .addFrameIndex(FI) + .addImm(0) + .addImm(0) + .addReg(SrcReg, getKillRegState(isKill)) + .addMemOperand(MMO); } else report_fatal_error("Can't store this register to stack slot"); } @@ -539,6 +557,18 @@ void VEInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, .addImm(0) .addImm(0) .addMemOperand(MMO); + } else if (RC == &VE::VMRegClass) { + BuildMI(MBB, I, DL, get(VE::LDVMrii), DestReg) + .addFrameIndex(FI) + .addImm(0) + .addImm(0) + .addMemOperand(MMO); + } else if (VE::VM512RegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(VE::LDVM512rii), DestReg) + .addFrameIndex(FI) + .addImm(0) + .addImm(0) + .addMemOperand(MMO); } else report_fatal_error("Can't load this register from stack slot"); } diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td index 71199717a3a2bd..0b2f5039e3f352 100644 --- a/llvm/lib/Target/VE/VEInstrPatternsVec.td +++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td @@ -25,6 +25,20 @@ def: Pat<(i64 (repl_i32 i32:$val)), (zero_f32 (i2l $val)), (SLLri (i2l $val), 32))>; +///// Mask Load & Store ///// + +// Store for v256i1, v512i1 are implemented in 2 ways. These STVM/STVM512 +// pseudo instruction is used for frameindex related load/store instructions. +// Custom Lowering is used for other load/store instructions. + +def : Pat<(v256i1 (load ADDRrii:$addr)), + (LDVMrii ADDRrii:$addr)>; +def : Pat<(v512i1 (load ADDRrii:$addr)), + (LDVM512rii ADDRrii:$addr)>; +def : Pat<(store v256i1:$vx, ADDRrii:$addr), + (STVMrii ADDRrii:$addr, $vx)>; +def : Pat<(store v512i1:$vx, ADDRrii:$addr), + (STVM512rii ADDRrii:$addr, $vx)>; multiclass vbrd_elem32 { diff --git a/llvm/lib/Target/VE/VEInstrVec.td b/llvm/lib/Target/VE/VEInstrVec.td index 4a8476f7288a36..327ad9ceacc52d 100644 --- a/llvm/lib/Target/VE/VEInstrVec.td +++ b/llvm/lib/Target/VE/VEInstrVec.td @@ -2,6 +2,33 @@ // Vector Instructions //===----------------------------------------------------------------------===// +// Pseudo instructions for VM/VM512 spill/restore +// +// These pseudo instructions are used for only spill/restore since +// InlineSpiller assumes storeRegToStackSlot/loadRegFromStackSlot +// functions emit only single instruction. Those functions emit a +// single store/load instruction or one of these pseudo store/load +// instructions. +// +// Specifies hasSideEffects = 0 to disable UnmodeledSideEffects. + +let mayLoad = 1, hasSideEffects = 0 in { +def LDVMrii : Pseudo< + (outs VM:$vmx), (ins MEMrii:$addr), + "# pseudo ldvm $vmx, $addr", []>; +def LDVM512rii : Pseudo< + (outs VM512:$vmx), (ins MEMrii:$addr), + "# pseudo ldvm512 $vmx, $addr", []>; +} +let mayStore = 1, hasSideEffects = 0 in { +def STVMrii : Pseudo< + (outs), (ins MEMrii:$addr, VM:$vmx), + "# pseudo stvm $addr, $vmx", []>; +def STVM512rii : Pseudo< + (outs), (ins MEMrii:$addr, VM512:$vmx), + "# pseudo stvm512 $addr, $vmx", []>; +} + //===----------------------------------------------------------------------===// // Pseudo instructions for VM512 modifications //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp index f334af128162d8..397ea09c9a024c 100644 --- a/llvm/lib/Target/VE/VERegisterInfo.cpp +++ b/llvm/lib/Target/VE/VERegisterInfo.cpp @@ -180,6 +180,16 @@ class EliminateFrameIndex { int FIOperandNum); void processLDQ(MachineInstr &MI, Register FrameReg, int64_t Offset, int FIOperandNum); + // Expand and eliminate Frame Index of pseudo STVMrii and LDVMrii. + void processSTVM(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); + void processLDVM(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); + // Expand and eliminate Frame Index of pseudo STVM512rii and LDVM512rii. + void processSTVM512(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); + void processLDVM512(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); public: EliminateFrameIndex(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, @@ -271,6 +281,185 @@ void EliminateFrameIndex::processLDQ(MachineInstr &MI, Register FrameReg, replaceFI(MI, FrameReg, Offset, FIOperandNum); } +void EliminateFrameIndex::processSTVM(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(MI.getOpcode() == VE::STVMrii); + LLVM_DEBUG(dbgs() << "processSTVM: "; MI.dump()); + + // Original MI is: + // STVMrii frame-index, 0, offset, reg (, memory operand) + // Convert it to: + // SVMi tmp-reg, reg, 0 + // STrii frame-reg, 0, offset, tmp-reg + // SVMi tmp-reg, reg, 1 + // STrii frame-reg, 0, offset+8, tmp-reg + // SVMi tmp-reg, reg, 2 + // STrii frame-reg, 0, offset+16, tmp-reg + // SVMi tmp-reg, reg, 3 + // STrii frame-reg, 0, offset+24, tmp-reg + + prepareReplaceFI(MI, FrameReg, Offset, 24); + + Register SrcReg = MI.getOperand(3).getReg(); + bool isKill = MI.getOperand(3).isKill(); + // FIXME: it would be better to scavenge a register here instead of + // reserving SX16 all of the time. + Register TmpReg = VE::SX16; + for (int i = 0; i < 3; ++i) { + build(VE::SVMmr, TmpReg).addReg(SrcReg).addImm(i); + MachineInstr *StMI = + build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg( + TmpReg, getKillRegState(true)); + replaceFI(*StMI, FrameReg, Offset, 0); + Offset += 8; + } + build(VE::SVMmr, TmpReg).addReg(SrcReg, getKillRegState(isKill)).addImm(3); + MI.setDesc(get(VE::STrii)); + MI.getOperand(3).ChangeToRegister(TmpReg, false, false, true); + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + +void EliminateFrameIndex::processLDVM(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(MI.getOpcode() == VE::LDVMrii); + LLVM_DEBUG(dbgs() << "processLDVM: "; MI.dump()); + + // Original MI is: + // LDVMri reg, frame-index, 0, offset (, memory operand) + // Convert it to: + // LDrii tmp-reg, frame-reg, 0, offset + // LVMir vm, 0, tmp-reg + // LDrii tmp-reg, frame-reg, 0, offset+8 + // LVMir_m vm, 1, tmp-reg, vm + // LDrii tmp-reg, frame-reg, 0, offset+16 + // LVMir_m vm, 2, tmp-reg, vm + // LDrii tmp-reg, frame-reg, 0, offset+24 + // LVMir_m vm, 3, tmp-reg, vm + + prepareReplaceFI(MI, FrameReg, Offset, 24); + + Register DestReg = MI.getOperand(0).getReg(); + // FIXME: it would be better to scavenge a register here instead of + // reserving SX16 all of the time. + unsigned TmpReg = VE::SX16; + for (int i = 0; i < 4; ++i) { + if (i != 3) { + MachineInstr *StMI = + build(VE::LDrii, TmpReg).addReg(FrameReg).addImm(0).addImm(0); + replaceFI(*StMI, FrameReg, Offset, 1); + Offset += 8; + } else { + // Last LDrii replace the target instruction. + MI.setDesc(get(VE::LDrii)); + MI.getOperand(0).ChangeToRegister(TmpReg, true); + } + // First LVM is LVMir. Others are LVMir_m. Last LVM places at the + // next of the target instruction. + if (i == 0) + build(VE::LVMir, DestReg).addImm(i).addReg(TmpReg, getKillRegState(true)); + else if (i != 3) + build(VE::LVMir_m, DestReg) + .addImm(i) + .addReg(TmpReg, getKillRegState(true)) + .addReg(DestReg); + else + BuildMI(*MI.getParent(), std::next(II), DL, get(VE::LVMir_m), DestReg) + .addImm(3) + .addReg(TmpReg, getKillRegState(true)) + .addReg(DestReg); + } + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + +void EliminateFrameIndex::processSTVM512(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(MI.getOpcode() == VE::STVM512rii); + LLVM_DEBUG(dbgs() << "processSTVM512: "; MI.dump()); + + prepareReplaceFI(MI, FrameReg, Offset, 56); + + Register SrcReg = MI.getOperand(3).getReg(); + Register SrcLoReg = getSubReg(SrcReg, VE::sub_vm_odd); + Register SrcHiReg = getSubReg(SrcReg, VE::sub_vm_even); + bool isKill = MI.getOperand(3).isKill(); + // FIXME: it would be better to scavenge a register here instead of + // reserving SX16 all of the time. + Register TmpReg = VE::SX16; + // store low part of VMP + MachineInstr *LastMI = nullptr; + for (int i = 0; i < 4; ++i) { + LastMI = build(VE::SVMmr, TmpReg).addReg(SrcLoReg).addImm(i); + MachineInstr *StMI = + build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg( + TmpReg, getKillRegState(true)); + replaceFI(*StMI, FrameReg, Offset, 0); + Offset += 8; + } + if (isKill) + LastMI->addRegisterKilled(SrcLoReg, &TRI, true); + // store high part of VMP + for (int i = 0; i < 3; ++i) { + build(VE::SVMmr, TmpReg).addReg(SrcHiReg).addImm(i); + MachineInstr *StMI = + build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg( + TmpReg, getKillRegState(true)); + replaceFI(*StMI, FrameReg, Offset, 0); + Offset += 8; + } + LastMI = build(VE::SVMmr, TmpReg).addReg(SrcHiReg).addImm(3); + if (isKill) { + LastMI->addRegisterKilled(SrcHiReg, &TRI, true); + // Add implicit super-register kills to the particular MI. + LastMI->addRegisterKilled(SrcReg, &TRI, true); + } + MI.setDesc(get(VE::STrii)); + MI.getOperand(3).ChangeToRegister(TmpReg, false, false, true); + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + +void EliminateFrameIndex::processLDVM512(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(MI.getOpcode() == VE::LDVM512rii); + LLVM_DEBUG(dbgs() << "processLDVM512: "; MI.dump()); + + prepareReplaceFI(MI, FrameReg, Offset, 56); + + Register DestReg = MI.getOperand(0).getReg(); + Register DestLoReg = getSubReg(DestReg, VE::sub_vm_odd); + Register DestHiReg = getSubReg(DestReg, VE::sub_vm_even); + // FIXME: it would be better to scavenge a register here instead of + // reserving SX16 all of the time. + Register TmpReg = VE::SX16; + build(VE::IMPLICIT_DEF, DestReg); + for (int i = 0; i < 4; ++i) { + MachineInstr *LdMI = + build(VE::LDrii, TmpReg).addReg(FrameReg).addImm(0).addImm(0); + replaceFI(*LdMI, FrameReg, Offset, 1); + build(VE::LVMir_m, DestLoReg) + .addImm(i) + .addReg(TmpReg, getKillRegState(true)) + .addReg(DestLoReg); + Offset += 8; + } + for (int i = 0; i < 3; ++i) { + MachineInstr *LdMI = + build(VE::LDrii, TmpReg).addReg(FrameReg).addImm(0).addImm(0); + replaceFI(*LdMI, FrameReg, Offset, 1); + build(VE::LVMir_m, DestHiReg) + .addImm(i) + .addReg(TmpReg, getKillRegState(true)) + .addReg(DestHiReg); + Offset += 8; + } + MI.setDesc(get(VE::LDrii)); + MI.getOperand(0).ChangeToRegister(TmpReg, true); + BuildMI(*MI.getParent(), std::next(II), DL, get(VE::LVMir_m), DestHiReg) + .addImm(3) + .addReg(TmpReg, getKillRegState(true)) + .addReg(DestHiReg); + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + void EliminateFrameIndex::processMI(MachineInstr &MI, Register FrameReg, int64_t Offset, int FIOperandNum) { switch (MI.getOpcode()) { @@ -280,6 +469,18 @@ void EliminateFrameIndex::processMI(MachineInstr &MI, Register FrameReg, case VE::LDQrii: processLDQ(MI, FrameReg, Offset, FIOperandNum); return; + case VE::STVMrii: + processSTVM(MI, FrameReg, Offset, FIOperandNum); + return; + case VE::LDVMrii: + processLDVM(MI, FrameReg, Offset, FIOperandNum); + return; + case VE::STVM512rii: + processSTVM512(MI, FrameReg, Offset, FIOperandNum); + return; + case VE::LDVM512rii: + processLDVM512(MI, FrameReg, Offset, FIOperandNum); + return; } prepareReplaceFI(MI, FrameReg, Offset); replaceFI(MI, FrameReg, Offset, FIOperandNum); diff --git a/llvm/test/CodeGen/VE/Vector/load_stk_ldvm.ll b/llvm/test/CodeGen/VE/Vector/load_stk_ldvm.ll new file mode 100644 index 00000000000000..18464d9e243410 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/load_stk_ldvm.ll @@ -0,0 +1,1203 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve -mattr=+vpu | FileCheck %s + +;;; Test store instructions +;;; +;;; Note: +;;; We test store instructions using general stack, stack with dynamic +;;; allocation, stack with dynamic allocation and alignment, and stack +;;; with dynamic allocation, alignment, and spill. +;;; +;;; Fist test using a stack for leaf function. +;;; +;;; | | Higher address +;;; |----------------------------------------------| <- old sp +;;; | Local variables of fixed size | +;;; |----------------------------------------------| <- sp +;;; | | Lower address +;;; +;;; Access local variable using sp (%s11). In addition, please remember +;;; that stack is aligned by 16 bytes. +;;; +;;; Second test using a general stack. +;;; +;;; | | Higher address +;;; |----------------------------------------------| +;;; | Parameter area for this function | +;;; |----------------------------------------------| +;;; | Register save area (RSA) for this function | +;;; |----------------------------------------------| +;;; | Return address for this function | +;;; |----------------------------------------------| +;;; | Frame pointer for this function | +;;; |----------------------------------------------| <- fp(=old sp) +;;; | Local variables of fixed size | +;;; |----------------------------------------------| +;;; |.variable-sized.local.variables.(VLAs)........| +;;; |..............................................| +;;; |..............................................| +;;; |----------------------------------------------| <- returned by alloca +;;; | Parameter area for callee | +;;; |----------------------------------------------| +;;; | Register save area (RSA) for callee | +;;; |----------------------------------------------| +;;; | Return address for callee | +;;; |----------------------------------------------| +;;; | Frame pointer for callee | +;;; |----------------------------------------------| <- sp +;;; | | Lower address +;;; +;;; Access local variable using fp (%s9) since the size of VLA is not +;;; known. At the beginning of the functions, allocates 240 + data +;;; bytes. 240 means RSA+RA+FP (=176) + Parameter (=64). +;;; +;;; Third test using a general stack. +;;; +;;; | | Higher address +;;; |----------------------------------------------| +;;; | Parameter area for this function | +;;; |----------------------------------------------| +;;; | Register save area (RSA) for this function | +;;; |----------------------------------------------| +;;; | Return address for this function | +;;; |----------------------------------------------| +;;; | Frame pointer for this function | +;;; |----------------------------------------------| <- fp(=old sp) +;;; |.empty.space.to.make.part.below.aligned.in....| +;;; |.case.it.needs.more.than.the.standard.16-byte.| (size of this area is +;;; |.alignment....................................| unknown at compile time) +;;; |----------------------------------------------| +;;; | Local variables of fixed size including spill| +;;; | slots | +;;; |----------------------------------------------| <- bp(not defined by ABI, +;;; |.variable-sized.local.variables.(VLAs)........| LLVM chooses SX17) +;;; |..............................................| (size of this area is +;;; |..............................................| unknown at compile time) +;;; |----------------------------------------------| <- stack top (returned by +;;; | Parameter area for callee | alloca) +;;; |----------------------------------------------| +;;; | Register save area (RSA) for callee | +;;; |----------------------------------------------| +;;; | Return address for callee | +;;; |----------------------------------------------| +;;; | Frame pointer for callee | +;;; |----------------------------------------------| <- sp +;;; | | Lower address +;;; +;;; Access local variable using bp (%s17) since the size of alignment +;;; and VLA are not known. At the beginning of the functions, allocates +;;; pad(240 + data + align) bytes. Then, access data through bp + pad(240) +;;; since this address doesn't change even if VLA is dynamically allocated. +;;; +;;; Fourth test using a general stack with some spills. +;;; + +; Function Attrs: argmemonly mustprogress nofree nounwind willreturn +define fastcc <256 x i1> @load__vm256_stk() { +; CHECK-LABEL: load__vm256_stk: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -224(, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: ld %s16, 192(, %s11) +; CHECK-NEXT: lvm %vm1, 0, %s16 +; CHECK-NEXT: ld %s16, 200(, %s11) +; CHECK-NEXT: lvm %vm1, 1, %s16 +; CHECK-NEXT: ld %s16, 208(, %s11) +; CHECK-NEXT: lvm %vm1, 2, %s16 +; CHECK-NEXT: ld %s16, 216(, %s11) +; CHECK-NEXT: lvm %vm1, 3, %s16 +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %1 = alloca <256 x i1>, align 32 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %1) + %2 = load volatile <256 x i1>, ptr %1, align 32 + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %1) + ret <256 x i1> %2 +} + +; Function Attrs: argmemonly mustprogress nocallback nofree nosync nounwind willreturn +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) + +; Function Attrs: argmemonly mustprogress nocallback nofree nosync nounwind willreturn +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) + +; Function Attrs: argmemonly nofree nounwind +define fastcc <256 x i1> @load__vm256_stk_big_fit() { +; CHECK-LABEL: load__vm256_stk_big_fit: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -2147483648(, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: brge.l %s11, %s8, .LBB1_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: ld %s16, 2147483616(, %s11) +; CHECK-NEXT: lvm %vm1, 0, %s16 +; CHECK-NEXT: ld %s16, 2147483624(, %s11) +; CHECK-NEXT: lvm %vm1, 1, %s16 +; CHECK-NEXT: ld %s16, 2147483632(, %s11) +; CHECK-NEXT: lvm %vm1, 2, %s16 +; CHECK-NEXT: ld %s16, 2147483640(, %s11) +; CHECK-NEXT: lvm %vm1, 3, %s16 +; CHECK-NEXT: or %s0, 0, (0)1 +; CHECK-NEXT: lea %s1, 2147483424 +; CHECK-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld %s2, 192(%s0, %s11) +; CHECK-NEXT: lea %s0, 8(, %s0) +; CHECK-NEXT: brne.l %s0, %s1, .LBB1_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %1 = alloca <256 x i1>, align 32 + %2 = alloca [268435428 x i64], align 8 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %1) + call void @llvm.lifetime.start.p0(i64 2147483424, ptr nonnull %2) + %3 = load volatile <256 x i1>, ptr %1, align 32 + br label %5 + +4: ; preds = %5 + call void @llvm.lifetime.end.p0(i64 2147483424, ptr nonnull %2) + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %1) + ret <256 x i1> %3 + +5: ; preds = %0, %5 + %6 = phi i64 [ 0, %0 ], [ %9, %5 ] + %7 = getelementptr inbounds [268435428 x i64], ptr %2, i64 0, i64 %6 + %8 = load volatile i64, ptr %7, align 8, !tbaa !3 + %9 = add nuw nsw i64 %6, 1 + %10 = icmp eq i64 %9, 268435428 + br i1 %10, label %4, label %5, !llvm.loop !7 +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc <256 x i1> @load__vm256_stk_big() { +; CHECK-LABEL: load__vm256_stk_big: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s13, 2147483616 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: brge.l %s11, %s8, .LBB2_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: lea %s13, -2147483648 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s13, (%s11, %s13) +; CHECK-NEXT: ld %s16, (, %s13) +; CHECK-NEXT: lvm %vm1, 0, %s16 +; CHECK-NEXT: ld %s16, 8(, %s13) +; CHECK-NEXT: lvm %vm1, 1, %s16 +; CHECK-NEXT: ld %s16, 16(, %s13) +; CHECK-NEXT: lvm %vm1, 2, %s16 +; CHECK-NEXT: ld %s16, 24(, %s13) +; CHECK-NEXT: lvm %vm1, 3, %s16 +; CHECK-NEXT: or %s0, 0, (0)1 +; CHECK-NEXT: lea %s1, 2147483432 +; CHECK-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld %s2, 216(%s0, %s11) +; CHECK-NEXT: lea %s0, 8(, %s0) +; CHECK-NEXT: brne.l %s0, %s1, .LBB2_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %1 = alloca <256 x i1>, align 32 + %2 = alloca [268435429 x i64], align 8 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %1) + call void @llvm.lifetime.start.p0(i64 2147483432, ptr nonnull %2) + %3 = load volatile <256 x i1>, ptr %1, align 32 + br label %5 + +4: ; preds = %5 + call void @llvm.lifetime.end.p0(i64 2147483432, ptr nonnull %2) + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %1) + ret <256 x i1> %3 + +5: ; preds = %0, %5 + %6 = phi i64 [ 0, %0 ], [ %9, %5 ] + %7 = getelementptr inbounds [268435429 x i64], ptr %2, i64 0, i64 %6 + %8 = load volatile i64, ptr %7, align 8, !tbaa !3 + %9 = add nuw nsw i64 %6, 1 + %10 = icmp eq i64 %9, 268435429 + br i1 %10, label %4, label %5, !llvm.loop !9 +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc <256 x i1> @load__vm256_stk_big2() { +; CHECK-LABEL: load__vm256_stk_big2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s13, 2147483424 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: brge.l %s11, %s8, .LBB3_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB3_4: +; CHECK-NEXT: lea %s13, -2147483456 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s13, (%s11, %s13) +; CHECK-NEXT: ld %s16, (, %s13) +; CHECK-NEXT: lvm %vm1, 0, %s16 +; CHECK-NEXT: ld %s16, 8(, %s13) +; CHECK-NEXT: lvm %vm1, 1, %s16 +; CHECK-NEXT: ld %s16, 16(, %s13) +; CHECK-NEXT: lvm %vm1, 2, %s16 +; CHECK-NEXT: ld %s16, 24(, %s13) +; CHECK-NEXT: lvm %vm1, 3, %s16 +; CHECK-NEXT: or %s0, 0, (0)1 +; CHECK-NEXT: lea %s1, -2147483648 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld %s2, 192(%s0, %s11) +; CHECK-NEXT: lea %s0, 8(, %s0) +; CHECK-NEXT: brne.l %s0, %s1, .LBB3_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %1 = alloca <256 x i1>, align 32 + %2 = alloca [268435456 x i64], align 8 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %1) + call void @llvm.lifetime.start.p0(i64 2147483648, ptr nonnull %2) + %3 = load volatile <256 x i1>, ptr %1, align 32 + br label %5 + +4: ; preds = %5 + call void @llvm.lifetime.end.p0(i64 2147483648, ptr nonnull %2) + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %1) + ret <256 x i1> %3 + +5: ; preds = %0, %5 + %6 = phi i64 [ 0, %0 ], [ %9, %5 ] + %7 = getelementptr inbounds [268435456 x i64], ptr %2, i64 0, i64 %6 + %8 = load volatile i64, ptr %7, align 8, !tbaa !3 + %9 = add nuw nsw i64 %6, 1 + %10 = icmp eq i64 %9, 268435456 + br i1 %10, label %4, label %5, !llvm.loop !10 +} + +; Function Attrs: argmemonly mustprogress nofree nounwind willreturn +define fastcc <256 x i1> @load__vm256_stk_dyn(i64 noundef %0) { +; CHECK-LABEL: load__vm256_stk_dyn: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -272(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB4_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: sll %s0, %s0, 5 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: ld %s1, 24(, %s0) +; CHECK-NEXT: ld %s1, 16(, %s0) +; CHECK-NEXT: ld %s1, 8(, %s0) +; CHECK-NEXT: ld %s0, (, %s0) +; CHECK-NEXT: ld %s16, -32(, %s9) +; CHECK-NEXT: lvm %vm1, 0, %s16 +; CHECK-NEXT: ld %s16, -24(, %s9) +; CHECK-NEXT: lvm %vm1, 1, %s16 +; CHECK-NEXT: ld %s16, -16(, %s9) +; CHECK-NEXT: lvm %vm1, 2, %s16 +; CHECK-NEXT: ld %s16, -8(, %s9) +; CHECK-NEXT: lvm %vm1, 3, %s16 +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %2 = alloca <256 x i1>, align 8 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %2) + %3 = alloca <256 x i1>, i64 %0, align 8 + %4 = load volatile <256 x i1>, ptr %3, align 32 + %5 = load volatile <256 x i1>, ptr %2, align 32 + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %2) + ret <256 x i1> %5 +} + +; Function Attrs: argmemonly mustprogress nofree nounwind willreturn +define fastcc <256 x i1> @load__vm256_stk_dyn_align(i64 noundef %0) { +; CHECK-LABEL: load__vm256_stk_dyn_align: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -288(, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB5_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB5_2: +; CHECK-NEXT: sll %s0, %s0, 5 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: ld %s1, 24(, %s0) +; CHECK-NEXT: ld %s1, 16(, %s0) +; CHECK-NEXT: ld %s1, 8(, %s0) +; CHECK-NEXT: ld %s0, (, %s0) +; CHECK-NEXT: ld %s16, 256(, %s17) +; CHECK-NEXT: lvm %vm1, 0, %s16 +; CHECK-NEXT: ld %s16, 264(, %s17) +; CHECK-NEXT: lvm %vm1, 1, %s16 +; CHECK-NEXT: ld %s16, 272(, %s17) +; CHECK-NEXT: lvm %vm1, 2, %s16 +; CHECK-NEXT: ld %s16, 280(, %s17) +; CHECK-NEXT: lvm %vm1, 3, %s16 +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %2 = alloca <256 x i1>, align 32 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %2) + %3 = alloca <256 x i1>, i64 %0, align 8 + %4 = load volatile <256 x i1>, ptr %3, align 32 + %5 = load volatile <256 x i1>, ptr %2, align 32 + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %2) + ret <256 x i1> %5 +} + +; Function Attrs: argmemonly mustprogress nofree nounwind willreturn +define fastcc <256 x i1> @load__vm256_stk_dyn_align2(i64 noundef %0) { +; CHECK-LABEL: load__vm256_stk_dyn_align2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -320(, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB6_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: sll %s0, %s0, 5 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: ld %s1, 24(, %s0) +; CHECK-NEXT: ld %s1, 16(, %s0) +; CHECK-NEXT: ld %s1, 8(, %s0) +; CHECK-NEXT: ld %s0, (, %s0) +; CHECK-NEXT: ld %s16, 288(, %s17) +; CHECK-NEXT: lvm %vm1, 0, %s16 +; CHECK-NEXT: ld %s16, 296(, %s17) +; CHECK-NEXT: lvm %vm1, 1, %s16 +; CHECK-NEXT: ld %s16, 304(, %s17) +; CHECK-NEXT: lvm %vm1, 2, %s16 +; CHECK-NEXT: ld %s16, 312(, %s17) +; CHECK-NEXT: lvm %vm1, 3, %s16 +; CHECK-NEXT: ld %s16, 256(, %s17) +; CHECK-NEXT: lvm %vm2, 0, %s16 +; CHECK-NEXT: ld %s16, 264(, %s17) +; CHECK-NEXT: lvm %vm2, 1, %s16 +; CHECK-NEXT: ld %s16, 272(, %s17) +; CHECK-NEXT: lvm %vm2, 2, %s16 +; CHECK-NEXT: ld %s16, 280(, %s17) +; CHECK-NEXT: lvm %vm2, 3, %s16 +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %2 = alloca <256 x i1>, align 32 + %3 = alloca <256 x i1>, align 64 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %2) + %4 = alloca <256 x i1>, i64 %0, align 8 + %5 = load volatile <256 x i1>, ptr %4, align 32 + %6 = load volatile <256 x i1>, ptr %2, align 32 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %3) + %7 = load volatile <256 x i1>, ptr %3, align 64 + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %3) + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %2) + ret <256 x i1> %6 +} + +; Function Attrs: nounwind +define fastcc <256 x i1> @load__vm256_stk_dyn_align_spill(i64 noundef %0) { +; CHECK-LABEL: load__vm256_stk_dyn_align_spill: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -320(, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB7_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: st %s18, 48(, %s9) # 8-byte Folded Spill +; CHECK-NEXT: or %s18, 0, %s0 +; CHECK-NEXT: lea %s0, 15(, %s0) +; CHECK-NEXT: and %s0, -16, %s0 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: ld %s1, 24(, %s0) +; CHECK-NEXT: ld %s1, 16(, %s0) +; CHECK-NEXT: ld %s1, 8(, %s0) +; CHECK-NEXT: ld %s0, (, %s0) +; CHECK-NEXT: ld %s16, 288(, %s17) +; CHECK-NEXT: lvm %vm1, 0, %s16 +; CHECK-NEXT: ld %s16, 296(, %s17) +; CHECK-NEXT: lvm %vm1, 1, %s16 +; CHECK-NEXT: ld %s16, 304(, %s17) +; CHECK-NEXT: lvm %vm1, 2, %s16 +; CHECK-NEXT: ld %s16, 312(, %s17) +; CHECK-NEXT: lvm %vm1, 3, %s16 +; CHECK-NEXT: svm %s16, %vm1, 0 +; CHECK-NEXT: st %s16, 256(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 1 +; CHECK-NEXT: st %s16, 264(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 2 +; CHECK-NEXT: st %s16, 272(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 3 +; CHECK-NEXT: st %s16, 280(, %s17) # 32-byte Folded Spill +; CHECK-NEXT: lea %s0, dummy@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, dummy@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, pass@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, pass@hi(, %s0) +; CHECK-NEXT: or %s0, 0, %s18 +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: ld %s16, 256(, %s17) +; CHECK-NEXT: lvm %vm1, 0, %s16 +; CHECK-NEXT: ld %s16, 264(, %s17) +; CHECK-NEXT: lvm %vm1, 1, %s16 +; CHECK-NEXT: ld %s16, 272(, %s17) +; CHECK-NEXT: lvm %vm1, 2, %s16 +; CHECK-NEXT: ld %s16, 280(, %s17) # 32-byte Folded Reload +; CHECK-NEXT: lvm %vm1, 3, %s16 +; CHECK-NEXT: ld %s18, 48(, %s9) # 8-byte Folded Reload +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %2 = alloca <256 x i1>, align 32 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %2) + %3 = alloca i8, i64 %0, align 8 + %4 = load volatile <256 x i1>, ptr %3, align 32 + %5 = load volatile <256 x i1>, ptr %2, align 32 + tail call fastcc void @dummy() + tail call fastcc void @pass(i64 noundef %0) + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %2) + ret <256 x i1> %5 +} + +declare fastcc void @dummy() + +declare fastcc void @pass(i64 noundef) + +; Function Attrs: argmemonly mustprogress nofree nounwind willreturn +define fastcc <512 x i1> @load__vm512_stk() { +; CHECK-LABEL: load__vm512_stk: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -256(, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB8_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB8_2: +; CHECK-NEXT: # implicit-def: $vmp1 +; CHECK-NEXT: ld %s16, 192(, %s11) +; CHECK-NEXT: lvm %vm3, 0, %s16 +; CHECK-NEXT: ld %s16, 200(, %s11) +; CHECK-NEXT: lvm %vm3, 1, %s16 +; CHECK-NEXT: ld %s16, 208(, %s11) +; CHECK-NEXT: lvm %vm3, 2, %s16 +; CHECK-NEXT: ld %s16, 216(, %s11) +; CHECK-NEXT: lvm %vm3, 3, %s16 +; CHECK-NEXT: ld %s16, 224(, %s11) +; CHECK-NEXT: lvm %vm2, 0, %s16 +; CHECK-NEXT: ld %s16, 232(, %s11) +; CHECK-NEXT: lvm %vm2, 1, %s16 +; CHECK-NEXT: ld %s16, 240(, %s11) +; CHECK-NEXT: lvm %vm2, 2, %s16 +; CHECK-NEXT: ld %s16, 248(, %s11) +; CHECK-NEXT: lvm %vm2, 3, %s16 +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %1 = alloca <512 x i1>, align 64 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %1) + %2 = load volatile <512 x i1>, ptr %1, align 64 + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %1) + ret <512 x i1> %2 +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc <512 x i1> @load__vm512_stk_big_fit() { +; CHECK-LABEL: load__vm512_stk_big_fit: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -2147483648(, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: brge.l %s11, %s8, .LBB9_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB9_4: +; CHECK-NEXT: # implicit-def: $vmp1 +; CHECK-NEXT: ld %s16, 2147483584(, %s11) +; CHECK-NEXT: lvm %vm3, 0, %s16 +; CHECK-NEXT: ld %s16, 2147483592(, %s11) +; CHECK-NEXT: lvm %vm3, 1, %s16 +; CHECK-NEXT: ld %s16, 2147483600(, %s11) +; CHECK-NEXT: lvm %vm3, 2, %s16 +; CHECK-NEXT: ld %s16, 2147483608(, %s11) +; CHECK-NEXT: lvm %vm3, 3, %s16 +; CHECK-NEXT: ld %s16, 2147483616(, %s11) +; CHECK-NEXT: lvm %vm2, 0, %s16 +; CHECK-NEXT: ld %s16, 2147483624(, %s11) +; CHECK-NEXT: lvm %vm2, 1, %s16 +; CHECK-NEXT: ld %s16, 2147483632(, %s11) +; CHECK-NEXT: lvm %vm2, 2, %s16 +; CHECK-NEXT: ld %s16, 2147483640(, %s11) +; CHECK-NEXT: lvm %vm2, 3, %s16 +; CHECK-NEXT: or %s0, 0, (0)1 +; CHECK-NEXT: lea %s1, 2147483392 +; CHECK-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld %s2, 192(%s0, %s11) +; CHECK-NEXT: lea %s0, 8(, %s0) +; CHECK-NEXT: brne.l %s0, %s1, .LBB9_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %1 = alloca <512 x i1>, align 64 + %2 = alloca [268435424 x i64], align 8 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %1) + call void @llvm.lifetime.start.p0(i64 2147483392, ptr nonnull %2) + %3 = load volatile <512 x i1>, ptr %1, align 64 + br label %5 + +4: ; preds = %5 + call void @llvm.lifetime.end.p0(i64 2147483392, ptr nonnull %2) + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %1) + ret <512 x i1> %3 + +5: ; preds = %0, %5 + %6 = phi i64 [ 0, %0 ], [ %9, %5 ] + %7 = getelementptr inbounds [268435424 x i64], ptr %2, i64 0, i64 %6 + %8 = load volatile i64, ptr %7, align 8, !tbaa !3 + %9 = add nuw nsw i64 %6, 1 + %10 = icmp eq i64 %9, 268435424 + br i1 %10, label %4, label %5, !llvm.loop !11 +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc <512 x i1> @load__vm512_stk_big() { +; CHECK-LABEL: load__vm512_stk_big: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s13, 2147483584 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: brge.l %s11, %s8, .LBB10_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB10_4: +; CHECK-NEXT: lea %s13, -2147483648 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s13, (%s11, %s13) +; CHECK-NEXT: # implicit-def: $vmp1 +; CHECK-NEXT: ld %s16, (, %s13) +; CHECK-NEXT: lvm %vm3, 0, %s16 +; CHECK-NEXT: ld %s16, 8(, %s13) +; CHECK-NEXT: lvm %vm3, 1, %s16 +; CHECK-NEXT: ld %s16, 16(, %s13) +; CHECK-NEXT: lvm %vm3, 2, %s16 +; CHECK-NEXT: ld %s16, 24(, %s13) +; CHECK-NEXT: lvm %vm3, 3, %s16 +; CHECK-NEXT: ld %s16, 32(, %s13) +; CHECK-NEXT: lvm %vm2, 0, %s16 +; CHECK-NEXT: ld %s16, 40(, %s13) +; CHECK-NEXT: lvm %vm2, 1, %s16 +; CHECK-NEXT: ld %s16, 48(, %s13) +; CHECK-NEXT: lvm %vm2, 2, %s16 +; CHECK-NEXT: ld %s16, 56(, %s13) +; CHECK-NEXT: lvm %vm2, 3, %s16 +; CHECK-NEXT: or %s0, 0, (0)1 +; CHECK-NEXT: lea %s1, 2147483400 +; CHECK-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld %s2, 248(%s0, %s11) +; CHECK-NEXT: lea %s0, 8(, %s0) +; CHECK-NEXT: brne.l %s0, %s1, .LBB10_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %1 = alloca <512 x i1>, align 64 + %2 = alloca [268435425 x i64], align 8 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %1) + call void @llvm.lifetime.start.p0(i64 2147483400, ptr nonnull %2) + %3 = load volatile <512 x i1>, ptr %1, align 64 + br label %5 + +4: ; preds = %5 + call void @llvm.lifetime.end.p0(i64 2147483400, ptr nonnull %2) + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %1) + ret <512 x i1> %3 + +5: ; preds = %0, %5 + %6 = phi i64 [ 0, %0 ], [ %9, %5 ] + %7 = getelementptr inbounds [268435425 x i64], ptr %2, i64 0, i64 %6 + %8 = load volatile i64, ptr %7, align 8, !tbaa !3 + %9 = add nuw nsw i64 %6, 1 + %10 = icmp eq i64 %9, 268435425 + br i1 %10, label %4, label %5, !llvm.loop !12 +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc <512 x i1> @load__vm512_stk_big2() { +; CHECK-LABEL: load__vm512_stk_big2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s13, 2147483392 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: brge.l %s11, %s8, .LBB11_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB11_4: +; CHECK-NEXT: lea %s13, -2147483456 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s13, (%s11, %s13) +; CHECK-NEXT: # implicit-def: $vmp1 +; CHECK-NEXT: ld %s16, (, %s13) +; CHECK-NEXT: lvm %vm3, 0, %s16 +; CHECK-NEXT: ld %s16, 8(, %s13) +; CHECK-NEXT: lvm %vm3, 1, %s16 +; CHECK-NEXT: ld %s16, 16(, %s13) +; CHECK-NEXT: lvm %vm3, 2, %s16 +; CHECK-NEXT: ld %s16, 24(, %s13) +; CHECK-NEXT: lvm %vm3, 3, %s16 +; CHECK-NEXT: ld %s16, 32(, %s13) +; CHECK-NEXT: lvm %vm2, 0, %s16 +; CHECK-NEXT: ld %s16, 40(, %s13) +; CHECK-NEXT: lvm %vm2, 1, %s16 +; CHECK-NEXT: ld %s16, 48(, %s13) +; CHECK-NEXT: lvm %vm2, 2, %s16 +; CHECK-NEXT: ld %s16, 56(, %s13) +; CHECK-NEXT: lvm %vm2, 3, %s16 +; CHECK-NEXT: or %s0, 0, (0)1 +; CHECK-NEXT: lea %s1, -2147483648 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld %s2, 192(%s0, %s11) +; CHECK-NEXT: lea %s0, 8(, %s0) +; CHECK-NEXT: brne.l %s0, %s1, .LBB11_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %1 = alloca <512 x i1>, align 64 + %2 = alloca [268435456 x i64], align 8 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %1) + call void @llvm.lifetime.start.p0(i64 2147483648, ptr nonnull %2) + %3 = load volatile <512 x i1>, ptr %1, align 64 + br label %5 + +4: ; preds = %5 + call void @llvm.lifetime.end.p0(i64 2147483648, ptr nonnull %2) + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %1) + ret <512 x i1> %3 + +5: ; preds = %0, %5 + %6 = phi i64 [ 0, %0 ], [ %9, %5 ] + %7 = getelementptr inbounds [268435456 x i64], ptr %2, i64 0, i64 %6 + %8 = load volatile i64, ptr %7, align 8, !tbaa !3 + %9 = add nuw nsw i64 %6, 1 + %10 = icmp eq i64 %9, 268435456 + br i1 %10, label %4, label %5, !llvm.loop !13 +} + +; Function Attrs: argmemonly mustprogress nofree nounwind willreturn +define fastcc <512 x i1> @load__vm512_stk_dyn(i64 noundef %0) { +; CHECK-LABEL: load__vm512_stk_dyn: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -320(, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB12_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: sll %s0, %s0, 6 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: ld %s1, 56(, %s0) +; CHECK-NEXT: ld %s1, 48(, %s0) +; CHECK-NEXT: ld %s1, 40(, %s0) +; CHECK-NEXT: ld %s1, 32(, %s0) +; CHECK-NEXT: ld %s1, 24(, %s0) +; CHECK-NEXT: ld %s1, 16(, %s0) +; CHECK-NEXT: ld %s1, 8(, %s0) +; CHECK-NEXT: ld %s0, (, %s0) +; CHECK-NEXT: # implicit-def: $vmp1 +; CHECK-NEXT: ld %s16, 256(, %s17) +; CHECK-NEXT: lvm %vm3, 0, %s16 +; CHECK-NEXT: ld %s16, 264(, %s17) +; CHECK-NEXT: lvm %vm3, 1, %s16 +; CHECK-NEXT: ld %s16, 272(, %s17) +; CHECK-NEXT: lvm %vm3, 2, %s16 +; CHECK-NEXT: ld %s16, 280(, %s17) +; CHECK-NEXT: lvm %vm3, 3, %s16 +; CHECK-NEXT: ld %s16, 288(, %s17) +; CHECK-NEXT: lvm %vm2, 0, %s16 +; CHECK-NEXT: ld %s16, 296(, %s17) +; CHECK-NEXT: lvm %vm2, 1, %s16 +; CHECK-NEXT: ld %s16, 304(, %s17) +; CHECK-NEXT: lvm %vm2, 2, %s16 +; CHECK-NEXT: ld %s16, 312(, %s17) +; CHECK-NEXT: lvm %vm2, 3, %s16 +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %2 = alloca <512 x i1>, align 64 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %2) + %3 = alloca <512 x i1>, i64 %0, align 8 + %4 = load volatile <512 x i1>, ptr %3, align 64 + %5 = load volatile <512 x i1>, ptr %2, align 64 + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %2) + ret <512 x i1> %5 +} + +; Function Attrs: argmemonly mustprogress nofree nounwind willreturn +define fastcc <512 x i1> @load__vm512_stk_dyn_align(i64 noundef %0) { +; CHECK-LABEL: load__vm512_stk_dyn_align: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -320(, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB13_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB13_2: +; CHECK-NEXT: sll %s0, %s0, 6 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: ld %s1, 56(, %s0) +; CHECK-NEXT: ld %s1, 48(, %s0) +; CHECK-NEXT: ld %s1, 40(, %s0) +; CHECK-NEXT: ld %s1, 32(, %s0) +; CHECK-NEXT: ld %s1, 24(, %s0) +; CHECK-NEXT: ld %s1, 16(, %s0) +; CHECK-NEXT: ld %s1, 8(, %s0) +; CHECK-NEXT: ld %s0, (, %s0) +; CHECK-NEXT: # implicit-def: $vmp1 +; CHECK-NEXT: ld %s16, 256(, %s17) +; CHECK-NEXT: lvm %vm3, 0, %s16 +; CHECK-NEXT: ld %s16, 264(, %s17) +; CHECK-NEXT: lvm %vm3, 1, %s16 +; CHECK-NEXT: ld %s16, 272(, %s17) +; CHECK-NEXT: lvm %vm3, 2, %s16 +; CHECK-NEXT: ld %s16, 280(, %s17) +; CHECK-NEXT: lvm %vm3, 3, %s16 +; CHECK-NEXT: ld %s16, 288(, %s17) +; CHECK-NEXT: lvm %vm2, 0, %s16 +; CHECK-NEXT: ld %s16, 296(, %s17) +; CHECK-NEXT: lvm %vm2, 1, %s16 +; CHECK-NEXT: ld %s16, 304(, %s17) +; CHECK-NEXT: lvm %vm2, 2, %s16 +; CHECK-NEXT: ld %s16, 312(, %s17) +; CHECK-NEXT: lvm %vm2, 3, %s16 +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %2 = alloca <512 x i1>, align 32 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %2) + %3 = alloca <512 x i1>, i64 %0, align 8 + %4 = load volatile <512 x i1>, ptr %3, align 64 + %5 = load volatile <512 x i1>, ptr %2, align 32 + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %2) + ret <512 x i1> %5 +} + +; Function Attrs: argmemonly mustprogress nofree nounwind willreturn +define fastcc <512 x i1> @load__vm512_stk_dyn_align2(i64 noundef %0) { +; CHECK-LABEL: load__vm512_stk_dyn_align2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -384(, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB14_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB14_2: +; CHECK-NEXT: sll %s0, %s0, 6 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: ld %s1, 56(, %s0) +; CHECK-NEXT: ld %s1, 48(, %s0) +; CHECK-NEXT: ld %s1, 40(, %s0) +; CHECK-NEXT: ld %s1, 32(, %s0) +; CHECK-NEXT: ld %s1, 24(, %s0) +; CHECK-NEXT: ld %s1, 16(, %s0) +; CHECK-NEXT: ld %s1, 8(, %s0) +; CHECK-NEXT: ld %s0, (, %s0) +; CHECK-NEXT: # implicit-def: $vmp1 +; CHECK-NEXT: ld %s16, 320(, %s17) +; CHECK-NEXT: lvm %vm3, 0, %s16 +; CHECK-NEXT: ld %s16, 328(, %s17) +; CHECK-NEXT: lvm %vm3, 1, %s16 +; CHECK-NEXT: ld %s16, 336(, %s17) +; CHECK-NEXT: lvm %vm3, 2, %s16 +; CHECK-NEXT: ld %s16, 344(, %s17) +; CHECK-NEXT: lvm %vm3, 3, %s16 +; CHECK-NEXT: ld %s16, 352(, %s17) +; CHECK-NEXT: lvm %vm2, 0, %s16 +; CHECK-NEXT: ld %s16, 360(, %s17) +; CHECK-NEXT: lvm %vm2, 1, %s16 +; CHECK-NEXT: ld %s16, 368(, %s17) +; CHECK-NEXT: lvm %vm2, 2, %s16 +; CHECK-NEXT: ld %s16, 376(, %s17) +; CHECK-NEXT: lvm %vm2, 3, %s16 +; CHECK-NEXT: # implicit-def: $vmp2 +; CHECK-NEXT: ld %s16, 256(, %s17) +; CHECK-NEXT: lvm %vm5, 0, %s16 +; CHECK-NEXT: ld %s16, 264(, %s17) +; CHECK-NEXT: lvm %vm5, 1, %s16 +; CHECK-NEXT: ld %s16, 272(, %s17) +; CHECK-NEXT: lvm %vm5, 2, %s16 +; CHECK-NEXT: ld %s16, 280(, %s17) +; CHECK-NEXT: lvm %vm5, 3, %s16 +; CHECK-NEXT: ld %s16, 288(, %s17) +; CHECK-NEXT: lvm %vm4, 0, %s16 +; CHECK-NEXT: ld %s16, 296(, %s17) +; CHECK-NEXT: lvm %vm4, 1, %s16 +; CHECK-NEXT: ld %s16, 304(, %s17) +; CHECK-NEXT: lvm %vm4, 2, %s16 +; CHECK-NEXT: ld %s16, 312(, %s17) +; CHECK-NEXT: lvm %vm4, 3, %s16 +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %2 = alloca <512 x i1>, align 32 + %3 = alloca <512 x i1>, align 64 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %2) + %4 = alloca <512 x i1>, i64 %0, align 8 + %5 = load volatile <512 x i1>, ptr %4, align 64 + %6 = load volatile <512 x i1>, ptr %2, align 32 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %3) + %7 = load volatile <512 x i1>, ptr %3, align 64 + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %3) + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %2) + ret <512 x i1> %6 +} + +; Function Attrs: nounwind +define fastcc <512 x i1> @load__vm512_stk_dyn_align_spill(i64 noundef %0) { +; CHECK-LABEL: load__vm512_stk_dyn_align_spill: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -384(, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB15_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB15_2: +; CHECK-NEXT: st %s18, 48(, %s9) # 8-byte Folded Spill +; CHECK-NEXT: or %s18, 0, %s0 +; CHECK-NEXT: sll %s0, %s0, 6 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: ld %s1, 56(, %s0) +; CHECK-NEXT: ld %s1, 48(, %s0) +; CHECK-NEXT: ld %s1, 40(, %s0) +; CHECK-NEXT: ld %s1, 32(, %s0) +; CHECK-NEXT: ld %s1, 24(, %s0) +; CHECK-NEXT: ld %s1, 16(, %s0) +; CHECK-NEXT: ld %s1, 8(, %s0) +; CHECK-NEXT: ld %s0, (, %s0) +; CHECK-NEXT: # implicit-def: $vmp1 +; CHECK-NEXT: ld %s16, 320(, %s17) +; CHECK-NEXT: lvm %vm3, 0, %s16 +; CHECK-NEXT: ld %s16, 328(, %s17) +; CHECK-NEXT: lvm %vm3, 1, %s16 +; CHECK-NEXT: ld %s16, 336(, %s17) +; CHECK-NEXT: lvm %vm3, 2, %s16 +; CHECK-NEXT: ld %s16, 344(, %s17) +; CHECK-NEXT: lvm %vm3, 3, %s16 +; CHECK-NEXT: ld %s16, 352(, %s17) +; CHECK-NEXT: lvm %vm2, 0, %s16 +; CHECK-NEXT: ld %s16, 360(, %s17) +; CHECK-NEXT: lvm %vm2, 1, %s16 +; CHECK-NEXT: ld %s16, 368(, %s17) +; CHECK-NEXT: lvm %vm2, 2, %s16 +; CHECK-NEXT: ld %s16, 376(, %s17) +; CHECK-NEXT: lvm %vm2, 3, %s16 +; CHECK-NEXT: svm %s16, %vm3, 0 +; CHECK-NEXT: st %s16, 256(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 1 +; CHECK-NEXT: st %s16, 264(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 2 +; CHECK-NEXT: st %s16, 272(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 3 +; CHECK-NEXT: st %s16, 280(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 0 +; CHECK-NEXT: st %s16, 288(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 1 +; CHECK-NEXT: st %s16, 296(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 2 +; CHECK-NEXT: st %s16, 304(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 3 +; CHECK-NEXT: st %s16, 312(, %s17) # 64-byte Folded Spill +; CHECK-NEXT: lea %s0, dummy@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, dummy@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, pass@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, pass@hi(, %s0) +; CHECK-NEXT: or %s0, 0, %s18 +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: # implicit-def: $vmp1 +; CHECK-NEXT: ld %s16, 256(, %s17) +; CHECK-NEXT: lvm %vm3, 0, %s16 +; CHECK-NEXT: ld %s16, 264(, %s17) +; CHECK-NEXT: lvm %vm3, 1, %s16 +; CHECK-NEXT: ld %s16, 272(, %s17) +; CHECK-NEXT: lvm %vm3, 2, %s16 +; CHECK-NEXT: ld %s16, 280(, %s17) +; CHECK-NEXT: lvm %vm3, 3, %s16 +; CHECK-NEXT: ld %s16, 288(, %s17) +; CHECK-NEXT: lvm %vm2, 0, %s16 +; CHECK-NEXT: ld %s16, 296(, %s17) +; CHECK-NEXT: lvm %vm2, 1, %s16 +; CHECK-NEXT: ld %s16, 304(, %s17) +; CHECK-NEXT: lvm %vm2, 2, %s16 +; CHECK-NEXT: ld %s16, 312(, %s17) # 64-byte Folded Reload +; CHECK-NEXT: lvm %vm2, 3, %s16 +; CHECK-NEXT: ld %s18, 48(, %s9) # 8-byte Folded Reload +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %2 = alloca <512 x i1>, align 32 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %2) + %3 = alloca <512 x i1>, i64 %0, align 8 + %4 = load volatile <512 x i1>, ptr %3, align 64 + %5 = load volatile <512 x i1>, ptr %2, align 32 + tail call fastcc void @dummy() + tail call fastcc void @pass(i64 noundef %0) + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %2) + ret <512 x i1> %5 +} + +!2 = !{!"clang version 15.0.0 (git@kaz7.github.com:sx-aurora-dev/llvm-project.git 50263c9e9cc3714bcd816eaea8822d3e010a0f19)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"long", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} +!7 = distinct !{!7, !8} +!8 = !{!"llvm.loop.mustprogress"} +!9 = distinct !{!9, !8} +!10 = distinct !{!10, !8} +!11 = distinct !{!11, !8} +!12 = distinct !{!12, !8} +!13 = distinct !{!13, !8} diff --git a/llvm/test/CodeGen/VE/Vector/store_stk_stvm.ll b/llvm/test/CodeGen/VE/Vector/store_stk_stvm.ll new file mode 100644 index 00000000000000..5a443bb3ee9e1a --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/store_stk_stvm.ll @@ -0,0 +1,1241 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=ve -mattr=+vpu | FileCheck %s + +;;; Test store instructions +;;; +;;; Note: +;;; We test store instructions using general stack, stack with dynamic +;;; allocation, stack with dynamic allocation and alignment, and stack +;;; with dynamic allocation, alignment, and spill. +;;; +;;; Fist test using a stack for leaf function. +;;; +;;; | | Higher address +;;; |----------------------------------------------| <- old sp +;;; | Local variables of fixed size | +;;; |----------------------------------------------| <- sp +;;; | | Lower address +;;; +;;; Access local variable using sp (%s11). In addition, please remember +;;; that stack is aligned by 16 bytes. +;;; +;;; Second test using a general stack. +;;; +;;; | | Higher address +;;; |----------------------------------------------| +;;; | Parameter area for this function | +;;; |----------------------------------------------| +;;; | Register save area (RSA) for this function | +;;; |----------------------------------------------| +;;; | Return address for this function | +;;; |----------------------------------------------| +;;; | Frame pointer for this function | +;;; |----------------------------------------------| <- fp(=old sp) +;;; | Local variables of fixed size | +;;; |----------------------------------------------| +;;; |.variable-sized.local.variables.(VLAs)........| +;;; |..............................................| +;;; |..............................................| +;;; |----------------------------------------------| <- returned by alloca +;;; | Parameter area for callee | +;;; |----------------------------------------------| +;;; | Register save area (RSA) for callee | +;;; |----------------------------------------------| +;;; | Return address for callee | +;;; |----------------------------------------------| +;;; | Frame pointer for callee | +;;; |----------------------------------------------| <- sp +;;; | | Lower address +;;; +;;; Access local variable using fp (%s9) since the size of VLA is not +;;; known. At the beginning of the functions, allocates 240 + data +;;; bytes. 240 means RSA+RA+FP (=176) + Parameter (=64). +;;; +;;; Third test using a general stack. +;;; +;;; | | Higher address +;;; |----------------------------------------------| +;;; | Parameter area for this function | +;;; |----------------------------------------------| +;;; | Register save area (RSA) for this function | +;;; |----------------------------------------------| +;;; | Return address for this function | +;;; |----------------------------------------------| +;;; | Frame pointer for this function | +;;; |----------------------------------------------| <- fp(=old sp) +;;; |.empty.space.to.make.part.below.aligned.in....| +;;; |.case.it.needs.more.than.the.standard.16-byte.| (size of this area is +;;; |.alignment....................................| unknown at compile time) +;;; |----------------------------------------------| +;;; | Local variables of fixed size including spill| +;;; | slots | +;;; |----------------------------------------------| <- bp(not defined by ABI, +;;; |.variable-sized.local.variables.(VLAs)........| LLVM chooses SX17) +;;; |..............................................| (size of this area is +;;; |..............................................| unknown at compile time) +;;; |----------------------------------------------| <- stack top (returned by +;;; | Parameter area for callee | alloca) +;;; |----------------------------------------------| +;;; | Register save area (RSA) for callee | +;;; |----------------------------------------------| +;;; | Return address for callee | +;;; |----------------------------------------------| +;;; | Frame pointer for callee | +;;; |----------------------------------------------| <- sp +;;; | | Lower address +;;; +;;; Access local variable using bp (%s17) since the size of alignment +;;; and VLA are not known. At the beginning of the functions, allocates +;;; pad(240 + data + align) bytes. Then, access data through bp + pad(240) +;;; since this address doesn't change even if VLA is dynamically allocated. +;;; +;;; Fourth test using a general stack with some spills. +;;; + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm256_stk(<256 x i1> noundef %0) { +; CHECK-LABEL: store__vm256_stk: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -224(, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: svm %s16, %vm1, 0 +; CHECK-NEXT: st %s16, 192(, %s11) +; CHECK-NEXT: svm %s16, %vm1, 1 +; CHECK-NEXT: st %s16, 200(, %s11) +; CHECK-NEXT: svm %s16, %vm1, 2 +; CHECK-NEXT: st %s16, 208(, %s11) +; CHECK-NEXT: svm %s16, %vm1, 3 +; CHECK-NEXT: st %s16, 216(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %2 = alloca <256 x i1>, align 32 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %2) + store volatile <256 x i1> %0, ptr %2, align 32, !tbaa !3 + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %2) + ret void +} + +; Function Attrs: argmemonly mustprogress nocallback nofree nosync nounwind willreturn +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) + +; Function Attrs: argmemonly mustprogress nocallback nofree nosync nounwind willreturn +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm256_stk_big_fit(<256 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm256_stk_big_fit: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -2147483648(, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: brge.l %s11, %s8, .LBB1_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: svm %s16, %vm1, 0 +; CHECK-NEXT: st %s16, 2147483616(, %s11) +; CHECK-NEXT: svm %s16, %vm1, 1 +; CHECK-NEXT: st %s16, 2147483624(, %s11) +; CHECK-NEXT: svm %s16, %vm1, 2 +; CHECK-NEXT: st %s16, 2147483632(, %s11) +; CHECK-NEXT: svm %s16, %vm1, 3 +; CHECK-NEXT: st %s16, 2147483640(, %s11) +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: lea %s2, 2147483424 +; CHECK-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: st %s0, 192(%s1, %s11) +; CHECK-NEXT: lea %s1, 8(, %s1) +; CHECK-NEXT: brne.l %s1, %s2, .LBB1_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <256 x i1>, align 32 + %4 = alloca [268435428 x i64], align 8 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %3) + call void @llvm.lifetime.start.p0(i64 2147483424, ptr nonnull %4) + store volatile <256 x i1> %0, ptr %3, align 32, !tbaa !3 + br label %6 + +5: ; preds = %6 + call void @llvm.lifetime.end.p0(i64 2147483424, ptr nonnull %4) + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %3) + ret void + +6: ; preds = %2, %6 + %7 = phi i64 [ 0, %2 ], [ %9, %6 ] + %8 = getelementptr inbounds [268435428 x i64], ptr %4, i64 0, i64 %7 + store volatile i64 %1, ptr %8, align 8, !tbaa !6 + %9 = add nuw nsw i64 %7, 1 + %10 = icmp eq i64 %9, 268435428 + br i1 %10, label %5, label %6, !llvm.loop !8 +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm256_stk_big(<256 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm256_stk_big: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s13, 2147483616 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: brge.l %s11, %s8, .LBB2_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: lea %s13, -2147483648 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s13, (%s11, %s13) +; CHECK-NEXT: svm %s16, %vm1, 0 +; CHECK-NEXT: st %s16, (, %s13) +; CHECK-NEXT: svm %s16, %vm1, 1 +; CHECK-NEXT: st %s16, 8(, %s13) +; CHECK-NEXT: svm %s16, %vm1, 2 +; CHECK-NEXT: st %s16, 16(, %s13) +; CHECK-NEXT: svm %s16, %vm1, 3 +; CHECK-NEXT: st %s16, 24(, %s13) +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: lea %s2, 2147483432 +; CHECK-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: st %s0, 216(%s1, %s11) +; CHECK-NEXT: lea %s1, 8(, %s1) +; CHECK-NEXT: brne.l %s1, %s2, .LBB2_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <256 x i1>, align 32 + %4 = alloca [268435429 x i64], align 8 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %3) + call void @llvm.lifetime.start.p0(i64 2147483432, ptr nonnull %4) + store volatile <256 x i1> %0, ptr %3, align 32, !tbaa !3 + br label %6 + +5: ; preds = %6 + call void @llvm.lifetime.end.p0(i64 2147483432, ptr nonnull %4) + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %3) + ret void + +6: ; preds = %2, %6 + %7 = phi i64 [ 0, %2 ], [ %9, %6 ] + %8 = getelementptr inbounds [268435429 x i64], ptr %4, i64 0, i64 %7 + store volatile i64 %1, ptr %8, align 8, !tbaa !6 + %9 = add nuw nsw i64 %7, 1 + %10 = icmp eq i64 %9, 268435429 + br i1 %10, label %5, label %6, !llvm.loop !8 +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm256_stk_big2(<256 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm256_stk_big2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s13, 2147483424 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: brge.l %s11, %s8, .LBB3_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB3_4: +; CHECK-NEXT: lea %s13, -2147483456 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s13, (%s11, %s13) +; CHECK-NEXT: svm %s16, %vm1, 0 +; CHECK-NEXT: st %s16, (, %s13) +; CHECK-NEXT: svm %s16, %vm1, 1 +; CHECK-NEXT: st %s16, 8(, %s13) +; CHECK-NEXT: svm %s16, %vm1, 2 +; CHECK-NEXT: st %s16, 16(, %s13) +; CHECK-NEXT: svm %s16, %vm1, 3 +; CHECK-NEXT: st %s16, 24(, %s13) +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: lea %s2, -2147483648 +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: st %s0, 192(%s1, %s11) +; CHECK-NEXT: lea %s1, 8(, %s1) +; CHECK-NEXT: brne.l %s1, %s2, .LBB3_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <256 x i1>, align 32 + %4 = alloca [268435456 x i64], align 8 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %3) + call void @llvm.lifetime.start.p0(i64 2147483648, ptr nonnull %4) + store volatile <256 x i1> %0, ptr %3, align 32, !tbaa !3 + br label %6 + +5: ; preds = %6 + call void @llvm.lifetime.end.p0(i64 2147483648, ptr nonnull %4) + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %3) + ret void + +6: ; preds = %2, %6 + %7 = phi i64 [ 0, %2 ], [ %9, %6 ] + %8 = getelementptr inbounds [268435456 x i64], ptr %4, i64 0, i64 %7 + store volatile i64 %1, ptr %8, align 8, !tbaa !6 + %9 = add nuw nsw i64 %7, 1 + %10 = icmp eq i64 %9, 268435456 + br i1 %10, label %5, label %6, !llvm.loop !10 +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm256_stk_dyn(<256 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm256_stk_dyn: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -272(, %s11) +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB4_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: sll %s0, %s0, 5 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: svm %s1, %vm1, 3 +; CHECK-NEXT: st %s1, 24(, %s0) +; CHECK-NEXT: svm %s1, %vm1, 2 +; CHECK-NEXT: st %s1, 16(, %s0) +; CHECK-NEXT: svm %s1, %vm1, 1 +; CHECK-NEXT: st %s1, 8(, %s0) +; CHECK-NEXT: svm %s1, %vm1, 0 +; CHECK-NEXT: st %s1, (, %s0) +; CHECK-NEXT: svm %s16, %vm1, 0 +; CHECK-NEXT: st %s16, -32(, %s9) +; CHECK-NEXT: svm %s16, %vm1, 1 +; CHECK-NEXT: st %s16, -24(, %s9) +; CHECK-NEXT: svm %s16, %vm1, 2 +; CHECK-NEXT: st %s16, -16(, %s9) +; CHECK-NEXT: svm %s16, %vm1, 3 +; CHECK-NEXT: st %s16, -8(, %s9) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <256 x i1>, align 8 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %3) + %4 = alloca <256 x i1>, i64 %1, align 8 + store volatile <256 x i1> %0, ptr %4, align 32, !tbaa !3 + store volatile <256 x i1> %0, ptr %3, align 32, !tbaa !3 + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %3) + ret void +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm256_stk_dyn_align(<256 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm256_stk_dyn_align: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -288(, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB5_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB5_2: +; CHECK-NEXT: sll %s0, %s0, 5 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: svm %s1, %vm1, 3 +; CHECK-NEXT: st %s1, 24(, %s0) +; CHECK-NEXT: svm %s1, %vm1, 2 +; CHECK-NEXT: st %s1, 16(, %s0) +; CHECK-NEXT: svm %s1, %vm1, 1 +; CHECK-NEXT: st %s1, 8(, %s0) +; CHECK-NEXT: svm %s1, %vm1, 0 +; CHECK-NEXT: st %s1, (, %s0) +; CHECK-NEXT: svm %s16, %vm1, 0 +; CHECK-NEXT: st %s16, 256(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 1 +; CHECK-NEXT: st %s16, 264(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 2 +; CHECK-NEXT: st %s16, 272(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 3 +; CHECK-NEXT: st %s16, 280(, %s17) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <256 x i1>, align 32 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %3) + %4 = alloca <256 x i1>, i64 %1, align 8 + store volatile <256 x i1> %0, ptr %4, align 32, !tbaa !3 + store volatile <256 x i1> %0, ptr %3, align 32, !tbaa !3 + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %3) + ret void +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm256_stk_dyn_align2(<256 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm256_stk_dyn_align2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -320(, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB6_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: sll %s0, %s0, 5 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: svm %s1, %vm1, 3 +; CHECK-NEXT: st %s1, 24(, %s0) +; CHECK-NEXT: svm %s1, %vm1, 2 +; CHECK-NEXT: st %s1, 16(, %s0) +; CHECK-NEXT: svm %s1, %vm1, 1 +; CHECK-NEXT: st %s1, 8(, %s0) +; CHECK-NEXT: svm %s1, %vm1, 0 +; CHECK-NEXT: st %s1, (, %s0) +; CHECK-NEXT: svm %s16, %vm1, 0 +; CHECK-NEXT: st %s16, 288(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 1 +; CHECK-NEXT: st %s16, 296(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 2 +; CHECK-NEXT: st %s16, 304(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 3 +; CHECK-NEXT: st %s16, 312(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 0 +; CHECK-NEXT: st %s16, 256(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 1 +; CHECK-NEXT: st %s16, 264(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 2 +; CHECK-NEXT: st %s16, 272(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 3 +; CHECK-NEXT: st %s16, 280(, %s17) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <256 x i1>, align 32 + %4 = alloca <256 x i1>, align 64 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %3) + %5 = alloca <256 x i1>, i64 %1, align 8 + store volatile <256 x i1> %0, ptr %5, align 32, !tbaa !3 + store volatile <256 x i1> %0, ptr %3, align 32, !tbaa !3 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %4) + store volatile <256 x i1> %0, ptr %4, align 64, !tbaa !3 + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %4) + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %3) + ret void +} + +; Function Attrs: nounwind +define fastcc void @store__vm256_stk_dyn_align_spill(<256 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm256_stk_dyn_align_spill: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -320(, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB7_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: st %s18, 48(, %s9) # 8-byte Folded Spill +; CHECK-NEXT: st %s19, 56(, %s9) # 8-byte Folded Spill +; CHECK-NEXT: or %s18, 0, %s0 +; CHECK-NEXT: svm %s16, %vm1, 0 +; CHECK-NEXT: st %s16, 256(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 1 +; CHECK-NEXT: st %s16, 264(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 2 +; CHECK-NEXT: st %s16, 272(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 3 +; CHECK-NEXT: st %s16, 280(, %s17) # 32-byte Folded Spill +; CHECK-NEXT: sll %s0, %s0, 5 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s19, 240(, %s11) +; CHECK-NEXT: lea %s0, dummy@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, dummy@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, pass@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, pass@hi(, %s0) +; CHECK-NEXT: or %s0, 0, %s18 +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: ld %s16, 256(, %s17) +; CHECK-NEXT: lvm %vm1, 0, %s16 +; CHECK-NEXT: ld %s16, 264(, %s17) +; CHECK-NEXT: lvm %vm1, 1, %s16 +; CHECK-NEXT: ld %s16, 272(, %s17) +; CHECK-NEXT: lvm %vm1, 2, %s16 +; CHECK-NEXT: ld %s16, 280(, %s17) # 32-byte Folded Reload +; CHECK-NEXT: lvm %vm1, 3, %s16 +; CHECK-NEXT: svm %s0, %vm1, 3 +; CHECK-NEXT: st %s0, 24(, %s19) +; CHECK-NEXT: svm %s0, %vm1, 2 +; CHECK-NEXT: st %s0, 16(, %s19) +; CHECK-NEXT: svm %s0, %vm1, 1 +; CHECK-NEXT: st %s0, 8(, %s19) +; CHECK-NEXT: svm %s0, %vm1, 0 +; CHECK-NEXT: st %s0, (, %s19) +; CHECK-NEXT: svm %s16, %vm1, 0 +; CHECK-NEXT: st %s16, 288(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 1 +; CHECK-NEXT: st %s16, 296(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 2 +; CHECK-NEXT: st %s16, 304(, %s17) +; CHECK-NEXT: svm %s16, %vm1, 3 +; CHECK-NEXT: st %s16, 312(, %s17) +; CHECK-NEXT: ld %s19, 56(, %s9) # 8-byte Folded Reload +; CHECK-NEXT: ld %s18, 48(, %s9) # 8-byte Folded Reload +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <256 x i1>, align 32 + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %3) + %4 = alloca <256 x i1>, i64 %1, align 8 + tail call fastcc void @dummy() + tail call fastcc void @pass(i64 noundef %1) + store volatile <256 x i1> %0, ptr %4, align 32, !tbaa !3 + store volatile <256 x i1> %0, ptr %3, align 32, !tbaa !3 + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %3) + ret void +} + +declare fastcc void @dummy() + +declare fastcc void @pass(i64 noundef) + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm512_stk(<512 x i1> noundef %0) { +; CHECK-LABEL: store__vm512_stk: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -256(, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB8_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB8_2: +; CHECK-NEXT: svm %s16, %vm3, 0 +; CHECK-NEXT: st %s16, 192(, %s11) +; CHECK-NEXT: svm %s16, %vm3, 1 +; CHECK-NEXT: st %s16, 200(, %s11) +; CHECK-NEXT: svm %s16, %vm3, 2 +; CHECK-NEXT: st %s16, 208(, %s11) +; CHECK-NEXT: svm %s16, %vm3, 3 +; CHECK-NEXT: st %s16, 216(, %s11) +; CHECK-NEXT: svm %s16, %vm2, 0 +; CHECK-NEXT: st %s16, 224(, %s11) +; CHECK-NEXT: svm %s16, %vm2, 1 +; CHECK-NEXT: st %s16, 232(, %s11) +; CHECK-NEXT: svm %s16, %vm2, 2 +; CHECK-NEXT: st %s16, 240(, %s11) +; CHECK-NEXT: svm %s16, %vm2, 3 +; CHECK-NEXT: st %s16, 248(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %2 = alloca <512 x i1>, align 64 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %2) + store volatile <512 x i1> %0, ptr %2, align 64, !tbaa !3 + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %2) + ret void +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm512_stk_bc(<512 x i1> noundef %0) { +; CHECK-LABEL: store__vm512_stk_bc: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -320(, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB9_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB9_2: +; CHECK-NEXT: svm %s16, %vm3, 0 +; CHECK-NEXT: st %s16, 192(, %s11) +; CHECK-NEXT: svm %s16, %vm3, 1 +; CHECK-NEXT: st %s16, 200(, %s11) +; CHECK-NEXT: svm %s16, %vm3, 2 +; CHECK-NEXT: st %s16, 208(, %s11) +; CHECK-NEXT: svm %s16, %vm3, 3 +; CHECK-NEXT: st %s16, 216(, %s11) +; CHECK-NEXT: svm %s16, %vm2, 0 +; CHECK-NEXT: st %s16, 224(, %s11) +; CHECK-NEXT: svm %s16, %vm2, 1 +; CHECK-NEXT: st %s16, 232(, %s11) +; CHECK-NEXT: svm %s16, %vm2, 2 +; CHECK-NEXT: st %s16, 240(, %s11) +; CHECK-NEXT: svm %s16, %vm2, 3 +; CHECK-NEXT: st %s16, 248(, %s11) +; CHECK-NEXT: ld %s0, 192(, %s11) +; CHECK-NEXT: ld %s1, 200(, %s11) +; CHECK-NEXT: ld %s2, 208(, %s11) +; CHECK-NEXT: ld %s3, 216(, %s11) +; CHECK-NEXT: ld %s4, 248(, %s11) +; CHECK-NEXT: ld %s5, 240(, %s11) +; CHECK-NEXT: ld %s6, 232(, %s11) +; CHECK-NEXT: ld %s7, 224(, %s11) +; CHECK-NEXT: st %s4, 312(, %s11) +; CHECK-NEXT: st %s5, 304(, %s11) +; CHECK-NEXT: st %s6, 296(, %s11) +; CHECK-NEXT: st %s7, 288(, %s11) +; CHECK-NEXT: st %s3, 280(, %s11) +; CHECK-NEXT: st %s2, 272(, %s11) +; CHECK-NEXT: st %s1, 264(, %s11) +; CHECK-NEXT: st %s0, 256(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %2 = alloca i512, align 64 + %3 = bitcast <512 x i1> %0 to i512 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %2) + store volatile i512 %3, ptr %2, align 64, !tbaa !3 + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %2) + ret void +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm512_stk_big(<512 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm512_stk_big: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s13, 2147483392 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: brge.l %s11, %s8, .LBB10_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB10_4: +; CHECK-NEXT: lea %s13, -2147483456 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s13, (%s11, %s13) +; CHECK-NEXT: svm %s16, %vm3, 0 +; CHECK-NEXT: st %s16, (, %s13) +; CHECK-NEXT: svm %s16, %vm3, 1 +; CHECK-NEXT: st %s16, 8(, %s13) +; CHECK-NEXT: svm %s16, %vm3, 2 +; CHECK-NEXT: st %s16, 16(, %s13) +; CHECK-NEXT: svm %s16, %vm3, 3 +; CHECK-NEXT: st %s16, 24(, %s13) +; CHECK-NEXT: svm %s16, %vm2, 0 +; CHECK-NEXT: st %s16, 32(, %s13) +; CHECK-NEXT: svm %s16, %vm2, 1 +; CHECK-NEXT: st %s16, 40(, %s13) +; CHECK-NEXT: svm %s16, %vm2, 2 +; CHECK-NEXT: st %s16, 48(, %s13) +; CHECK-NEXT: svm %s16, %vm2, 3 +; CHECK-NEXT: st %s16, 56(, %s13) +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: lea %s2, 2147483640 +; CHECK-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: st %s0, 200(%s1, %s11) +; CHECK-NEXT: lea %s1, 8(, %s1) +; CHECK-NEXT: brne.l %s1, %s2, .LBB10_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <512 x i1>, align 64 + %4 = alloca [268435455 x i64], align 8 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %3) + call void @llvm.lifetime.start.p0(i64 2147483640, ptr nonnull %4) + store volatile <512 x i1> %0, ptr %3, align 64, !tbaa !3 + br label %6 + +5: ; preds = %6 + call void @llvm.lifetime.end.p0(i64 2147483640, ptr nonnull %4) + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %3) + ret void + +6: ; preds = %2, %6 + %7 = phi i64 [ 0, %2 ], [ %9, %6 ] + %8 = getelementptr inbounds [268435455 x i64], ptr %4, i64 0, i64 %7 + store volatile i64 %1, ptr %8, align 8, !tbaa !6 + %9 = add nuw nsw i64 %7, 1 + %10 = icmp eq i64 %9, 268435455 + br i1 %10, label %5, label %6, !llvm.loop !11 +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm512_stk_big2(<512 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm512_stk_big2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s13, 2147483392 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s11, -1(%s13, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: brge.l %s11, %s8, .LBB11_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB11_4: +; CHECK-NEXT: lea %s13, -2147483456 +; CHECK-NEXT: and %s13, %s13, (32)0 +; CHECK-NEXT: lea.sl %s13, (%s11, %s13) +; CHECK-NEXT: svm %s16, %vm3, 0 +; CHECK-NEXT: st %s16, (, %s13) +; CHECK-NEXT: svm %s16, %vm3, 1 +; CHECK-NEXT: st %s16, 8(, %s13) +; CHECK-NEXT: svm %s16, %vm3, 2 +; CHECK-NEXT: st %s16, 16(, %s13) +; CHECK-NEXT: svm %s16, %vm3, 3 +; CHECK-NEXT: st %s16, 24(, %s13) +; CHECK-NEXT: svm %s16, %vm2, 0 +; CHECK-NEXT: st %s16, 32(, %s13) +; CHECK-NEXT: svm %s16, %vm2, 1 +; CHECK-NEXT: st %s16, 40(, %s13) +; CHECK-NEXT: svm %s16, %vm2, 2 +; CHECK-NEXT: st %s16, 48(, %s13) +; CHECK-NEXT: svm %s16, %vm2, 3 +; CHECK-NEXT: st %s16, 56(, %s13) +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: lea %s2, -2147483648 +; CHECK-NEXT: and %s2, %s2, (32)0 +; CHECK-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: st %s0, 192(%s1, %s11) +; CHECK-NEXT: lea %s1, 8(, %s1) +; CHECK-NEXT: brne.l %s1, %s2, .LBB11_1 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <512 x i1>, align 64 + %4 = alloca [268435456 x i64], align 8 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %3) + call void @llvm.lifetime.start.p0(i64 2147483648, ptr nonnull %4) + store volatile <512 x i1> %0, ptr %3, align 64, !tbaa !3 + br label %6 + +5: ; preds = %6 + call void @llvm.lifetime.end.p0(i64 2147483648, ptr nonnull %4) + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %3) + ret void + +6: ; preds = %2, %6 + %7 = phi i64 [ 0, %2 ], [ %9, %6 ] + %8 = getelementptr inbounds [268435456 x i64], ptr %4, i64 0, i64 %7 + store volatile i64 %1, ptr %8, align 8, !tbaa !6 + %9 = add nuw nsw i64 %7, 1 + %10 = icmp eq i64 %9, 268435456 + br i1 %10, label %5, label %6, !llvm.loop !12 +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm512_stk_dyn(<512 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm512_stk_dyn: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -320(, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB12_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: sll %s0, %s0, 6 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: svm %s1, %vm2, 3 +; CHECK-NEXT: st %s1, 56(, %s0) +; CHECK-NEXT: svm %s1, %vm2, 2 +; CHECK-NEXT: st %s1, 48(, %s0) +; CHECK-NEXT: svm %s1, %vm2, 1 +; CHECK-NEXT: st %s1, 40(, %s0) +; CHECK-NEXT: svm %s1, %vm2, 0 +; CHECK-NEXT: st %s1, 32(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 3 +; CHECK-NEXT: st %s1, 24(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 2 +; CHECK-NEXT: st %s1, 16(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 1 +; CHECK-NEXT: st %s1, 8(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 0 +; CHECK-NEXT: st %s1, (, %s0) +; CHECK-NEXT: svm %s16, %vm3, 0 +; CHECK-NEXT: st %s16, 256(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 1 +; CHECK-NEXT: st %s16, 264(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 2 +; CHECK-NEXT: st %s16, 272(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 3 +; CHECK-NEXT: st %s16, 280(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 0 +; CHECK-NEXT: st %s16, 288(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 1 +; CHECK-NEXT: st %s16, 296(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 2 +; CHECK-NEXT: st %s16, 304(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 3 +; CHECK-NEXT: st %s16, 312(, %s17) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <512 x i1>, align 64 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %3) + %4 = alloca <512 x i1>, i64 %1, align 8 + store volatile <512 x i1> %0, ptr %4, align 64, !tbaa !3 + store volatile <512 x i1> %0, ptr %3, align 64, !tbaa !3 + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %3) + ret void +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm512_stk_dyn_align(<512 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm512_stk_dyn_align: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -320(, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB13_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB13_2: +; CHECK-NEXT: sll %s0, %s0, 6 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: svm %s1, %vm2, 3 +; CHECK-NEXT: st %s1, 56(, %s0) +; CHECK-NEXT: svm %s1, %vm2, 2 +; CHECK-NEXT: st %s1, 48(, %s0) +; CHECK-NEXT: svm %s1, %vm2, 1 +; CHECK-NEXT: st %s1, 40(, %s0) +; CHECK-NEXT: svm %s1, %vm2, 0 +; CHECK-NEXT: st %s1, 32(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 3 +; CHECK-NEXT: st %s1, 24(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 2 +; CHECK-NEXT: st %s1, 16(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 1 +; CHECK-NEXT: st %s1, 8(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 0 +; CHECK-NEXT: st %s1, (, %s0) +; CHECK-NEXT: svm %s16, %vm3, 0 +; CHECK-NEXT: st %s16, 256(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 1 +; CHECK-NEXT: st %s16, 264(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 2 +; CHECK-NEXT: st %s16, 272(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 3 +; CHECK-NEXT: st %s16, 280(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 0 +; CHECK-NEXT: st %s16, 288(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 1 +; CHECK-NEXT: st %s16, 296(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 2 +; CHECK-NEXT: st %s16, 304(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 3 +; CHECK-NEXT: st %s16, 312(, %s17) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <512 x i1>, align 32 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %3) + %4 = alloca <512 x i1>, i64 %1, align 8 + store volatile <512 x i1> %0, ptr %4, align 64, !tbaa !3 + store volatile <512 x i1> %0, ptr %3, align 32, !tbaa !3 + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %3) + ret void +} + +; Function Attrs: argmemonly nofree nounwind +define fastcc void @store__vm512_stk_dyn_align2(<512 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm512_stk_dyn_align2: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -384(, %s11) +; CHECK-NEXT: and %s11, %s11, (58)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB14_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB14_2: +; CHECK-NEXT: lea %s0, 15(, %s0) +; CHECK-NEXT: and %s0, -16, %s0 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, 240(, %s11) +; CHECK-NEXT: svm %s1, %vm2, 3 +; CHECK-NEXT: st %s1, 56(, %s0) +; CHECK-NEXT: svm %s1, %vm2, 2 +; CHECK-NEXT: st %s1, 48(, %s0) +; CHECK-NEXT: svm %s1, %vm2, 1 +; CHECK-NEXT: st %s1, 40(, %s0) +; CHECK-NEXT: svm %s1, %vm2, 0 +; CHECK-NEXT: st %s1, 32(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 3 +; CHECK-NEXT: st %s1, 24(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 2 +; CHECK-NEXT: st %s1, 16(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 1 +; CHECK-NEXT: st %s1, 8(, %s0) +; CHECK-NEXT: svm %s1, %vm3, 0 +; CHECK-NEXT: st %s1, (, %s0) +; CHECK-NEXT: svm %s16, %vm3, 0 +; CHECK-NEXT: st %s16, 320(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 1 +; CHECK-NEXT: st %s16, 328(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 2 +; CHECK-NEXT: st %s16, 336(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 3 +; CHECK-NEXT: st %s16, 344(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 0 +; CHECK-NEXT: st %s16, 352(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 1 +; CHECK-NEXT: st %s16, 360(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 2 +; CHECK-NEXT: st %s16, 368(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 3 +; CHECK-NEXT: st %s16, 376(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 0 +; CHECK-NEXT: st %s16, 256(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 1 +; CHECK-NEXT: st %s16, 264(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 2 +; CHECK-NEXT: st %s16, 272(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 3 +; CHECK-NEXT: st %s16, 280(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 0 +; CHECK-NEXT: st %s16, 288(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 1 +; CHECK-NEXT: st %s16, 296(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 2 +; CHECK-NEXT: st %s16, 304(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 3 +; CHECK-NEXT: st %s16, 312(, %s17) +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <512 x i1>, align 32 + %4 = alloca <512 x i1>, align 64 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %3) + %5 = alloca i8, i64 %1, align 8 + store volatile <512 x i1> %0, ptr %5, align 64, !tbaa !3 + store volatile <512 x i1> %0, ptr %3, align 32, !tbaa !3 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %4) + store volatile <512 x i1> %0, ptr %4, align 64, !tbaa !3 + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %4) + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %3) + ret void +} + +; Function Attrs: nounwind +define fastcc void @store__vm512_stk_dyn_align_spill(<512 x i1> noundef %0, i64 noundef %1) { +; CHECK-LABEL: store__vm512_stk_dyn_align_spill: +; CHECK: # %bb.0: +; CHECK-NEXT: st %s9, (, %s11) +; CHECK-NEXT: st %s10, 8(, %s11) +; CHECK-NEXT: st %s17, 40(, %s11) +; CHECK-NEXT: or %s9, 0, %s11 +; CHECK-NEXT: lea %s11, -384(, %s11) +; CHECK-NEXT: and %s11, %s11, (59)1 +; CHECK-NEXT: or %s17, 0, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB15_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB15_2: +; CHECK-NEXT: st %s18, 48(, %s9) # 8-byte Folded Spill +; CHECK-NEXT: st %s19, 56(, %s9) # 8-byte Folded Spill +; CHECK-NEXT: or %s18, 0, %s0 +; CHECK-NEXT: svm %s16, %vm3, 0 +; CHECK-NEXT: st %s16, 256(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 1 +; CHECK-NEXT: st %s16, 264(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 2 +; CHECK-NEXT: st %s16, 272(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 3 +; CHECK-NEXT: st %s16, 280(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 0 +; CHECK-NEXT: st %s16, 288(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 1 +; CHECK-NEXT: st %s16, 296(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 2 +; CHECK-NEXT: st %s16, 304(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 3 +; CHECK-NEXT: st %s16, 312(, %s17) # 64-byte Folded Spill +; CHECK-NEXT: sll %s0, %s0, 6 +; CHECK-NEXT: lea %s1, __ve_grow_stack@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, __ve_grow_stack@hi(, %s1) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s19, 240(, %s11) +; CHECK-NEXT: lea %s0, dummy@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, dummy@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, pass@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, pass@hi(, %s0) +; CHECK-NEXT: or %s0, 0, %s18 +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: # implicit-def: $vmp1 +; CHECK-NEXT: ld %s16, 256(, %s17) +; CHECK-NEXT: lvm %vm3, 0, %s16 +; CHECK-NEXT: ld %s16, 264(, %s17) +; CHECK-NEXT: lvm %vm3, 1, %s16 +; CHECK-NEXT: ld %s16, 272(, %s17) +; CHECK-NEXT: lvm %vm3, 2, %s16 +; CHECK-NEXT: ld %s16, 280(, %s17) +; CHECK-NEXT: lvm %vm3, 3, %s16 +; CHECK-NEXT: ld %s16, 288(, %s17) +; CHECK-NEXT: lvm %vm2, 0, %s16 +; CHECK-NEXT: ld %s16, 296(, %s17) +; CHECK-NEXT: lvm %vm2, 1, %s16 +; CHECK-NEXT: ld %s16, 304(, %s17) +; CHECK-NEXT: lvm %vm2, 2, %s16 +; CHECK-NEXT: ld %s16, 312(, %s17) # 64-byte Folded Reload +; CHECK-NEXT: lvm %vm2, 3, %s16 +; CHECK-NEXT: svm %s0, %vm2, 3 +; CHECK-NEXT: st %s0, 56(, %s19) +; CHECK-NEXT: svm %s0, %vm2, 2 +; CHECK-NEXT: st %s0, 48(, %s19) +; CHECK-NEXT: svm %s0, %vm2, 1 +; CHECK-NEXT: st %s0, 40(, %s19) +; CHECK-NEXT: svm %s0, %vm2, 0 +; CHECK-NEXT: st %s0, 32(, %s19) +; CHECK-NEXT: svm %s0, %vm3, 3 +; CHECK-NEXT: st %s0, 24(, %s19) +; CHECK-NEXT: svm %s0, %vm3, 2 +; CHECK-NEXT: st %s0, 16(, %s19) +; CHECK-NEXT: svm %s0, %vm3, 1 +; CHECK-NEXT: st %s0, 8(, %s19) +; CHECK-NEXT: svm %s0, %vm3, 0 +; CHECK-NEXT: st %s0, (, %s19) +; CHECK-NEXT: svm %s16, %vm3, 0 +; CHECK-NEXT: st %s16, 320(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 1 +; CHECK-NEXT: st %s16, 328(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 2 +; CHECK-NEXT: st %s16, 336(, %s17) +; CHECK-NEXT: svm %s16, %vm3, 3 +; CHECK-NEXT: st %s16, 344(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 0 +; CHECK-NEXT: st %s16, 352(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 1 +; CHECK-NEXT: st %s16, 360(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 2 +; CHECK-NEXT: st %s16, 368(, %s17) +; CHECK-NEXT: svm %s16, %vm2, 3 +; CHECK-NEXT: st %s16, 376(, %s17) +; CHECK-NEXT: ld %s19, 56(, %s9) # 8-byte Folded Reload +; CHECK-NEXT: ld %s18, 48(, %s9) # 8-byte Folded Reload +; CHECK-NEXT: or %s11, 0, %s9 +; CHECK-NEXT: ld %s17, 40(, %s11) +; CHECK-NEXT: ld %s10, 8(, %s11) +; CHECK-NEXT: ld %s9, (, %s11) +; CHECK-NEXT: b.l.t (, %s10) + %3 = alloca <512 x i1>, align 32 + call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %3) + %4 = alloca <512 x i1>, i64 %1, align 8 + tail call fastcc void @dummy() + tail call fastcc void @pass(i64 noundef %1) + store volatile <512 x i1> %0, ptr %4, align 64, !tbaa !3 + store volatile <512 x i1> %0, ptr %3, align 32, !tbaa !3 + call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %3) + ret void +} + +!2 = !{!"clang version 15.0.0 (git@kaz7.github.com:sx-aurora-dev/llvm-project.git 6c510cbf7e17baa380bf8a181c3b43145fd50980)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = !{!7, !7, i64 0} +!7 = !{!"long", !4, i64 0} +!8 = distinct !{!8, !9} +!9 = !{!"llvm.loop.mustprogress"} +!10 = distinct !{!10, !9} +!11 = distinct !{!11, !9} +!12 = distinct !{!12, !9}